Skip to content

Commit

Permalink
TextStorage class
Browse files Browse the repository at this point in the history
  • Loading branch information
blindsphynx committed Nov 23, 2023
1 parent 342728e commit 9a927bb
Showing 1 changed file with 40 additions and 0 deletions.
40 changes: 40 additions & 0 deletions app/main/reports/text_storage/text_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from app.main.reports.docx_uploader import DocxUploader


class TextStorage:
def __init__(self, filepath, pdf_filepath=''):
self.chapters = []
self.headers = []
self.docx = DocxUploader()
self.docx.upload(filepath, pdf_filepath)
self.docx.parse()
self.docx.chapters = self.docx.make_chapters("VKR")
self.docx.headers = self.docx.make_headers("VKR")

def parse_headers_and_pages(self):
text_on_page = self.docx.pdf_file.get_text_on_page()
keys = list(text_on_page.keys())
values = list(text_on_page.values())
for page_text in text_on_page.values():
for chapter in self.chapters:
if chapter["header"] in page_text:
pos = values.index(page_text)
chapter["start_page"] = keys[pos]

def parse_chapters(self):
for chapter in self.docx.chapters:
if chapter["child"] != [] and "heading" in chapter["style"]:
temp_text = ""
for i in range(len(chapter["child"])):
temp_text += chapter["child"][i]["styled_text"]["text"]
self.chapters.append({"header": chapter["styled_text"]["text"], "start_page": 0, "text": temp_text})


if __name__ == "__main__":
docx_path = "/home/vilka/Downloads/petrov.docx"
pdf_path = "/home/vilka/Downloads/petrov.pdf"
text_storage = TextStorage(docx_path, pdf_path)
text_storage.parse_chapters()
text_storage.parse_headers_and_pages()
for ch in text_storage.chapters:
print(ch)

0 comments on commit 9a927bb

Please sign in to comment.