Skip to content

Commit

Permalink
refactoring && bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
blindsphynx committed Dec 7, 2023
1 parent 433c729 commit c9c3694
Showing 1 changed file with 27 additions and 8 deletions.
35 changes: 27 additions & 8 deletions app/main/reports/parse_file/parse_file.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
from app.main.reports.docx_uploader import DocxUploader
import re


def parse_headers_and_pages(chapters, docx):
text_on_page = docx.pdf_file.get_text_on_page()
keys = list(text_on_page.keys())
values = list(text_on_page.values())
for page_text in text_on_page.values():

for page, text in text_on_page.items():
text = re.sub(r"(-\n)", "", text)
text = re.sub(r"\s\n", " ", text)
if "СОДЕРЖАНИЕ" in text:
continue
for chapter in chapters:
if chapter["header"] in page_text:
pos = values.index(page_text)
chapter["start_page"] = keys[pos]
return chapters
if chapter["header"] in text:
chapter["start_page"] = page
return chapters # 1.3, 1.4, 3.4.2


def parse_chapters(docx):
Expand All @@ -20,5 +26,18 @@ def parse_chapters(docx):
temp_text = ""
for i in range(len(chapter["child"])):
temp_text += chapter["child"][i]["styled_text"]["text"]
chapters.append({"header": head, "start_page": 0, "text": temp_text})
chapters.append({"header": head, "start_page": 0, "text": temp_text})
return chapters


if __name__ == "__main__":
file = "/home/vilka/Downloads/petrov.docx"
parsed_file = DocxUploader()
parsed_file.upload(file)
parsed_file.parse()
parsed_file.make_chapters("VKR")
parsed_file.make_headers("VKR")
chapters = parse_chapters(parsed_file)
chapters_with_headers = parse_headers_and_pages(chapters, parsed_file)
for i in range(len(chapters_with_headers) // 2):
print(chapters_with_headers[i])

0 comments on commit c9c3694

Please sign in to comment.