Skip to content

Commit

Permalink
refactoring && bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
blindsphynx committed Dec 7, 2023
1 parent c9c3694 commit 494cb39
Showing 1 changed file with 1 addition and 16 deletions.
17 changes: 1 addition & 16 deletions app/main/reports/parse_file/parse_file.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from app.main.reports.docx_uploader import DocxUploader
import re


def parse_headers_and_pages(chapters, docx):
text_on_page = docx.pdf_file.get_text_on_page()

for page, text in text_on_page.items():
text = re.sub(r"(-\n)", "", text)
text = re.sub(r"\s\n", " ", text)
Expand All @@ -13,7 +11,7 @@ def parse_headers_and_pages(chapters, docx):
for chapter in chapters:
if chapter["header"] in text:
chapter["start_page"] = page
return chapters # 1.3, 1.4, 3.4.2
return chapters


def parse_chapters(docx):
Expand All @@ -28,16 +26,3 @@ def parse_chapters(docx):
temp_text += chapter["child"][i]["styled_text"]["text"]
chapters.append({"header": head, "start_page": 0, "text": temp_text})
return chapters


if __name__ == "__main__":
file = "/home/vilka/Downloads/petrov.docx"
parsed_file = DocxUploader()
parsed_file.upload(file)
parsed_file.parse()
parsed_file.make_chapters("VKR")
parsed_file.make_headers("VKR")
chapters = parse_chapters(parsed_file)
chapters_with_headers = parse_headers_and_pages(chapters, parsed_file)
for i in range(len(chapters_with_headers) // 2):
print(chapters_with_headers[i])

0 comments on commit 494cb39

Please sign in to comment.