Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

465 class document uploader #470

Closed
wants to merge 12 commits into from
3 changes: 2 additions & 1 deletion app/main/checks/report_checks/banned_words_in_literature.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def __init__(self, file_info, banned_words=["wikipedia"]):
self.literature_header = []
self.banned_words = [morph.normal_forms(word)[0] for word in banned_words]
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
self.md_name_pattern = r'<h2>список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)</h2>'

def late_init_vkr(self):
self.literature_header = self.file.find_literature_vkr(self.file_type['report_type'])
Expand Down Expand Up @@ -83,6 +84,6 @@ def start_of_literature_chapter(self, ):
start_index = 0
for i in range(len(self.file.paragraphs)):
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
if re.fullmatch(self.name_pattern, text_string):
if re.fullmatch(f'{self.name_pattern}|{self.md_name_pattern}', text_string):
start_index = i
return start_index
5 changes: 4 additions & 1 deletion app/main/checks/report_checks/image_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def check(self):
def search_references(self):
array_of_references = set()
for i in range(0, self.last_child_number):
detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
if isinstance(self.file.paragraphs[i], str):
detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i])
else:
detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
if detected_references:
for reference in detected_references:
for one_part in re.split(r'[Рр]ис\.|,| ', reference):
Expand Down
22 changes: 16 additions & 6 deletions app/main/checks/report_checks/literature_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(self, file_info, min_ref=1, max_ref=1000):
self.headers = []
self.literature_header = []
self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
self.md_name_pattern = r"<h2>(Список использованных источников|Список использованной литературы)<\/h2>"
self.min_ref = min_ref
self.max_ref = max_ref

Expand Down Expand Up @@ -77,7 +78,10 @@ def check(self):
def search_references(self, start_par):
array_of_references = set()
for i in range(0, start_par):
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
if isinstance(self.file.paragraphs[i], str):
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i])
else:
detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
if detected_references:
for reference in detected_references:
for one_part in re.split(r'[\[\],]', reference):
Expand All @@ -92,10 +96,16 @@ def search_references(self, start_par):
def find_start_paragraph(self):
start_index = 0
for i in range(len(self.file.paragraphs)):
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
if re.fullmatch(self.name_pattern, text_string):
start_index = i
break
if isinstance(self.file.paragraphs[i], str):
text_string = self.file.paragraphs[i].lower()
if re.fullmatch(self.md_name_pattern, text_string):
start_index = i
break
else:
text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
if re.fullmatch(self.name_pattern, text_string):
start_index = i
break
return start_index

def count_sources_vkr(self, header):
Expand Down Expand Up @@ -142,4 +152,4 @@ def search_literature_start_pdf(self):
if re.search('приложение а[\n .]', lowercase_str):
end_page = i
break
return start_page, end_page
return start_page, end_page
5 changes: 4 additions & 1 deletion app/main/checks/report_checks/table_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ def check(self):
def search_references(self):
array_of_references = set()
for i in range(0, self.last_child_number):
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
if isinstance(self.file.paragraphs[i], str):
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i])
else:
detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
if detected_references:
for reference in detected_references:
for one_part in re.split(r'таблиц[аеыу]| ', reference):
Expand Down
12 changes: 11 additions & 1 deletion app/main/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from main.presentations import PresentationPPTX
from main.reports.docx_uploader import DocxUploader
from main.reports.md_uploader import MdUpload
from utils import convert_to

logger = logging.getLogger('root_logger')
Expand All @@ -19,15 +20,24 @@ def parse(filepath, pdf_filepath):
logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
new_filepath = convert_to(filepath, target_format='pptx')
file_object = PresentationPPTX(new_filepath)
elif tmp_filepath.endswith(('.doc', '.odt', '.docx')):
elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
new_filepath = filepath
if tmp_filepath.endswith(('.doc', '.odt')):
logger.info(f"Отчёт {filepath} старого формата. Временно преобразован в docx для обработки.")
new_filepath = convert_to(filepath, target_format='docx')

docx = DocxUploader()
docx.upload(new_filepath, pdf_filepath)
docx.parse()
file_object = docx

elif tmp_filepath.endswith('.md' ):
new_filepath = filepath
doc = MdUpload(new_filepath)
md_text = doc.upload()
doc.parse(md_text)
file_object = doc

else:
raise ValueError("Файл с недопустимым именем или недопустимого формата: " + filepath)
# Если была конвертация, то удаляем временный файл.
Expand Down
7 changes: 1 addition & 6 deletions app/main/reports/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,6 @@ Proof-of-concept парсинг файлов `.docx` с выводом стру
$ python3 -m app.main.mse22.pdf_document text_from_pages --filename path_to_file
```

## `MD`

Парсинг файлов `.md` с выводом структуры файла в текстовом виде в stdout.

```bash
$ python3 -m app.main.reports.md_uploader md_parser --mdfile path_to_md_file
```

```
35 changes: 35 additions & 0 deletions app/main/reports/document_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from abc import ABC, abstractmethod

class DocumentUploader(ABC):

@abstractmethod
def upload(self):
pass

@abstractmethod
def parse(self):
pass

@abstractmethod
def parse_effective_styles(self):
pass

@abstractmethod
def page_counter(self):
pass

@abstractmethod
def make_headers(self, work_type):
pass

@abstractmethod
def make_chapters(self, work_type):
pass

@abstractmethod
def find_header_page(self, work_type):
pass

@abstractmethod
def find_literature_vkr(self, work_type):
pass
3 changes: 2 additions & 1 deletion app/main/reports/docx_uploader/docx_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
from .style import Style
from .table import Table, Cell
from ..pdf_document.pdf_document_manager import PdfDocumentManager
from ..document_uploader import DocumentUploader


class DocxUploader:
class DocxUploader(DocumentUploader):
def __init__(self):
self.inline_shapes = []
self.core_properties = None
Expand Down
Loading
Loading