moevm · MarinaProsche · Oct 5, 2023 · Oct 5, 2023 · Oct 9, 2023 · Oct 9, 2023
diff --git a/app/main/checks/report_checks/banned_words_in_literature.py b/app/main/checks/report_checks/banned_words_in_literature.py
@@ -13,6 +13,7 @@ def __init__(self, file_info, banned_words=["wikipedia"]):
         self.literature_header = []
         self.banned_words = [morph.normal_forms(word)[0] for word in banned_words]
         self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
+        self.md_name_pattern = r'<h2>список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)</h2>'
 
     def late_init_vkr(self):
         self.literature_header = self.file.find_literature_vkr(self.file_type['report_type'])
@@ -83,6 +84,6 @@ def start_of_literature_chapter(self, ):
         start_index = 0
         for i in range(len(self.file.paragraphs)):
             text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
-            if re.fullmatch(self.name_pattern, text_string):
+            if re.fullmatch(f'{self.name_pattern}|{self.md_name_pattern}', text_string):
                 start_index = i
         return start_index
diff --git a/app/main/checks/report_checks/image_references.py b/app/main/checks/report_checks/image_references.py
@@ -63,7 +63,10 @@ def check(self):
     def search_references(self):
         array_of_references = set()
         for i in range(0, self.last_child_number):
-            detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
+            if isinstance(self.file.paragraphs[i], str):
+                detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i])
+            else:    
+                detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text)
             if detected_references:
                 for reference in detected_references:
                     for one_part in re.split(r'[Рр]ис\.|,| ', reference):

diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py
@@ -12,6 +12,7 @@ def __init__(self, file_info, min_ref=1, max_ref=1000):
         self.headers = []
         self.literature_header = []
         self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)'
+        self.md_name_pattern = r"<h2>(Список использованных источников|Список использованной литературы)<\/h2>"
         self.min_ref = min_ref
         self.max_ref = max_ref
 
@@ -77,7 +78,10 @@ def check(self):
     def search_references(self, start_par):
         array_of_references = set()
         for i in range(0, start_par):
-            detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
+            if isinstance(self.file.paragraphs[i], str):
+                detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i])
+            else:    
+                detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1])
             if detected_references:
                 for reference in detected_references:
                     for one_part in re.split(r'[\[\],]', reference):
@@ -92,10 +96,16 @@ def search_references(self, start_par):
     def find_start_paragraph(self):
         start_index = 0
         for i in range(len(self.file.paragraphs)):
-            text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
-            if re.fullmatch(self.name_pattern, text_string):
-                start_index = i
-                break
+            if isinstance(self.file.paragraphs[i], str):
+                text_string = self.file.paragraphs[i].lower()
+                if re.fullmatch(self.md_name_pattern, text_string):
+                    start_index = i
+                    break
+            else:    
+                text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1]
+                if re.fullmatch(self.name_pattern, text_string):
+                    start_index = i
+                    break
         return start_index
 
     def count_sources_vkr(self, header):
@@ -142,4 +152,4 @@ def search_literature_start_pdf(self):
             if re.search('приложение а[\n .]', lowercase_str):
                 end_page = i
                 break
-        return start_page, end_page
+        return start_page, end_page
diff --git a/app/main/checks/report_checks/table_references.py b/app/main/checks/report_checks/table_references.py
@@ -63,7 +63,10 @@ def check(self):
     def search_references(self):
         array_of_references = set()
         for i in range(0, self.last_child_number):
-            detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
+            if  isinstance(self.file.paragraphs[i], str):
+                detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i])
+            else:    
+                detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text)
             if detected_references:
                 for reference in detected_references:
                     for one_part in re.split(r'таблиц[аеыу]| ', reference):

diff --git a/app/main/parser.py b/app/main/parser.py
@@ -5,6 +5,7 @@
 
 from main.presentations import PresentationPPTX
 from main.reports.docx_uploader import DocxUploader
+from main.reports.md_uploader import MdUpload
 from utils import convert_to
 
 logger = logging.getLogger('root_logger')
@@ -19,15 +20,24 @@ def parse(filepath, pdf_filepath):
                 logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.")
                 new_filepath = convert_to(filepath, target_format='pptx')
             file_object = PresentationPPTX(new_filepath)
-        elif tmp_filepath.endswith(('.doc', '.odt', '.docx')):
+        elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )):
             new_filepath = filepath
             if tmp_filepath.endswith(('.doc', '.odt')):
                 logger.info(f"Отчёт {filepath} старого формата. Временно преобразован в docx для обработки.")
                 new_filepath = convert_to(filepath, target_format='docx')
+
             docx = DocxUploader()
             docx.upload(new_filepath, pdf_filepath)
             docx.parse()
             file_object = docx
+
+        elif tmp_filepath.endswith('.md' ):
+            new_filepath = filepath
+            doc = MdUpload(new_filepath)
+            md_text = doc.upload()
+            doc.parse(md_text)
+            file_object = doc
+
         else:
             raise ValueError("Файл с недопустимым именем или недопустимого формата: " + filepath)
         # Если была конвертация, то удаляем временный файл.

diff --git a/app/main/reports/README.md b/app/main/reports/README.md
@@ -66,11 +66,6 @@ Proof-of-concept парсинг файлов `.docx` с выводом стру
 $ python3 -m app.main.mse22.pdf_document text_from_pages --filename path_to_file
 ```
 
-## `MD`
-
-Парсинг файлов `.md` с выводом структуры файла в текстовом виде в stdout.
-
 ```bash
 $ python3 -m app.main.reports.md_uploader md_parser --mdfile path_to_md_file
-```
-
+```
diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py
@@ -0,0 +1,35 @@
+from abc import ABC, abstractmethod
+
+class DocumentUploader(ABC):
+
+    @abstractmethod
+    def upload(self):
+        pass
+
+    @abstractmethod
+    def parse(self):
+        pass
+
+    @abstractmethod
+    def parse_effective_styles(self):
+        pass
+
+    @abstractmethod
+    def page_counter(self):
+        pass
+
+    @abstractmethod
+    def make_headers(self, work_type):
+        pass
+
+    @abstractmethod
+    def make_chapters(self, work_type):
+        pass
+
+    @abstractmethod
+    def find_header_page(self, work_type):
+        pass
+
+    @abstractmethod
+    def find_literature_vkr(self, work_type):
+        pass
diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py
@@ -9,9 +9,10 @@
 from .style import Style
 from .table import Table, Cell
 from ..pdf_document.pdf_document_manager import PdfDocumentManager
+from ..document_uploader import DocumentUploader
 
 
-class DocxUploader:
+class DocxUploader(DocumentUploader):
     def __init__(self):
         self.inline_shapes = []
         self.core_properties = None