From 423c3bfb79975bb1740c0b5cfb9002c2d6cc06d4 Mon Sep 17 00:00:00 2001 From: Marina Date: Thu, 5 Oct 2023 14:28:02 +0300 Subject: [PATCH 01/10] change parser --- app/main/parser.py | 12 +++- app/main/reports/document_uploader.py | 19 ++++++ .../reports/docx_uploader/docx_uploader.py | 3 +- app/main/reports/md_uploader/__init__.py | 1 + app/main/reports/md_uploader/__main__.py | 21 +++++++ app/main/reports/md_uploader/md_uploader.py | 63 +++++++++++++++++++ app/server.py | 2 +- requirements.txt | 1 + 8 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 app/main/reports/document_uploader.py create mode 100644 app/main/reports/md_uploader/__init__.py create mode 100644 app/main/reports/md_uploader/__main__.py create mode 100644 app/main/reports/md_uploader/md_uploader.py diff --git a/app/main/parser.py b/app/main/parser.py index 5cf671de..b185af4c 100644 --- a/app/main/parser.py +++ b/app/main/parser.py @@ -5,6 +5,7 @@ from main.presentations import PresentationPPTX from main.reports.docx_uploader import DocxUploader +from main.reports.md_uploader import MdUpload from utils import convert_to logger = logging.getLogger('root_logger') @@ -19,15 +20,24 @@ def parse(filepath, pdf_filepath): logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.") new_filepath = convert_to(filepath, target_format='pptx') file_object = PresentationPPTX(new_filepath) - elif tmp_filepath.endswith(('.doc', '.odt', '.docx')): + elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )): new_filepath = filepath if tmp_filepath.endswith(('.doc', '.odt')): logger.info(f"Отчёт {filepath} старого формата. Временно преобразован в docx для обработки.") new_filepath = convert_to(filepath, target_format='docx') + docx = DocxUploader() docx.upload(new_filepath, pdf_filepath) docx.parse() file_object = docx + + elif tmp_filepath.endswith('.md' ): + new_filepath = filepath + doc = MdUpload(new_filepath) + md_text = doc.upload() + doc.parse(md_text) + file_object = doc + else: raise ValueError("Файл с недопустимым именем или недопустимого формата: " + filepath) # Если была конвертация, то удаляем временный файл. diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py new file mode 100644 index 00000000..a9455545 --- /dev/null +++ b/app/main/reports/document_uploader.py @@ -0,0 +1,19 @@ +from abc import ABC, abstractmethod + +class DocumentUploader(ABC): + + @abstractmethod + def upload(self): + pass + + @abstractmethod + def parse(self): + pass + + # @abstractmethod + # def make_chapters(self): + # pass + + # @abstractmethod + # def make_headers(self): + # pass diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py index 14efdc05..93e5e7a3 100644 --- a/app/main/reports/docx_uploader/docx_uploader.py +++ b/app/main/reports/docx_uploader/docx_uploader.py @@ -9,9 +9,10 @@ from .style import Style from .table import Table, Cell from ..pdf_document.pdf_document_manager import PdfDocumentManager +from ..document_uploader import DocumentUploader -class DocxUploader: +class DocxUploader(DocumentUploader): def __init__(self): self.inline_shapes = [] self.core_properties = None diff --git a/app/main/reports/md_uploader/__init__.py b/app/main/reports/md_uploader/__init__.py new file mode 100644 index 00000000..31cb93c1 --- /dev/null +++ b/app/main/reports/md_uploader/__init__.py @@ -0,0 +1 @@ +from .md_uploader import MdUpload diff --git a/app/main/reports/md_uploader/__main__.py b/app/main/reports/md_uploader/__main__.py new file mode 100644 index 00000000..f69ae975 --- /dev/null +++ b/app/main/reports/md_uploader/__main__.py @@ -0,0 +1,21 @@ +import argparse + +from .md_uploader import main as md_uploader_main + + +def parse_args(): + parser = argparse.ArgumentParser(description='File md parser') + subparsers = parser.add_subparsers() + md_parser = subparsers.add_parser('md_parser', help='md document') + md_parser.add_argument('--mdfile', type=str, required=True, help='path to md file') + md_parser.set_defaults(func=md_uploader_main) + return parser.parse_args() + + +def main(): + args = parse_args() + args.func(args) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py new file mode 100644 index 00000000..5380864e --- /dev/null +++ b/app/main/reports/md_uploader/md_uploader.py @@ -0,0 +1,63 @@ +import markdown #installation: pip install markdown +import re + +from ..document_uploader import DocumentUploader + +class MdUpload(DocumentUploader): + def __init__(self, path_to_md_file): + self.path_to_md_file = path_to_md_file + self.headers = [] + self.chapters = [] + self.paragraphs = [] + self.html_text = '' + self.tables = [] + self.chapter_with_text = [] + + def upload(self): + with open(self.path_to_md_file, "r", encoding="utf-8") as f: + md_text = f.read() + return md_text + + def parse(self, md_text): + self.html_text = markdown.markdown(md_text) + self.paragraphs = self.html_text.split('\n') + + def get_headers(self): + header_regex = "

(.*?)<\/h1>" + self.headers = re.findall(header_regex, self.html_text) + + def get_chapters(self): + chapter_regex = "

(.*?)<\/h2>" + self.chapters = re.findall(chapter_regex, self.html_text) + + def get_chapter_with_text(self): + text = self.html_text + chapter_name = '' + for chapter in self.chapters: + self.split_chapter = text.split("

" + chapter + "

") + self.chapter_with_text.append(chapter_name + self.split_chapter[-2]) + chapter_name = chapter + text = self.split_chapter[-1] + self.chapter_with_text.append(chapter_name + text) + + def get_tables_size(self): + count_table_line = 0 + count_paragraph = len(self.paragraphs) + for line in self.paragraphs: + if "|" in line: + count_table_line +=1 + return round(count_table_line/count_paragraph, 4) + + def parse_md_file(self): + md_text = self.upload() + self.parse(md_text) + self.get_headers() + self.get_chapters() + self.get_chapter_with_text() + self.get_tables_size() + return f"Заголовки:\n{self.headers}\n\nГлавы:\n{self.chapters}\n\nГлавы с текстом:\n{self.chapter_with_text}\n\nДоля таблиц в тексте:\n{self.get_tables_size()}" + +def main(args): + md_file = MdUpload(args.mdfile) + print(md_file.parse_md_file()) + diff --git a/app/server.py b/app/server.py index 1bfe9127..6be4334b 100644 --- a/app/server.py +++ b/app/server.py @@ -34,7 +34,7 @@ UPLOAD_FOLDER = '/usr/src/project/files' ALLOWED_EXTENSIONS = { 'pres': {'ppt', 'pptx', 'odp'}, - 'report': {'doc', 'odt', 'docx'} + 'report': {'doc', 'odt', 'docx', 'md'} } DOCUMENT_TYPES = {'Лабораторная работа', 'Курсовая работа', 'ВКР'} TABLE_COLUMNS = ['Solution', 'User', 'File', 'Criteria', 'Check added', 'LMS date', 'Score'] diff --git a/requirements.txt b/requirements.txt index 6ec862a4..cde73eef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ pdfplumber==0.6.1 pytest~=7.1.2 filetype==1.2.0 language-tool-python==2.7.1 +markdown From be66bf9c7de076339cb5c6127a3306e2bfa7b473 Mon Sep 17 00:00:00 2001 From: Marina Date: Thu, 5 Oct 2023 14:47:27 +0300 Subject: [PATCH 02/10] change mime_type = False --- app/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/server.py b/app/server.py index 6be4334b..2ba6c80a 100644 --- a/app/server.py +++ b/app/server.py @@ -178,7 +178,7 @@ def run_task(): if get_file_len(file) * 2 + db_methods.get_storage() > app.config['MAX_SYSTEM_STORAGE']: logger.critical('Storage overload has occured') return 'storage_overload' - file_check_response = check_file(file, extension, ALLOWED_EXTENSIONS[file_ext_type], check_mime=True) + file_check_response = check_file(file, extension, ALLOWED_EXTENSIONS[file_ext_type], check_mime=False) if file_check_response != "ok": logger.info('Пользователь загрузил файл с ошибочным расширением: ' + file_check_response) return file_check_response From e20e27a602204a60fe3929a998007724731416bd Mon Sep 17 00:00:00 2001 From: Marina Date: Mon, 9 Oct 2023 16:33:16 +0300 Subject: [PATCH 03/10] add if MIME-check --- app/server.py | 9 +++++++-- app/utils/check_file.py | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/app/server.py b/app/server.py index 2ba6c80a..956ac08f 100644 --- a/app/server.py +++ b/app/server.py @@ -36,6 +36,10 @@ 'pres': {'ppt', 'pptx', 'odp'}, 'report': {'doc', 'odt', 'docx', 'md'} } + +# сохраняем те форматы, в которых НЕ НУЖНО проверять MIME: +NOT_MIME_TYPE = {'md'} + DOCUMENT_TYPES = {'Лабораторная работа', 'Курсовая работа', 'ВКР'} TABLE_COLUMNS = ['Solution', 'User', 'File', 'Criteria', 'Check added', 'LMS date', 'Score'] URL_DOMEN = os.environ.get('URL_DOMEN', f"http://localhost:{os.environ.get('WEB_PORT', 8080)}") @@ -171,6 +175,7 @@ def run_task(): file_type = request.form.get('file_type', 'pres') filename, extension = file.filename.rsplit('.', 1) file_ext_type = current_user.file_type['type'] + check_mime = True if extension not in NOT_MIME_TYPE else False if not file: logger.critical("request doesn't include file") @@ -178,13 +183,13 @@ def run_task(): if get_file_len(file) * 2 + db_methods.get_storage() > app.config['MAX_SYSTEM_STORAGE']: logger.critical('Storage overload has occured') return 'storage_overload' - file_check_response = check_file(file, extension, ALLOWED_EXTENSIONS[file_ext_type], check_mime=False) + file_check_response = check_file(file, extension, ALLOWED_EXTENSIONS[file_ext_type], check_mime) if file_check_response != "ok": logger.info('Пользователь загрузил файл с ошибочным расширением: ' + file_check_response) return file_check_response if pdf_file: - pdf_file_check_response = check_file(pdf_file, pdf_file.filename.rsplit('.', 1)[1], "pdf", check_mime=True) + pdf_file_check_response = check_file(pdf_file, pdf_file.filename.rsplit('.', 1)[1], "pdf", check_mime) if pdf_file_check_response != "ok": logger.info('Пользователь загрузил файл с ошибочным расширением: pdf_' + pdf_file_check_response) return "pdf_" + pdf_file_check_response diff --git a/app/utils/check_file.py b/app/utils/check_file.py index c559ae54..83ac4e6d 100644 --- a/app/utils/check_file.py +++ b/app/utils/check_file.py @@ -1,11 +1,11 @@ import filetype -def check_file(file, file_extension, allowed_extensions, check_mime=True): +def check_file(file, file_extension, allowed_extensions, check_mime): if not file_extension in allowed_extensions: return "not_allowed_extension" - + # Проверяем MIME тип (библиотека автоматически умеет переводить MIME в реальное расширение файла). if check_mime and file_extension != filetype.guess_extension(file): return "mime_type_does_not_match_extension" - return "ok" \ No newline at end of file + return "ok" From 595487c81fda8122725e634a5430f02158cb4a86 Mon Sep 17 00:00:00 2001 From: Marina Date: Mon, 9 Oct 2023 19:49:29 +0300 Subject: [PATCH 04/10] add new abstract method --- app/main/reports/document_uploader.py | 4 ++++ app/main/reports/md_uploader/md_uploader.py | 15 +++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py index a9455545..49ce9604 100644 --- a/app/main/reports/document_uploader.py +++ b/app/main/reports/document_uploader.py @@ -10,6 +10,10 @@ def upload(self): def parse(self): pass + @abstractmethod + def page_counter(self): + pass + # @abstractmethod # def make_chapters(self): # pass diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index 5380864e..f5a2b20a 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -12,6 +12,8 @@ def __init__(self, path_to_md_file): self.html_text = '' self.tables = [] self.chapter_with_text = [] + self.literature_header = [] + self.headers_page = 1 def upload(self): with open(self.path_to_md_file, "r", encoding="utf-8") as f: @@ -22,6 +24,9 @@ def parse(self, md_text): self.html_text = markdown.markdown(md_text) self.paragraphs = self.html_text.split('\n') + def page_counter(self): + return 5 + def get_headers(self): header_regex = "

(.*?)<\/h1>" self.headers = re.findall(header_regex, self.html_text) @@ -48,6 +53,16 @@ def get_tables_size(self): count_table_line +=1 return round(count_table_line/count_paragraph, 4) + def find_literature_vkr(self, work_type): + if not self.literature_header: + for header in self.chapters: + if header.lower() == "список использованных источников" or header == "список литературы": + self.literature_header = header + return self.literature_header + + def find_header_page(self, work_type): + return self.headers_page + def parse_md_file(self): md_text = self.upload() self.parse(md_text) From 45c78d4d610429695509330d1002e99c24a9ebbc Mon Sep 17 00:00:00 2001 From: Marina Date: Wed, 11 Oct 2023 15:36:38 +0300 Subject: [PATCH 05/10] problem with len(self.headers) --- app/main/check_packs/pack_config.py | 2 +- app/main/reports/README.md | 7 ++ app/main/reports/md_uploader/md_uploader.py | 133 ++++++++++++++++---- 3 files changed, 117 insertions(+), 25 deletions(-) diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index 0639689b..7a29ddcc 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -52,5 +52,5 @@ 'pres': BaseCriterionPack(BASE_PRES_CRITERION, DEFAULT_PRES_TYPE_INFO, min_score=1.0, name="BasePresentationCriterionPack"), 'report': BaseCriterionPack(BASE_REPORT_CRITERION, DEFAULT_REPORT_TYPE_INFO, min_score=1.0, - name="BaseReportCriterionPack") + name="BaseReportCriterionPackMd") } diff --git a/app/main/reports/README.md b/app/main/reports/README.md index 16904d47..18b4b5d3 100644 --- a/app/main/reports/README.md +++ b/app/main/reports/README.md @@ -65,3 +65,10 @@ Proof-of-concept парсинг файлов `.docx` с выводом стру ```bash $ python3 -m app.main.mse22.pdf_document text_from_pages --filename path_to_file ``` +## `MD` + +Парсинг файлов `.md` с выводом структуры файла в текстовом виде в stdout. + +```bash +$ python3 -m app.main.reports.md_uploader md_parser --mdfile path_to_md_file +``` \ No newline at end of file diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index f5a2b20a..10f3878c 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -6,14 +6,16 @@ class MdUpload(DocumentUploader): def __init__(self, path_to_md_file): self.path_to_md_file = path_to_md_file + self.paragraphs = [] + self.headers_main = [] self.headers = [] self.chapters = [] - self.paragraphs = [] self.html_text = '' self.tables = [] self.chapter_with_text = [] self.literature_header = [] self.headers_page = 1 + self.styled_paragraphs = [] def upload(self): with open(self.path_to_md_file, "r", encoding="utf-8") as f: @@ -21,29 +23,60 @@ def upload(self): return md_text def parse(self, md_text): - self.html_text = markdown.markdown(md_text) - self.paragraphs = self.html_text.split('\n') + self.html_text = markdown.markdown(md_text) + self.paragraphs = self.make_paragraphs(self.html_text) + + def make_paragraphs(self, html_text): + self.paragraphs = html_text.split('\n') + return self.paragraphs def page_counter(self): - return 5 + return 5 - def get_headers(self): - header_regex = "

(.*?)<\/h1>" - self.headers = re.findall(header_regex, self.html_text) + def get_main_headers(self): + header_main_regex = "

(.*?)<\/h1>" + self.headers_main = re.findall(header_main_regex, self.html_text) - def get_chapters(self): - chapter_regex = "

(.*?)<\/h2>" - self.chapters = re.findall(chapter_regex, self.html_text) - - def get_chapter_with_text(self): - text = self.html_text - chapter_name = '' - for chapter in self.chapters: - self.split_chapter = text.split("

" + chapter + "

") - self.chapter_with_text.append(chapter_name + self.split_chapter[-2]) - chapter_name = chapter - text = self.split_chapter[-1] - self.chapter_with_text.append(chapter_name + text) + def make_headers(self, work_type): + headers_regex = "

(.*?)<\/h2>" + self.headers = re.findall(headers_regex, self.html_text) + return self.headers + + def parse_effective_styles(self): + for par in self.paragraphs: + if len(par.strip()) > 0: + paragraph = {"text": par, "runs": []} + if '

' not in paragraph['text'] and '

' not in paragraph["text"]: + paragraph["runs"].append({"text": par, "style": 'body text'}) + self.styled_paragraphs.append(paragraph) + elif '

' in paragraph["text"]: + paragraph["runs"].append({"text": par, "style": "heading 2"}) + self.styled_paragraphs.append(paragraph) + return self.styled_paragraphs + + + def make_chapters(self, work_type): + if not self.chapters: + if work_type == 'VKR': + # find headers + header_ind = -1 + par_num = 0 + head_par_ind = -1 + for par_ind in range(len(self.styled_paragraphs)): + head_par_ind += 1 + style_name = self.styled_paragraphs[par_ind]['runs'][0]['style'] + if "heading" in style_name: + header_ind += 1 + par_num = 0 + self.chapters.append({"style": style_name, "text": self.styled_paragraphs[par_ind]["text"].strip(), + "styled_text": self.styled_paragraphs[par_ind], "number": head_par_ind, + "child": []}) + elif header_ind >= 0: + par_num += 1 + self.chapters[header_ind]["child"].append( + {"style": style_name, "text": self.styled_paragraphs[par_ind]["text"], + "styled_text": self.styled_paragraphs[par_ind], "number": head_par_ind}) + return self.chapters def get_tables_size(self): count_table_line = 0 @@ -63,16 +96,68 @@ def find_literature_vkr(self, work_type): def find_header_page(self, work_type): return self.headers_page + def late_init_vkr(self): + self.headers = self.make_chapters(work_type='VKR') + def parse_md_file(self): md_text = self.upload() self.parse(md_text) - self.get_headers() - self.get_chapters() - self.get_chapter_with_text() + self.make_headers(work_type="VKR") self.get_tables_size() - return f"Заголовки:\n{self.headers}\n\nГлавы:\n{self.chapters}\n\nГлавы с текстом:\n{self.chapter_with_text}\n\nДоля таблиц в тексте:\n{self.get_tables_size()}" + self.parse_effective_styles() + self.make_chapters(work_type="VKR") + self.late_init_vkr() + return f"Заголовки:\n{self.headers}\n\nГлавы:\n\n\nДоля таблиц в тексте:\n{self.get_tables_size()}\n\nParagraphs" + + def late_init_vkr(self): + self.headers = self.make_chapters(work_type='VKR') def main(args): md_file = MdUpload(args.mdfile) print(md_file.parse_md_file()) + +# [ +# [ +# "simple_check" +# ], +# [ +# "banned_words_in_literature" +# ], +# [ +# "short_sections_check" +# ], +# [ +# "banned_words_check" +# ], +# [ +# "right_words_check" +# ], +# [ +# "banned_words_in_literature" +# ], +# [ +# "literature_references" +# ], +# [ +# "table_references" +# ], +# [ +# "main_character_check" +# ], +# [ +# "needed_headers_check" +# ], +# [ +# "header_check" +# ], +# [ +# "report_section_component" +# ], +# [ +# "main_text_check" +# ], +# [ +# "spelling_check" +# ] +# ] From dc92232b22aa309aa9efc0637e40bbd8454bc284 Mon Sep 17 00:00:00 2001 From: Marina Date: Fri, 20 Oct 2023 14:32:03 +0300 Subject: [PATCH 06/10] fix len() --- app/main/reports/md_uploader/md_uploader.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index 10f3878c..2294087a 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -25,6 +25,7 @@ def upload(self): def parse(self, md_text): self.html_text = markdown.markdown(md_text) self.paragraphs = self.make_paragraphs(self.html_text) + self.parse_effective_styles() def make_paragraphs(self, html_text): self.paragraphs = html_text.split('\n') @@ -38,9 +39,10 @@ def get_main_headers(self): self.headers_main = re.findall(header_main_regex, self.html_text) def make_headers(self, work_type): - headers_regex = "

(.*?)<\/h2>" - self.headers = re.findall(headers_regex, self.html_text) - return self.headers + if not self.headers_page: + headers_regex = "

(.*?)<\/h2>" + self.headers = re.findall(headers_regex, self.html_text) + return self.headers def parse_effective_styles(self): for par in self.paragraphs: @@ -63,6 +65,7 @@ def make_chapters(self, work_type): par_num = 0 head_par_ind = -1 for par_ind in range(len(self.styled_paragraphs)): + head_par_ind += 1 style_name = self.styled_paragraphs[par_ind]['runs'][0]['style'] if "heading" in style_name: @@ -107,10 +110,8 @@ def parse_md_file(self): self.parse_effective_styles() self.make_chapters(work_type="VKR") self.late_init_vkr() - return f"Заголовки:\n{self.headers}\n\nГлавы:\n\n\nДоля таблиц в тексте:\n{self.get_tables_size()}\n\nParagraphs" + return f"Заголовки:\n{len(self.styled_paragraphs)}\n\nГлавы:\n\n\nДоля таблиц в тексте:\n{self.get_tables_size()}\n\nParagraphs" - def late_init_vkr(self): - self.headers = self.make_chapters(work_type='VKR') def main(args): md_file = MdUpload(args.mdfile) From fc870184f1c11a02078bf8c14254e176ce59d9b1 Mon Sep 17 00:00:00 2001 From: Marina Date: Thu, 26 Oct 2023 16:11:31 +0300 Subject: [PATCH 07/10] changed all checks for md --- .../banned_words_in_literature.py | 3 +- .../checks/report_checks/image_references.py | 5 +- .../report_checks/literature_references.py | 24 ++- .../checks/report_checks/table_references.py | 5 +- app/main/reports/md_uploader/md_uploader.py | 173 +++++++++++------- .../pdf_document/pdf_document_manager.py | 2 +- requirements.txt | 1 + 7 files changed, 135 insertions(+), 78 deletions(-) diff --git a/app/main/checks/report_checks/banned_words_in_literature.py b/app/main/checks/report_checks/banned_words_in_literature.py index 5671785c..6e9f8358 100644 --- a/app/main/checks/report_checks/banned_words_in_literature.py +++ b/app/main/checks/report_checks/banned_words_in_literature.py @@ -13,6 +13,7 @@ def __init__(self, file_info, banned_words=["wikipedia"]): self.literature_header = [] self.banned_words = [morph.normal_forms(word)[0] for word in banned_words] self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)' + self.md_name_pattern = r'

список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)

' def late_init_vkr(self): self.literature_header = self.file.find_literature_vkr(self.file_type['report_type']) @@ -83,6 +84,6 @@ def start_of_literature_chapter(self, ): start_index = 0 for i in range(len(self.file.paragraphs)): text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] - if re.fullmatch(self.name_pattern, text_string): + if re.fullmatch(f'{self.name_pattern}|{self.md_name_pattern}', text_string): start_index = i return start_index diff --git a/app/main/checks/report_checks/image_references.py b/app/main/checks/report_checks/image_references.py index 7bcb6256..79e46fd7 100644 --- a/app/main/checks/report_checks/image_references.py +++ b/app/main/checks/report_checks/image_references.py @@ -63,7 +63,10 @@ def check(self): def search_references(self): array_of_references = set() for i in range(0, self.last_child_number): - detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text) + if isinstance(self.file.paragraphs[i], str): + detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i]) + else: + detected_references = re.findall(r'[Рр]ис\. [\d .,]+', self.file.paragraphs[i].paragraph_text) if detected_references: for reference in detected_references: for one_part in re.split(r'[Рр]ис\.|,| ', reference): diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py index b4ed3335..0f2b5131 100644 --- a/app/main/checks/report_checks/literature_references.py +++ b/app/main/checks/report_checks/literature_references.py @@ -12,6 +12,7 @@ def __init__(self, file_info): self.headers = [] self.literature_header = [] self.name_pattern = r'список[ \t]*(использованных|использованной|)[ \t]*(источников|литературы)' + self.md_name_pattern = r"

(Список использованных источников|Список использованной литературы)<\/h2>" def late_init_vkr(self): self.headers = self.file.make_chapters(self.file_type['report_type']) @@ -72,7 +73,12 @@ def check(self): def search_references(self, start_par): array_of_references = set() for i in range(0, start_par): - detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) + if isinstance(self.file.paragraphs[i], str): + print(111111111111111111111111111111111111111111111111111) + print(self.file.paragraphs[i]) + detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i]) + else: + detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) if detected_references: for reference in detected_references: for one_part in re.split(r'[\[\],]', reference): @@ -87,10 +93,18 @@ def search_references(self, start_par): def find_start_paragraph(self): start_index = 0 for i in range(len(self.file.paragraphs)): - text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] - if re.fullmatch(self.name_pattern, text_string): - start_index = i - break + if isinstance(self.file.paragraphs[i], str): + text_string = self.file.paragraphs[i].lower() + if re.fullmatch(self.md_name_pattern, text_string): + start_index = i + print(222222222222222222222222222222222) + (print(start_index)) + break + else: + text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] + if re.fullmatch(self.name_pattern, text_string): + start_index = i + break return start_index def count_sources_vkr(self, header): diff --git a/app/main/checks/report_checks/table_references.py b/app/main/checks/report_checks/table_references.py index d390872b..43aa51b9 100644 --- a/app/main/checks/report_checks/table_references.py +++ b/app/main/checks/report_checks/table_references.py @@ -63,7 +63,10 @@ def check(self): def search_references(self): array_of_references = set() for i in range(0, self.last_child_number): - detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text) + if isinstance(self.file.paragraphs[i], str): + detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i]) + else: + detected_references = re.findall(r'таблиц[аеыу][\d .]+', self.file.paragraphs[i].paragraph_text) if detected_references: for reference in detected_references: for one_part in re.split(r'таблиц[аеыу]| ', reference): diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index 2294087a..223197ee 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -1,21 +1,33 @@ import markdown #installation: pip install markdown +from md2pdf.core import md2pdf #installation: pip install md2pdf import re +# from functools import reduce +from PIL import Image +from io import BytesIO +import requests +# from ..docx_uploader.inline_shape import InlineShape from ..document_uploader import DocumentUploader +from ..pdf_document.pdf_document_manager import PdfDocumentManager + class MdUpload(DocumentUploader): def __init__(self, path_to_md_file): + self.pdf_file = None self.path_to_md_file = path_to_md_file self.paragraphs = [] self.headers_main = [] self.headers = [] self.chapters = [] self.html_text = '' + self.count = 0 self.tables = [] self.chapter_with_text = [] self.literature_header = [] self.headers_page = 1 self.styled_paragraphs = [] + self.first_lines = [] + self.inline_shapes = [] def upload(self): with open(self.path_to_md_file, "r", encoding="utf-8") as f: @@ -26,36 +38,79 @@ def parse(self, md_text): self.html_text = markdown.markdown(md_text) self.paragraphs = self.make_paragraphs(self.html_text) self.parse_effective_styles() - + self.pdf_filepath = self.path_to_md_file.split('.')[0]+'.pdf' + self.pdf_file = PdfDocumentManager(self.path_to_md_file, md2pdf(self.pdf_filepath, md_file_path=self.path_to_md_file)) + def make_paragraphs(self, html_text): self.paragraphs = html_text.split('\n') return self.paragraphs def page_counter(self): - return 5 + if not self.count: + for k, v in self.pdf_file.text_on_page.items(): + line = v[:20] if len(v) > 21 else v + if re.search('ПРИЛОЖЕНИЕ [А-Я]', line.strip()): + break + self.count += 1 + line = '' + lines = v.split("\n") + for i in range(len(lines)): + if i > 1: + break + if i > 0: + line += " " + line += lines[i].strip() + self.first_lines.append(line.lower()) + return self.count def get_main_headers(self): header_main_regex = "

(.*?)<\/h1>" self.headers_main = re.findall(header_main_regex, self.html_text) def make_headers(self, work_type): - if not self.headers_page: - headers_regex = "

(.*?)<\/h2>" - self.headers = re.findall(headers_regex, self.html_text) - return self.headers + if not self.headers: + if work_type == 'VKR': + # find first pages + headers = [ + {"name": "Титульный лист", "marker": False, "key": "санкт-петербургский государственный", + "main_character": True, "page": 0}, + {"name": "Задание на выпускную квалификационную работу", "marker": False, "key": "задание", + "main_character": True, "page": 0}, + {"name": "Календарный план", "marker": False, "key": "календарный план", "main_character": True, + "page": 0}, + {"name": "Реферат", "marker": False, "key": "реферат", "main_character": False, "page": 0}, + {"name": "Abstract", "marker": False, "key": "abstract", "main_character": False, "page": 0}, + {"name": "Содержание", "marker": False, "key": "содержание", "main_character": False, "page": 0}] + for page in range(1, self.count if self.page_counter() < 2 * len(headers) else 2 * len(headers)): + page_text = (self.pdf_file.get_text_on_page()[page].lower()) + for i in range(len(headers)): + if not headers[i]["marker"]: + if page_text.find(headers[i]["key"]) >= 0: + headers[i]["marker"] = True + headers[i]["page"] = page + break + self.headers = headers + return self.headers def parse_effective_styles(self): for par in self.paragraphs: if len(par.strip()) > 0: paragraph = {"text": par, "runs": []} - if '

' not in paragraph['text'] and '

' not in paragraph["text"]: - paragraph["runs"].append({"text": par, "style": 'body text'}) - self.styled_paragraphs.append(paragraph) - elif '

' in paragraph["text"]: - paragraph["runs"].append({"text": par, "style": "heading 2"}) - self.styled_paragraphs.append(paragraph) - return self.styled_paragraphs - + if '

' in paragraph["text"]: + paragraph["runs"].append({"text": par, "style": "heading 2"}) + elif 'Таблица' in paragraph["text"]: + if '|' in self.paragraphs[self.paragraphs.index(par)+1]: + paragraph['runs'].append({"text": par, "style": "вкр_подпись таблицы"}) + elif ' in paragraph[= 0: self.literature_header = header return self.literature_header def find_header_page(self, work_type): return self.headers_page - def late_init_vkr(self): - self.headers = self.make_chapters(work_type='VKR') + # def get_paragraph_indices_by_style(self, style_list): + # result = [] + # for template_style in style_list: + # matched_pars = [] + # for i in range(len(self.styled_paragraphs)): + # par = self.styled_paragraphs[i] + # if reduce(lambda prev, run: prev and run["style"].matches(template_style), par["runs"], True): + # matched_pars.append(i) + # result.append(matched_pars) + # print(result) + # print('!!!!!!!!!!!') + # return result + def parse_md_file(self): md_text = self.upload() self.parse(md_text) self.make_headers(work_type="VKR") self.get_tables_size() - self.parse_effective_styles() self.make_chapters(work_type="VKR") - self.late_init_vkr() - return f"Заголовки:\n{len(self.styled_paragraphs)}\n\nГлавы:\n\n\nДоля таблиц в тексте:\n{self.get_tables_size()}\n\nParagraphs" + self.find_images() + self.find_literature_vkr(work_type="VKR") + return f"Заголовки:\n{self.headers_main}\n\nГлавы\n{self.chapters}\n\nИзображения:\n\n{self.inline_shapes}" def main(args): md_file = MdUpload(args.mdfile) print(md_file.parse_md_file()) - -# [ -# [ -# "simple_check" -# ], -# [ -# "banned_words_in_literature" -# ], -# [ -# "short_sections_check" -# ], -# [ -# "banned_words_check" -# ], -# [ -# "right_words_check" -# ], -# [ -# "banned_words_in_literature" -# ], -# [ -# "literature_references" -# ], -# [ -# "table_references" -# ], -# [ -# "main_character_check" -# ], -# [ -# "needed_headers_check" -# ], -# [ -# "header_check" -# ], -# [ -# "report_section_component" -# ], -# [ -# "main_text_check" -# ], -# [ -# "spelling_check" -# ] -# ] - diff --git a/app/main/reports/pdf_document/pdf_document_manager.py b/app/main/reports/pdf_document/pdf_document_manager.py index ddc125e0..1500756f 100644 --- a/app/main/reports/pdf_document/pdf_document_manager.py +++ b/app/main/reports/pdf_document/pdf_document_manager.py @@ -4,7 +4,7 @@ class PdfDocumentManager: - def __init__(self, path_to_file, pdf_filepath=''): + def __init__(self, path_to_file, pdf_filepath): if not pdf_filepath: self.pdf_file = pdfplumber.open(convert_to(path_to_file, target_format='pdf')) else: diff --git a/requirements.txt b/requirements.txt index cde73eef..f29f2e0b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ pytest~=7.1.2 filetype==1.2.0 language-tool-python==2.7.1 markdown +md2pdf From 7f666ceb069a37c359819d406d32ab7aeed09675 Mon Sep 17 00:00:00 2001 From: Marina Date: Thu, 26 Oct 2023 18:43:31 +0300 Subject: [PATCH 08/10] full_test_of_md-file 1.0 --- app/main/check_packs/pack_config.py | 2 +- .../report_checks/literature_references.py | 4 -- app/main/reports/document_uploader.py | 24 +++++-- app/main/reports/md_uploader/md_uploader.py | 66 +++++++++++++++---- 4 files changed, 71 insertions(+), 25 deletions(-) diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index 7a29ddcc..0639689b 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -52,5 +52,5 @@ 'pres': BaseCriterionPack(BASE_PRES_CRITERION, DEFAULT_PRES_TYPE_INFO, min_score=1.0, name="BasePresentationCriterionPack"), 'report': BaseCriterionPack(BASE_REPORT_CRITERION, DEFAULT_REPORT_TYPE_INFO, min_score=1.0, - name="BaseReportCriterionPackMd") + name="BaseReportCriterionPack") } diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py index 0f2b5131..5b0eb37f 100644 --- a/app/main/checks/report_checks/literature_references.py +++ b/app/main/checks/report_checks/literature_references.py @@ -74,8 +74,6 @@ def search_references(self, start_par): array_of_references = set() for i in range(0, start_par): if isinstance(self.file.paragraphs[i], str): - print(111111111111111111111111111111111111111111111111111) - print(self.file.paragraphs[i]) detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i]) else: detected_references = re.findall(r'\[[\d \-,]+\]', self.file.paragraphs[i].to_string().split('\n')[1]) @@ -97,8 +95,6 @@ def find_start_paragraph(self): text_string = self.file.paragraphs[i].lower() if re.fullmatch(self.md_name_pattern, text_string): start_index = i - print(222222222222222222222222222222222) - (print(start_index)) break else: text_string = self.file.paragraphs[i].to_string().lower().split('\n')[1] diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py index 49ce9604..a67600f0 100644 --- a/app/main/reports/document_uploader.py +++ b/app/main/reports/document_uploader.py @@ -10,14 +10,26 @@ def upload(self): def parse(self): pass + @abstractmethod + def parse_effective_styles(self): + pass + @abstractmethod def page_counter(self): pass - # @abstractmethod - # def make_chapters(self): - # pass + @abstractmethod + def make_headers(self, work_type): + pass - # @abstractmethod - # def make_headers(self): - # pass + @abstractmethod + def make_chapters(self, work_type): + pass + + @abstractmethod + def find_header_page(self, work_type): + pass + + @abstractmethod + def find_literature_vkr(self, work_type): + pass diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index 223197ee..25346ffc 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -1,3 +1,55 @@ +'''Available checks for md-file: +pack "BaseReportCriterionPackMd" + +[ + [ + "simple_check" + ], + [ + "banned_words_in_literature" + ], + [ + "page_counter" + ], + [ + "short_sections_check" + ], + [ + "banned_words_check" + ], + [ + "right_words_check" + ], + [ + "banned_words_in_literature" + ], + [ + "literature_references" + ], + [ + "image_references" + ], + [ + "table_references" + ], + [ + "first_pages_check" + ], + [ + "main_character_check" + ], + [ + "needed_headers_check" + ], + [ + "report_section_component" + ], + [ + "spelling_check" + ] +] +''' + import markdown #installation: pip install markdown from md2pdf.core import md2pdf #installation: pip install md2pdf import re @@ -168,20 +220,6 @@ def find_literature_vkr(self, work_type): def find_header_page(self, work_type): return self.headers_page - - # def get_paragraph_indices_by_style(self, style_list): - # result = [] - # for template_style in style_list: - # matched_pars = [] - # for i in range(len(self.styled_paragraphs)): - # par = self.styled_paragraphs[i] - # if reduce(lambda prev, run: prev and run["style"].matches(template_style), par["runs"], True): - # matched_pars.append(i) - # result.append(matched_pars) - # print(result) - # print('!!!!!!!!!!!') - # return result - def parse_md_file(self): md_text = self.upload() From 80d73b8b912bf909107e26895315d3b2b503b3fc Mon Sep 17 00:00:00 2001 From: Marina Date: Wed, 8 Nov 2023 13:43:31 +0300 Subject: [PATCH 09/10] resolve conflicts 2 --- app/main/reports/md_uploader/md_uploader.py | 56 --------------------- 1 file changed, 56 deletions(-) diff --git a/app/main/reports/md_uploader/md_uploader.py b/app/main/reports/md_uploader/md_uploader.py index e3de1e51..25346ffc 100644 --- a/app/main/reports/md_uploader/md_uploader.py +++ b/app/main/reports/md_uploader/md_uploader.py @@ -1,4 +1,3 @@ -<<<<<<< HEAD '''Available checks for md-file: pack "BaseReportCriterionPackMd" @@ -83,27 +82,10 @@ def __init__(self, path_to_md_file): self.inline_shapes = [] def upload(self): -======= -import markdown #installation: pip install markdown -import re - -class MdUpload: - def __init__(self, path_to_md_file): - self.path_to_md_file = path_to_md_file - self.headers = [] - self.chapters = [] - self.paragraphs = [] - self.html_text = '' - self.tables = [] - self.chapter_with_text = [] - - def read_md_file(self): ->>>>>>> master with open(self.path_to_md_file, "r", encoding="utf-8") as f: md_text = f.read() return md_text -<<<<<<< HEAD def parse(self, md_text): self.html_text = markdown.markdown(md_text) self.paragraphs = self.make_paragraphs(self.html_text) @@ -219,29 +201,6 @@ def find_images(self): total_height += width self.inline_shapes.append((width, height)) return self.inline_shapes -======= - def get_html_from_md(self, md_text): - self.html_text = markdown.markdown(md_text) - self.paragraphs = self.html_text.split('\n') - - def get_headers(self): - header_regex = "

(.*?)<\/h1>" - self.headers = re.findall(header_regex, self.html_text) - - def get_chapters(self): - chapter_regex = "

(.*?)<\/h2>" - self.chapters = re.findall(chapter_regex, self.html_text) - - def get_chapter_with_text(self): - text = self.html_text - chapter_name = '' - for chapter in self.chapters: - self.split_chapter = text.split("

" + chapter + "

") - self.chapter_with_text.append(chapter_name + self.split_chapter[-2]) - chapter_name = chapter - text = self.split_chapter[-1] - self.chapter_with_text.append(chapter_name + text) ->>>>>>> master def get_tables_size(self): count_table_line = 0 @@ -251,7 +210,6 @@ def get_tables_size(self): count_table_line +=1 return round(count_table_line/count_paragraph, 4) -<<<<<<< HEAD def find_literature_vkr(self, work_type): if not self.literature_header: for header in self.make_chapters(work_type): @@ -273,21 +231,7 @@ def parse_md_file(self): self.find_literature_vkr(work_type="VKR") return f"Заголовки:\n{self.headers_main}\n\nГлавы\n{self.chapters}\n\nИзображения:\n\n{self.inline_shapes}" -======= - def parse_md_file(self): - md_text = self.read_md_file() - self.get_html_from_md(md_text) - self.get_headers() - self.get_chapters() - self.get_chapter_with_text() - self.get_tables_size() - return f"Заголовки:\n{self.headers}\n\nГлавы:\n{self.chapters}\n\nГлавы с текстом:\n{self.chapter_with_text}\n\nДоля таблиц в тексте:\n{self.get_tables_size()}" ->>>>>>> master def main(args): md_file = MdUpload(args.mdfile) print(md_file.parse_md_file()) -<<<<<<< HEAD -======= - ->>>>>>> master From a52a9def71dd100b844cf58a99030b57662d36d0 Mon Sep 17 00:00:00 2001 From: Marina Date: Tue, 20 Feb 2024 15:32:12 +0300 Subject: [PATCH 10/10] MIME for md returned --- app/server.py | 8 ++------ app/utils/check_file.py | 12 ++++++++---- requirements.txt | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/app/server.py b/app/server.py index 956ac08f..edbe6284 100644 --- a/app/server.py +++ b/app/server.py @@ -37,9 +37,6 @@ 'report': {'doc', 'odt', 'docx', 'md'} } -# сохраняем те форматы, в которых НЕ НУЖНО проверять MIME: -NOT_MIME_TYPE = {'md'} - DOCUMENT_TYPES = {'Лабораторная работа', 'Курсовая работа', 'ВКР'} TABLE_COLUMNS = ['Solution', 'User', 'File', 'Criteria', 'Check added', 'LMS date', 'Score'] URL_DOMEN = os.environ.get('URL_DOMEN', f"http://localhost:{os.environ.get('WEB_PORT', 8080)}") @@ -175,7 +172,6 @@ def run_task(): file_type = request.form.get('file_type', 'pres') filename, extension = file.filename.rsplit('.', 1) file_ext_type = current_user.file_type['type'] - check_mime = True if extension not in NOT_MIME_TYPE else False if not file: logger.critical("request doesn't include file") @@ -183,13 +179,13 @@ def run_task(): if get_file_len(file) * 2 + db_methods.get_storage() > app.config['MAX_SYSTEM_STORAGE']: logger.critical('Storage overload has occured') return 'storage_overload' - file_check_response = check_file(file, extension, ALLOWED_EXTENSIONS[file_ext_type], check_mime) + file_check_response = check_file(file, extension, ALLOWED_EXTENSIONS[file_ext_type], check_mime=True) if file_check_response != "ok": logger.info('Пользователь загрузил файл с ошибочным расширением: ' + file_check_response) return file_check_response if pdf_file: - pdf_file_check_response = check_file(pdf_file, pdf_file.filename.rsplit('.', 1)[1], "pdf", check_mime) + pdf_file_check_response = check_file(pdf_file, pdf_file.filename.rsplit('.', 1)[1], "pdf", check_mime=True) if pdf_file_check_response != "ok": logger.info('Пользователь загрузил файл с ошибочным расширением: pdf_' + pdf_file_check_response) return "pdf_" + pdf_file_check_response diff --git a/app/utils/check_file.py b/app/utils/check_file.py index 83ac4e6d..309abfd7 100644 --- a/app/utils/check_file.py +++ b/app/utils/check_file.py @@ -1,11 +1,15 @@ import filetype -def check_file(file, file_extension, allowed_extensions, check_mime): +def check_file(file, file_extension, allowed_extensions, check_mime=True): if not file_extension in allowed_extensions: return "not_allowed_extension" - # Проверяем MIME тип (библиотека автоматически умеет переводить MIME в реальное расширение файла). - if check_mime and file_extension != filetype.guess_extension(file): - return "mime_type_does_not_match_extension" + if check_mime: + if file_extension == 'md': + if file.mimetype != 'text/plain': + return "mime_type_does_not_match_extension" + else: + if file_extension != filetype.guess_extension(file): + return "mime_type_does_not_match_extension" return "ok" diff --git a/requirements.txt b/requirements.txt index 617a5755..226de495 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,5 +27,5 @@ pdfplumber==0.6.1 pytest~=7.1.2 filetype==1.2.0 language-tool-python==2.7.1 -md2pdf +md2pdf==1.0.1 markdown==3.4.4