From bc6e8576af0b384ef9134d512784cd830c466c36 Mon Sep 17 00:00:00 2001 From: Marina Date: Sun, 29 Sep 2024 18:50:07 +0300 Subject: [PATCH] fix fitz and table_share --- Dockerfile | 1 + app/main/check_packs/pack_config.py | 2 +- .../reports/pdf_document/pdf_document_manager.py | 13 ++++++++----- requirements.txt | 3 +-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index d0ad254e..15a7ce48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ ADD ./scripts/local_start.sh ./scripts/ ADD ./db_versioning ./db_versioning/ ADD ./app ./app/ COPY --from=frontend_build /app/src ./src/ +RUN pip install --upgrade PyMuPDF==1.24.10 ENV PYTHONPATH "${PYTHONPATH}:/usr/src/project/app" diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index efc7453d..d91e8702 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -44,7 +44,7 @@ ["spelling_check"], ["max_abstract_size_check"], ["theme_in_report_check"], - # ["table_share_check"], + ["table_share_check"], ['key_words_report_check'], ["empty_task_page_check"], ["template_name"], diff --git a/app/main/reports/pdf_document/pdf_document_manager.py b/app/main/reports/pdf_document/pdf_document_manager.py index 3143a695..ebd84bcc 100644 --- a/app/main/reports/pdf_document/pdf_document_manager.py +++ b/app/main/reports/pdf_document/pdf_document_manager.py @@ -1,7 +1,7 @@ # import pdfplumber import fitz -# Version of PyMuPDF is important for find_tables() method (now it's PyMuPDF==1.23.6) +# Version of PyMuPDF is important for find_tables() method (now it's PyMuPDF==1.24.10) from app.utils import convert_to @@ -22,7 +22,7 @@ def __init__(self, path_to_file, pdf_filepath=''): def get_text_on_page(self): return {page_num + 1: page.get_text() for page_num, page in enumerate(self.pages)} # return {page + 1: self.pages[page].extract_text() for page in range(self.page_count)} - + # def get_text_on_page(self): # return {page + 1: self.pages[page].extract_text() for page in range(self.page_count_all)} @@ -35,15 +35,18 @@ def page_table(self, page_without_pril): for table in tables: table_coord = table.bbox total_height += (table_coord[3] - table_coord[1]) - return total_height + return total_height + def get_image_num(self): return len(self.pdf_file.get_page_images(0)) def page_images(self, page_without_pril): total_height = 0 for page_num in range(page_without_pril): - page = self.pdf_file[page_num] - images = self.pdf_file.get_page_images(page) + page = self.pages[page_num] + # page = self.pdf_file[page_num] + # images = self.pdf_file.get_page_images(page) + images = page.get_images() for image in images: image_coord = page.get_image_bbox(image[7], transform=0) # might be [1.0, 1.0, -1.0, -1.0] image_height = image_coord[3] - image_coord[1] diff --git a/requirements.txt b/requirements.txt index c8b82d84..9a812674 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,7 +25,6 @@ docx2python~=2.0.4 oauthlib~=3.1.0 pdfplumber==0.6.1 pytest~=7.1.2 -PyMuPDF~=1.22.5 PyPDF2~=3.0.1 configparser~=5.3.0 pytz~=2023.3 @@ -35,4 +34,4 @@ filetype==1.2.0 language-tool-python==2.7.1 markdown==3.4.4 md2pdf==1.0.1 -# PyMuPDF==1.23.6 +PyMuPDF==1.24.10