Skip to content

Commit

Permalink
fix fitz and table_share
Browse files Browse the repository at this point in the history
  • Loading branch information
MarinaProsche committed Sep 29, 2024
1 parent 1ff4b07 commit bc6e857
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 8 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ ADD ./scripts/local_start.sh ./scripts/
ADD ./db_versioning ./db_versioning/
ADD ./app ./app/
COPY --from=frontend_build /app/src ./src/
RUN pip install --upgrade PyMuPDF==1.24.10

ENV PYTHONPATH "${PYTHONPATH}:/usr/src/project/app"

Expand Down
2 changes: 1 addition & 1 deletion app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
["spelling_check"],
["max_abstract_size_check"],
["theme_in_report_check"],
# ["table_share_check"],
["table_share_check"],
['key_words_report_check'],
["empty_task_page_check"],
["template_name"],
Expand Down
13 changes: 8 additions & 5 deletions app/main/reports/pdf_document/pdf_document_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# import pdfplumber
import fitz

# Version of PyMuPDF is important for find_tables() method (now it's PyMuPDF==1.23.6)
# Version of PyMuPDF is important for find_tables() method (now it's PyMuPDF==1.24.10)

from app.utils import convert_to

Expand All @@ -22,7 +22,7 @@ def __init__(self, path_to_file, pdf_filepath=''):
def get_text_on_page(self):
return {page_num + 1: page.get_text() for page_num, page in enumerate(self.pages)}
# return {page + 1: self.pages[page].extract_text() for page in range(self.page_count)}

# def get_text_on_page(self):
# return {page + 1: self.pages[page].extract_text() for page in range(self.page_count_all)}

Expand All @@ -35,15 +35,18 @@ def page_table(self, page_without_pril):
for table in tables:
table_coord = table.bbox
total_height += (table_coord[3] - table_coord[1])
return total_height
return total_height

def get_image_num(self):
return len(self.pdf_file.get_page_images(0))

def page_images(self, page_without_pril):
total_height = 0
for page_num in range(page_without_pril):
page = self.pdf_file[page_num]
images = self.pdf_file.get_page_images(page)
page = self.pages[page_num]
# page = self.pdf_file[page_num]
# images = self.pdf_file.get_page_images(page)
images = page.get_images()
for image in images:
image_coord = page.get_image_bbox(image[7], transform=0) # might be [1.0, 1.0, -1.0, -1.0]
image_height = image_coord[3] - image_coord[1]
Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ docx2python~=2.0.4
oauthlib~=3.1.0
pdfplumber==0.6.1
pytest~=7.1.2
PyMuPDF~=1.22.5
PyPDF2~=3.0.1
configparser~=5.3.0
pytz~=2023.3
Expand All @@ -35,4 +34,4 @@ filetype==1.2.0
language-tool-python==2.7.1
markdown==3.4.4
md2pdf==1.0.1
# PyMuPDF==1.23.6
PyMuPDF==1.24.10

0 comments on commit bc6e857

Please sign in to comment.