Skip to content

Commit

Permalink
feat(pdf_parse_union_core_v2): reintegrate para_split_v3 and add page…
Browse files Browse the repository at this point in the history
… range support

- Reintegrate para_split_v3 into the pdf_parse_union_core_v2 process
- Add support for specifying page range in doc_analyze_by_custom_model
- Implement garbage collection and memory cleaning after processing
- Refine image loading from PDF, including handling out-of-range pages
  • Loading branch information
myhloli committed Oct 10, 2024
1 parent 675f8e6 commit 6f63e70
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 20 deletions.
50 changes: 33 additions & 17 deletions magic_pdf/model/doc_analyze_by_custom_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from loguru import logger

from magic_pdf.libs.clean_memory import clean_memory
from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
from magic_pdf.model.model_list import MODEL
import magic_pdf.model as model_config
Expand All @@ -23,7 +24,7 @@ def remove_duplicates_dicts(lst):
return unique_dicts


def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
def load_images_from_pdf(pdf_bytes: bytes, dpi=200, start_page_id=0, end_page_id=None) -> list:
try:
from PIL import Image
except ImportError:
Expand All @@ -32,18 +33,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:

images = []
with fitz.open("pdf", pdf_bytes) as doc:
pdf_page_num = doc.page_count
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1

for index in range(0, doc.page_count):
page = doc[index]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
if start_page_id <= index <= end_page_id:
page = doc[index]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)

# If the width or height exceeds 9000 after scaling, do not scale further.
if pm.width > 9000 or pm.height > 9000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

# If the width or height exceeds 9000 after scaling, do not scale further.
if pm.width > 9000 or pm.height > 9000:
pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
img = np.array(img)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
else:
img_dict = {"img": [], "width": 0, "height": 0}

img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
img = np.array(img)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
images.append(img_dict)
return images

Expand Down Expand Up @@ -111,14 +122,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
model_manager = ModelSingleton()
custom_model = model_manager.get_model(ocr, show_log, lang)

images = load_images_from_pdf(pdf_bytes)

# end_page_id = end_page_id if end_page_id else len(images) - 1
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(images) - 1
with fitz.open("pdf", pdf_bytes) as doc:
pdf_page_num = doc.page_count
end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
if end_page_id > pdf_page_num - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = pdf_page_num - 1

if end_page_id > len(images) - 1:
logger.warning("end_page_id is out of range, use images length")
end_page_id = len(images) - 1
images = load_images_from_pdf(pdf_bytes, start_page_id=start_page_id, end_page_id=end_page_id)

model_json = []
doc_analyze_start = time.time()
Expand All @@ -135,6 +146,11 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False,
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)

gc_start = time.time()
clean_memory()
gc_time = round(time.time() - gc_start, 2)
logger.info(f"gc time: {gc_time}")

doc_analyze_time = round(time.time() - doc_analyze_start, 2)
doc_analyze_speed = round( (end_page_id + 1 - start_page_id) / doc_analyze_time, 2)
logger.info(f"doc analyze time: {round(time.time() - doc_analyze_start, 2)},"
Expand Down
110 changes: 110 additions & 0 deletions magic_pdf/para/para_split_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import copy

from magic_pdf.libs.Constants import LINES_DELETED, CROSS_PAGE

LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?')


def __process_blocks(blocks):

result = []
current_group = []

for i in range(len(blocks)):
current_block = blocks[i]

# 如果当前块是 text 类型
if current_block['type'] == 'text':

current_block["bbox_fs"] = copy.deepcopy(current_block["bbox"])
if len(current_block["lines"]) > 0:
current_block['bbox_fs'] = [min([line['bbox'][0] for line in current_block['lines']]),
min([line['bbox'][1] for line in current_block['lines']]),
max([line['bbox'][2] for line in current_block['lines']]),
max([line['bbox'][3] for line in current_block['lines']])]

current_group.append(current_block)

# 检查下一个块是否存在
if i + 1 < len(blocks):
next_block = blocks[i + 1]
# 如果下一个块不是 text 类型且是 title 或 interline_equation 类型
if next_block['type'] in ['title', 'interline_equation']:
result.append(current_group)
current_group = []

# 处理最后一个 group
if current_group:
result.append(current_group)

return result


def __merge_2_blocks(block1, block2):
if len(block1['lines']) > 0:
first_line = block1['lines'][0]
line_height = first_line['bbox'][3] - first_line['bbox'][1]
if abs(block1['bbox_fs'][0] - first_line['bbox'][0]) < line_height/2:
last_line = block2['lines'][-1]
if len(last_line['spans']) > 0:
last_span = last_line['spans'][-1]
line_height = last_line['bbox'][3] - last_line['bbox'][1]
if abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height and not last_span['content'].endswith(LINE_STOP_FLAG):
if block1['page_num'] != block2['page_num']:
for line in block1['lines']:
for span in line['spans']:
span[CROSS_PAGE] = True
block2['lines'].extend(block1['lines'])
block1['lines'] = []
block1[LINES_DELETED] = True

return block1, block2


def __para_merge_page(blocks):
page_text_blocks_groups = __process_blocks(blocks)
for text_blocks_group in page_text_blocks_groups:
if len(text_blocks_group) > 1:
# 倒序遍历
for i in range(len(text_blocks_group)-1, -1, -1):
current_block = text_blocks_group[i]
# 检查是否有前一个块
if i - 1 >= 0:
prev_block = text_blocks_group[i - 1]
__merge_2_blocks(current_block, prev_block)
else:
continue


def para_split(pdf_info_dict, debug_mode=False):
all_blocks = []
for page_num, page in pdf_info_dict.items():
blocks = copy.deepcopy(page['preproc_blocks'])
for block in blocks:
block['page_num'] = page_num
all_blocks.extend(blocks)

__para_merge_page(all_blocks)
for page_num, page in pdf_info_dict.items():
page['para_blocks'] = []
for block in all_blocks:
if block['page_num'] == page_num:
page['para_blocks'].append(block)


if __name__ == '__main__':
input_blocks = [
{'type': 'text', 'content': '这是第一段'},
{'type': 'text', 'content': '这是第二段'},
{'type': 'title', 'content': '这是一个标题'},
{'type': 'text', 'content': '这是第三段'},
{'type': 'interline_equation', 'content': '这是一个公式'},
{'type': 'text', 'content': '这是第四段'},
{'type': 'image', 'content': '这是一张图片'},
{'type': 'text', 'content': '这是第五段'},
{'type': 'table', 'content': '这是一张表格'}
]

# 调用函数
for group_index, group in enumerate(__process_blocks(input_blocks)):
print(f"Group {group_index}: {group}")
5 changes: 2 additions & 3 deletions magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from magic_pdf.libs.local_math import float_equal
from magic_pdf.libs.ocr_content_type import ContentType
from magic_pdf.model.magic_model import MagicModel
from magic_pdf.para.para_split_v3 import para_split
from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
Expand Down Expand Up @@ -435,9 +436,7 @@ def pdf_parse_union(pdf_bytes,
pdf_info_dict[f"page_{page_id}"] = page_info

"""分段"""
# para_split(pdf_info_dict, debug_mode=debug_mode)
for page_num, page in pdf_info_dict.items():
page['para_blocks'] = page['preproc_blocks']
para_split(pdf_info_dict, debug_mode=debug_mode)

"""dict转list"""
pdf_info_list = dict_to_list(pdf_info_dict)
Expand Down

0 comments on commit 6f63e70

Please sign in to comment.