From faac0a4d17caf3d9c145a99fb2a596925ad975e2 Mon Sep 17 00:00:00 2001 From: icecraft Date: Thu, 12 Sep 2024 14:12:47 +0800 Subject: [PATCH 1/3] fix: 1. resolve uncorrect pair relation of figure and footnote, 2. resolve uncorrect pair relation of table and caption #590 --- magic_pdf/libs/boxbase.py | 19 +++++ magic_pdf/model/magic_model.py | 141 ++++++++++++++++++++++----------- magic_pdf/tools/common.py | 2 +- 3 files changed, 116 insertions(+), 46 deletions(-) diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py index 90f46ef2..0472328f 100644 --- a/magic_pdf/libs/boxbase.py +++ b/magic_pdf/libs/boxbase.py @@ -426,3 +426,22 @@ def dist(point1, point2): elif top: return y2 - y1b return 0.0 + + +def box_area(bbox): + return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) + + +def get_overlap_area(bbox1, bbox2): + """计算box1和box2的重叠面积占bbox1的比例.""" + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + return (x_right - x_left) * (y_bottom - y_top) diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py index 61dc3a43..bd8e061a 100644 --- a/magic_pdf/model/magic_model.py +++ b/magic_pdf/model/magic_model.py @@ -1,8 +1,9 @@ import json from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance, - bbox_relative_pos, calculate_iou, - calculate_overlap_area_in_bbox1_area_ratio) + bbox_relative_pos, box_area, calculate_iou, + calculate_overlap_area_in_bbox1_area_ratio, + get_overlap_area) from magic_pdf.libs.commons import fitz, join_path from magic_pdf.libs.coordinate_transform import get_scale_ratio from magic_pdf.libs.local_math import float_gt @@ -12,6 +13,7 @@ from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter CAPATION_OVERLAP_AREA_RATIO = 0.6 +MERGE_BOX_OVERLAP_AREA_RATIO = 1.1 class MagicModel: @@ -124,49 +126,51 @@ def __fix_footnote(self): tables.append(obj) if len(footnotes) * len(figures) == 0: continue - dis_figure_footnote = {} - dis_table_footnote = {} - - for i in range(len(footnotes)): - for j in range(len(figures)): - pos_flag_count = sum( - list( - map( - lambda x: 1 if x else 0, - bbox_relative_pos( - footnotes[i]['bbox'], figures[j]['bbox'] - ), - ) + dis_figure_footnote = {} + dis_table_footnote = {} + + for i in range(len(footnotes)): + for j in range(len(figures)): + pos_flag_count = sum( + list( + map( + lambda x: 1 if x else 0, + bbox_relative_pos( + footnotes[i]['bbox'], figures[j]['bbox'] + ), ) ) - if pos_flag_count > 1: - continue - dis_figure_footnote[i] = min( - bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']), - dis_figure_footnote.get(i, float('inf')), - ) - for i in range(len(footnotes)): - for j in range(len(tables)): - pos_flag_count = sum( - list( - map( - lambda x: 1 if x else 0, - bbox_relative_pos( - footnotes[i]['bbox'], tables[j]['bbox'] - ), - ) + ) + if pos_flag_count > 1: + continue + dis_figure_footnote[i] = min( + bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']), + dis_figure_footnote.get(i, float('inf')), + ) + for i in range(len(footnotes)): + for j in range(len(tables)): + pos_flag_count = sum( + list( + map( + lambda x: 1 if x else 0, + bbox_relative_pos( + footnotes[i]['bbox'], tables[j]['bbox'] + ), ) ) - if pos_flag_count > 1: - continue + ) + if pos_flag_count > 1: + continue - dis_table_footnote[i] = min( - bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']), - dis_table_footnote.get(i, float('inf')), - ) - for i in range(len(footnotes)): - if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]: - footnotes[i]['category_id'] = CategoryId.ImageFootnote + dis_table_footnote[i] = min( + bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']), + dis_table_footnote.get(i, float('inf')), + ) + for i in range(len(footnotes)): + if i not in dis_figure_footnote: + continue + if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]: + footnotes[i]['category_id'] = CategoryId.ImageFootnote def __reduct_overlap(self, bboxes): N = len(bboxes) @@ -191,6 +195,44 @@ def __tie_up_category_by_distance( 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。 再求出筛选出的 subjects 和 object 的最短距离 """ + def search_overlap_between_boxes( + subject_idx, object_idx + ): + idxes = [subject_idx, object_idx] + x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes] + y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes] + x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes] + y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes] + + merged_bbox = [ + min(x0s), + min(y0s), + max(x1s), + max(y1s), + ] + ratio = 0 + + other_objects = list( + map( + lambda x: {'bbox': x['bbox'], 'score': x['score']}, + filter( + lambda x: x['category_id'] + not in (object_category_id, subject_category_id), + self.__model_list[page_no]['layout_dets'], + ), + ) + ) + for other_object in other_objects: + ratio = max( + ratio, + get_overlap_area( + merged_bbox, other_object['bbox'] + ) * 1.0 / box_area(all_bboxes[object_idx]['bbox']) + ) + if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO: + break + + return ratio def may_find_other_nearest_bbox(subject_idx, object_idx): ret = float('inf') @@ -299,6 +341,15 @@ def expand_bbbox(idxes): ): continue + subject_idx, object_idx = i, j + if all_bboxes[j]['category_id'] == subject_category_id: + subject_idx, object_idx = j, i + + if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO: + dis[i][j] = float('inf') + dis[j][i] = dis[i][j] + continue + dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox']) dis[j][i] = dis[i][j] @@ -627,13 +678,13 @@ def remove_duplicate_spans(spans): span['type'] = ContentType.Image elif category_id == 5: # 获取table模型结果 - latex = layout_det.get("latex", None) - html = layout_det.get("html", None) + latex = layout_det.get('latex', None) + html = layout_det.get('html', None) if latex: - span["latex"] = latex + span['latex'] = latex elif html: - span["html"] = html - span["type"] = ContentType.Table + span['html'] = html + span['type'] = ContentType.Table elif category_id == 13: span['content'] = layout_det['latex'] span['type'] = ContentType.InlineEquation diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py index 6d7a381b..419457ec 100644 --- a/magic_pdf/tools/common.py +++ b/magic_pdf/tools/common.py @@ -46,7 +46,7 @@ def do_parse( end_page_id=None, ): if debug_able: - logger.warning("debug mode is on") + logger.warning('debug mode is on') f_dump_content_list = True f_draw_model_bbox = True From 85ed25875e0e59e214fa496b2c8427c9795cac07 Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 12 Sep 2024 14:00:03 +0000 Subject: [PATCH 2/3] Update version.py with new version --- magic_pdf/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py index 777f190d..8088f751 100644 --- a/magic_pdf/libs/version.py +++ b/magic_pdf/libs/version.py @@ -1 +1 @@ -__version__ = "0.8.0" +__version__ = "0.8.1" From 80082d3d804bb5bd91b46bc08800ab20222b9733 Mon Sep 17 00:00:00 2001 From: samqin123 <103937568+samqin123@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:24:55 +0800 Subject: [PATCH 3/3] Update magic_pdf_parse_main.py --- demo/magic_pdf_parse_main.py | 54 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/demo/magic_pdf_parse_main.py b/demo/magic_pdf_parse_main.py index 95b84ef0..d74b0ed8 100644 --- a/demo/magic_pdf_parse_main.py +++ b/demo/magic_pdf_parse_main.py @@ -1,7 +1,9 @@ +## 以下代码是可以对目录中pdf文件进行循环处理,方便使用。修改前,由于文件打开后没有关闭,造成内存报错,感谢i12345的帮助,修改了代码,提交PR,请作者完善,造福大家。 + import os import json import copy - +import glob from loguru import logger from magic_pdf.pipe.UNIPipe import UNIPipe @@ -12,8 +14,6 @@ model_config.__use_inside_model__ = True -# todo: 设备类型选择 (?) - def json_md_dump( pipe, md_writer, @@ -46,23 +46,13 @@ def json_md_dump( path=f"{pdf_name}.md" ) - def pdf_parse_main( pdf_path: str, - parse_method: str = 'auto', + parse_method: str = 'ocr', model_json_path: str = None, is_json_md_dump: bool = True, output_dir: str = None ): - """ - 执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录 - - :param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径 - :param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr - :param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应 - :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中 - :param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果 - """ try: pdf_name = os.path.basename(pdf_path).split(".")[0] pdf_path_parent = os.path.dirname(pdf_path) @@ -77,21 +67,19 @@ def pdf_parse_main( # 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中 image_path_parent = os.path.basename(output_image_path) - pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据 + # 使用 with open 自动处理文件的打开和关闭 + with open(pdf_path, "rb") as pdf_file: + pdf_bytes = pdf_file.read() if model_json_path: - # 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型 - model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read()) + with open(model_json_path, "r", encoding="utf-8") as model_file: + model_json = json.load(model_file) else: model_json = [] # 执行解析步骤 - # image_writer = DiskReaderWriter(output_image_path) image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path) - # 选择解析方式 - # jso_useful_key = {"_pdf_type": "", "model_list": model_json} - # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) if parse_method == "auto": jso_useful_key = {"_pdf_type": "", "model_list": model_json} pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) @@ -103,34 +91,36 @@ def pdf_parse_main( logger.error("unknown parse method, only auto, ocr, txt allowed") exit(1) - # 执行分类 pipe.pipe_classify() - # 如果没有传入模型数据,则使用内置模型解析 if not model_json: if model_config.__use_inside_model__: - pipe.pipe_analyze() # 解析 + pipe.pipe_analyze() else: logger.error("need model list input") exit(1) - # 执行解析 pipe.pipe_parse() - # 保存 text 和 md 格式的结果 content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none") md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none") - if is_json_md_dump: json_md_dump(pipe, md_writer, pdf_name, content_list, md_content) - except Exception as e: logger.exception(e) - -# 测试 if __name__ == '__main__': - pdf_path = r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf" - pdf_parse_main(pdf_path) + pdf_directory = r"E:/MinerU/pdf_to_go" + pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf")) + output_dir = r"E:/MinerU/pdfoutput" + + # 打印PDF文件列表 + print("本次处理的PDF 文件列表:") + for pdf_path in pdf_files: + print(pdf_path) + + # 循环处理每个PDF文件 + for pdf_path in pdf_files: + pdf_parse_main(pdf_path, output_dir=output_dir)