From faac0a4d17caf3d9c145a99fb2a596925ad975e2 Mon Sep 17 00:00:00 2001
From: icecraft <xurui1@pjlab.org.cn>
Date: Thu, 12 Sep 2024 14:12:47 +0800
Subject: [PATCH 1/3] fix: 1. resolve uncorrect pair relation of figure and
 footnote, 2. resolve uncorrect pair relation of table and caption #590

---
 magic_pdf/libs/boxbase.py      |  19 +++++
 magic_pdf/model/magic_model.py | 141 ++++++++++++++++++++++-----------
 magic_pdf/tools/common.py      |   2 +-
 3 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py
index 90f46ef2..0472328f 100644
--- a/magic_pdf/libs/boxbase.py
+++ b/magic_pdf/libs/boxbase.py
@@ -426,3 +426,22 @@ def dist(point1, point2):
     elif top:
         return y2 - y1b
     return 0.0
+
+
+def box_area(bbox):
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def get_overlap_area(bbox1, bbox2):
+    """计算box1和box2的重叠面积占bbox1的比例."""
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    return (x_right - x_left) * (y_bottom - y_top)
diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py
index 61dc3a43..bd8e061a 100644
--- a/magic_pdf/model/magic_model.py
+++ b/magic_pdf/model/magic_model.py
@@ -1,8 +1,9 @@
 import json
 
 from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
-                                    bbox_relative_pos, calculate_iou,
-                                    calculate_overlap_area_in_bbox1_area_ratio)
+                                    bbox_relative_pos, box_area, calculate_iou,
+                                    calculate_overlap_area_in_bbox1_area_ratio,
+                                    get_overlap_area)
 from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.local_math import float_gt
@@ -12,6 +13,7 @@
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 
 CAPATION_OVERLAP_AREA_RATIO = 0.6
+MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
 
 
 class MagicModel:
@@ -124,49 +126,51 @@ def __fix_footnote(self):
                     tables.append(obj)
                 if len(footnotes) * len(figures) == 0:
                     continue
-                dis_figure_footnote = {}
-                dis_table_footnote = {}
-
-                for i in range(len(footnotes)):
-                    for j in range(len(figures)):
-                        pos_flag_count = sum(
-                            list(
-                                map(
-                                    lambda x: 1 if x else 0,
-                                    bbox_relative_pos(
-                                        footnotes[i]['bbox'], figures[j]['bbox']
-                                    ),
-                                )
+            dis_figure_footnote = {}
+            dis_table_footnote = {}
+
+            for i in range(len(footnotes)):
+                for j in range(len(figures)):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    footnotes[i]['bbox'], figures[j]['bbox']
+                                ),
                             )
                         )
-                        if pos_flag_count > 1:
-                            continue
-                        dis_figure_footnote[i] = min(
-                            bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
-                            dis_figure_footnote.get(i, float('inf')),
-                        )
-                for i in range(len(footnotes)):
-                    for j in range(len(tables)):
-                        pos_flag_count = sum(
-                            list(
-                                map(
-                                    lambda x: 1 if x else 0,
-                                    bbox_relative_pos(
-                                        footnotes[i]['bbox'], tables[j]['bbox']
-                                    ),
-                                )
+                    )
+                    if pos_flag_count > 1:
+                        continue
+                    dis_figure_footnote[i] = min(
+                        bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
+                        dis_figure_footnote.get(i, float('inf')),
+                    )
+            for i in range(len(footnotes)):
+                for j in range(len(tables)):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    footnotes[i]['bbox'], tables[j]['bbox']
+                                ),
                             )
                         )
-                        if pos_flag_count > 1:
-                            continue
+                    )
+                    if pos_flag_count > 1:
+                        continue
 
-                        dis_table_footnote[i] = min(
-                            bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
-                            dis_table_footnote.get(i, float('inf')),
-                        )
-                for i in range(len(footnotes)):
-                    if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
-                        footnotes[i]['category_id'] = CategoryId.ImageFootnote
+                    dis_table_footnote[i] = min(
+                        bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
+                        dis_table_footnote.get(i, float('inf')),
+                    )
+            for i in range(len(footnotes)):
+                if i not in dis_figure_footnote:
+                    continue
+                if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
+                    footnotes[i]['category_id'] = CategoryId.ImageFootnote
 
     def __reduct_overlap(self, bboxes):
         N = len(bboxes)
@@ -191,6 +195,44 @@ def __tie_up_category_by_distance(
         筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
         再求出筛选出的 subjects 和 object 的最短距离
         """
+        def search_overlap_between_boxes(
+            subject_idx, object_idx
+        ):
+            idxes = [subject_idx, object_idx]
+            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
+            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
+            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
+            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
+
+            merged_bbox = [
+                min(x0s),
+                min(y0s),
+                max(x1s),
+                max(y1s),
+            ]
+            ratio = 0
+
+            other_objects = list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id']
+                        not in (object_category_id, subject_category_id),
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+            for other_object in other_objects:
+                ratio = max(
+                    ratio,
+                    get_overlap_area(
+                        merged_bbox, other_object['bbox']
+                    ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
+                )
+                if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
+                    break
+
+            return ratio
 
         def may_find_other_nearest_bbox(subject_idx, object_idx):
             ret = float('inf')
@@ -299,6 +341,15 @@ def expand_bbbox(idxes):
                 ):
                     continue
 
+                subject_idx, object_idx = i, j
+                if all_bboxes[j]['category_id'] == subject_category_id:
+                    subject_idx, object_idx = j, i
+
+                if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
+                    dis[i][j] = float('inf')
+                    dis[j][i] = dis[i][j]
+                    continue
+
                 dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
                 dis[j][i] = dis[i][j]
 
@@ -627,13 +678,13 @@ def remove_duplicate_spans(spans):
                     span['type'] = ContentType.Image
                 elif category_id == 5:
                     # 获取table模型结果
-                    latex = layout_det.get("latex", None)
-                    html = layout_det.get("html", None)
+                    latex = layout_det.get('latex', None)
+                    html = layout_det.get('html', None)
                     if latex:
-                        span["latex"] = latex
+                        span['latex'] = latex
                     elif html:
-                        span["html"] = html
-                    span["type"] = ContentType.Table
+                        span['html'] = html
+                    span['type'] = ContentType.Table
                 elif category_id == 13:
                     span['content'] = layout_det['latex']
                     span['type'] = ContentType.InlineEquation
diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py
index 6d7a381b..419457ec 100644
--- a/magic_pdf/tools/common.py
+++ b/magic_pdf/tools/common.py
@@ -46,7 +46,7 @@ def do_parse(
     end_page_id=None,
 ):
     if debug_able:
-        logger.warning("debug mode is on")
+        logger.warning('debug mode is on')
         f_dump_content_list = True
         f_draw_model_bbox = True
 

From 85ed25875e0e59e214fa496b2c8427c9795cac07 Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Thu, 12 Sep 2024 14:00:03 +0000
Subject: [PATCH 2/3] Update version.py with new version

---
 magic_pdf/libs/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py
index 777f190d..8088f751 100644
--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
@@ -1 +1 @@
-__version__ = "0.8.0"
+__version__ = "0.8.1"

From 80082d3d804bb5bd91b46bc08800ab20222b9733 Mon Sep 17 00:00:00 2001
From: samqin123 <103937568+samqin123@users.noreply.github.com>
Date: Mon, 30 Sep 2024 09:24:55 +0800
Subject: [PATCH 3/3] Update magic_pdf_parse_main.py

---
 demo/magic_pdf_parse_main.py | 54 +++++++++++++++---------------------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/demo/magic_pdf_parse_main.py b/demo/magic_pdf_parse_main.py
index 95b84ef0..d74b0ed8 100644
--- a/demo/magic_pdf_parse_main.py
+++ b/demo/magic_pdf_parse_main.py
@@ -1,7 +1,9 @@
+## 以下代码是可以对目录中pdf文件进行循环处理，方便使用。修改前，由于文件打开后没有关闭，造成内存报错，感谢i12345的帮助，修改了代码，提交PR，请作者完善，造福大家。
+
 import os
 import json
 import copy
-
+import glob
 from loguru import logger
 
 from magic_pdf.pipe.UNIPipe import UNIPipe
@@ -12,8 +14,6 @@
 
 model_config.__use_inside_model__ = True
 
-# todo: 设备类型选择 （？）
-
 def json_md_dump(
         pipe,
         md_writer,
@@ -46,23 +46,13 @@ def json_md_dump(
         path=f"{pdf_name}.md"
     )
 
-
 def pdf_parse_main(
         pdf_path: str,
-        parse_method: str = 'auto',
+        parse_method: str = 'ocr',
         model_json_path: str = None,
         is_json_md_dump: bool = True,
         output_dir: str = None
 ):
-    """
-    执行从 pdf 转换到 json、md 的过程，输出 md 和 json 文件到 pdf 文件所在的目录
-
-    :param pdf_path: .pdf 文件的路径，可以是相对路径，也可以是绝对路径
-    :param parse_method: 解析方法， 共 auto、ocr、txt 三种，默认 auto，如果效果不好，可以尝试 ocr
-    :param model_json_path: 已经存在的模型数据文件，如果为空则使用内置模型，pdf 和 model_json 务必对应
-    :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中，默认 True，会将不同阶段的数据写入到不同的 .json 文件中（共3个.json文件），md内容会保存到 .md 文件中
-    :param output_dir: 输出结果的目录地址，会生成一个以 pdf 文件名命名的文件夹并保存所有结果
-    """
     try:
         pdf_name = os.path.basename(pdf_path).split(".")[0]
         pdf_path_parent = os.path.dirname(pdf_path)
@@ -77,21 +67,19 @@ def pdf_parse_main(
         # 获取图片的父路径，为的是以相对路径保存到 .md 和 conent_list.json 文件中
         image_path_parent = os.path.basename(output_image_path)
 
-        pdf_bytes = open(pdf_path, "rb").read()  # 读取 pdf 文件的二进制数据
+        # 使用 with open 自动处理文件的打开和关闭
+        with open(pdf_path, "rb") as pdf_file:
+            pdf_bytes = pdf_file.read()
 
         if model_json_path:
-            # 读取已经被模型解析后的pdf文件的 json 原始数据，list 类型
-            model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
+            with open(model_json_path, "r", encoding="utf-8") as model_file:
+                model_json = json.load(model_file)
         else:
             model_json = []
 
         # 执行解析步骤
-        # image_writer = DiskReaderWriter(output_image_path)
         image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
 
-        # 选择解析方式
-        # jso_useful_key = {"_pdf_type": "", "model_list": model_json}
-        # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
         if parse_method == "auto":
             jso_useful_key = {"_pdf_type": "", "model_list": model_json}
             pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
@@ -103,34 +91,36 @@ def pdf_parse_main(
             logger.error("unknown parse method, only auto, ocr, txt allowed")
             exit(1)
 
-        # 执行分类
         pipe.pipe_classify()
 
-        # 如果没有传入模型数据，则使用内置模型解析
         if not model_json:
             if model_config.__use_inside_model__:
-                pipe.pipe_analyze()  # 解析
+                pipe.pipe_analyze()
             else:
                 logger.error("need model list input")
                 exit(1)
 
-        # 执行解析
         pipe.pipe_parse()
 
-        # 保存 text 和 md 格式的结果
         content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
         md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
 
-
         if is_json_md_dump:
             json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
 
-
     except Exception as e:
         logger.exception(e)
 
-
-# 测试
 if __name__ == '__main__':
-    pdf_path = r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf"
-    pdf_parse_main(pdf_path)
+    pdf_directory = r"E:/MinerU/pdf_to_go"
+    pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
+    output_dir = r"E:/MinerU/pdfoutput"
+
+    # 打印PDF文件列表
+    print("本次处理的PDF 文件列表:")
+    for pdf_path in pdf_files:
+        print(pdf_path)
+
+    # 循环处理每个PDF文件
+    for pdf_path in pdf_files:
+        pdf_parse_main(pdf_path, output_dir=output_dir)