Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update magic_pdf_parse_main.py #676

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 22 additions & 32 deletions demo/magic_pdf_parse_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## 以下代码是可以对目录中pdf文件进行循环处理,方便使用。修改前,由于文件打开后没有关闭,造成内存报错,感谢i12345的帮助,修改了代码,提交PR,请作者完善,造福大家。

import os
import json
import copy

import glob
from loguru import logger

from magic_pdf.pipe.UNIPipe import UNIPipe
Expand All @@ -12,8 +14,6 @@

model_config.__use_inside_model__ = True

# todo: 设备类型选择 (?)

def json_md_dump(
pipe,
md_writer,
Expand Down Expand Up @@ -46,23 +46,13 @@ def json_md_dump(
path=f"{pdf_name}.md"
)


def pdf_parse_main(
pdf_path: str,
parse_method: str = 'auto',
parse_method: str = 'ocr',
model_json_path: str = None,
is_json_md_dump: bool = True,
output_dir: str = None
):
"""
执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录

:param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr
:param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果
"""
try:
pdf_name = os.path.basename(pdf_path).split(".")[0]
pdf_path_parent = os.path.dirname(pdf_path)
Expand All @@ -77,21 +67,19 @@ def pdf_parse_main(
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent = os.path.basename(output_image_path)

pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据
# 使用 with open 自动处理文件的打开和关闭
with open(pdf_path, "rb") as pdf_file:
pdf_bytes = pdf_file.read()

if model_json_path:
# 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型
model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
with open(model_json_path, "r", encoding="utf-8") as model_file:
model_json = json.load(model_file)
else:
model_json = []

# 执行解析步骤
# image_writer = DiskReaderWriter(output_image_path)
image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)

# 选择解析方式
# jso_useful_key = {"_pdf_type": "", "model_list": model_json}
# pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
if parse_method == "auto":
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
Expand All @@ -103,34 +91,36 @@ def pdf_parse_main(
logger.error("unknown parse method, only auto, ocr, txt allowed")
exit(1)

# 执行分类
pipe.pipe_classify()

# 如果没有传入模型数据,则使用内置模型解析
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # 解析
pipe.pipe_analyze()
else:
logger.error("need model list input")
exit(1)

# 执行解析
pipe.pipe_parse()

# 保存 text 和 md 格式的结果
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")


if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)


except Exception as e:
logger.exception(e)


# 测试
if __name__ == '__main__':
pdf_path = r"C:\Users\XYTK2\Desktop\2024-2016-gb-cd-300.pdf"
pdf_parse_main(pdf_path)
pdf_directory = r"E:/MinerU/pdf_to_go"
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
output_dir = r"E:/MinerU/pdfoutput"

# 打印PDF文件列表
print("本次处理的PDF 文件列表:")
for pdf_path in pdf_files:
print(pdf_path)

# 循环处理每个PDF文件
for pdf_path in pdf_files:
pdf_parse_main(pdf_path, output_dir=output_dir)
19 changes: 19 additions & 0 deletions magic_pdf/libs/boxbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,22 @@ def dist(point1, point2):
elif top:
return y2 - y1b
return 0.0


def box_area(bbox):
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])


def get_overlap_area(bbox1, bbox2):
"""计算box1和box2的重叠面积占bbox1的比例."""
# Determine the coordinates of the intersection rectangle
x_left = max(bbox1[0], bbox2[0])
y_top = max(bbox1[1], bbox2[1])
x_right = min(bbox1[2], bbox2[2])
y_bottom = min(bbox1[3], bbox2[3])

if x_right < x_left or y_bottom < y_top:
return 0.0

# The area of overlap area
return (x_right - x_left) * (y_bottom - y_top)
2 changes: 1 addition & 1 deletion magic_pdf/libs/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.0"
__version__ = "0.8.1"
141 changes: 96 additions & 45 deletions magic_pdf/model/magic_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json

from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
bbox_relative_pos, calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio)
bbox_relative_pos, box_area, calculate_iou,
calculate_overlap_area_in_bbox1_area_ratio,
get_overlap_area)
from magic_pdf.libs.commons import fitz, join_path
from magic_pdf.libs.coordinate_transform import get_scale_ratio
from magic_pdf.libs.local_math import float_gt
Expand All @@ -12,6 +13,7 @@
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter

CAPATION_OVERLAP_AREA_RATIO = 0.6
MERGE_BOX_OVERLAP_AREA_RATIO = 1.1


class MagicModel:
Expand Down Expand Up @@ -124,49 +126,51 @@ def __fix_footnote(self):
tables.append(obj)
if len(footnotes) * len(figures) == 0:
continue
dis_figure_footnote = {}
dis_table_footnote = {}

for i in range(len(footnotes)):
for j in range(len(figures)):
pos_flag_count = sum(
list(
map(
lambda x: 1 if x else 0,
bbox_relative_pos(
footnotes[i]['bbox'], figures[j]['bbox']
),
)
dis_figure_footnote = {}
dis_table_footnote = {}

for i in range(len(footnotes)):
for j in range(len(figures)):
pos_flag_count = sum(
list(
map(
lambda x: 1 if x else 0,
bbox_relative_pos(
footnotes[i]['bbox'], figures[j]['bbox']
),
)
)
if pos_flag_count > 1:
continue
dis_figure_footnote[i] = min(
bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
dis_figure_footnote.get(i, float('inf')),
)
for i in range(len(footnotes)):
for j in range(len(tables)):
pos_flag_count = sum(
list(
map(
lambda x: 1 if x else 0,
bbox_relative_pos(
footnotes[i]['bbox'], tables[j]['bbox']
),
)
)
if pos_flag_count > 1:
continue
dis_figure_footnote[i] = min(
bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
dis_figure_footnote.get(i, float('inf')),
)
for i in range(len(footnotes)):
for j in range(len(tables)):
pos_flag_count = sum(
list(
map(
lambda x: 1 if x else 0,
bbox_relative_pos(
footnotes[i]['bbox'], tables[j]['bbox']
),
)
)
if pos_flag_count > 1:
continue
)
if pos_flag_count > 1:
continue

dis_table_footnote[i] = min(
bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
dis_table_footnote.get(i, float('inf')),
)
for i in range(len(footnotes)):
if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
footnotes[i]['category_id'] = CategoryId.ImageFootnote
dis_table_footnote[i] = min(
bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
dis_table_footnote.get(i, float('inf')),
)
for i in range(len(footnotes)):
if i not in dis_figure_footnote:
continue
if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
footnotes[i]['category_id'] = CategoryId.ImageFootnote

def __reduct_overlap(self, bboxes):
N = len(bboxes)
Expand All @@ -191,6 +195,44 @@ def __tie_up_category_by_distance(
筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
再求出筛选出的 subjects 和 object 的最短距离
"""
def search_overlap_between_boxes(
subject_idx, object_idx
):
idxes = [subject_idx, object_idx]
x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]

merged_bbox = [
min(x0s),
min(y0s),
max(x1s),
max(y1s),
]
ratio = 0

other_objects = list(
map(
lambda x: {'bbox': x['bbox'], 'score': x['score']},
filter(
lambda x: x['category_id']
not in (object_category_id, subject_category_id),
self.__model_list[page_no]['layout_dets'],
),
)
)
for other_object in other_objects:
ratio = max(
ratio,
get_overlap_area(
merged_bbox, other_object['bbox']
) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
)
if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
break

return ratio

def may_find_other_nearest_bbox(subject_idx, object_idx):
ret = float('inf')
Expand Down Expand Up @@ -299,6 +341,15 @@ def expand_bbbox(idxes):
):
continue

subject_idx, object_idx = i, j
if all_bboxes[j]['category_id'] == subject_category_id:
subject_idx, object_idx = j, i

if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
dis[i][j] = float('inf')
dis[j][i] = dis[i][j]
continue

dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
dis[j][i] = dis[i][j]

Expand Down Expand Up @@ -627,13 +678,13 @@ def remove_duplicate_spans(spans):
span['type'] = ContentType.Image
elif category_id == 5:
# 获取table模型结果
latex = layout_det.get("latex", None)
html = layout_det.get("html", None)
latex = layout_det.get('latex', None)
html = layout_det.get('html', None)
if latex:
span["latex"] = latex
span['latex'] = latex
elif html:
span["html"] = html
span["type"] = ContentType.Table
span['html'] = html
span['type'] = ContentType.Table
elif category_id == 13:
span['content'] = layout_det['latex']
span['type'] = ContentType.InlineEquation
Expand Down
2 changes: 1 addition & 1 deletion magic_pdf/tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def do_parse(
end_page_id=None,
):
if debug_able:
logger.warning("debug mode is on")
logger.warning('debug mode is on')
f_dump_content_list = True
f_draw_model_bbox = True

Expand Down
Loading