Skip to content

Commit

Permalink
refactor(magic_pdf): improve line sorting and block indexing
Browse files Browse the repository at this point in the history
- Insert lines into blocks based on median line height- Calculate block index using line indices median
- Remove virtual line information for table and image blocks
- Enhance line sorting algorithm for different block types
- Add line height calculation function
  • Loading branch information
myhloli committed Sep 29, 2024
1 parent 4c9bf8a commit 564c4ce
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 18 deletions.
4 changes: 4 additions & 0 deletions magic_pdf/libs/draw_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
bbox = block['bbox']
index = block['index']
page_line_list.append({'index': index, 'bbox': bbox})
# for line in block['lines']:
# bbox = line['bbox']
# index = line['index']
# page_line_list.append({'index': index, 'bbox': bbox})
sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
pdf_docs = fitz.open('pdf', pdf_bytes)
Expand Down
114 changes: 96 additions & 18 deletions magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,37 +150,99 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:

def cal_block_index(fix_blocks, sorted_bboxes):
for block in fix_blocks:
if block['type'] in ['text', 'title', 'interline_equation']:
line_index_list = []
if len(block['lines']) == 0:
block['index'] = sorted_bboxes.index(block['bbox'])
else:
for line in block['lines']:
line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list)
block['index'] = median_value

elif block['type'] in ['table', 'image']:
# if block['type'] in ['text', 'title', 'interline_equation']:
# line_index_list = []
# if len(block['lines']) == 0:
# block['index'] = sorted_bboxes.index(block['bbox'])
# else:
# for line in block['lines']:
# line['index'] = sorted_bboxes.index(line['bbox'])
# line_index_list.append(line['index'])
# median_value = statistics.median(line_index_list)
# block['index'] = median_value
#
# elif block['type'] in ['table', 'image']:
# block['index'] = sorted_bboxes.index(block['bbox'])

line_index_list = []
if len(block['lines']) == 0:
block['index'] = sorted_bboxes.index(block['bbox'])
else:
for line in block['lines']:
line['index'] = sorted_bboxes.index(line['bbox'])
line_index_list.append(line['index'])
median_value = statistics.median(line_index_list)
block['index'] = median_value

# 删除图表block中的虚拟line信息
if block['type'] in ['table', 'image']:
del block['lines']

return fix_blocks


def sort_lines_by_model(fix_blocks, page_w, page_h):
def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
# block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标
x0, y0, x1, y1 = block_bbox

block_height = y1 - y0
block_weight = x1 - x0

# 如果block高度小于n行正文,则直接返回block的bbox
if line_height*3 < block_height:
if block_height > page_h*0.25 and page_w*0.5 > block_weight > page_w*0.25: # 可能是双列结构,可以切细点
lines = int(block_height/line_height)
else:
# 如果block的宽度超过0.4页面宽度,则将block分成3行
if block_weight > page_w*0.4:
line_height = (y1 - y0) / 3
lines = 3
elif block_weight > page_w*0.25: # 否则将block分成两行
line_height = (y1 - y0) / 2
lines = 2
else: # 判断长宽比
if block_height/block_weight > 1.2: # 细长的不分
return [[x0, y0, x1, y1]]
else: # 不细长的还是分成两行
line_height = (y1 - y0) / 2
lines = 2

# 确定从哪个y位置开始绘制线条
current_y = y0

# 用于存储线条的位置信息[(x0, y), ...]
lines_positions = []

for i in range(lines):
lines_positions.append([x0, current_y, x1, current_y + line_height])
current_y += line_height
return lines_positions

else:
return [[x0, y0, x1, y1]]


def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
page_line_list = []
for block in fix_blocks:
if block['type'] in ['text', 'title', 'interline_equation']:
if len(block['lines']) == 0: # 没有line的block(一般是图片形式的文本块),就直接用block的bbox来排序
if len(block['lines']) == 0:
bbox = block['bbox']
page_line_list.append(bbox)
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
for line in lines:
block['lines'].append({'bbox': line, 'spans': []})
page_line_list.extend(lines)
else:
for line in block['lines']:
bbox = line['bbox']
page_line_list.append(bbox)
elif block['type'] in ['table', 'image']: # 简单的把表和图都当成一个line处理
elif block['type'] in ['table', 'image']:
bbox = block['bbox']
page_line_list.append(bbox)
lines = insert_lines_into_block(bbox, line_height, page_w, page_h)
block['lines'] = []
for line in lines:
block['lines'].append({'bbox': line, 'spans': []})
page_line_list.extend(lines)

# 使用layoutreader排序
x_scale = 1000.0 / page_w
Expand Down Expand Up @@ -222,6 +284,19 @@ def sort_lines_by_model(fix_blocks, page_w, page_h):
return sorted_bboxes


def get_line_height(blocks):
page_line_height_list = []
for block in blocks:
if block['type'] in ['text', 'title', 'interline_equation']:
for line in block['lines']:
bbox = line['bbox']
page_line_height_list.append(int(bbox[3]-bbox[1]))
if len(page_line_height_list) > 0:
return statistics.median(page_line_height_list)
else:
return 10


def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
need_drop = False
drop_reason = []
Expand Down Expand Up @@ -286,8 +361,11 @@ def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter,
'''对block进行fix操作'''
fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)

'''获取所有line并计算正文line的高度'''
line_height = get_line_height(fix_blocks)

'''获取所有line并对line排序'''
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h)
sorted_bboxes = sort_lines_by_model(fix_blocks, page_w, page_h, line_height)

'''根据line的中位数算block的序列关系'''
fix_blocks = cal_block_index(fix_blocks, sorted_bboxes)
Expand Down

0 comments on commit 564c4ce

Please sign in to comment.