Skip to content

Commit

Permalink
Merge pull request #18 from opendatalab/dev
Browse files Browse the repository at this point in the history
fix: Fix missing short text in complex code blocks
  • Loading branch information
sixgad authored Nov 14, 2024
2 parents 1c44bd5 + 068cd0c commit 8a0f5b5
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 1 deletion.
3 changes: 3 additions & 0 deletions magic_html/extractors/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,9 @@ def delete_by_link_density(
elemtext = trim(elem.text_content())
result, templist = link_density_test(elem, elemtext, favor_precision)
if result is True and img_div_check(elem):
# 保留table中的链接
if tagname in ['ul', 'li', 'div', 'p'] and ancestor_node_check(elem, ['td']):
continue
deletions.append(elem)
elif backtracking is True and len(templist) > 0: # if?
myelems[elemtext].append(elem)
Expand Down
8 changes: 8 additions & 0 deletions magic_html/readability_plus.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,14 @@ def sanitize(self, node, candidates):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < MIN_LEN and counts["img"] == 0:
# 代码块内容过短,导致删除
if el.tag in ['code', 'pre']:
continue
if ancestor_node_check(el, ['code', 'pre']):
continue
# 保留table中的链接
if el.tag in ['ul', 'li', 'div', 'p'] and ancestor_node_check(el, ['td']):
continue
reason = (
"too short content length %s without a single image"
% content_length
Expand Down
7 changes: 7 additions & 0 deletions magic_html/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,13 @@ def fromstring_bytes(htmlobject):
return tree


def ancestor_node_check(node: HtmlElement, tags: list):
for tag in tags:
if node.xpath(f'ancestor::{tag}[1]'):
return True
return False


def load_html(htmlobject):
if isinstance(htmlobject, HtmlElement):
return htmlobject
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Brotli
cchardet
cchardet==2.2.0a2
charset_normalizer
lxml<5.2.0
numpy
Expand Down

0 comments on commit 8a0f5b5

Please sign in to comment.