From 895cffea36f181562437153b7084bacd6e020220 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Fri, 29 Nov 2024 16:40:58 +0100
Subject: [PATCH 1/4] tests: extend coverage
---
tests/cli_tests.py | 15 ++++++++++++++-
tests/unit_tests.py | 13 +++++++++++--
tests/xml_tei_tests.py | 29 ++++++++++++++++++++++++++++-
trafilatura/external.py | 2 +-
trafilatura/readability_lxml.py | 28 ++++++++++------------------
trafilatura/utils.py | 2 +-
6 files changed, 65 insertions(+), 24 deletions(-)
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index aeba580a..8bcad9e7 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -225,6 +225,14 @@ def test_sysoutput():
options = settings.args_to_extractor(args)
assert options.format == "markdown" and options.formatting is True
assert cli_utils.process_result("DADIDA", args, -1, options) == -1
+
+ # with counter
+ with open(
+ path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+ ) as f:
+ teststring = f.read()
+ assert cli_utils.process_result(teststring, args, 1, options) == 2
+
# test keeping dir structure
testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
with patch.object(sys, "argv", testargs):
@@ -377,6 +385,9 @@ def test_cli_pipeline():
def test_file_processing():
"Test file processing pipeline on actual directories."
+ backup = settings.MAX_FILES_PER_DIRECTORY
+ settings.MAX_FILES_PER_DIRECTORY = 0
+
# dry-run file processing pipeline
testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"]
with patch.object(sys, "argv", testargs):
@@ -393,6 +404,8 @@ def test_file_processing():
for f in cli_utils.generate_filelist(args.input_dir):
cli_utils.file_processing(f, args, options=options)
+ settings.MAX_FILES_PER_DIRECTORY = backup
+
def test_cli_config_file():
"Test if the configuration file is loaded correctly from the CLI."
@@ -497,7 +510,7 @@ def test_crawling():
testargs = ["", "--crawl", ""]
with patch.object(sys, "argv", testargs):
args = cli.parse_args(testargs)
- cli_utils.cli_crawler(args)
+ cli.process_args(args)
testargs = ["", "--crawl", " "]
with patch.object(sys, "argv", testargs):
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index e7c12dc0..48565c30 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -23,7 +23,7 @@
import trafilatura.htmlprocessing
from trafilatura import bare_extraction, extract, xml
from trafilatura.core import Extractor
-from trafilatura.external import sanitize_tree, try_justext
+from trafilatura.external import sanitize_tree, try_justext, try_readability
from trafilatura.main_extractor import (handle_formatting, handle_image,
handle_lists, handle_paragraphs, handle_quotes,
handle_table, handle_textelem)
@@ -815,7 +815,8 @@ def test_htmlprocessing():
def test_extraction_options():
'''Test the different parameters available in extract() and bare_extraction()'''
- my_html = ''
+ my_html = ''
+
with pytest.raises(ValueError) as err:
extract(my_html, output_format="python")
assert extract(my_html, config=NEW_CONFIG) is None
@@ -824,9 +825,17 @@ def test_extraction_options():
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', fast=True, config=ZERO_CONFIG) is None
+
+ # justext hardening
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b''
+ assert etree.tostring(try_justext(None, None, 'de')) == b'
' + 'Text. '*10 + '
'
+ result = etree.tostring(try_readability(html.fromstring(my_html)))
+ assert len(result) > 10 and 'Test' not in result
+
my_html = '' + 'ABC def ghi jkl.
'*1000 + 'Posted on 1st Dec 2019<.
'
assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None
assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None
diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py
index 5a36b4bd..160c151b 100644
--- a/tests/xml_tei_tests.py
+++ b/tests/xml_tei_tests.py
@@ -6,7 +6,7 @@
from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring
from trafilatura.metadata import Document
-from trafilatura.xml import (check_tei, write_fullheader,
+from trafilatura.xml import (check_tei, replace_element_text, write_fullheader,
_handle_unwanted_tails, _move_element_one_level_up,
_wrap_unwanted_siblings_of_div)
@@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div():
assert cleaned.find(".//p").text == "tail"
+def test_replace_element_text():
+ elem = Element("head")
+ elem.text = "Title"
+ elem.set("rend", "h1")
+ assert replace_element_text(elem, True) == "# Title"
+
+ elem = Element("hi")
+ elem.text = "Text"
+ elem.set("rend", "#b")
+ assert replace_element_text(elem, True) == "**Text**"
+
+ elem = Element("item")
+ elem.text = "Test text"
+ elem.tag = "item"
+ assert replace_element_text(elem, True) == "- Test text\n"
+
+ elem = Element("ref")
+ elem.text = "Link"
+ elem.set("target", "https://example.com")
+ assert replace_element_text(elem, True) == "[Link](https://example.com)"
+
+ elem = Element("ref")
+ elem.text = "Link"
+ assert replace_element_text(elem, True) == "[Link]"
+
+
if __name__ == "__main__":
test_publisher_added_before_availability_in_publicationStmt()
test_unwanted_siblings_of_div_removed()
test_tail_on_p_like_elements_removed()
test_head_with_children_converted_to_ab()
test_ab_with_p_parent_resolved()
+ test_replace_element_text()
diff --git a/trafilatura/external.py b/trafilatura/external.py
index 49801869..4dd1f090 100644
--- a/trafilatura/external.py
+++ b/trafilatura/external.py
@@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element:
# extract
try:
paragraphs = custom_justext(tree, justext_stoplist)
- except ValueError as err: # not an XML element: HtmlComment
+ except Exception as err:
LOGGER.error('justext %s %s', err, url)
else:
for paragraph in paragraphs:
diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
index 96742bd0..48f651c1 100644
--- a/trafilatura/readability_lxml.py
+++ b/trafilatura/readability_lxml.py
@@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
)
elem.drop_tree()
elif elem.text_content().count(",") < 10:
- to_remove = False
+ to_remove = True
counts = {
kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS
}
@@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
# continue
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = f'too many images ({counts["img"]})'
- to_remove = True
elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS:
reason = "more s than s"
- to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x
s than s"
- to_remove = True
elif content_length < self.min_text_length and counts["img"] == 0:
reason = f"too short content length {content_length} without a single image"
- to_remove = True
elif content_length < self.min_text_length and counts["img"] > 2:
reason = (
f"too short content length {content_length} and too many images"
)
- to_remove = True
elif weight < 25 and link_density > 0.2:
reason = (
f"too many links {link_density:.3f} for its weight {weight}"
)
- to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = (
f"too many links {link_density:.3f} for its weight {weight}"
)
- to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts[
"embed"
] > 1:
reason = (
"