From 895cffea36f181562437153b7084bacd6e020220 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 29 Nov 2024 16:40:58 +0100 Subject: [PATCH 1/4] tests: extend coverage --- tests/cli_tests.py | 15 ++++++++++++++- tests/unit_tests.py | 13 +++++++++++-- tests/xml_tei_tests.py | 29 ++++++++++++++++++++++++++++- trafilatura/external.py | 2 +- trafilatura/readability_lxml.py | 28 ++++++++++------------------ trafilatura/utils.py | 2 +- 6 files changed, 65 insertions(+), 24 deletions(-) diff --git a/tests/cli_tests.py b/tests/cli_tests.py index aeba580a..8bcad9e7 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -225,6 +225,14 @@ def test_sysoutput(): options = settings.args_to_extractor(args) assert options.format == "markdown" and options.formatting is True assert cli_utils.process_result("DADIDA", args, -1, options) == -1 + + # with counter + with open( + path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8" + ) as f: + teststring = f.read() + assert cli_utils.process_result(teststring, args, 1, options) == 2 + # test keeping dir structure testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"] with patch.object(sys, "argv", testargs): @@ -377,6 +385,9 @@ def test_cli_pipeline(): def test_file_processing(): "Test file processing pipeline on actual directories." + backup = settings.MAX_FILES_PER_DIRECTORY + settings.MAX_FILES_PER_DIRECTORY = 0 + # dry-run file processing pipeline testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"] with patch.object(sys, "argv", testargs): @@ -393,6 +404,8 @@ def test_file_processing(): for f in cli_utils.generate_filelist(args.input_dir): cli_utils.file_processing(f, args, options=options) + settings.MAX_FILES_PER_DIRECTORY = backup + def test_cli_config_file(): "Test if the configuration file is loaded correctly from the CLI." @@ -497,7 +510,7 @@ def test_crawling(): testargs = ["", "--crawl", ""] with patch.object(sys, "argv", testargs): args = cli.parse_args(testargs) - cli_utils.cli_crawler(args) + cli.process_args(args) testargs = ["", "--crawl", " "] with patch.object(sys, "argv", testargs): diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e7c12dc0..48565c30 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -23,7 +23,7 @@ import trafilatura.htmlprocessing from trafilatura import bare_extraction, extract, xml from trafilatura.core import Extractor -from trafilatura.external import sanitize_tree, try_justext +from trafilatura.external import sanitize_tree, try_justext, try_readability from trafilatura.main_extractor import (handle_formatting, handle_image, handle_lists, handle_paragraphs, handle_quotes, handle_table, handle_textelem) @@ -815,7 +815,8 @@ def test_htmlprocessing(): def test_extraction_options(): '''Test the different parameters available in extract() and bare_extraction()''' - my_html = '

Text.

' + my_html = '

Text.

' + with pytest.raises(ValueError) as err: extract(my_html, output_format="python") assert extract(my_html, config=NEW_CONFIG) is None @@ -824,9 +825,17 @@ def test_extraction_options(): assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None assert extract(my_html, target_language='de', fast=True, config=ZERO_CONFIG) is None + + # justext hardening assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'' + assert etree.tostring(try_justext(None, None, 'de')) == b'' # assert extract(my_html) is None + # readability + my_html = '

' + 'Text. '*10 + 'Test

' + result = etree.tostring(try_readability(html.fromstring(my_html))) + assert len(result) > 10 and 'Test' not in result + my_html = '' + '

ABC def ghi jkl.

'*1000 + '

Posted on 1st Dec 2019<.

' assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py index 5a36b4bd..160c151b 100644 --- a/tests/xml_tei_tests.py +++ b/tests/xml_tei_tests.py @@ -6,7 +6,7 @@ from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring from trafilatura.metadata import Document -from trafilatura.xml import (check_tei, write_fullheader, +from trafilatura.xml import (check_tei, replace_element_text, write_fullheader, _handle_unwanted_tails, _move_element_one_level_up, _wrap_unwanted_siblings_of_div) @@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div(): assert cleaned.find(".//p").text == "tail" +def test_replace_element_text(): + elem = Element("head") + elem.text = "Title" + elem.set("rend", "h1") + assert replace_element_text(elem, True) == "# Title" + + elem = Element("hi") + elem.text = "Text" + elem.set("rend", "#b") + assert replace_element_text(elem, True) == "**Text**" + + elem = Element("item") + elem.text = "Test text" + elem.tag = "item" + assert replace_element_text(elem, True) == "- Test text\n" + + elem = Element("ref") + elem.text = "Link" + elem.set("target", "https://example.com") + assert replace_element_text(elem, True) == "[Link](https://example.com)" + + elem = Element("ref") + elem.text = "Link" + assert replace_element_text(elem, True) == "[Link]" + + if __name__ == "__main__": test_publisher_added_before_availability_in_publicationStmt() test_unwanted_siblings_of_div_removed() test_tail_on_p_like_elements_removed() test_head_with_children_converted_to_ab() test_ab_with_p_parent_resolved() + test_replace_element_text() diff --git a/trafilatura/external.py b/trafilatura/external.py index 49801869..4dd1f090 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element: # extract try: paragraphs = custom_justext(tree, justext_stoplist) - except ValueError as err: # not an XML element: HtmlComment + except Exception as err: LOGGER.error('justext %s %s', err, url) else: for paragraph in paragraphs: diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index 96742bd0..48f651c1 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) ) elem.drop_tree() elif elem.text_content().count(",") < 10: - to_remove = False + to_remove = True counts = { kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS } @@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) # continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = f'too many images ({counts["img"]})' - to_remove = True elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS: reason = "more
  • s than

    s" - to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

    s than s" - to_remove = True elif content_length < self.min_text_length and counts["img"] == 0: reason = f"too short content length {content_length} without a single image" - to_remove = True elif content_length < self.min_text_length and counts["img"] > 2: reason = ( f"too short content length {content_length} and too many images" ) - to_remove = True elif weight < 25 and link_density > 0.2: reason = ( f"too many links {link_density:.3f} for its weight {weight}" ) - to_remove = True elif weight >= 25 and link_density > 0.5: reason = ( f"too many links {link_density:.3f} for its weight {weight}" ) - to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts[ "embed" ] > 1: reason = ( "s with too short content length, or too many s" ) - to_remove = True elif not content_length: reason = "no content" - to_remove = True # find x non empty preceding and succeeding siblings siblings = [] @@ -430,17 +421,18 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) if siblings and sum(siblings) > 1000: to_remove = False allowed.update(elem.iter("table", "ul", "div", "section")) + else: + to_remove = False if to_remove: elem.drop_tree() - if LOGGER.isEnabledFor(logging.DEBUG): - LOGGER.debug( - "Removed %6.3f %s with weight %s cause it has %s.", - score, - elem.tag, - weight, - reason or "", - ) + LOGGER.debug( + "Removed %6.3f %s with weight %s cause it has %s.", + score, + elem.tag, + weight, + reason or "", + ) self.doc = node return _tostring(self.doc) diff --git a/trafilatura/utils.py b/trafilatura/utils.py index aae37d7f..7db53889 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -405,7 +405,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]: if len(temp_text) > len(temp_comments) else py3langid.classify(temp_comments) ) - else: + else: # pragma: no cover LOGGER.warning('Language detector not installed, skipping detection') result = None return result # type: ignore[no-any-return] From 8b06b2d3bfb35b6074ef00ffc51f587ce83d55ed Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 29 Nov 2024 16:49:33 +0100 Subject: [PATCH 2/4] fix tests --- tests/cli_tests.py | 2 +- tests/unit_tests.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/cli_tests.py b/tests/cli_tests.py index 8bcad9e7..b7ffada5 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -510,7 +510,7 @@ def test_crawling(): testargs = ["", "--crawl", ""] with patch.object(sys, "argv", testargs): args = cli.parse_args(testargs) - cli.process_args(args) + cli.process_args(args) testargs = ["", "--crawl", " "] with patch.object(sys, "argv", testargs): diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 48565c30..9990a107 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -832,9 +832,12 @@ def test_extraction_options(): # assert extract(my_html) is None # readability + my_html = '

    ' + 'Text. '*10 + '

    ' + result = etree.tostring(try_readability(html.fromstring(my_html))) + assert len(result) > 10 and b'Text' in result my_html = '

    ' + 'Text. '*10 + 'Test

    ' result = etree.tostring(try_readability(html.fromstring(my_html))) - assert len(result) > 10 and 'Test' not in result + assert b'Test' not in result my_html = '' + '

    ABC def ghi jkl.

    '*1000 + '

    Posted on 1st Dec 2019<.

    ' assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None From a01d68a857300da6597742047d974f3822f2dee8 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 29 Nov 2024 17:01:17 +0100 Subject: [PATCH 3/4] fix cli test --- tests/cli_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cli_tests.py b/tests/cli_tests.py index b7ffada5..8d094445 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -510,7 +510,7 @@ def test_crawling(): testargs = ["", "--crawl", ""] with patch.object(sys, "argv", testargs): args = cli.parse_args(testargs) - cli.process_args(args) + cli_utils.cli_crawler(args) testargs = ["", "--crawl", " "] with patch.object(sys, "argv", testargs): @@ -522,7 +522,7 @@ def test_crawling(): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): - cli_utils.cli_crawler(args) + cli.process_args(args) assert f.getvalue() == "https://httpbun.com/html\n" spider.URL_STORE = UrlStore(compressed=False, strict=False) From 0407def8a7da58f649e8ab3e1b0e81d10d690668 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 29 Nov 2024 17:29:41 +0100 Subject: [PATCH 4/4] update actions --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e990ae57..ffef9b99 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -98,7 +98,7 @@ jobs: run: python -m pip install -e ".[all]" - name: Type checking - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }} run: | mypy -p trafilatura @@ -110,7 +110,7 @@ jobs: # coverage - name: Upload coverage to Codecov - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }} uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}