Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: extend coverage #753

Merged
merged 4 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
run: python -m pip install -e ".[all]"

- name: Type checking
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }}
run: |
mypy -p trafilatura

Expand All @@ -110,7 +110,7 @@ jobs:

# coverage
- name: Upload coverage to Codecov
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }}
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
Expand Down
15 changes: 14 additions & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,14 @@ def test_sysoutput():
options = settings.args_to_extractor(args)
assert options.format == "markdown" and options.formatting is True
assert cli_utils.process_result("DADIDA", args, -1, options) == -1

# with counter
with open(
path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
) as f:
teststring = f.read()
assert cli_utils.process_result(teststring, args, 1, options) == 2

# test keeping dir structure
testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
with patch.object(sys, "argv", testargs):
Expand Down Expand Up @@ -377,6 +385,9 @@ def test_cli_pipeline():

def test_file_processing():
"Test file processing pipeline on actual directories."
backup = settings.MAX_FILES_PER_DIRECTORY
settings.MAX_FILES_PER_DIRECTORY = 0

# dry-run file processing pipeline
testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"]
with patch.object(sys, "argv", testargs):
Expand All @@ -393,6 +404,8 @@ def test_file_processing():
for f in cli_utils.generate_filelist(args.input_dir):
cli_utils.file_processing(f, args, options=options)

settings.MAX_FILES_PER_DIRECTORY = backup


def test_cli_config_file():
"Test if the configuration file is loaded correctly from the CLI."
Expand Down Expand Up @@ -509,7 +522,7 @@ def test_crawling():
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
cli.process_args(args)
assert f.getvalue() == "https://httpbun.com/html\n"

spider.URL_STORE = UrlStore(compressed=False, strict=False)
Expand Down
16 changes: 14 additions & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import trafilatura.htmlprocessing
from trafilatura import bare_extraction, extract, xml
from trafilatura.core import Extractor
from trafilatura.external import sanitize_tree, try_justext
from trafilatura.external import sanitize_tree, try_justext, try_readability
from trafilatura.main_extractor import (handle_formatting, handle_image,
handle_lists, handle_paragraphs, handle_quotes,
handle_table, handle_textelem)
Expand Down Expand Up @@ -815,7 +815,8 @@ def test_htmlprocessing():

def test_extraction_options():
'''Test the different parameters available in extract() and bare_extraction()'''
my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --></p></div></body></html>'
my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --><?php echo "This is a PHP processing instruction"; ?></p></div></body></html>'

with pytest.raises(ValueError) as err:
extract(my_html, output_format="python")
assert extract(my_html, config=NEW_CONFIG) is None
Expand All @@ -824,9 +825,20 @@ def test_extraction_options():
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', fast=True, config=ZERO_CONFIG) is None

# justext hardening
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
assert etree.tostring(try_justext(None, None, 'de')) == b'<body/>'
# assert extract(my_html) is None

# readability
my_html = '<html><body><p>' + 'Text. '*10 + '</p></body></html>'
result = etree.tostring(try_readability(html.fromstring(my_html)))
assert len(result) > 10 and b'Text' in result
my_html = '<html><body><p>' + 'Text. '*10 + '<embed>Test</embed></p></body></html>'
result = etree.tostring(try_readability(html.fromstring(my_html)))
assert b'Test' not in result

my_html = '<html><head/><body>' + '<p>ABC def ghi jkl.</p>'*1000 + '<p>Posted on 1st Dec 2019<.</p></body></html>'
assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None
assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None
Expand Down
29 changes: 28 additions & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring

from trafilatura.metadata import Document
from trafilatura.xml import (check_tei, write_fullheader,
from trafilatura.xml import (check_tei, replace_element_text, write_fullheader,
_handle_unwanted_tails, _move_element_one_level_up,
_wrap_unwanted_siblings_of_div)

Expand Down Expand Up @@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div():
assert cleaned.find(".//p").text == "tail"


def test_replace_element_text():
elem = Element("head")
elem.text = "Title"
elem.set("rend", "h1")
assert replace_element_text(elem, True) == "# Title"

elem = Element("hi")
elem.text = "Text"
elem.set("rend", "#b")
assert replace_element_text(elem, True) == "**Text**"

elem = Element("item")
elem.text = "Test text"
elem.tag = "item"
assert replace_element_text(elem, True) == "- Test text\n"

elem = Element("ref")
elem.text = "Link"
elem.set("target", "https://example.com")
assert replace_element_text(elem, True) == "[Link](https://example.com)"

elem = Element("ref")
elem.text = "Link"
assert replace_element_text(elem, True) == "[Link]"


if __name__ == "__main__":
test_publisher_added_before_availability_in_publicationStmt()
test_unwanted_siblings_of_div_removed()
test_tail_on_p_like_elements_removed()
test_head_with_children_converted_to_ab()
test_ab_with_p_parent_resolved()
test_replace_element_text()
2 changes: 1 addition & 1 deletion trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element:
# extract
try:
paragraphs = custom_justext(tree, justext_stoplist)
except ValueError as err: # not an XML element: HtmlComment
except Exception as err:
LOGGER.error('justext %s %s', err, url)
else:
for paragraph in paragraphs:
Expand Down
28 changes: 10 additions & 18 deletions trafilatura/readability_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
)
elem.drop_tree()
elif elem.text_content().count(",") < 10:
to_remove = False
to_remove = True
counts = {
kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS
}
Expand All @@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
# continue
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = f'too many images ({counts["img"]})'
to_remove = True
elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS:
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < self.min_text_length and counts["img"] == 0:
reason = f"too short content length {content_length} without a single image"
to_remove = True
elif content_length < self.min_text_length and counts["img"] > 2:
reason = (
f"too short content length {content_length} and too many images"
)
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = (
f"too many links {link_density:.3f} for its weight {weight}"
)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = (
f"too many links {link_density:.3f} for its weight {weight}"
)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts[
"embed"
] > 1:
reason = (
"<embed>s with too short content length, or too many <embed>s"
)
to_remove = True
elif not content_length:
reason = "no content"
to_remove = True

# find x non empty preceding and succeeding siblings
siblings = []
Expand All @@ -430,17 +421,18 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
if siblings and sum(siblings) > 1000:
to_remove = False
allowed.update(elem.iter("table", "ul", "div", "section"))
else:
to_remove = False

if to_remove:
elem.drop_tree()
if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug(
"Removed %6.3f %s with weight %s cause it has %s.",
score,
elem.tag,
weight,
reason or "",
)
LOGGER.debug(
"Removed %6.3f %s with weight %s cause it has %s.",
score,
elem.tag,
weight,
reason or "",
)

self.doc = node
return _tostring(self.doc)
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]:
if len(temp_text) > len(temp_comments)
else py3langid.classify(temp_comments)
)
else:
else: # pragma: no cover
LOGGER.warning('Language detector not installed, skipping detection')
result = None
return result # type: ignore[no-any-return]
Expand Down
Loading