From a324216805e2431335550c355bf8918fdb81dceb Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 19 Oct 2023 16:51:19 +0200 Subject: [PATCH 1/2] add htmldate extensive search to config --- tests/resources/newsettings.cfg | 2 ++ tests/unit_tests.py | 4 ++++ trafilatura/core.py | 8 ++++++++ trafilatura/settings.cfg | 2 ++ 4 files changed, 16 insertions(+) diff --git a/tests/resources/newsettings.cfg b/tests/resources/newsettings.cfg index f167025a..9614d4bc 100644 --- a/tests/resources/newsettings.cfg +++ b/tests/resources/newsettings.cfg @@ -31,3 +31,5 @@ EXTRACTION_TIMEOUT = 0 MIN_DUPLCHECK_SIZE = 10 MAX_REPETITIONS = 3 +# Extraction option for Htmldate +EXTENSIVE_DATE_SEARCH = off diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 050d2588..73224f80 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -711,6 +711,10 @@ def test_extraction_options(): assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'' # assert extract(my_html) is None + my_html = '' + '

ABC def ghi jkl.

'*1000 + '

Posted on 1st Dec 2019<.

' + assert bare_extraction(my_html, config=ZERO_CONFIG)["date"] is not None + assert bare_extraction(my_html, config=NEW_CONFIG)["date"] is None + def test_precision_recall(): '''test precision- and recall-oriented settings''' diff --git a/trafilatura/core.py b/trafilatura/core.py index 9097c30e..5aa45ae3 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -913,11 +913,18 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, # extract metadata if necessary if output_format != 'txt': + + extensive_htmldate = config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH') + if not date_extraction_params and not extensive_htmldate: + date_extraction_params = {"extensive_search": False} + document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist) + # cut short if extracted URL in blacklist if document.url in url_blacklist: LOGGER.warning('blacklisted URL: %s', url) raise ValueError + # cut short if core elements are missing if only_with_metadata is True and any( x is None for x in @@ -925,6 +932,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, ): LOGGER.error('no metadata for URL %s', url) raise ValueError + else: document = Document() diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index 29573a8c..457d4ff4 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -26,3 +26,5 @@ EXTRACTION_TIMEOUT = 30 MIN_DUPLCHECK_SIZE = 100 MAX_REPETITIONS = 2 +# Extraction option for Htmldate +EXTENSIVE_DATE_SEARCH = on From a0bfa5432908d61a3c77c2e487bf03f23949ef61 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Tue, 24 Oct 2023 16:20:12 +0200 Subject: [PATCH 2/2] simplify syntax --- trafilatura/core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index 5aa45ae3..2b91fbb3 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -914,9 +914,10 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False, # extract metadata if necessary if output_format != 'txt': - extensive_htmldate = config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH') - if not date_extraction_params and not extensive_htmldate: - date_extraction_params = {"extensive_search": False} + if not date_extraction_params: + date_extraction_params = { + "extensive_search": config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH'), + } document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)