From a324216805e2431335550c355bf8918fdb81dceb Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Thu, 19 Oct 2023 16:51:19 +0200
Subject: [PATCH 1/2] add htmldate extensive search to config
---
tests/resources/newsettings.cfg | 2 ++
tests/unit_tests.py | 4 ++++
trafilatura/core.py | 8 ++++++++
trafilatura/settings.cfg | 2 ++
4 files changed, 16 insertions(+)
diff --git a/tests/resources/newsettings.cfg b/tests/resources/newsettings.cfg
index f167025a..9614d4bc 100644
--- a/tests/resources/newsettings.cfg
+++ b/tests/resources/newsettings.cfg
@@ -31,3 +31,5 @@ EXTRACTION_TIMEOUT = 0
MIN_DUPLCHECK_SIZE = 10
MAX_REPETITIONS = 3
+# Extraction option for Htmldate
+EXTENSIVE_DATE_SEARCH = off
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 050d2588..73224f80 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -711,6 +711,10 @@ def test_extraction_options():
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b''
# assert extract(my_html) is None
+ my_html = '
' + 'ABC def ghi jkl.
'*1000 + 'Posted on 1st Dec 2019<.
'
+ assert bare_extraction(my_html, config=ZERO_CONFIG)["date"] is not None
+ assert bare_extraction(my_html, config=NEW_CONFIG)["date"] is None
+
def test_precision_recall():
'''test precision- and recall-oriented settings'''
diff --git a/trafilatura/core.py b/trafilatura/core.py
index 9097c30e..5aa45ae3 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -913,11 +913,18 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
# extract metadata if necessary
if output_format != 'txt':
+
+ extensive_htmldate = config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH')
+ if not date_extraction_params and not extensive_htmldate:
+ date_extraction_params = {"extensive_search": False}
+
document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)
+
# cut short if extracted URL in blacklist
if document.url in url_blacklist:
LOGGER.warning('blacklisted URL: %s', url)
raise ValueError
+
# cut short if core elements are missing
if only_with_metadata is True and any(
x is None for x in
@@ -925,6 +932,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
):
LOGGER.error('no metadata for URL %s', url)
raise ValueError
+
else:
document = Document()
diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg
index 29573a8c..457d4ff4 100644
--- a/trafilatura/settings.cfg
+++ b/trafilatura/settings.cfg
@@ -26,3 +26,5 @@ EXTRACTION_TIMEOUT = 30
MIN_DUPLCHECK_SIZE = 100
MAX_REPETITIONS = 2
+# Extraction option for Htmldate
+EXTENSIVE_DATE_SEARCH = on
From a0bfa5432908d61a3c77c2e487bf03f23949ef61 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi
Date: Tue, 24 Oct 2023 16:20:12 +0200
Subject: [PATCH 2/2] simplify syntax
---
trafilatura/core.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/trafilatura/core.py b/trafilatura/core.py
index 5aa45ae3..2b91fbb3 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -914,9 +914,10 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
# extract metadata if necessary
if output_format != 'txt':
- extensive_htmldate = config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH')
- if not date_extraction_params and not extensive_htmldate:
- date_extraction_params = {"extensive_search": False}
+ if not date_extraction_params:
+ date_extraction_params = {
+ "extensive_search": config.getboolean('DEFAULT', 'EXTENSIVE_DATE_SEARCH'),
+ }
document = extract_metadata(tree, url, date_extraction_params, no_fallback, author_blacklist)