From b0107793454f7c6334d87f1228e0ff24d19c02b7 Mon Sep 17 00:00:00 2001
From: Cody
Date: Wed, 11 Dec 2024 19:30:24 +0800
Subject: [PATCH] syntax: add extract_with_metadata method (#765)
* add extract_with_metadata method in core and corresponding ut
* regroup code
* add method comment
---------
Co-authored-by: CodyInnowhere
---
tests/unit_tests.py | 56 ++++++++++++-
trafilatura/__init__.py | 3 +-
trafilatura/core.py | 177 ++++++++++++++++++++++++++++++++++++----
3 files changed, 219 insertions(+), 17 deletions(-)
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 29887a6f..ac8e60b3 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -21,7 +21,7 @@
from charset_normalizer import detect
import trafilatura.htmlprocessing
-from trafilatura import bare_extraction, extract, xml
+from trafilatura import bare_extraction, extract, extract_with_metadata, xml
from trafilatura.core import Extractor
from trafilatura.external import sanitize_tree, try_justext, try_readability
from trafilatura.main_extractor import (handle_formatting, handle_image,
@@ -443,6 +443,59 @@ def test_formatting():
assert '1) The in
Operator' in my_result and 'The easiest way to check if a Python string contains a substring is to use the in
operator. The in
operator is used to check data structures for membership in Python. It returns a Boolean (either True
or False
) and can be used as follows:
' in my_result
+def test_extract_with_metadata():
+ '''Test extract_with_metadata method'''
+ url = 'http://aa.bb/cc.html'
+ my_document = html.fromstring("""
+
+
+
+ AAA,
BBB
, CCC.
+
+
+
+ """)
+ parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
+ content = parsed_doc.text
+ assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+ assert url == parsed_doc.url and parsed_doc.date is None and parsed_doc.title is None
+
+ my_document = html.fromstring("""
+ title
+
+
+ May 24, 2021
+ AAA,
BBB
, CCC.
+
+
+
+ """)
+ parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url)
+ content = parsed_doc.text
+ assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+ assert url == parsed_doc.url and '2021-05-24' == parsed_doc.date and 'title' == parsed_doc.title
+
+ parsed_doc = extract_with_metadata(my_document, output_format='xml')
+ assert 'AAA, BBB , CCC.' == parsed_doc.raw_text and 'ee7d2fb6fcf2837d' == parsed_doc.fingerprint
+ content = parsed_doc.text
+ assert 'AAA' in content and 'BBB' in content and 'CCC' in content
+
+ my_document = html.fromstring("""
+
+
+
+ AAA,
BBB
, CCC.
+
+
+
+ """)
+ parsed_doc = extract_with_metadata(my_document, target_language='en', fast=True)
+ assert parsed_doc is None
+
+ with pytest.raises(ValueError) as err:
+ extract_with_metadata(my_document, output_format="python")
+
+
def test_external():
'''Test external components'''
options = DEFAULT_OPTIONS
@@ -1644,6 +1697,7 @@ def test_deprecations():
test_trim()
test_input()
test_formatting()
+ test_extract_with_metadata()
test_exotic_tags()
test_images()
test_links()
diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py
index 7017238a..94cf04d5 100644
--- a/trafilatura/__init__.py
+++ b/trafilatura/__init__.py
@@ -13,7 +13,7 @@
import logging
from .baseline import baseline, html2txt
-from .core import bare_extraction, extract
+from .core import bare_extraction, extract, extract_with_metadata
from .downloads import fetch_response, fetch_url
from .metadata import extract_metadata
from .utils import load_html
@@ -25,6 +25,7 @@
"baseline",
"extract",
"extract_metadata",
+ "extract_with_metadata",
"fetch_response",
"fetch_url",
"html2txt",
diff --git a/trafilatura/core.py b/trafilatura/core.py
index 79e424e1..dc1a01fd 100644
--- a/trafilatura/core.py
+++ b/trafilatura/core.py
@@ -193,19 +193,7 @@ def bare_extraction(
"""
# deprecations
- if no_fallback:
- fast = no_fallback
- warnings.warn(
- '"no_fallback" will be deprecated in a future version, use "fast" instead',
- PendingDeprecationWarning
- )
- if as_dict:
- warnings.warn(
- '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
- PendingDeprecationWarning
- )
- if max_tree_size:
- raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
+ _check_deprecation(no_fallback=no_fallback, as_dict=as_dict, max_tree_size=max_tree_size)
# regroup extraction options
if not options or not isinstance(options, Extractor):
@@ -424,16 +412,174 @@ def extract(
A string in the desired format or None.
"""
+ document = _internal_extraction(
+ filecontent=filecontent,
+ url=url,
+ record_id=record_id,
+ fast=fast,
+ no_fallback=no_fallback,
+ favor_precision=favor_precision,
+ favor_recall=favor_recall,
+ include_comments=include_comments,
+ output_format=output_format,
+ tei_validation=tei_validation,
+ target_language=target_language,
+ include_tables=include_tables,
+ include_images=include_images,
+ include_formatting=include_formatting,
+ include_links=include_links,
+ deduplicate=deduplicate,
+ date_extraction_params=date_extraction_params,
+ with_metadata=with_metadata,
+ only_with_metadata=only_with_metadata,
+ max_tree_size=max_tree_size,
+ url_blacklist=url_blacklist,
+ author_blacklist=author_blacklist,
+ settingsfile=settingsfile,
+ prune_xpath=prune_xpath,
+ config=config,
+ options=options)
+ return document.text if document is not None else None
+
+
+def extract_with_metadata(
+ filecontent: Any,
+ url: Optional[str] = None,
+ record_id: Optional[str] = None,
+ fast: bool = False,
+ favor_precision: bool = False,
+ favor_recall: bool = False,
+ include_comments: bool = True,
+ output_format: str = "txt",
+ tei_validation: bool = False,
+ target_language: Optional[str] = None,
+ include_tables: bool = True,
+ include_images: bool = False,
+ include_formatting: bool = False,
+ include_links: bool = False,
+ deduplicate: bool = False,
+ date_extraction_params: Optional[Dict[str, Any]] = None,
+ url_blacklist: Optional[Set[str]] = None,
+ author_blacklist: Optional[Set[str]] = None,
+ settingsfile: Optional[str] = None,
+ prune_xpath: Optional[Any] = None,
+ config: Any = DEFAULT_CONFIG,
+ options: Optional[Extractor] = None,
+) -> Optional[Document]:
+ """Main function exposed by the package:
+ Wrapper for text extraction and conversion to chosen output format.
+ This method also returns document metadata.
+
+ Args:
+ filecontent: HTML code as string.
+ url: URL of the webpage.
+ record_id: Add an ID to the metadata.
+ fast: Use faster heuristics and skip backup extraction.
+ no_fallback: Will be deprecated, use "fast" instead.
+ favor_precision: prefer less text but correct extraction.
+ favor_recall: when unsure, prefer more text.
+ include_comments: Extract comments along with the main text.
+ output_format: Define an output format:
+ "csv", "html", "json", "markdown", "txt", "xml", and "xmltei".
+ tei_validation: Validate the XML-TEI output with respect to the TEI standard.
+ target_language: Define a language to discard invalid documents (ISO 639-1 format).
+ include_tables: Take into account information within the HTML element.
+ include_images: Take images into account (experimental).
+ include_formatting: Keep structural elements related to formatting
+ (only valuable if output_format is set to XML).
+ include_links: Keep links along with their targets (experimental).
+ deduplicate: Remove duplicate segments and documents.
+ date_extraction_params: Provide extraction parameters to htmldate as dict().
+ url_blacklist: Provide a blacklist of URLs as set() to filter out documents.
+ author_blacklist: Provide a blacklist of Author Names as set() to filter out authors.
+ settingsfile: Use a configuration file to override the standard settings.
+ prune_xpath: Provide an XPath expression to prune the tree before extraction.
+ can be str or list of str.
+ config: Directly provide a configparser configuration.
+ options: Directly provide a whole extractor configuration.
+
+ Returns:
+ Document metadata with content string in the desired format or None.
+ """
+ return _internal_extraction(
+ filecontent=filecontent,
+ url=url,
+ record_id=record_id,
+ fast=fast,
+ favor_precision=favor_precision,
+ favor_recall=favor_recall,
+ include_comments=include_comments,
+ output_format=output_format,
+ tei_validation=tei_validation,
+ target_language=target_language,
+ include_tables=include_tables,
+ include_images=include_images,
+ include_formatting=include_formatting,
+ include_links=include_links,
+ deduplicate=deduplicate,
+ date_extraction_params=date_extraction_params,
+ with_metadata=True,
+ only_with_metadata=False,
+ url_blacklist=url_blacklist,
+ author_blacklist=author_blacklist,
+ settingsfile=settingsfile,
+ prune_xpath=prune_xpath,
+ config=config,
+ options=options)
+
+
+def _check_deprecation(
+ no_fallback: bool = False,
+ as_dict: bool = False,
+ max_tree_size: Optional[int] = None,
+)-> None:
+ '''Check deprecated or to-be-deprecated params'''
if no_fallback:
fast = no_fallback
warnings.warn(
'"no_fallback" will be deprecated in a future version, use "fast" instead',
PendingDeprecationWarning
)
-
+ if as_dict:
+ warnings.warn(
+ '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results',
+ PendingDeprecationWarning
+ )
if max_tree_size:
raise ValueError("max_tree_size is deprecated, use settings.cfg file instead")
+
+def _internal_extraction(
+ filecontent: Any,
+ url: Optional[str] = None,
+ record_id: Optional[str] = None,
+ fast: bool = False,
+ no_fallback: bool = False,
+ favor_precision: bool = False,
+ favor_recall: bool = False,
+ include_comments: bool = True,
+ output_format: str = "txt",
+ tei_validation: bool = False,
+ target_language: Optional[str] = None,
+ include_tables: bool = True,
+ include_images: bool = False,
+ include_formatting: bool = False,
+ include_links: bool = False,
+ deduplicate: bool = False,
+ date_extraction_params: Optional[Dict[str, Any]] = None,
+ with_metadata: bool = False,
+ only_with_metadata: bool = False,
+ max_tree_size: Optional[int] = None,
+ url_blacklist: Optional[Set[str]] = None,
+ author_blacklist: Optional[Set[str]] = None,
+ settingsfile: Optional[str] = None,
+ prune_xpath: Optional[Any] = None,
+ config: Any = DEFAULT_CONFIG,
+ options: Optional[Extractor] = None,
+) -> Optional[Document]:
+ '''Internal method to do the extraction'''
+ _check_deprecation(no_fallback=no_fallback, as_dict=False, max_tree_size=max_tree_size)
+
# regroup extraction options
if not options or not isinstance(options, Extractor):
options = Extractor(
@@ -485,4 +631,5 @@ def extract(
)
# return
- return determine_returnstring(document, options)
+ document.text = determine_returnstring(document, options)
+ return document