diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 29887a6f..ac8e60b3 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -21,7 +21,7 @@ from charset_normalizer import detect import trafilatura.htmlprocessing -from trafilatura import bare_extraction, extract, xml +from trafilatura import bare_extraction, extract, extract_with_metadata, xml from trafilatura.core import Extractor from trafilatura.external import sanitize_tree, try_justext, try_readability from trafilatura.main_extractor import (handle_formatting, handle_image, @@ -443,6 +443,59 @@ def test_formatting(): assert '1) The in Operator' in my_result and '

The easiest way to check if a Python string contains a substring is to use the in operator. The in operator is used to check data structures for membership in Python. It returns a Boolean (either True or False) and can be used as follows:

' in my_result +def test_extract_with_metadata(): + '''Test extract_with_metadata method''' + url = 'http://aa.bb/cc.html' + my_document = html.fromstring(""" + + +
+

AAA,

BBB

, CCC.

+
+ + + """) + parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url) + content = parsed_doc.text + assert 'AAA' in content and 'BBB' in content and 'CCC' in content + assert url == parsed_doc.url and parsed_doc.date is None and parsed_doc.title is None + + my_document = html.fromstring(""" + title + +
+
May 24, 2021
+

AAA,

BBB

, CCC.

+
+ + + """) + parsed_doc = extract_with_metadata(my_document, output_format='txt', include_formatting=True, fast=True, url=url) + content = parsed_doc.text + assert 'AAA' in content and 'BBB' in content and 'CCC' in content + assert url == parsed_doc.url and '2021-05-24' == parsed_doc.date and 'title' == parsed_doc.title + + parsed_doc = extract_with_metadata(my_document, output_format='xml') + assert 'AAA, BBB , CCC.' == parsed_doc.raw_text and 'ee7d2fb6fcf2837d' == parsed_doc.fingerprint + content = parsed_doc.text + assert 'AAA' in content and 'BBB' in content and 'CCC' in content + + my_document = html.fromstring(""" + + +
+

AAA,

BBB

, CCC.

+
+ + + """) + parsed_doc = extract_with_metadata(my_document, target_language='en', fast=True) + assert parsed_doc is None + + with pytest.raises(ValueError) as err: + extract_with_metadata(my_document, output_format="python") + + def test_external(): '''Test external components''' options = DEFAULT_OPTIONS @@ -1644,6 +1697,7 @@ def test_deprecations(): test_trim() test_input() test_formatting() + test_extract_with_metadata() test_exotic_tags() test_images() test_links() diff --git a/trafilatura/__init__.py b/trafilatura/__init__.py index 7017238a..94cf04d5 100644 --- a/trafilatura/__init__.py +++ b/trafilatura/__init__.py @@ -13,7 +13,7 @@ import logging from .baseline import baseline, html2txt -from .core import bare_extraction, extract +from .core import bare_extraction, extract, extract_with_metadata from .downloads import fetch_response, fetch_url from .metadata import extract_metadata from .utils import load_html @@ -25,6 +25,7 @@ "baseline", "extract", "extract_metadata", + "extract_with_metadata", "fetch_response", "fetch_url", "html2txt", diff --git a/trafilatura/core.py b/trafilatura/core.py index 79e424e1..dc1a01fd 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -193,19 +193,7 @@ def bare_extraction( """ # deprecations - if no_fallback: - fast = no_fallback - warnings.warn( - '"no_fallback" will be deprecated in a future version, use "fast" instead', - PendingDeprecationWarning - ) - if as_dict: - warnings.warn( - '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results', - PendingDeprecationWarning - ) - if max_tree_size: - raise ValueError("max_tree_size is deprecated, use settings.cfg file instead") + _check_deprecation(no_fallback=no_fallback, as_dict=as_dict, max_tree_size=max_tree_size) # regroup extraction options if not options or not isinstance(options, Extractor): @@ -424,16 +412,174 @@ def extract( A string in the desired format or None. """ + document = _internal_extraction( + filecontent=filecontent, + url=url, + record_id=record_id, + fast=fast, + no_fallback=no_fallback, + favor_precision=favor_precision, + favor_recall=favor_recall, + include_comments=include_comments, + output_format=output_format, + tei_validation=tei_validation, + target_language=target_language, + include_tables=include_tables, + include_images=include_images, + include_formatting=include_formatting, + include_links=include_links, + deduplicate=deduplicate, + date_extraction_params=date_extraction_params, + with_metadata=with_metadata, + only_with_metadata=only_with_metadata, + max_tree_size=max_tree_size, + url_blacklist=url_blacklist, + author_blacklist=author_blacklist, + settingsfile=settingsfile, + prune_xpath=prune_xpath, + config=config, + options=options) + return document.text if document is not None else None + + +def extract_with_metadata( + filecontent: Any, + url: Optional[str] = None, + record_id: Optional[str] = None, + fast: bool = False, + favor_precision: bool = False, + favor_recall: bool = False, + include_comments: bool = True, + output_format: str = "txt", + tei_validation: bool = False, + target_language: Optional[str] = None, + include_tables: bool = True, + include_images: bool = False, + include_formatting: bool = False, + include_links: bool = False, + deduplicate: bool = False, + date_extraction_params: Optional[Dict[str, Any]] = None, + url_blacklist: Optional[Set[str]] = None, + author_blacklist: Optional[Set[str]] = None, + settingsfile: Optional[str] = None, + prune_xpath: Optional[Any] = None, + config: Any = DEFAULT_CONFIG, + options: Optional[Extractor] = None, +) -> Optional[Document]: + """Main function exposed by the package: + Wrapper for text extraction and conversion to chosen output format. + This method also returns document metadata. + + Args: + filecontent: HTML code as string. + url: URL of the webpage. + record_id: Add an ID to the metadata. + fast: Use faster heuristics and skip backup extraction. + no_fallback: Will be deprecated, use "fast" instead. + favor_precision: prefer less text but correct extraction. + favor_recall: when unsure, prefer more text. + include_comments: Extract comments along with the main text. + output_format: Define an output format: + "csv", "html", "json", "markdown", "txt", "xml", and "xmltei". + tei_validation: Validate the XML-TEI output with respect to the TEI standard. + target_language: Define a language to discard invalid documents (ISO 639-1 format). + include_tables: Take into account information within the HTML element. + include_images: Take images into account (experimental). + include_formatting: Keep structural elements related to formatting + (only valuable if output_format is set to XML). + include_links: Keep links along with their targets (experimental). + deduplicate: Remove duplicate segments and documents. + date_extraction_params: Provide extraction parameters to htmldate as dict(). + url_blacklist: Provide a blacklist of URLs as set() to filter out documents. + author_blacklist: Provide a blacklist of Author Names as set() to filter out authors. + settingsfile: Use a configuration file to override the standard settings. + prune_xpath: Provide an XPath expression to prune the tree before extraction. + can be str or list of str. + config: Directly provide a configparser configuration. + options: Directly provide a whole extractor configuration. + + Returns: + Document metadata with content string in the desired format or None. + """ + return _internal_extraction( + filecontent=filecontent, + url=url, + record_id=record_id, + fast=fast, + favor_precision=favor_precision, + favor_recall=favor_recall, + include_comments=include_comments, + output_format=output_format, + tei_validation=tei_validation, + target_language=target_language, + include_tables=include_tables, + include_images=include_images, + include_formatting=include_formatting, + include_links=include_links, + deduplicate=deduplicate, + date_extraction_params=date_extraction_params, + with_metadata=True, + only_with_metadata=False, + url_blacklist=url_blacklist, + author_blacklist=author_blacklist, + settingsfile=settingsfile, + prune_xpath=prune_xpath, + config=config, + options=options) + + +def _check_deprecation( + no_fallback: bool = False, + as_dict: bool = False, + max_tree_size: Optional[int] = None, +)-> None: + '''Check deprecated or to-be-deprecated params''' if no_fallback: fast = no_fallback warnings.warn( '"no_fallback" will be deprecated in a future version, use "fast" instead', PendingDeprecationWarning ) - + if as_dict: + warnings.warn( + '"as_dict" will be deprecated, use the .as_dict() method on bare_extraction results', + PendingDeprecationWarning + ) if max_tree_size: raise ValueError("max_tree_size is deprecated, use settings.cfg file instead") + +def _internal_extraction( + filecontent: Any, + url: Optional[str] = None, + record_id: Optional[str] = None, + fast: bool = False, + no_fallback: bool = False, + favor_precision: bool = False, + favor_recall: bool = False, + include_comments: bool = True, + output_format: str = "txt", + tei_validation: bool = False, + target_language: Optional[str] = None, + include_tables: bool = True, + include_images: bool = False, + include_formatting: bool = False, + include_links: bool = False, + deduplicate: bool = False, + date_extraction_params: Optional[Dict[str, Any]] = None, + with_metadata: bool = False, + only_with_metadata: bool = False, + max_tree_size: Optional[int] = None, + url_blacklist: Optional[Set[str]] = None, + author_blacklist: Optional[Set[str]] = None, + settingsfile: Optional[str] = None, + prune_xpath: Optional[Any] = None, + config: Any = DEFAULT_CONFIG, + options: Optional[Extractor] = None, +) -> Optional[Document]: + '''Internal method to do the extraction''' + _check_deprecation(no_fallback=no_fallback, as_dict=False, max_tree_size=max_tree_size) + # regroup extraction options if not options or not isinstance(options, Extractor): options = Extractor( @@ -485,4 +631,5 @@ def extract( ) # return - return determine_returnstring(document, options) + document.text = determine_returnstring(document, options) + return document