From 12c53781a374afab2eeca7d5da03e65b8dd8d5e6 Mon Sep 17 00:00:00 2001 From: CodyInnowhere Date: Thu, 28 Nov 2024 17:23:06 +0800 Subject: [PATCH 1/3] refine table markdown output --- trafilatura/xml.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 646c8426..2c4ca80e 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -288,6 +288,9 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: if element.tag == "cell" and elem_text and len(element) > 0: if element[0].tag == 'p': elem_text = f"{elem_text} " + elif element.tag == 'cell' and elem_text: + # add | before first cell + elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}" # lists elif element.tag == "item" and elem_text: elem_text = f"- {elem_text}\n" @@ -324,7 +327,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting returnlist.append(f'{"|" * (max_span - cell_count)}\n') # if this is a head row, draw the separator below if element.xpath("./cell[@role='head']"): - returnlist.append(f'\n{"---|" * max_span}\n') + returnlist.append(f'\n|{"---|" * max_span}\n') else: returnlist.append("\n") elif element.tag != "cell": @@ -337,7 +340,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting # Common elements (Now processes end-tag logic correctly) if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"): # spacing hack - returnlist.append("\n\u2424\n" if include_formatting else "\n") + returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n") elif element.tag == "cell": returnlist.append(" | ") elif element.tag not in SPECIAL_FORMATTING: From 17d79192315fbf0178d4adfff325e00f61c4870c Mon Sep 17 00:00:00 2001 From: CodyInnowhere Date: Fri, 29 Nov 2024 11:19:19 +0800 Subject: [PATCH 2/3] fix ut --- tests/unit_tests.py | 18 +++++++++--------- trafilatura/xml.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 00046804..e7c12dc0 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1022,7 +1022,7 @@ def test_table_processing(): you buy they buy ''' in my_result - assert extract(htmlstring, fast=True, output_format='txt').startswith("Present Tense | I buy | you buy |") + assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |") # table with links # todo: further tests and adjustments htmlstring = '' @@ -1112,12 +1112,12 @@ def test_table_processing(): assert "1" in result and "2" in result # table headers in non-XML formats htmlstring = '
head 1head 2
12
' - assert "---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) + assert "|---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) # remove new lines in table cells in text format htmlstring = '
cell
1
cell

2

' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "cell 1 | cell 2 |" in result + assert "| cell 1 | cell 2 |" in result # only one header row is allowed in text format htmlstring = '
ab
cd
' @@ -1127,15 +1127,15 @@ def test_table_processing(): # handle colspan by appending columns in text format htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result # MemoryError: https://github.com/adbar/trafilatura/issues/657 htmlstring = '
ab
cde
' @@ -1149,16 +1149,16 @@ def test_table_processing(): # wrong span info htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result # links: this gets through (for now) htmlstring = '' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert result == "a |" + assert result == "| a |" # link: this is filtered out htmlstring = f'' diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 2c4ca80e..953a5f9f 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -287,7 +287,7 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: # cells if element.tag == "cell" and elem_text and len(element) > 0: if element[0].tag == 'p': - elem_text = f"{elem_text} " + elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} " elif element.tag == 'cell' and elem_text: # add | before first cell elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}" From e466953d7d56a94464df2d4e3b7e5268700b9df8 Mon Sep 17 00:00:00 2001 From: CodyInnowhere Date: Mon, 2 Dec 2024 14:53:15 +0800 Subject: [PATCH 3/3] extract image in textnode --- tests/unit_tests.py | 7 +++++++ trafilatura/htmlprocessing.py | 4 +++- trafilatura/main_extractor.py | 10 +++++++++- trafilatura/utils.py | 14 ++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 9990a107..7e9beae7 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -483,6 +483,7 @@ def test_images(): assert is_image_file('test.txt') is False assert is_image_file('test.jpg'*2000) is False # length threshold # tag with attributes + assert handle_image(None) is None assert handle_image(html.fromstring('')) is not None assert handle_image(html.fromstring('text')) is not None assert handle_image(html.fromstring('')) is None @@ -494,6 +495,12 @@ def test_images(): assert '![Example image](test.jpg)' in extract(teststring, include_images=True, fast=True) assert '' in extract(teststring, include_images=True, fast=True, output_format='xml', config=ZERO_CONFIG) assert extract('
text
', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '' + assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + assert extract('

text

', include_images=True, fast=True) == '![a title text](test.jpg)' + + assert handle_image(html.fromstring('text')) is None # CNN example mydoc = html.fromstring('Harry and Meghan last March, in their final royal engagement.') diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index af855ee2..d78734fe 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -20,7 +20,7 @@ MANUALLY_CLEANED, MANUALLY_STRIPPED, ) -from .utils import textfilter, trim +from .utils import textfilter, trim, is_image_element from .xml import META_ATTRIBUTES, delete_element @@ -226,6 +226,8 @@ def handle_textnode( preserve_spaces: bool = False, ) -> Optional[_Element]: "Convert, format, and probe potential text elements." + if elem.tag == "graphic" and is_image_element(elem): + return elem if elem.tag == "done" or (len(elem) == 0 and not elem.text and not elem.tail): return None diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index eb50338e..2a950bec 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -331,6 +331,11 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr # else: # newsub.tail = processed_child.text newsub.text, newsub.tail = processed_child.text, processed_child.tail + + if processed_child.tag == 'graphic': + image_elem = handle_image(processed_child) + if image_elem is not None: + newsub = image_elem processed_element.append(newsub) child.tag = "done" # finish @@ -437,8 +442,11 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac return None -def handle_image(element: _Element) -> Optional[_Element]: +def handle_image(element: Optional[_Element]) -> Optional[_Element]: "Process image elements and their relevant attributes." + if element is None: + return None + processed_element = Element(element.tag) for attr in ("data-src", "src"): diff --git a/trafilatura/utils.py b/trafilatura/utils.py index 7db53889..8cb09793 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -346,6 +346,20 @@ def trim(string: str) -> str: return "" +def is_image_element(element: _Element) -> bool: + '''Check if an element is a valid img element''' + for attr in ("data-src", "src"): + src = element.get(attr, "") + if is_image_file(src): + return True + else: + # take the first corresponding attribute + for attr, value in element.attrib.items(): + if attr.startswith("data-src") and is_image_file(value): + return True + return False + + def is_image_file(imagesrc: Optional[str]) -> bool: '''Check if the observed string corresponds to a valid image extension. Use a length threshold and apply a regex on the content.'''