From c510ac56678328c51f1daccdfb391b61ee5ecfbd Mon Sep 17 00:00:00 2001 From: CodyInnowhere Date: Fri, 13 Dec 2024 19:14:44 +0800 Subject: [PATCH] more robust table extraction fix type check --- tests/unit_tests.py | 64 +++++++++++++++++++++++++++++++++++ trafilatura/main_extractor.py | 5 ++- trafilatura/utils.py | 15 ++++++++ trafilatura/xml.py | 27 +++++++++------ 4 files changed, 100 insertions(+), 11 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index ac8e60b3..b43362c8 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1286,6 +1286,70 @@ def test_table_processing(): result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |" + htmlstring = """ +
+ + + + + + + +
abc
aimga

b

c

d
+
+ """ + result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG, + include_images=True, include_tables=True) + assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |" + + htmlstring = """ +
+ + + + + + + +
abc
imga

b

c

d
+
+ """ + result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG, + include_images=True, include_tables=True) + assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |" + + htmlstring = """ +
+ + + + + + + +
abc
imga

b

c

d
+
+ """ + result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG, + include_images=True, include_tables=True) + assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |" + + htmlstring = """ +
+ + + + + + + +
abc
img1aimg2

b

c

d
+
+ """ + result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG, + include_images=True, include_tables=True) + assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |" + def test_list_processing(): options = DEFAULT_OPTIONS diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py index 4af74329..a3b1d481 100644 --- a/trafilatura/main_extractor.py +++ b/trafilatura/main_extractor.py @@ -18,7 +18,7 @@ link_density_test_tables, process_node, prune_unwanted_nodes) from .settings import TAG_CATALOG, Extractor -from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim +from .utils import FORMATTING_PROTECTED, copy_attributes, is_image_file, text_chars_test, trim from .xml import delete_element from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH, DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH, @@ -157,6 +157,8 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None: if processed_elem is not None: childelem = SubElement(orig_elem, processed_elem.tag) childelem.text, childelem.tail = processed_elem.text, processed_elem.tail + if processed_elem.tag == 'graphic': + copy_attributes(childelem, processed_elem) def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]: @@ -488,6 +490,7 @@ def handle_image(element: Optional[_Element], options: Optional[Extractor] = Non link = re.sub(r"^//", "http://", link) processed_element.set("src", link) + processed_element.tail = element.tail return processed_element diff --git a/trafilatura/utils.py b/trafilatura/utils.py index bd1eee7c..84fcfd3c 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -454,3 +454,18 @@ def text_chars_test(string: Optional[str]) -> bool: # or not re.search(r'\w', string) # return string is not None and len(string) != 0 and not string.isspace() return bool(string) and not string.isspace() # type: ignore[union-attr] + + +def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None: + '''Copy attributes from src element to dest element''' + for key in src_elem.keys(): + dest_elem.set(key, src_elem.attrib[key]) + + +def is_in_table_cell(elem: _Element) -> bool: + '''Check whether an element is in a table cell''' + while elem is not None: + if elem.tag == 'cell': + return True + elem = elem.getparent() + return False diff --git a/trafilatura/xml.py b/trafilatura/xml.py index f5e6c57e..a31e70da 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -17,7 +17,7 @@ fromstring, tostring, DTD) from .settings import Document, Extractor -from .utils import sanitize, sanitize_tree, text_chars_test +from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test LOGGER = logging.getLogger(__name__) @@ -288,12 +288,8 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: if element.tag == "cell": elem_text = elem_text.strip() - if elem_text and len(element) > 0: - if element[0].tag == 'p': - elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} " - elif elem_text: - # add | before first cell - elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}" + if elem_text: + elem_text = f"{elem_text} " # lists elif element.tag == "item" and elem_text: elem_text = f"- {elem_text}\n" @@ -302,18 +298,29 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None: "Recursively convert a LXML element and its children to a flattened string representation." + if element.tag == 'cell' and element.getprevious() is None: + returnlist.append('| ') + if element.text: # this is the text that comes before the first child returnlist.append(replace_element_text(element, include_formatting)) + if element.tail and element.tag != 'graphic' and is_in_table_cell(element): + # if element is in table cell, append tail after element text when element is not graphic since we deal with + # graphic tail alone, textless elements like lb should be processed here too, otherwise process tail at the end + returnlist.append(element.tail.strip()) + for child in element: process_element(child, returnlist, include_formatting) - if not element.text and not element.tail: + if not element.text: if element.tag == "graphic": # add source, default to '' text = f'{element.get("title", "")} {element.get("alt", "")}' returnlist.append(f'![{text.strip()}]({element.get("src", "")})') + + if element.tail: + returnlist.append(f' {element.tail.strip()}') # newlines for textless elements elif element.tag in NEWLINE_ELEMS: # add line after table head @@ -350,8 +357,8 @@ def process_element(element: _Element, returnlist: List[str], include_formatting returnlist.append(" ") # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS - if element.tail: - returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail) + if element.tail and not is_in_table_cell(element): + returnlist.append(element.tail) def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: