more robust table extraction

fix type check
adbar · Dec 13, 2024 · c510ac5 · c510ac5
1 parent b010779
commit c510ac5
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 11 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1286,6 +1286,70 @@ def test_table_processing():
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
     assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
 
+    htmlstring = """
+                 <html><body><article>
+                 <table>
+                 <tr><td>a</td><td>b</td><td>c</td></tr>
+                 <tr>
+                    <td>a<img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></td>
+                    <td><p>b</p><p>c</p></td>
+                    <td>d</td>
+                 </tr>
+                 </table>
+                 </article></body></html>
+                 """
+    result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+                     include_images=True, include_tables=True)
+    assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
+
+    htmlstring = """
+                 <html><body><article>
+                 <table>
+                 <tr><td>a</td><td>b</td><td>c</td></tr>
+                 <tr>
+                    <td><a href="http://aa.bb/"><img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></a></td>
+                    <td><p>b</p><p>c</p></td>
+                    <td>d</td>
+                 </tr>
+                 </table>
+                 </article></body></html>
+                 """
+    result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+                     include_images=True, include_tables=True)
+    assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
+
+    htmlstring = """
+                 <html><body><article>
+                 <table>
+                 <tr><td>a</td><td>b</td><td>c</td></tr>
+                 <tr>
+                    <td><img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></td>
+                    <td><p>b</p><p>c</p></td>
+                    <td>d</td>
+                 </tr>
+                 </table>
+                 </article></body></html>
+                 """
+    result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+                     include_images=True, include_tables=True)
+    assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
+
+    htmlstring = """
+                 <html><body><article>
+                 <table>
+                 <tr><td>a</td><td>b</td><td>c</td></tr>
+                 <tr>
+                    <td><img src="http://aa.bb/c.jpg" alt="img1"/><span>a</span><img src="http://aa.bb/c.jpg" alt="img2"/></td>
+                    <td><p>b</p><p>c</p></td>
+                    <td>d</td>
+                 </tr>
+                 </table>
+                 </article></body></html>
+                 """
+    result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+                     include_images=True, include_tables=True)
+    assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
+
 
 def test_list_processing():
     options = DEFAULT_OPTIONS

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -18,7 +18,7 @@
                              link_density_test_tables, process_node,
                              prune_unwanted_nodes)
 from .settings import TAG_CATALOG, Extractor
-from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim
+from .utils import FORMATTING_PROTECTED, copy_attributes, is_image_file, text_chars_test, trim
 from .xml import delete_element
 from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
                      DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
@@ -157,6 +157,8 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None:
     if processed_elem is not None:
         childelem = SubElement(orig_elem, processed_elem.tag)
         childelem.text, childelem.tail = processed_elem.text, processed_elem.tail
+        if processed_elem.tag == 'graphic':
+            copy_attributes(childelem, processed_elem)
 
 
 def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]:
@@ -488,6 +490,7 @@ def handle_image(element: Optional[_Element], options: Optional[Extractor] = Non
             link = re.sub(r"^//", "http://", link)
         processed_element.set("src", link)
 
+    processed_element.tail = element.tail
     return processed_element
 
 

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -454,3 +454,18 @@ def text_chars_test(string: Optional[str]) -> bool:
     # or not re.search(r'\w', string)
     # return string is not None and len(string) != 0 and not string.isspace()
     return bool(string) and not string.isspace()  # type: ignore[union-attr]
+
+
+def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
+    '''Copy attributes from src element to dest element'''
+    for key in src_elem.keys():
+        dest_elem.set(key, src_elem.attrib[key])
+
+
+def is_in_table_cell(elem: _Element) -> bool:
+    '''Check whether an element is in a table cell'''
+    while elem is not None:
+        if elem.tag == 'cell':
+            return True
+        elem = elem.getparent()
+    return False
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -17,7 +17,7 @@
                         fromstring, tostring, DTD)
 
 from .settings import Document, Extractor
-from .utils import sanitize, sanitize_tree, text_chars_test
+from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
 
 
 LOGGER = logging.getLogger(__name__)
@@ -288,12 +288,8 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
     if element.tag == "cell":
         elem_text = elem_text.strip()
 
-        if elem_text and len(element) > 0:
-            if element[0].tag == 'p':
-                elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
-        elif elem_text:
-            # add | before first cell
-            elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
+        if elem_text:
+            elem_text = f"{elem_text} "
     # lists
     elif element.tag == "item" and elem_text:
         elem_text = f"- {elem_text}\n"
@@ -302,18 +298,29 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
 
 def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None:
     "Recursively convert a LXML element and its children to a flattened string representation."
+    if element.tag == 'cell' and element.getprevious() is None:
+        returnlist.append('| ')
+
     if element.text:
         # this is the text that comes before the first child
         returnlist.append(replace_element_text(element, include_formatting))
 
+    if element.tail and element.tag != 'graphic' and is_in_table_cell(element):
+        # if element is in table cell, append tail after element text when element is not graphic since we deal with
+        # graphic tail alone, textless elements like lb should be processed here too, otherwise process tail at the end
+        returnlist.append(element.tail.strip())
+
     for child in element:
         process_element(child, returnlist, include_formatting)
 
-    if not element.text and not element.tail:
+    if not element.text:
         if element.tag == "graphic":
             # add source, default to ''
             text = f'{element.get("title", "")} {element.get("alt", "")}'
             returnlist.append(f'![{text.strip()}]({element.get("src", "")})')
+
+            if element.tail:
+                returnlist.append(f' {element.tail.strip()}')
         # newlines for textless elements
         elif element.tag in NEWLINE_ELEMS:
             # add line after table head
@@ -350,8 +357,8 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
         returnlist.append(" ")
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
-    if element.tail:
-        returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)
+    if element.tail and not is_in_table_cell(element):
+        returnlist.append(element.tail)
 
 
 def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: