From 12c53781a374afab2eeca7d5da03e65b8dd8d5e6 Mon Sep 17 00:00:00 2001
From: CodyInnowhere <lostcody@CodyInnowheredeMacBook-Pro.local>
Date: Thu, 28 Nov 2024 17:23:06 +0800
Subject: [PATCH 1/3] refine table markdown output

---
 trafilatura/xml.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index 646c8426..2c4ca80e 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -288,6 +288,9 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
     if element.tag == "cell" and elem_text and len(element) > 0:
         if element[0].tag == 'p':
             elem_text = f"{elem_text} "
+    elif element.tag == 'cell' and elem_text:
+        # add | before first cell
+        elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
     # lists
     elif element.tag == "item" and elem_text:
         elem_text = f"- {elem_text}\n"
@@ -324,7 +327,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
                     returnlist.append(f'{"|" * (max_span - cell_count)}\n')
                 # if this is a head row, draw the separator below
                 if element.xpath("./cell[@role='head']"):
-                    returnlist.append(f'\n{"---|" * max_span}\n')
+                    returnlist.append(f'\n|{"---|" * max_span}\n')
             else:
                 returnlist.append("\n")
         elif element.tag != "cell":
@@ -337,7 +340,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
     # Common elements (Now processes end-tag logic correctly)
     if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
         # spacing hack
-        returnlist.append("\n\u2424\n" if include_formatting else "\n")
+        returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
     elif element.tag == "cell":
         returnlist.append(" | ")
     elif element.tag not in SPECIAL_FORMATTING:

From 17d79192315fbf0178d4adfff325e00f61c4870c Mon Sep 17 00:00:00 2001
From: CodyInnowhere <lostcody@CodyInnowheredeMacBook-Pro.local>
Date: Fri, 29 Nov 2024 11:19:19 +0800
Subject: [PATCH 2/3] fix ut

---
 tests/unit_tests.py | 18 +++++++++---------
 trafilatura/xml.py  |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 00046804..e7c12dc0 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1022,7 +1022,7 @@ def test_table_processing():
         <cell>you buy</cell>
         <cell>they buy</cell>
       </row>''' in my_result
-    assert extract(htmlstring, fast=True, output_format='txt').startswith("Present Tense | I buy | you buy |")
+    assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
     # table with links
     # todo: further tests and adjustments
     htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>'
@@ -1112,12 +1112,12 @@ def test_table_processing():
     assert "1" in result and "2" in result
     # table headers in non-XML formats
     htmlstring = '<html><body><article><table><tr><th>head 1</th><th>head 2</th></tr><tr><td>1</td><td>2</td></tr></table></article></body></html>'
-    assert "---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+    assert "|---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
 
     # remove new lines in table cells in text format
     htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "cell 1 | cell 2 |" in result
+    assert "| cell 1 | cell 2 |" in result
 
     # only one header row is allowed in text format
     htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
@@ -1127,15 +1127,15 @@ def test_table_processing():
     # handle colspan by appending columns in text format
     htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "a | b | |" in result
+    assert "| a | b | |" in result
 
     htmlstring = '<html><body><article><table><tr><td span="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "a | b | |" in result
+    assert "| a | b | |" in result
 
     htmlstring = '<html><body><article><table><tr><td span="2.1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "a | b | |" in result
+    assert "| a | b | |" in result
 
     # MemoryError: https://github.com/adbar/trafilatura/issues/657
     htmlstring = '<html><body><article><table><tr><td colspan="9007199254740991">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
@@ -1149,16 +1149,16 @@ def test_table_processing():
     # wrong span info
     htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "a | b | |" in result
+    assert "| a | b | |" in result
 
     htmlstring = '<html><body><article><table><tr><td span="abc">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert "a | b | |" in result
+    assert "| a | b | |" in result
 
     # links: this gets through (for now)
     htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert result == "a |"
+    assert result == "| a |"
 
     # link: this is filtered out
     htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index 2c4ca80e..953a5f9f 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -287,7 +287,7 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
     # cells
     if element.tag == "cell" and elem_text and len(element) > 0:
         if element[0].tag == 'p':
-            elem_text = f"{elem_text} "
+            elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
     elif element.tag == 'cell' and elem_text:
         # add | before first cell
         elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"

From e466953d7d56a94464df2d4e3b7e5268700b9df8 Mon Sep 17 00:00:00 2001
From: CodyInnowhere <lostcody@CodyInnowheredeMacBook-Pro.local>
Date: Mon, 2 Dec 2024 14:53:15 +0800
Subject: [PATCH 3/3] extract image in textnode

---
 tests/unit_tests.py           |  7 +++++++
 trafilatura/htmlprocessing.py |  4 +++-
 trafilatura/main_extractor.py | 10 +++++++++-
 trafilatura/utils.py          | 14 ++++++++++++++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 9990a107..7e9beae7 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -483,6 +483,7 @@ def test_images():
     assert is_image_file('test.txt') is False
     assert is_image_file('test.jpg'*2000) is False  # length threshold
     # tag with attributes
+    assert handle_image(None) is None
     assert handle_image(html.fromstring('<img src="test.jpg"/>')) is not None
     assert handle_image(html.fromstring('<img data-src="test.jpg" alt="text" title="a title"/>')) is not None
     assert handle_image(html.fromstring('<img other="test.jpg"/>')) is None
@@ -494,6 +495,12 @@ def test_images():
     assert '![Example image](test.jpg)' in extract(teststring, include_images=True, fast=True)
     assert '<graphic src="test.jpg" title="Example image"/>' in extract(teststring, include_images=True, fast=True, output_format='xml', config=ZERO_CONFIG)
     assert extract('<html><body><article><img data-src="test.jpg" alt="text" title="a title"/></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><p><img data-src="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><p><img other="test.jpg" alt="text" title="a title"/></p></article></body></html>', include_images=True, fast=True) == ''
+    assert extract('<html><body><article><div><p><img data-src="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+    assert extract('<html><body><article><div><p><img data-src-small="test.jpg" alt="text" title="a title"/></p></div></article></body></html>', include_images=True, fast=True) == '![a title text](test.jpg)'
+
+    assert handle_image(html.fromstring('<img src="data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==" alt="text"></img>')) is None
 
     # CNN example
     mydoc = html.fromstring('<img class="media__image media__image--responsive" alt="Harry and Meghan last March, in their final royal engagement." data-src-mini="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-169.jpg" data-src-xsmall="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-medium-plus-169.jpg" data-src-small="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-large-169.jpg" data-src-medium="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-src-large="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-super-169.jpg" data-src-full16x9="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-full-169.jpg" data-src-mini1x1="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-small-11.jpg" data-demand-load="loaded" data-eq-pts="mini: 0, xsmall: 221, small: 308, medium: 461, large: 781" src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg" data-eq-state="mini xsmall small medium" data-src="//cdn.cnn.com/cnnnext/dam/assets/210307091919-harry-meghan-commonwealth-day-exlarge-169.jpg">')
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index af855ee2..d78734fe 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -20,7 +20,7 @@
     MANUALLY_CLEANED,
     MANUALLY_STRIPPED,
 )
-from .utils import textfilter, trim
+from .utils import textfilter, trim, is_image_element
 from .xml import META_ATTRIBUTES, delete_element
 
 
@@ -226,6 +226,8 @@ def handle_textnode(
     preserve_spaces: bool = False,
 ) -> Optional[_Element]:
     "Convert, format, and probe potential text elements."
+    if elem.tag == "graphic" and is_image_element(elem):
+        return elem
     if elem.tag == "done" or (len(elem) == 0 and not elem.text and not elem.tail):
         return None
 
diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
index eb50338e..2a950bec 100644
--- a/trafilatura/main_extractor.py
+++ b/trafilatura/main_extractor.py
@@ -331,6 +331,11 @@ def handle_paragraphs(element: _Element, potential_tags: Set[str], options: Extr
             #    else:
             #        newsub.tail = processed_child.text
             newsub.text, newsub.tail = processed_child.text, processed_child.tail
+
+            if processed_child.tag == 'graphic':
+                image_elem = handle_image(processed_child)
+                if image_elem is not None:
+                    newsub = image_elem
             processed_element.append(newsub)
         child.tag = "done"
     # finish
@@ -437,8 +442,11 @@ def handle_table(table_elem: _Element, potential_tags: Set[str], options: Extrac
     return None
 
 
-def handle_image(element: _Element) -> Optional[_Element]:
+def handle_image(element: Optional[_Element]) -> Optional[_Element]:
     "Process image elements and their relevant attributes."
+    if element is None:
+        return None
+
     processed_element = Element(element.tag)
 
     for attr in ("data-src", "src"):
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index 7db53889..8cb09793 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -346,6 +346,20 @@ def trim(string: str) -> str:
         return ""
 
 
+def is_image_element(element: _Element) -> bool:
+    '''Check if an element is a valid img element'''
+    for attr in ("data-src", "src"):
+        src = element.get(attr, "")
+        if is_image_file(src):
+            return True
+    else:
+        # take the first corresponding attribute
+        for attr, value in element.attrib.items():
+            if attr.startswith("data-src") and is_image_file(value):
+                return True
+    return False
+
+
 def is_image_file(imagesrc: Optional[str]) -> bool:
     '''Check if the observed string corresponds to a valid image extension.
        Use a length threshold and apply a regex on the content.'''