diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index ac8e60b3..b43362c8 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1286,6 +1286,70 @@ def test_table_processing():
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
+ htmlstring = """
+
+
+ a | b | c |
+
+ aa |
+ b c |
+ d |
+
+
+
+ """
+ result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+ include_images=True, include_tables=True)
+ assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
+
+ htmlstring = """
+
+
+ a | b | c |
+
+ a |
+ b c |
+ d |
+
+
+
+ """
+ result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+ include_images=True, include_tables=True)
+ assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
+
+ htmlstring = """
+
+
+ a | b | c |
+
+ a |
+ b c |
+ d |
+
+
+
+ """
+ result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+ include_images=True, include_tables=True)
+ assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
+
+ htmlstring = """
+
+
+ a | b | c |
+
+ a |
+ b c |
+ d |
+
+
+
+ """
+ result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
+ include_images=True, include_tables=True)
+ assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
+
def test_list_processing():
options = DEFAULT_OPTIONS
diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
index 4af74329..a3b1d481 100644
--- a/trafilatura/main_extractor.py
+++ b/trafilatura/main_extractor.py
@@ -18,7 +18,7 @@
link_density_test_tables, process_node,
prune_unwanted_nodes)
from .settings import TAG_CATALOG, Extractor
-from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim
+from .utils import FORMATTING_PROTECTED, copy_attributes, is_image_file, text_chars_test, trim
from .xml import delete_element
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
@@ -157,6 +157,8 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None:
if processed_elem is not None:
childelem = SubElement(orig_elem, processed_elem.tag)
childelem.text, childelem.tail = processed_elem.text, processed_elem.tail
+ if processed_elem.tag == 'graphic':
+ copy_attributes(childelem, processed_elem)
def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]:
@@ -488,6 +490,7 @@ def handle_image(element: Optional[_Element], options: Optional[Extractor] = Non
link = re.sub(r"^//", "http://", link)
processed_element.set("src", link)
+ processed_element.tail = element.tail
return processed_element
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index bd1eee7c..84fcfd3c 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -454,3 +454,18 @@ def text_chars_test(string: Optional[str]) -> bool:
# or not re.search(r'\w', string)
# return string is not None and len(string) != 0 and not string.isspace()
return bool(string) and not string.isspace() # type: ignore[union-attr]
+
+
+def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
+ '''Copy attributes from src element to dest element'''
+ for key in src_elem.keys():
+ dest_elem.set(key, src_elem.attrib[key])
+
+
+def is_in_table_cell(elem: _Element) -> bool:
+ '''Check whether an element is in a table cell'''
+ while elem is not None:
+ if elem.tag == 'cell':
+ return True
+ elem = elem.getparent()
+ return False
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index f5e6c57e..a31e70da 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -17,7 +17,7 @@
fromstring, tostring, DTD)
from .settings import Document, Extractor
-from .utils import sanitize, sanitize_tree, text_chars_test
+from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
LOGGER = logging.getLogger(__name__)
@@ -288,12 +288,8 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if element.tag == "cell":
elem_text = elem_text.strip()
- if elem_text and len(element) > 0:
- if element[0].tag == 'p':
- elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
- elif elem_text:
- # add | before first cell
- elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
+ if elem_text:
+ elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
@@ -302,18 +298,29 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None:
"Recursively convert a LXML element and its children to a flattened string representation."
+ if element.tag == 'cell' and element.getprevious() is None:
+ returnlist.append('| ')
+
if element.text:
# this is the text that comes before the first child
returnlist.append(replace_element_text(element, include_formatting))
+ if element.tail and element.tag != 'graphic' and is_in_table_cell(element):
+ # if element is in table cell, append tail after element text when element is not graphic since we deal with
+ # graphic tail alone, textless elements like lb should be processed here too, otherwise process tail at the end
+ returnlist.append(element.tail.strip())
+
for child in element:
process_element(child, returnlist, include_formatting)
- if not element.text and not element.tail:
+ if not element.text:
if element.tag == "graphic":
# add source, default to ''
text = f'{element.get("title", "")} {element.get("alt", "")}'
returnlist.append(f'![{text.strip()}]({element.get("src", "")})')
+
+ if element.tail:
+ returnlist.append(f' {element.tail.strip()}')
# newlines for textless elements
elif element.tag in NEWLINE_ELEMS:
# add line after table head
@@ -350,8 +357,8 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(" ")
# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
- if element.tail:
- returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)
+ if element.tail and not is_in_table_cell(element):
+ returnlist.append(element.tail)
def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str: