Skip to content

Commit

Permalink
more robust table extraction
Browse files Browse the repository at this point in the history
fix type check
  • Loading branch information
CodyInnowhere authored and CodyInnowhere committed Dec 13, 2024
1 parent b010779 commit c510ac5
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 11 deletions.
64 changes: 64 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,70 @@ def test_table_processing():
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td>a<img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td><a href="http://aa.bb/"><img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></a></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td><img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td><img src="http://aa.bb/c.jpg" alt="img1"/><span>a</span><img src="http://aa.bb/c.jpg" alt="img2"/></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"


def test_list_processing():
options = DEFAULT_OPTIONS
Expand Down
5 changes: 4 additions & 1 deletion trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
link_density_test_tables, process_node,
prune_unwanted_nodes)
from .settings import TAG_CATALOG, Extractor
from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim
from .utils import FORMATTING_PROTECTED, copy_attributes, is_image_file, text_chars_test, trim
from .xml import delete_element
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
Expand Down Expand Up @@ -157,6 +157,8 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None:
if processed_elem is not None:
childelem = SubElement(orig_elem, processed_elem.tag)
childelem.text, childelem.tail = processed_elem.text, processed_elem.tail
if processed_elem.tag == 'graphic':
copy_attributes(childelem, processed_elem)


def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]:
Expand Down Expand Up @@ -488,6 +490,7 @@ def handle_image(element: Optional[_Element], options: Optional[Extractor] = Non
link = re.sub(r"^//", "http://", link)
processed_element.set("src", link)

processed_element.tail = element.tail
return processed_element


Expand Down
15 changes: 15 additions & 0 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,18 @@ def text_chars_test(string: Optional[str]) -> bool:
# or not re.search(r'\w', string)
# return string is not None and len(string) != 0 and not string.isspace()
return bool(string) and not string.isspace() # type: ignore[union-attr]


def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
'''Copy attributes from src element to dest element'''
for key in src_elem.keys():
dest_elem.set(key, src_elem.attrib[key])


def is_in_table_cell(elem: _Element) -> bool:
'''Check whether an element is in a table cell'''
while elem is not None:
if elem.tag == 'cell':
return True
elem = elem.getparent()
return False
27 changes: 17 additions & 10 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
fromstring, tostring, DTD)

from .settings import Document, Extractor
from .utils import sanitize, sanitize_tree, text_chars_test
from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -288,12 +288,8 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if element.tag == "cell":
elem_text = elem_text.strip()

if elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
if elem_text:
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
Expand All @@ -302,18 +298,29 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:

def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None:
"Recursively convert a LXML element and its children to a flattened string representation."
if element.tag == 'cell' and element.getprevious() is None:
returnlist.append('| ')

if element.text:
# this is the text that comes before the first child
returnlist.append(replace_element_text(element, include_formatting))

if element.tail and element.tag != 'graphic' and is_in_table_cell(element):
# if element is in table cell, append tail after element text when element is not graphic since we deal with
# graphic tail alone, textless elements like lb should be processed here too, otherwise process tail at the end
returnlist.append(element.tail.strip())

for child in element:
process_element(child, returnlist, include_formatting)

if not element.text and not element.tail:
if not element.text:
if element.tag == "graphic":
# add source, default to ''
text = f'{element.get("title", "")} {element.get("alt", "")}'
returnlist.append(f'![{text.strip()}]({element.get("src", "")})')

if element.tail:
returnlist.append(f' {element.tail.strip()}')
# newlines for textless elements
elif element.tag in NEWLINE_ELEMS:
# add line after table head
Expand Down Expand Up @@ -350,8 +357,8 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail:
returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)
if element.tail and not is_in_table_cell(element):
returnlist.append(element.tail)


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
Expand Down

0 comments on commit c510ac5

Please sign in to comment.