Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more robust table extraction #767

Merged
merged 4 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,70 @@ def test_table_processing():
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td>a<img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td><a href="http://aa.bb/"><img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></a></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td><img src="http://aa.bb/c.jpg" alt="img"/><span>a</span></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
<table>
<tr><td>a</td><td>b</td><td>c</td></tr>
<tr>
<td><img src="http://aa.bb/c.jpg" alt="img1"/><span>a</span><img src="http://aa.bb/c.jpg" alt="img2"/></td>
<td><p>b</p><p>c</p></td>
<td>d</td>
</tr>
</table>
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"


def test_list_processing():
options = DEFAULT_OPTIONS
Expand Down
5 changes: 4 additions & 1 deletion trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
link_density_test_tables, process_node,
prune_unwanted_nodes)
from .settings import TAG_CATALOG, Extractor
from .utils import FORMATTING_PROTECTED, is_image_file, text_chars_test, trim
from .utils import FORMATTING_PROTECTED, copy_attributes, is_image_file, text_chars_test, trim
from .xml import delete_element
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
Expand Down Expand Up @@ -157,6 +157,8 @@ def define_newelem(processed_elem: _Element, orig_elem: _Element) -> None:
if processed_elem is not None:
childelem = SubElement(orig_elem, processed_elem.tag)
childelem.text, childelem.tail = processed_elem.text, processed_elem.tail
if processed_elem.tag == 'graphic':
copy_attributes(childelem, processed_elem)


def handle_lists(element: _Element, options: Extractor) -> Optional[_Element]:
Expand Down Expand Up @@ -488,6 +490,7 @@ def handle_image(element: Optional[_Element], options: Optional[Extractor] = Non
link = re.sub(r"^//", "http://", link)
processed_element.set("src", link)

processed_element.tail = element.tail
return processed_element


Expand Down
11 changes: 11 additions & 0 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,3 +454,14 @@ def text_chars_test(string: Optional[str]) -> bool:
# or not re.search(r'\w', string)
# return string is not None and len(string) != 0 and not string.isspace()
return bool(string) and not string.isspace() # type: ignore[union-attr]


def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
'''Copy attributes from src element to dest element'''
for key in src_elem.keys():
adbar marked this conversation as resolved.
Show resolved Hide resolved
dest_elem.set(key, src_elem.attrib[key])


def is_in_table_cell(elem: _Element) -> bool:
'''Check whether an element is in a table cell'''
return elem.xpath('//ancestor::cell')
adbar marked this conversation as resolved.
Show resolved Hide resolved
27 changes: 17 additions & 10 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
fromstring, tostring, DTD)

from .settings import Document, Extractor
from .utils import sanitize, sanitize_tree, text_chars_test
from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -288,12 +288,8 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
if element.tag == "cell":
elem_text = elem_text.strip()

if elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
if elem_text:
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
Expand All @@ -302,18 +298,29 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:

def process_element(element: _Element, returnlist: List[str], include_formatting: bool) -> None:
"Recursively convert a LXML element and its children to a flattened string representation."
if element.tag == 'cell' and element.getprevious() is None:
returnlist.append('| ')

if element.text:
# this is the text that comes before the first child
returnlist.append(replace_element_text(element, include_formatting))

if element.tail and element.tag != 'graphic' and is_in_table_cell(element):
# if element is in table cell, append tail after element text when element is not graphic since we deal with
# graphic tail alone, textless elements like lb should be processed here too, otherwise process tail at the end
returnlist.append(element.tail.strip())

for child in element:
process_element(child, returnlist, include_formatting)

if not element.text and not element.tail:
if not element.text:
if element.tag == "graphic":
# add source, default to ''
text = f'{element.get("title", "")} {element.get("alt", "")}'
returnlist.append(f'![{text.strip()}]({element.get("src", "")})')

if element.tail:
returnlist.append(f' {element.tail.strip()}')
# newlines for textless elements
elif element.tag in NEWLINE_ELEMS:
# add line after table head
Expand Down Expand Up @@ -350,8 +357,8 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail:
returnlist.append(element.tail.strip() if element.tag == 'cell' else element.tail)
if element.tail and not is_in_table_cell(element):
returnlist.append(element.tail)


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
Expand Down
Loading