Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

preserve space in certain elements #429

Merged
merged 10 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def test_code_blocks():
</code></pre>
</div>'''
testresult = extract(highlightjs, config=ZERO_CONFIG, output_format='xml')
assert '<code>code\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
assert '<code>code\n\nhighlighted more code\n</code>' in testresult and 'quote' not in testresult
github = '''<div class="highlight highlight-source-shell notranslate position-relative overflow-auto" dir="auto"><pre>$ pip install PyGithub</pre><div class="zeroclipboard-container position-absolute right-0 top-0">
<clipboard-copy aria-label="Copy" class="ClipboardButton btn js-clipboard-copy m-2 p-0 tooltipped-no-delay" data-copy-feedback="Copied!" data-tooltip-direction="w" value="$ pip install PyGithub" tabindex="0" role="button" style="display: inherit;">
<svg aria-hidden="true" height="16" viewBox="0 0 16 16" version="1.1" width="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2">
Expand Down Expand Up @@ -1093,14 +1093,14 @@ def test_code_blocks():
</div>'''
testresult = extract(w3schools, config=ZERO_CONFIG, output_format='xml')
expected = '''<code>
class Person:<lb/> def __init__(self, name, age):<lb/>
self.name = name<lb/> self.age = age<lb/><lb/>p1 = Person("John",
36)<lb/>
<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
class Person:<lb/>\xa0 def __init__(self, name, age):<lb/>\xa0\xa0\xa0
self.name = name<lb/>\xa0\xa0\xa0 self.age = age<lb/><lb/>p1 = Person("John",
36)<lb/>
<lb/>print(p1.name)<lb/>print(p1.age) </code>'''
assert expected in testresult and 'quote' not in testresult
pip = '''<div><p>Code:</p>
<pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
<span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
<pre lang="python3"><span class="kn">import</span> <span class="nn">openai</span>
<span class="kn">from</span> <span class="nn">openai_function_call</span> <span class="kn">import</span> <span class="n">openai_function</span></pre></div>'''
expected = '''<code>import openai
from openai_function_call import openai_function</code>'''
testresult = extract(pip, config=ZERO_CONFIG, output_format='xml')
Expand All @@ -1111,8 +1111,8 @@ class Person:<lb/> def __init__(self, name, age):<lb/>
testresult = extract(medium_js, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
medium_ssr = '''<div><p>Code:</p>
<pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_functiondef sum(a:int, b:int):<br/> &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
expected = '<code>import openai_function<lb/><lb/>@openai_functiondef sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'
<pre class="lw lx ly lz ma nq nr ns bo nt ba bj"><span id="fe48" class="nu mo ev nr b bf nv nw l nx ny">import openai_function<br><br>@openai_function<br>def sum(a:int, b:int):<br> &quot;&quot;&quot;Sum description adds a + b&quot;&quot;&quot;</span></pre>'''
expected = '''<code>import openai_function<lb/><lb/>@openai_function<lb/>def sum(a:int, b:int):<lb/> """Sum description adds a + b"""</code>'''
testresult = extract(medium_ssr, config=ZERO_CONFIG, output_format='xml')
assert expected in testresult and 'quote' not in testresult
code_el = '''<div><p>Code:</p>
Expand Down
6 changes: 2 additions & 4 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
from .settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
xmltotxt)
Expand All @@ -38,8 +38,6 @@

LOGGER = logging.getLogger(__name__)

FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'td'}
SPACING_PROTECTED = {'code', 'hi', 'ref'}
P_FORMATTING = {'hi', 'ref'}
TABLE_ELEMS = {'td', 'th'}
TABLE_ALL = {'td', 'th', 'hi'}
Expand Down Expand Up @@ -942,7 +940,7 @@ def bare_extraction(filecontent, url=None, no_fallback=False, # fast=False,
include_comments, include_formatting, include_links,
include_images, include_tables, deduplicate,
target_language)

# prune all xpath expressions that user specified
# no backup as this is unetre full control of the user
if prune_xpath is not None:
Expand Down
56 changes: 43 additions & 13 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@

STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")

FORMATTING_PROTECTED = {'cell', 'head', 'hi', 'item', 'p', 'quote', 'ref', 'td'}
SPACING_PROTECTED = {'code', 'pre'}


def handle_compressed_file(filecontent):
"""Tell if a file's magic number corresponds to the GZip format
Expand Down Expand Up @@ -257,29 +260,56 @@ def normalize_unicode(string, unicodeform='NFC'):


@lru_cache(maxsize=1024)
def line_processing(line):
def line_processing(line, preserve_space=False, trailing_space=False):
'''Remove HTML space entities, then discard incompatible unicode
and invalid XML characters on line level'''
# spacing HTML entities: https://www.w3.org/MarkUp/html-spec/html-spec_13.html
# unique code spaces
line = line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0').replace(';cs;', ' ')
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
line = trim(remove_control_characters(LINES_TRIMMING.sub(r' ', line)))
# prune empty lines
if all(map(str.isspace, line)):
line = None
return line


def sanitize(text):
new_line = remove_control_characters(line.replace('&#13;', '\r').replace('&#10;', '\n').replace('&nbsp;', '\u00A0'))
if not preserve_space:
# remove newlines that are not related to punctuation or markup
# remove non-printable chars and normalize space characters (including Unicode spaces)
new_line = trim(LINES_TRIMMING.sub(r" ", new_line))
# prune empty lines
if all(map(str.isspace, new_line)):
new_line = None
elif trailing_space:
space_before = " " if line[0] == " " else ""
space_after = " " if line[-1] == " " else ""
new_line = "".join([space_before, new_line, space_after])
return new_line


def sanitize(text, preserve_space=False, trailing_space=False):
'''Convert text and discard incompatible and invalid characters'''
# consider all text as a single line
if trailing_space:
return line_processing(text, preserve_space, True)
# process line by line
try:
return '\n'.join(filter(None, (line_processing(l) for l in text.splitlines())))
return '\n'.join(filter(None, (line_processing(l, preserve_space) for l in text.splitlines())))
except AttributeError:
return None


def sanitize_tree(tree):
'''Trims spaces, removes control characters and normalizes unicode'''
for elem in tree.iter():
parent = elem.getparent()
parent_tag = parent.tag if parent is not None else ""

# preserve space if the element or its parent is a specific tag, or if the element has text and children
# the last part is relevant for item elements with ref inside for example
preserve_space = elem.tag in SPACING_PROTECTED or parent_tag in SPACING_PROTECTED
trailing_space = elem.tag in FORMATTING_PROTECTED or parent_tag in FORMATTING_PROTECTED or preserve_space

if elem.text:
elem.text = sanitize(elem.text, preserve_space, trailing_space)
if elem.tail:
elem.tail = sanitize(elem.tail, preserve_space, trailing_space)
return tree


@lru_cache(maxsize=1024)
def trim(string):
'''Remove unnecessary spaces within a text string'''
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from . import __version__
from .filters import text_chars_test
from .utils import sanitize
from .utils import sanitize, sanitize_tree

LOGGER = logging.getLogger(__name__)
# validation
Expand Down Expand Up @@ -117,8 +117,9 @@ def build_xml_output(docmeta):

def control_xml_output(output_tree, output_format, tei_validation, docmeta):
'''Make sure the XML output is conform and valid if required'''
control_string = sanitize(tostring(output_tree, encoding='unicode'))
output_tree = sanitize_tree(output_tree)
# necessary for cleaning
control_string = tostring(output_tree, encoding='unicode')
output_tree = fromstring(control_string, CONTROL_PARSER)
# validate
if output_format == 'xmltei' and tei_validation is True:
Expand Down
Loading