Skip to content

Commit

Permalink
Add <br> to HTML output and fix margin collapsing order
Browse files Browse the repository at this point in the history
  • Loading branch information
phoerious committed Dec 3, 2024
1 parent 52caefd commit 5d42529
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 14 deletions.
17 changes: 14 additions & 3 deletions resiliparse/resiliparse/extract/html2text.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ cdef void _extract_cb(vector[shared_ptr[ExtractNode]]& extract_nodes, ExtractCon
elif ctx.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
return

elif ctx.node.local_name in [LXB_TAG_BR, LXB_TAG_HR]:
elif ctx.node.local_name == LXB_TAG_BR and ctx.opts.preserve_formatting == FormattingOpts.FORMAT_BASIC:
_ensure_text_contents(extract_nodes)
deref(current_node).collapse_margins = False

Expand Down Expand Up @@ -302,6 +302,7 @@ cdef string _serialize_extract_nodes(vector[shared_ptr[ExtractNode]]& extract_no
cdef bint bullet_inserted = False
cdef size_t list_depth = 0
cdef size_t margin_size = 0
cdef size_t uncollapsed_margin_count = 0
cdef vector[size_t] list_numbering
cdef string list_item_indent = <const char*>b' '
cdef const char* element_name = NULL
Expand All @@ -314,6 +315,9 @@ cdef string _serialize_extract_nodes(vector[shared_ptr[ExtractNode]]& extract_no

# Basic and minimal HTML formatting
if opts.preserve_formatting >= FormattingOpts.FORMAT_BASIC:
if current_node.make_block and not current_node.collapse_margins:
uncollapsed_margin_count += 1

# List tags
if (current_node.tag_id in [LXB_TAG_UL, LXB_TAG_OL]
or (current_node.tag_id == LXB_TAG_LI and list_depth == 0)):
Expand Down Expand Up @@ -361,19 +365,25 @@ cdef string _serialize_extract_nodes(vector[shared_ptr[ExtractNode]]& extract_no
if current_node.pre_depth:
current_node.make_block = False

# Explicit line breaks
if current_node.tag_id == LXB_TAG_BR:
output.append(b'<br>')

# Add a select number of start/end tags if minimal HTML formatting is on.
if opts.preserve_formatting == FormattingOpts.FORMAT_MINIMAL_HTML and (
current_node.tag_id in [LXB_TAG_H1, LXB_TAG_H2, LXB_TAG_H3, LXB_TAG_H4, LXB_TAG_H5, LXB_TAG_H6, LXB_TAG_P]
or (current_node.tag_id in [LXB_TAG_UL, LXB_TAG_OL] and opts.list_bullets)):

# Add margin before start tag and skip after
if not current_node.is_end_tag and not current_node.pre_depth:
if (not current_node.is_end_tag and not current_node.pre_depth) or (
uncollapsed_margin_count and current_node.collapse_margins):
if current_node.collapse_margins:
margin_size = max(margin_size, <size_t>(current_node.make_block + current_node.make_big_block))
else:
margin_size += <size_t>(current_node.make_block + current_node.make_big_block)
_make_margin(output, margin_size, current_node, opts)
current_node.make_block = False
uncollapsed_margin_count = 0

# Indent if in list (indent ul and ol start tags on level less)
if opts.list_bullets:
Expand Down Expand Up @@ -419,6 +429,7 @@ cdef string _serialize_extract_nodes(vector[shared_ptr[ExtractNode]]& extract_no

# Make margins and indents
_make_margin(output, margin_size, current_node, opts)
uncollapsed_margin_count = 0

# Indent list items if basic formatting is used (follow-up lines without bullets are indented more)
if list_depth and opts.preserve_formatting == FormattingOpts.FORMAT_BASIC:
Expand Down Expand Up @@ -748,7 +759,7 @@ def extract_plain_text(html,
If ``preserve_formatting`` is ``True``, line breaks, paragraphs, other block-level elements,
list elements, and pre-formatted text will be preserved. Use the special value ``'minimal_html'`` to
add reduced HTML markup to the formatted output, preserving headings (``<h1-6>``), paragraphs (``<p>``),
lists (``<ul>``, ``<ol>``), ``<pre>`` text, and links (``<a>``, if ``links=True``).
lists (``<ul>``, ``<ol>``), ``<pre>`` text, ``<br>`` line breaks, and links (``<a>``, if ``links=True``).
Extraction of particular elements and attributes such as links, alt texts, or form fields
can be configured individually by setting the corresponding parameter to ``True``.
Expand Down
29 changes: 18 additions & 11 deletions tests/resiliparse/extract/test_html2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_basic_extraction():
</ul>
foo bar
<p>baz
<p>baz<br>
bar</p>
Copyright (C) 2021 Foo Bar"""
Expand All @@ -118,7 +118,7 @@ def test_basic_extraction():
</ul>
foo bar
<p>baz
<p>baz<br>
bar</p>
Some image Cannot display object
Expand All @@ -133,7 +133,7 @@ def test_basic_extraction():
foo bar
<p>baz
<p>baz<br>
bar</p>
Some image Cannot display object
Expand Down Expand Up @@ -500,27 +500,34 @@ def test_html_escaping():
"Hello World link (https://example.com/?foo=bar&bar=baz) Some code <html>& foo <html> <html>& <html>& [ <html>& ]"


def test_linebreaks():
def test_margin_collapsing():
html = """\
<p>Hello
World</p>
<p>Hello<br>World<br><br><br><br>!</p>
<div>Hello<br>World<br><br><br><br>!</div>"""
<p>Hello<br>World<br><br><br><br></p>
<p>Hello<br>World<br><br><br><br></p>
<div>Hello World</div>"""

assert extract_plain_text(html, preserve_formatting=True) == """\
Hello World
Hello\nWorld\n\n\n\n!
Hello\nWorld\n\n\n\n!"""
Hello\nWorld\n\n\n
Hello\nWorld\n\n\n
Hello World"""

assert extract_plain_text(html, preserve_formatting='minimal_html') == """\
<p>Hello World</p>
<p>Hello\nWorld\n\n\n\n!</p>
<p>Hello<br>
World<br><br><br><br></p>
<p>Hello<br>
World<br><br><br><br></p>
Hello\nWorld\n\n\n\n!"""
Hello World"""


def test_real_word_data():
Expand Down

0 comments on commit 5d42529

Please sign in to comment.