Skip to content

Commit

Permalink
Fix newline on formatting tags
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Oct 13, 2023
1 parent d4d6852 commit 5b3fb27
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 5 deletions.
20 changes: 18 additions & 2 deletions src/extractor/parse/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
from urllib.parse import urljoin, urlparse, urlunparse

import pandas as pd
from bs4 import BeautifulSoup, NavigableString
from bs4 import BeautifulSoup, Comment, NavigableString

from extractor.extractors.data.images import MediaUse, ResolvableMediaUse
from extractor.extractors.data.links import Link, ResolvableLink
from extractor.extractors.media import get_caption
from extractor.util.str import squash_whitespace

EXCLUDED_CONTENT_TAGS = {"figcaption"}
NEWLINE_TAGS = {"br", "p"}


InternalLinks = List[ResolvableLink]
Expand Down Expand Up @@ -117,6 +118,21 @@ def extract_images(doc: BeautifulSoup, self_link: str) -> Images:
return media_uses


def _get_text(doc: BeautifulSoup) -> str:
"""Custom function to get document text.
Extracts text from all elements, inserting newlines for <p> and <br> tags.
"""
text = ""
for e in doc.descendants:
# Comments are a subtype of NavigableString, they need to be excluded
if isinstance(e, NavigableString) and not isinstance(e, Comment):
text += e
elif e.name in NEWLINE_TAGS:
text += "\n"
return text


def extract_content_data(doc: BeautifulSoup, self_link: str) -> pd.Series:
"""Extract the links, embeds, images and text content of the document.
Expand All @@ -139,6 +155,6 @@ def extract_content_data(doc: BeautifulSoup, self_link: str) -> pd.Series:
if child.name in EXCLUDED_CONTENT_TAGS:
child.extract()

content_text = squash_whitespace(doc_c.get_text(separator="\n"))
content_text = squash_whitespace(_get_text(doc_c))

return pd.Series([content_text, internal_links, external_links, embeds, images])
6 changes: 5 additions & 1 deletion tests/parse/test_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def test_extract_content(datadir: Path):
assert (
text == "The first paragraph.\n"
"The second paragraph.\n"
"The third paragraph.\n"
"Not in a paragraph.\n"
"Heavily nested."
)
Expand All @@ -101,7 +102,10 @@ def test_extract_content_br_newline(datadir: Path):
content_series = extract_content_data(doc, "https://example.org/home")
text = content_series[0]

assert text == "Before break\nAfter break"
assert (
text
== "Before break\nAfter break. Don't break here.\nBefore break\nAfter break."
)


def test_extract_content_whitespace_collapse(datadir: Path):
Expand Down
3 changes: 2 additions & 1 deletion tests/parse/test_content/content_extraction.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
<img src="/example-image.png" alt="Some alt text" />
<figcaption>A caption</figcaption>
</figure>
<p>The second paragraph.</p>
<!-- a comment, I should be ignored -->
<p>The second paragraph.</p><p>The third paragraph.</p>
Not in a paragraph.
<div>
<div>
Expand Down
3 changes: 2 additions & 1 deletion tests/parse/test_content/whitespace_br.html
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
<p>Before break<br />After break</p>
<p>Before break<br />After break. Don't <strong>break here</strong>.</p>
<p>Before break<br>After break.</p>

0 comments on commit 5b3fb27

Please sign in to comment.