Fix newline on formatting tags

GateNLP · Oct 13, 2023 · 5b3fb27 · 5b3fb27
1 parent d4d6852
commit 5b3fb27
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 5 deletions.
diff --git a/src/extractor/parse/content.py b/src/extractor/parse/content.py
@@ -4,14 +4,15 @@
 from urllib.parse import urljoin, urlparse, urlunparse
 
 import pandas as pd
-from bs4 import BeautifulSoup, NavigableString
+from bs4 import BeautifulSoup, Comment, NavigableString
 
 from extractor.extractors.data.images import MediaUse, ResolvableMediaUse
 from extractor.extractors.data.links import Link, ResolvableLink
 from extractor.extractors.media import get_caption
 from extractor.util.str import squash_whitespace
 
 EXCLUDED_CONTENT_TAGS = {"figcaption"}
+NEWLINE_TAGS = {"br", "p"}
 
 
 InternalLinks = List[ResolvableLink]
@@ -117,6 +118,21 @@ def extract_images(doc: BeautifulSoup, self_link: str) -> Images:
     return media_uses
 
 
+def _get_text(doc: BeautifulSoup) -> str:
+    """Custom function to get document text.
+
+    Extracts text from all elements, inserting newlines for <p> and <br> tags.
+    """
+    text = ""
+    for e in doc.descendants:
+        # Comments are a subtype of NavigableString, they need to be excluded
+        if isinstance(e, NavigableString) and not isinstance(e, Comment):
+            text += e
+        elif e.name in NEWLINE_TAGS:
+            text += "\n"
+    return text
+
+
 def extract_content_data(doc: BeautifulSoup, self_link: str) -> pd.Series:
     """Extract the links, embeds, images and text content of the document.
 
@@ -139,6 +155,6 @@ def extract_content_data(doc: BeautifulSoup, self_link: str) -> pd.Series:
         if child.name in EXCLUDED_CONTENT_TAGS:
             child.extract()
 
-    content_text = squash_whitespace(doc_c.get_text(separator="\n"))
+    content_text = squash_whitespace(_get_text(doc_c))
 
     return pd.Series([content_text, internal_links, external_links, embeds, images])
diff --git a/tests/parse/test_content.py b/tests/parse/test_content.py
@@ -90,6 +90,7 @@ def test_extract_content(datadir: Path):
     assert (
         text == "The first paragraph.\n"
         "The second paragraph.\n"
+        "The third paragraph.\n"
         "Not in a paragraph.\n"
         "Heavily nested."
     )
@@ -101,7 +102,10 @@ def test_extract_content_br_newline(datadir: Path):
     content_series = extract_content_data(doc, "https://example.org/home")
     text = content_series[0]
 
-    assert text == "Before break\nAfter break"
+    assert (
+        text
+        == "Before break\nAfter break. Don't break here.\nBefore break\nAfter break."
+    )
 
 
 def test_extract_content_whitespace_collapse(datadir: Path):

diff --git a/tests/parse/test_content/content_extraction.html b/tests/parse/test_content/content_extraction.html
@@ -3,7 +3,8 @@
   <img src="/example-image.png" alt="Some alt text" />
   <figcaption>A caption</figcaption>
 </figure>
-<p>The second paragraph.</p>
+<!-- a comment, I should be ignored -->
+<p>The second paragraph.</p><p>The third paragraph.</p>
 Not in a paragraph.
 <div>
   <div>

diff --git a/tests/parse/test_content/whitespace_br.html b/tests/parse/test_content/whitespace_br.html
@@ -1 +1,2 @@
-<p>Before break<br />After break</p>
+<p>Before break<br />After break. Don't <strong>break here</strong>.</p>
+<p>Before break<br>After break.</p>