Skip to content

Commit

Permalink
fixed missing spaces inside some tags
Browse files Browse the repository at this point in the history
Signed-off-by: Jonas Kalderstam <[email protected]>
  • Loading branch information
spacecowboy committed Dec 18, 2024
1 parent 235228e commit 32cb38f
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ class HtmlLinearizer {
if (blockStyle.shouldSoftWrap) {
node.appendCorrectlyNormalizedWhiteSpace(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
} else {
append(node.wholeText)
Expand Down Expand Up @@ -113,7 +112,6 @@ class HtmlLinearizer {
withLinearTextAnnotation(LinearTextAnnotationH1) {
element.appendCorrectlyNormalizedWhiteSpaceRecursively(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
}
}
Expand All @@ -124,7 +122,6 @@ class HtmlLinearizer {
withLinearTextAnnotation(LinearTextAnnotationH2) {
element.appendCorrectlyNormalizedWhiteSpaceRecursively(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
}
}
Expand All @@ -135,7 +132,6 @@ class HtmlLinearizer {
withLinearTextAnnotation(LinearTextAnnotationH3) {
element.appendCorrectlyNormalizedWhiteSpaceRecursively(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
}
}
Expand All @@ -146,7 +142,6 @@ class HtmlLinearizer {
withLinearTextAnnotation(LinearTextAnnotationH4) {
element.appendCorrectlyNormalizedWhiteSpaceRecursively(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
}
}
Expand All @@ -157,7 +152,6 @@ class HtmlLinearizer {
withLinearTextAnnotation(LinearTextAnnotationH5) {
element.appendCorrectlyNormalizedWhiteSpaceRecursively(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
}
}
Expand All @@ -168,7 +162,6 @@ class HtmlLinearizer {
withLinearTextAnnotation(LinearTextAnnotationH6) {
element.appendCorrectlyNormalizedWhiteSpaceRecursively(
linearTextBuilder,
stripLeading = linearTextBuilder.endsWithWhitespace,
)
}
}
Expand Down Expand Up @@ -925,13 +918,10 @@ class HtmlLinearizer {
* Can't use JSoup's text() method because that strips invisible characters
* such as ZWNJ which are crucial for several languages.
*/
fun TextNode.appendCorrectlyNormalizedWhiteSpace(
builder: LinearTextBuilder,
stripLeading: Boolean,
) {
fun TextNode.appendCorrectlyNormalizedWhiteSpace(builder: LinearTextBuilder) {
wholeText.asUTF8Sequence()
.dropWhile {
stripLeading && isCollapsableWhiteSpace(it)
builder.endsWithWhitespace && isCollapsableWhiteSpace(it)
}
.fold(false) { lastWasWhite, char ->
if (isCollapsableWhiteSpace(char)) {
Expand All @@ -946,17 +936,13 @@ fun TextNode.appendCorrectlyNormalizedWhiteSpace(
}
}

fun Element.appendCorrectlyNormalizedWhiteSpaceRecursively(
builder: LinearTextBuilder,
stripLeading: Boolean,
) {
fun Element.appendCorrectlyNormalizedWhiteSpaceRecursively(builder: LinearTextBuilder) {
for (child in childNodes()) {
when (child) {
is TextNode -> child.appendCorrectlyNormalizedWhiteSpace(builder, stripLeading)
is TextNode -> child.appendCorrectlyNormalizedWhiteSpace(builder)
is Element ->
child.appendCorrectlyNormalizedWhiteSpaceRecursively(
builder,
stripLeading,
)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,23 @@ class HtmlLinearizerTest {
assertEquals(LinearText("Hello, world!", LinearTextBlockStyle.TEXT), result[0])
}

@Test
fun `spaces inside headers are kept`() {
val html =
"""
<html><body><h2><a href="http://example.com">Link</a> <small>small</small></h2></body></html>
""".trimIndent()
val baseUrl = "https://example.com"

val result = linearizer.linearize(html, baseUrl).elements

assertEquals(1, result.size)
assertEquals(
LinearText("Link small", LinearTextBlockStyle.TEXT, LinearTextAnnotation(LinearTextAnnotationH2, 0, 9)),
result[0],
)
}

@Test
fun `should return annotations with bold, italic, and underline`() {
val html = "<html><body><b><i><u>Hello, world!</u></i></b></body></html>"
Expand Down

0 comments on commit 32cb38f

Please sign in to comment.