Skip to content

Commit

Permalink
Double-truncate to avoid inefficient encodeString() usage (#413)
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelChirico authored Nov 2, 2023
1 parent 1c4453f commit e088a1c
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 8 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# xml2 (development version)

* Remove unused dependencies on glue, withr and lifecycle (@mgirlich).
* `print()` is faster for very long `xml_nodeset` inputs (#366, @michaelchirico).

# xml2 1.3.5

Expand Down
29 changes: 21 additions & 8 deletions R/xml_nodeset.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,23 @@ as.character.xml_nodeset <- function(x, ...) {
xml_nodeset(NextMethod())
}

#' Wrapper for encodeString() that takes width into consideration
#'
#' encodeString() is relatively expensive to run (see #366), so
#' avoid doing so to very wide inputs by first trimming inputs
#' to approximately the correct width, then encoding. A second
#' round of truncation occurs after encoding to account for
#' any newly-inserted characters bumping an input too wide.
#' @noRd
encode_with_width <- function(x, width) {
truncate_raw <- nchar(x) > width
x[truncate_raw] <- substr(x[truncate_raw], 1L, width - 3L)
x <- encodeString(x)
truncate_encoded <- truncate_raw | nchar(x) > width
x[truncate_encoded] <- paste(substr(x[truncate_encoded], 1L, width - 3L), "...")
x
}

show_nodes <- function(x, width = getOption("width"), max_n = 20) {
stopifnot(inherits(x, "xml_nodeset"))

Expand All @@ -46,20 +63,16 @@ show_nodes <- function(x, width = getOption("width"), max_n = 20) {
return()
}

if (n > max_n) {
trunc <- n > max_n
if (trunc) {
n <- max_n
x <- x[seq_len(n)]
trunc <- TRUE
} else {
trunc <- FALSE
}

label <- format(paste0("[", seq_len(n), "]"), justify = "right")
contents <- encodeString(vapply(x, as.character, FUN.VALUE = character(1)))
contents <- vapply(x, as.character, FUN.VALUE = character(1L))

desc <- paste0(label, " ", contents)
needs_trunc <- nchar(desc) > width
desc[needs_trunc] <- paste(substr(desc[needs_trunc], 1, width - 3), "...")
desc <- encode_with_width(paste(label, contents), width)

cat(desc, sep = "\n")
if (trunc) {
Expand Down
19 changes: 19 additions & 0 deletions tests/testthat/_snaps/xml_nodeset.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,22 @@
[9] <div id="more_if_no_javascript"><a href="/search/">More</a></div>
[10] <div class="magnifyingglass navbarSprite"></div>

---

Code
print(x, width = 13L)
Output
{xml_document}
<doc>
[1] <a>123 ...
[2] <b>123 ...
[3] <c>12\ ...
Code
print(x, width = 14L)
Output
{xml_document}
<doc>
[1] <a>1234 ...
[2] <b>1234 ...
[3] <c>12\\ ...

16 changes: 16 additions & 0 deletions tests/testthat/test-xml_nodeset.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,24 @@ test_that("methods work on empty nodesets", {
})

test_that("print method is correct", {
skip_if(getOption("width") < 20L, "Screen too narrow")

x <- read_html(test_path("lego.html.bz2"))
body <- xml_find_first(x, "//body")
divs <- xml_find_all(body, ".//div")[1:10]
expect_snapshot(print(divs))

# double-substring() logic
s <- c(
"123456789\\", # always too wide, '\' never encoded
"12345", # always fits
"12\\45" # doesn't fit when '\' is encoded
)
# embed as text on nodes <a>,<b>,<c>
s <- sprintf("<%1$s>%2$s</%1$s>", letters[1:3], s)
x <- read_xml(sprintf("<doc>%s</doc>", paste(s, collapse="")))
expect_snapshot({
print(x, width = 13L)
print(x, width = 14L)
})
})

0 comments on commit e088a1c

Please sign in to comment.