From b9f65ba66f44e194cd2e182108118a9ce83f58b8 Mon Sep 17 00:00:00 2001 From: Henrik Bengtsson Date: Fri, 10 Nov 2023 12:15:20 -0800 Subject: [PATCH] Make xml_serialize()/xml_unserialize() work also for HTML documents (#408) Fixes #407 --- NEWS.md | 3 +++ R/xml_serialize.R | 10 ++++++++-- tests/testthat/test-xml_serialize.R | 15 +++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4ba49c77..86553566 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,9 @@ # xml2 (development version) +* `xml_serialize()` now includes the document type so that `xml_unserialize()` works also for HTML documents (#407, @HenrikBengtsson). + * Remove unused dependencies on glue, withr and lifecycle (@mgirlich). + * `print()` is faster for very long `xml_nodeset` inputs (#366, @michaelchirico). # xml2 1.3.5 diff --git a/R/xml_serialize.R b/R/xml_serialize.R index 1869268e..d2cdcb9b 100644 --- a/R/xml_serialize.R +++ b/R/xml_serialize.R @@ -22,7 +22,7 @@ xml_serialize.xml_document <- function(object, connection, ...) { connection <- file(connection, "w", raw = TRUE) on.exit(close(connection)) } - serialize(structure(as.character(object, ...), class = "xml_serialized_document"), connection) + serialize(structure(as.character(object, ...), doc_type = doc_type(object), class = "xml_serialized_document"), connection) } #' @export @@ -64,7 +64,13 @@ xml_unserialize <- function(connection, ...) { # Select only the root res <- xml_find_first(x, "/node()") } else if (inherits(object, "xml_serialized_document")) { - res <- read_xml(unclass(object), ...) + read_xml_int <- function(object, as_html = FALSE, ...) { + if (missing(as_html)) { + as_html <- identical(attr(object, "doc_type", exact = TRUE), "html") + } + read_xml(unclass(object), as_html = as_html, ...) + } + res <- read_xml_int(unclass(object), ...) } else { abort("Not a serialized xml2 object") } diff --git a/tests/testthat/test-xml_serialize.R b/tests/testthat/test-xml_serialize.R index 395dfb0c..825ead6a 100644 --- a/tests/testthat/test-xml_serialize.R +++ b/tests/testthat/test-xml_serialize.R @@ -2,6 +2,7 @@ x <- read_xml(" 123 456 ") + test_that("xml_serialize and xml_unserialize work with xml_document input", { out <- xml_unserialize(xml_serialize(x, NULL)) expect_identical(as.character(x), as.character(out)) @@ -37,6 +38,20 @@ test_that("xml_serialize and xml_unserialize work with xml_nodeset input", { expect_identical(as.character(xml_unserialize(f)), as.character(b)) }) +test_that("xml_serialize and xml_unserialize work with HTML-based xml_document input", { + file <- system.file("extdata", "r-project.html", package = "xml2") + x <- read_html(file) + + out <- xml_unserialize(xml_serialize(x, NULL)) + expect_identical(as.character(x), as.character(out)) + + f <- tempfile() + on.exit(unlink(f)) + + xml_serialize(x, f) + expect_identical(as.character(xml_unserialize(f)), as.character(x)) +}) + test_that("xml_unserialize throws an error if given a invalid object", { expect_error(xml_unserialize(serialize(1, NULL)), "Not a serialized xml2 object") })