Open
Description
I am writing a small function to find all links on a page, and attempting to use url_absolute()
to convert relative links to absolute links.
I've run into an issue if the original url to read_html()
redirects to a different location, because then links normalized with url_absolute(link, base = xml_url(doc))
are incorrect.
small reprex:
library(xml2)
url <- "https://docs.posit.co/connect/admin"
x <- read_html(url)
links <- "../admin/appendix/branding/index.html"
# in actuality,
# links <- x |> xml_find_all(".//a[@href]") |> xml_attr("href", default = "")
# note the "/connect/" is swallowed
x2 <- url_absolute("../admin/appendix/branding/index.html", base = xml2::xml_url(x))
x2
#> [1] "https://docs.posit.co/admin/appendix/branding/index.html"
read_html(x2)
#> Error in open.connection(x, "rb"): cannot open the connection
# because we need to add a trailing backslash to the base url
x2 <- url_absolute("../admin/appendix/branding/index.html", base = paste0(xml_url(x), "/"))
x2
#> [1] "https://docs.posit.co/connect/admin/appendix/branding/index.html"
read_html(x2)
#> {html_document}
#> <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
#> [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
#> [2] <body class="nav-sidebar floating nav-fixed">\n\n<div id="quarto-search-r ...
# because the original request was redirected to a different location
system("curl -I https://docs.posit.co/connect/admin | grep location:", intern = T)
#> [1] "location: /connect/admin/\r"
system('curl -I -L -o /dev/null -s -w "%{url_effective}\n" https://docs.posit.co/connect/admin', intern = T)
#> [1] "https://docs.posit.co/connect/admin/"
Created on 2025-02-27 with reprex v2.1.1
Metadata
Metadata
Assignees
Labels
No labels