diff --git a/src/renderer/html_handlebars/hbs_renderer.rs b/src/renderer/html_handlebars/hbs_renderer.rs index 8ea2f49efc..49c0a5e114 100644 --- a/src/renderer/html_handlebars/hbs_renderer.rs +++ b/src/renderer/html_handlebars/hbs_renderer.rs @@ -57,8 +57,12 @@ impl HtmlHandlebars { let content = ch.content.clone(); let content = utils::render_markdown(&content, ctx.html_config.curly_quotes); - let fixed_content = - utils::render_markdown_with_path(&ch.content, ctx.html_config.curly_quotes, Some(path)); + let fixed_content = utils::render_markdown_with_path( + &ch.content, + ctx.html_config.curly_quotes, + Some(path), + ctx.html_config.redirect, + ); if !ctx.is_index && ctx.html_config.print.page_break { // Add page break between chapters // See https://developer.mozilla.org/en-US/docs/Web/CSS/break-before and https://developer.mozilla.org/en-US/docs/Web/CSS/page-break-before @@ -66,7 +70,25 @@ impl HtmlHandlebars { print_content .push_str(r#"
"#); } - print_content.push_str(&fixed_content); + let path_id = { + let mut base = path.display().to_string(); + if base.ends_with(".md") { + base.replace_range(base.len() - 3.., ""); + } + &base + .replace("/", "-") + .replace("\\", "-") + .to_ascii_lowercase() + }; + + // We have to build header links in advance so that we can know the ranges + // for the headers in one page. + // Insert a dummy div to make sure that we can locate the specific page. + print_content.push_str(&(format!(r#"
"#, &path_id))); + print_content.push_str(&build_header_links( + &build_print_element_id(&fixed_content, &path_id), + Some(path_id), + )); // Update the context with data for this file let ctx_path = path @@ -211,7 +233,7 @@ impl HtmlHandlebars { code_config: &Code, edition: Option, ) -> String { - let rendered = build_header_links(&rendered); + let rendered = build_header_links(&rendered, None); let rendered = fix_code_blocks(&rendered); let rendered = add_playground_pre(&rendered, playground_config, edition); let rendered = hide_lines(&rendered, code_config); @@ -809,9 +831,43 @@ fn make_data( Ok(data) } +/// Goes through part of the rendered print page HTML, +/// add path id prefix to all the elements id as well as footnote links. +fn build_print_element_id(html: &str, path_id: &str) -> String { + let all_id = Regex::new(r#"(<[^>]*?id=")([^"]+?)""#).unwrap(); + let footnote_id = Regex::new( + r##"(]*?class="footnote-reference"[^>]*?>[^<]*?]*?href="#)([^"]+?)""##, + ) + .unwrap(); + + if path_id.is_empty() { + return html.to_string(); + } + + let temp_html = all_id + .replace_all(html, |caps: &Captures<'_>| { + let mut fixed = String::new(); + fixed.push_str(&path_id); + fixed.push_str("-"); + fixed.push_str(&caps[2]); + format!("{}{}\"", &caps[1], fixed) + }) + .into_owned(); + + footnote_id + .replace_all(&temp_html, |caps: &Captures<'_>| { + let mut fixed = String::new(); + fixed.push_str(&path_id); + fixed.push_str("-"); + fixed.push_str(&caps[2]); + format!("{}{}\"", &caps[1], fixed) + }) + .into_owned() +} + /// Goes through the rendered HTML, making sure all header tags have /// an anchor respectively so people can link to sections directly. -fn build_header_links(html: &str) -> String { +fn build_header_links(html: &str, path_id: Option<&str>) -> String { static BUILD_HEADER_LINKS: Lazy = Lazy::new(|| { Regex::new(r#"(.*?)"#).unwrap() }); @@ -840,6 +896,7 @@ fn build_header_links(html: &str) -> String { caps.get(2).map(|x| x.as_str().to_string()), caps.get(3).map(|x| x.as_str().to_string()), &mut id_counter, + path_id, ) }) .into_owned() @@ -847,14 +904,21 @@ fn build_header_links(html: &str) -> String { /// Insert a sinle link into a header, making sure each link gets its own /// unique ID by appending an auto-incremented number (if necessary). +/// +/// For `print.html`, we will add a path id prefix. fn insert_link_into_header( level: usize, content: &str, id: Option, classes: Option, id_counter: &mut HashMap, + path_id: Option<&str>, ) -> String { - let id = id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)); + let id = if let Some(path_id) = path_id { + id.unwrap_or_else(|| utils::unique_id_from_content_with_path(content, id_counter, path_id)) + } else { + id.unwrap_or_else(|| utils::unique_id_from_content(content, id_counter)) + }; let classes = classes .map(|s| format!(" class=\"{s}\"")) .unwrap_or_default(); @@ -1142,7 +1206,7 @@ mod tests { ]; for (src, should_be) in inputs { - let got = build_header_links(src); + let got = build_header_links(src, None); assert_eq!(got, should_be); } } diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 9156916ea6..bc586e5645 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -12,7 +12,7 @@ use regex::Regex; use std::borrow::Cow; use std::collections::HashMap; use std::fmt::Write; -use std::path::Path; +use std::path::{Component, Path, PathBuf}; pub use self::string::{ take_anchored_lines, take_lines, take_rustdoc_include_anchored_lines, @@ -83,29 +83,114 @@ pub fn unique_id_from_content(content: &str, id_counter: &mut HashMap, + path_id: &str, +) -> String { + unique_id_from_content(&format!("{} {}", path_id, content), id_counter) +} + +/// https://stackoverflow.com/a/68233480 +/// Improve the path to try remove and solve .. token. +/// +/// This assumes that `a/b/../c` is `a/c` which might be different from +/// what the OS would have chosen when b is a link. This is OK +/// for broot verb arguments but can't be generally used elsewhere +/// +/// This function ensures a given path ending with '/' will also +/// end with '/' after normalization. +pub fn normalize_path>(path: P) -> String { + let ends_with_slash = path.as_ref().to_str().map_or(false, |s| s.ends_with('/')); + let mut normalized = PathBuf::new(); + for component in path.as_ref().components() { + match &component { + Component::ParentDir => { + if !normalized.pop() { + normalized.push(component); + } + } + Component::CurDir => {} + _ => { + normalized.push(component); + } + } + } + if ends_with_slash { + normalized.push(""); + } + normalized.to_str().unwrap().replace("\\", "/").to_string() +} + +/// Return the normalized path id. +pub fn normalize_path_id(mut path: String) -> String { + path = path + .replace("/", "-") + .replace(".html#", "-") + .replace("#", "-") + .to_ascii_lowercase(); + if path.ends_with(".html") { + path.replace_range(path.len() - 5.., ""); + } + path +} + /// Fix links to the correct location. /// /// This adjusts links, such as turning `.md` extensions to `.html`. /// /// `path` is the path to the page being rendered relative to the root of the /// book. This is used for the `print.html` page so that links on the print -/// page go to the original location. Normal page rendering sets `path` to -/// None. Ideally, print page links would link to anchors on the print page, -/// but that is very difficult. -fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { +/// page go to the anchors that has a path id prefix. Normal page rendering +/// sets `path` to None. +fn adjust_links<'a>( + event: Event<'a>, + path: Option<&Path>, + redirects: HashMap, +) -> Event<'a> { static SCHEME_LINK: Lazy = Lazy::new(|| Regex::new(r"^[a-z][a-z0-9+.-]*:").unwrap()); - static MD_LINK: Lazy = - Lazy::new(|| Regex::new(r"(?P.*)\.md(?P#.*)?").unwrap()); + static HTML_MD_LINK: Lazy = + Lazy::new(|| Regex::new(r"(?P.*)\.(html|md)(?P#.*)?").unwrap()); fn fix<'a>(dest: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + // Don't modify links with schemes like `https`. + if !SCHEME_LINK.is_match(&dest) { + // This is a relative link, adjust it as necessary. + let mut fixed_link = String::new(); + if let Some(path) = path { + let base = path + .parent() + .expect("path can't be empty") + .to_str() + .expect("utf-8 paths only"); + if !base.is_empty() { + write!(fixed_link, "{}/", base).unwrap(); + } + } + fixed_link.push_str(&dest); + return CowStr::from(fixed_link); + } + dest + } + + fn fix_a_links<'a>( + dest: CowStr<'a>, + path: Option<&Path>, + redirects: HashMap, + ) -> CowStr<'a> { if dest.starts_with('#') { // Fragment-only link. if let Some(path) = path { let mut base = path.display().to_string(); if base.ends_with(".md") { - base.replace_range(base.len() - 3.., ".html"); + base.replace_range(base.len() - 3.., ""); } - return format!("{}{}", base, dest).into(); + return format!( + "#{}{}", + normalize_path_id(normalize_path(base)), + dest.replace("#", "-") + ) + .into(); } else { return dest; } @@ -125,7 +210,7 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { } } - if let Some(caps) = MD_LINK.captures(&dest) { + if let Some(caps) = HTML_MD_LINK.captures(&dest) { fixed_link.push_str(&caps["link"]); fixed_link.push_str(".html"); if let Some(anchor) = caps.name("anchor") { @@ -134,12 +219,84 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { } else { fixed_link.push_str(&dest); }; + + let mut normalized_path = normalize_path(&fixed_link); + + // Judge if the html link is inside the book. + if !normalized_path.starts_with("../") && !normalized_path.contains("/../") { + // In `print.html`, print page links would all link to anchors on the print page. + if let Some(_) = path { + // Fix redirect links + let normalized_path_split: Vec<&str> = normalized_path.split('#').collect(); + for (original, redirect) in &redirects { + if normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&normalized_path) + || normalize_path(original.trim_start_matches('/')) + .eq_ignore_ascii_case(&normalized_path_split[0]) + { + let mut unnormalized_path = String::new(); + if SCHEME_LINK.is_match(&redirect) { + unnormalized_path = redirect.to_string(); + } else { + let base = PathBuf::from(normalized_path_split[0]) + .parent() + .expect("path can't be empty") + .to_str() + .expect("utf-8 paths only") + .to_owned(); + + let normalized_base = + normalize_path(base).trim_matches('/').to_owned(); + if !normalized_base.is_empty() { + write!(unnormalized_path, "{}/{}", normalized_base, redirect) + .unwrap(); + } else { + unnormalized_path = + redirect.to_string().trim_start_matches('/').to_string(); + } + } + + // original without anchors, need to append link anchors + if !original.contains("#") { + for i in 1..normalized_path_split.len() { + if !unnormalized_path.contains("#") { + unnormalized_path.push('#'); + } else { + unnormalized_path.push('-'); + } + unnormalized_path.push_str(normalized_path_split[i]); + } + } + + if !SCHEME_LINK.is_match(&redirect) { + normalized_path = normalize_path(unnormalized_path); + } else { + return CowStr::from(unnormalized_path); + } + break; + } + } + // Check again to make sure anchors are the html links inside the book. + if normalized_path.starts_with("../") || normalized_path.contains("/../") { + return CowStr::from(normalized_path); + } + let mut fixed_anchor_for_print = String::new(); + fixed_anchor_for_print.push_str("#"); + fixed_anchor_for_print.push_str(&normalize_path_id(normalized_path)); + return CowStr::from(fixed_anchor_for_print); + } + } + // In normal page rendering, links to anchors on another page. return CowStr::from(fixed_link); } dest } - fn fix_html<'a>(html: CowStr<'a>, path: Option<&Path>) -> CowStr<'a> { + fn fix_html<'a>( + html: CowStr<'a>, + path: Option<&Path>, + redirects: HashMap, + ) -> CowStr<'a> { // This is a terrible hack, but should be reasonably reliable. Nobody // should ever parse a tag with a regex. However, there isn't anything // in Rust that I know of that is suitable for handling partial html @@ -148,33 +305,44 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> { // There are dozens of HTML tags/attributes that contain paths, so // feel free to add more tags if desired; these are the only ones I // care about right now. - static HTML_LINK: Lazy = - Lazy::new(|| Regex::new(r#"(<(?:a|img) [^>]*?(?:src|href)=")([^"]+?)""#).unwrap()); + static A_LINK: Lazy = + Lazy::new(|| Regex::new(r#"(]*?href=")([^"]+?)""#).unwrap()); + static IMG_LINK: Lazy = + Lazy::new(|| Regex::new(r#"(]*?src=")([^"]+?)""#).unwrap()); - HTML_LINK + let temp_html = IMG_LINK .replace_all(&html, |caps: ®ex::Captures<'_>| { let fixed = fix(caps[2].into(), path); format!("{}{}\"", &caps[1], fixed) }) + .into_owned(); + + A_LINK + .replace_all(&temp_html, |caps: ®ex::Captures<'_>| { + let fixed = fix_a_links(caps[2].into(), path, redirects.clone()); + format!("{}{}\"", &caps[1], fixed) + }) .into_owned() .into() } match event { - Event::Start(Tag::Link(link_type, dest, title)) => { - Event::Start(Tag::Link(link_type, fix(dest, path), title)) - } + Event::Start(Tag::Link(link_type, dest, title)) => Event::Start(Tag::Link( + link_type, + fix_a_links(dest, path, redirects), + title, + )), Event::Start(Tag::Image(link_type, dest, title)) => { Event::Start(Tag::Image(link_type, fix(dest, path), title)) } - Event::Html(html) => Event::Html(fix_html(html, path)), + Event::Html(html) => Event::Html(fix_html(html, path, redirects)), _ => event, } } /// Wrapper around the pulldown-cmark parser for rendering markdown to HTML. pub fn render_markdown(text: &str, curly_quotes: bool) -> String { - render_markdown_with_path(text, curly_quotes, None) + render_markdown_with_path(text, curly_quotes, None, HashMap::new()) } pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> { @@ -190,12 +358,17 @@ pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> { Parser::new_ext(text, opts) } -pub fn render_markdown_with_path(text: &str, curly_quotes: bool, path: Option<&Path>) -> String { +pub fn render_markdown_with_path( + text: &str, + curly_quotes: bool, + path: Option<&Path>, + redirects: HashMap, +) -> String { let mut s = String::with_capacity(text.len() * 3 / 2); let p = new_cmark_parser(text, curly_quotes); let events = p .map(clean_codeblock_headers) - .map(|event| adjust_links(event, path)) + .map(|event| adjust_links(event, path, redirects.clone())) .flat_map(|event| { let (a, b) = wrap_tables(event); a.into_iter().chain(b) diff --git a/tests/rendered_output.rs b/tests/rendered_output.rs index 7626b9e8ac..6c45464313 100644 --- a/tests/rendered_output.rs +++ b/tests/rendered_output.rs @@ -126,11 +126,11 @@ fn check_correct_relative_links_in_print_page() { assert_contains_strings( first.join("print.html"), &[ - r##"the first section,"##, + r##"the first section,"##, r##"outside"##, r##"Some image"##, - r##"fragment link"##, - r##"HTML Link"##, + r##"fragment link"##, + r##"HTML Link"##, r##"raw html"##, ], );