From 94e339c3ce019dd0e7586c0b335f4144e8b132f8 Mon Sep 17 00:00:00 2001 From: Yumi Izumi Date: Fri, 6 Dec 2024 14:58:59 -0500 Subject: [PATCH] fix: hrefs with whitespace wrapped in <> Refs: #34 --- Cargo.lock | 9 +++++++++ Cargo.toml | 3 +++ src/parser.rs | 1 + src/structs.rs | 10 ++++++++++ src/to_md.rs | 25 +++++++++++++++++++++++-- 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 79a68c1..b5381e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5,3 +5,12 @@ version = 3 [[package]] name = "html2md-rs" version = "0.10.0" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" diff --git a/Cargo.toml b/Cargo.toml index ed8c3b5..3a365b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,3 +11,6 @@ repository = "https://github.com/izyumidev/html2md-rs" license = "MIT" keywords = ["html", "markdown", "html2md", "html-to-markdown", "html-to-md"] categories = ["parsing"] + +[dependencies] +percent-encoding = "2.3.1" diff --git a/src/parser.rs b/src/parser.rs index 01a5686..7de0773 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -607,6 +607,7 @@ fn issue_31() { attributes: Some(Attributes { id: None, class: None, + href: None, attributes: std::collections::HashMap::from([ ( "src".to_string(), diff --git a/src/structs.rs b/src/structs.rs index b53619d..da09458 100644 --- a/src/structs.rs +++ b/src/structs.rs @@ -138,6 +138,7 @@ impl Node { pub struct Attributes { pub(crate) id: Option, pub(crate) class: Option, + pub(crate) href: Option, pub(crate) attributes: HashMap, } @@ -147,6 +148,7 @@ impl Attributes { Attributes { id: None, class: None, + href: None, attributes: HashMap::new(), } } @@ -173,6 +175,14 @@ impl Attributes { self.class.as_ref() } + /// Return the href attribute of the element + pub fn get_href(&self) -> Option { + self.get("href").and_then(|value| match value { + AttributeValues::String(href) => Some(href), + _ => None, + }) + } + /// Returns the attributes of the element pub fn contains(&self, key: &str) -> bool { match key { diff --git a/src/to_md.rs b/src/to_md.rs index 59b3be4..0ada48a 100644 --- a/src/to_md.rs +++ b/src/to_md.rs @@ -124,10 +124,19 @@ pub fn to_md_with_config(node: Node, config: &ToMdConfig) -> String { tail.push('*'); } A => { - if let Some(link) = node.attributes.as_ref().and_then(|attrs| attrs.get("href")) + if let Some(link) = node.attributes.as_ref().and_then(|attrs| attrs.get_href()) { + let link = percent_encoding::percent_decode(link.as_bytes()) + .decode_utf8() + .map(|s| s.to_string()) + .unwrap_or(link); + res.push('['); - tail.push_str(&format!("]({})", link)); + if link.contains(' ') { + tail.push_str(&format!("](<{}>)", link)); + } else { + tail.push_str(&format!("]({})", link)); + } } else { res.push('['); tail.push(']'); @@ -231,6 +240,18 @@ pub fn to_md_with_config(node: Node, config: &ToMdConfig) -> String { res } +// https://github.com/izyuumi/html2md-rs/issues/34 +#[test] +fn issue34() { + let input = "

link

"; + let expected = "[link]()\n"; + assert_eq!(safe_from_html_to_md(input.to_string()).unwrap(), expected); + + let input = "

link

"; + let expected = "[link](/myuri)\n"; + assert_eq!(safe_from_html_to_md(input.to_string()).unwrap(), expected); +} + /// Converts a string of HTML to a markdown string. /// /// Panics if the HTML is invalid.