From 8a218739ec26643b2ef7e57fdbc51750ef590202 Mon Sep 17 00:00:00 2001 From: Kornel Date: Thu, 31 Oct 2024 19:47:32 +0000 Subject: [PATCH] HTML doesn't allow changing encoding multiple times --- src/rewriter/mod.rs | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index 106c6175..0031f4d6 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -261,22 +261,29 @@ impl<'h, O: OutputSink, H: HandlerTypes> Debug for HtmlRewriter<'h, O, H> { fn handler_adjust_charset_on_meta_tag<'h, H: HandlerTypes>( encoding: SharedEncoding, ) -> (Cow<'h, crate::Selector>, ElementContentHandlers<'h, H>) { + let mut set_once = false; let handler = move |el: &mut Element<'_, '_, H>| { - let attr_charset = el - .get_attribute("charset") - .and_then(|cs| Encoding::for_label_no_replacement(cs.as_bytes())) - .and_then(AsciiCompatibleEncoding::new); - - let attr_http_equiv = el - .get_attribute("http-equiv") - .filter(|http_equiv| http_equiv.eq_ignore_ascii_case("Content-Type")) - .and_then(|_| el.get_attribute("content")) - .and_then(|ct| ct.parse::().ok()) - .as_ref() - .and_then(AsciiCompatibleEncoding::from_mimetype); - - if let Some(charset) = attr_charset.or(attr_http_equiv) { + if set_once { + return Ok(()); + } + + let charset = el.get_attribute("charset").and_then(|cs| { + AsciiCompatibleEncoding::new(Encoding::for_label_no_replacement(cs.as_bytes())?) + }); + + let charset = charset.or_else(|| { + el.get_attribute("http-equiv") + .filter(|http_equiv| http_equiv.eq_ignore_ascii_case("Content-Type")) + .and_then(|_| { + AsciiCompatibleEncoding::from_mimetype( + &el.get_attribute("content")?.parse::().ok()?, + ) + }) + }); + + if let Some(charset) = charset { encoding.set(charset); + set_once = true; } Ok(()) @@ -734,10 +741,11 @@ mod tests { }; let html: Vec = [ - r#"I love "#.as_bytes().to_vec(), - vec![0xd5, 0xec, 0xb3, 0xcb, 0xdc], - br"!".to_vec(), - ].into_iter().concat(); + r#""#.as_bytes(), + br#"I love "#, // second one should be ignored + &[0xd5, 0xec, 0xb3, 0xcb, 0xdc], + br"!", + ].concat(); let expected: Vec = html .iter()