From a7b897b3502a0309bd91e74c8dfcc515c658b6e2 Mon Sep 17 00:00:00 2001 From: Kornel Date: Fri, 22 Nov 2024 16:27:39 +0000 Subject: [PATCH] Clarifications --- src/html/mod.rs | 9 ++- src/rewritable_units/text_encoder.rs | 117 ++++++++++++++++++--------- 2 files changed, 82 insertions(+), 44 deletions(-) diff --git a/src/html/mod.rs b/src/html/mod.rs index 73dfe336..7a33027e 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -18,12 +18,13 @@ pub use self::text_type::TextType; pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) { loop { if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) { - let Some((chunk_before, (matched, rest))) = content - .split_at_checked(pos) - .and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?))) - else { + let Some((chunk_before, rest)) = content.split_at_checked(pos) else { return; }; + let Some((matched, rest)) = rest.split_at_checked(1) else { + return; + }; + content = rest; let matched = matched.as_bytes()[0]; diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index 3fd002d0..c7b969da 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> { } } +/// Temporary buffer used for encoding_rs output enum Buffer { + /// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path. + Stack([u8; 63]), // leave a byte for the enum's tag, so that the enum has 64-byte size + /// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up Heap(Vec), - Stack([u8; 63]), // leave a byte for the tag +} + +impl Buffer { + /// Arbitrary limit when to switch from a small on-stack buffer to heap allocation + const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER: usize = 1 << 20; + + /// Arbitrary, about a page size + const DEFAULT_HEAP_BUFFER_SIZE: usize = 4096; + + fn buffer_for_length(&mut self, len: usize) -> &mut [u8] { + let buffer = match self { + Buffer::Heap(buf) => buf.as_mut_slice(), + // Long non-ASCII content could take lots of roundtrips through the encoder + buf if content.len() >= Self::CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => { + *buf = Buffer::Heap(vec![0; Self::DEFAULT_HEAP_BUFFER_SIZE]); + match buf { + Buffer::Heap(buf) => buf.as_mut(), + _ => unreachable!(), + } + } + Buffer::Stack(buf) => buf.as_mut_slice(), + }; + buffer + } } struct TextEncoder { @@ -152,6 +179,7 @@ impl TextEncoder { #[inline(never)] fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) { loop { + // First, fast path for ASCII-only prefix debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes()); if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) { @@ -164,20 +192,12 @@ impl TextEncoder { content = remainder; } - let buffer = match &mut self.buffer { - Buffer::Heap(buf) => buf.as_mut_slice(), - // Long non-ASCII content could take lots of roundtrips through the encoder - buf if content.len() >= 1 << 20 => { - *buf = Buffer::Heap(vec![0; 4096]); - match buf { - Buffer::Heap(buf) => buf.as_mut(), - _ => unreachable!(), - } - } - Buffer::Stack(buf) => buf.as_mut_slice(), - }; + // Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to. + let buffer = self.buffer.buffer_for_length(content.len()); + // last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false); + if written > 0 && written <= buffer.len() { (output_handler)(&buffer[..written]); } @@ -185,20 +205,21 @@ impl TextEncoder { return; } content = &content[read..]; + match result { CoderResult::InputEmpty => { debug_assert!(content.is_empty()); return; } + // we've made progress, and can try again without growing the buffer + CoderResult::OutputFull if written > 0 => {} CoderResult::OutputFull => { - match &mut self.buffer { - Buffer::Heap(buf) if buf.len() >= 1024 => { - if written == 0 { - panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes - } - } - buf => *buf = Buffer::Heap(vec![0; 1024]), - } + // encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug. + assert!( + buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE, + "encoding_rs infinite loop" + ); + self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]); } } } @@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 { b.leading_ones() as _ } +/// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments struct IncompleteUtf8Resync { - bytes: [u8; 4], - len: u8, + /// Buffers an incomplete UTF-8 sequence + char_bytes: [u8; 4], + /// Number of bytes in `bytes` + char_len: u8, } impl IncompleteUtf8Resync { pub fn new() -> Self { Self { - bytes: [0; 4], - len: 0, + char_bytes: [0; 4], + char_len: 0, } } + /// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes. + /// + /// Call `discard_incomplete()` after the last write to flush any partially-written chars. pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>( &'buf mut self, mut content: &'src [u8], ) -> Result<(&'buf str, &'src [u8]), Utf8Error> { - if self.len > 0 { - let mut found_end_byte = false; + // There may be incomplete char buffered from previous write, that must be continued now + if self.char_len > 0 { + let mut must_emit_now = false; while let Some((&next_byte, rest)) = content.split_first() { if is_continuation_byte(next_byte) { - if let Some(buf) = self.bytes.get_mut(self.len as usize) { + if let Some(buf) = self.char_bytes.get_mut(self.char_len as usize) { *buf = next_byte; - self.len += 1; + self.char_len += 1; content = rest; continue; } + // overlong sequences fall here, and will be checked when the char_bytes is flushed } - found_end_byte = true; + must_emit_now = true; break; } - if found_end_byte || self.len >= utf8_width(self.bytes[0]) { - let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?; - self.len = 0; - std::str::from_utf8(char_buf) - .map_err(|_| Utf8Error) - .map(|ch| (ch, content)) + if self.char_len >= utf8_width(self.char_bytes[0]) { + must_emit_now = true; + } + + if must_emit_now { + let char_buf = self + .char_bytes + .get(..self.char_len as usize) + .ok_or(Utf8Error)?; + self.char_len = 0; + let ch = std::str::from_utf8(char_buf).map_err(|_| Utf8Error)?; + Ok((ch, content)) } else { + // a partial write has ended without fully completing a char (it's possible to write 1 byte at a time) debug_assert!(content.is_empty()); Ok(("", b"")) } @@ -264,11 +300,12 @@ impl IncompleteUtf8Resync { let (valid, invalid) = content .split_at_checked(err.valid_up_to()) .ok_or(Utf8Error)?; - self.bytes + // save the incomplete bytes from the end for the next write + self.char_bytes .get_mut(..invalid.len()) .ok_or(Utf8Error)? .copy_from_slice(invalid); - self.len = invalid.len() as _; + self.char_len = invalid.len() as _; // valid_up_to promises it is valid debug_assert!(std::str::from_utf8(valid).is_ok()); let valid = unsafe { std::str::from_utf8_unchecked(valid) }; @@ -280,8 +317,8 @@ impl IncompleteUtf8Resync { /// True if there were incomplete invalid bytes in the buffer pub fn discard_incomplete(&mut self) -> bool { - if self.len > 0 { - self.len = 0; + if self.char_len > 0 { + self.char_len = 0; true } else { false