From a7b897b3502a0309bd91e74c8dfcc515c658b6e2 Mon Sep 17 00:00:00 2001
From: Kornel <kornel@cloudflare.com>
Date: Fri, 22 Nov 2024 16:27:39 +0000
Subject: [PATCH] Clarifications

---
 src/html/mod.rs                      |   9 ++-
 src/rewritable_units/text_encoder.rs | 117 ++++++++++++++++++---------
 2 files changed, 82 insertions(+), 44 deletions(-)
diff --git a/src/html/mod.rs b/src/html/mod.rs
index 73dfe336..7a33027e 100644
--- a/src/html/mod.rs
+++ b/src/html/mod.rs
@@ -18,12 +18,13 @@ pub use self::text_type::TextType;
 pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) {
     loop {
         if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) {
-            let Some((chunk_before, (matched, rest))) = content
-                .split_at_checked(pos)
-                .and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?)))
-            else {
+            let Some((chunk_before, rest)) = content.split_at_checked(pos) else {
                 return;
             };
+            let Some((matched, rest)) = rest.split_at_checked(1) else {
+                return;
+            };
+
             content = rest;
             let matched = matched.as_bytes()[0];
 
diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs
index 3fd002d0..c7b969da 100644
--- a/src/rewritable_units/text_encoder.rs
+++ b/src/rewritable_units/text_encoder.rs
@@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> {
     }
 }
 
+/// Temporary buffer used for encoding_rs output
 enum Buffer {
+    /// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path.
+    Stack([u8; 63]), // leave a byte for the enum's tag, so that the enum has 64-byte size
+    /// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up
     Heap(Vec<u8>),
-    Stack([u8; 63]), // leave a byte for the tag
+}
+
+impl Buffer {
+    /// Arbitrary limit when to switch from a small on-stack buffer to heap allocation
+    const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER: usize = 1 << 20;
+
+    /// Arbitrary, about a page size
+    const DEFAULT_HEAP_BUFFER_SIZE: usize = 4096;
+
+    fn buffer_for_length(&mut self, len: usize) -> &mut [u8] {
+        let buffer = match self {
+            Buffer::Heap(buf) => buf.as_mut_slice(),
+            // Long non-ASCII content could take lots of roundtrips through the encoder
+            buf if content.len() >= Self::CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => {
+                *buf = Buffer::Heap(vec![0; Self::DEFAULT_HEAP_BUFFER_SIZE]);
+                match buf {
+                    Buffer::Heap(buf) => buf.as_mut(),
+                    _ => unreachable!(),
+                }
+            }
+            Buffer::Stack(buf) => buf.as_mut_slice(),
+        };
+        buffer
+    }
 }
 
 struct TextEncoder {
@@ -152,6 +179,7 @@ impl TextEncoder {
     #[inline(never)]
     fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) {
         loop {
+            // First, fast path for ASCII-only prefix
             debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it
             let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes());
             if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) {
@@ -164,20 +192,12 @@ impl TextEncoder {
                 content = remainder;
             }
 
-            let buffer = match &mut self.buffer {
-                Buffer::Heap(buf) => buf.as_mut_slice(),
-                // Long non-ASCII content could take lots of roundtrips through the encoder
-                buf if content.len() >= 1 << 20 => {
-                    *buf = Buffer::Heap(vec![0; 4096]);
-                    match buf {
-                        Buffer::Heap(buf) => buf.as_mut(),
-                        _ => unreachable!(),
-                    }
-                }
-                Buffer::Stack(buf) => buf.as_mut_slice(),
-            };
+            // Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to.
+            let buffer = self.buffer.buffer_for_length(content.len());
 
+            // last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow
             let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false);
+
             if written > 0 && written <= buffer.len() {
                 (output_handler)(&buffer[..written]);
             }
@@ -185,20 +205,21 @@ impl TextEncoder {
                 return;
             }
             content = &content[read..];
+
             match result {
                 CoderResult::InputEmpty => {
                     debug_assert!(content.is_empty());
                     return;
                 }
+                // we've made progress, and can try again without growing the buffer
+                CoderResult::OutputFull if written > 0 => {}
                 CoderResult::OutputFull => {
-                    match &mut self.buffer {
-                        Buffer::Heap(buf) if buf.len() >= 1024 => {
-                            if written == 0 {
-                                panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes
-                            }
-                        }
-                        buf => *buf = Buffer::Heap(vec![0; 1024]),
-                    }
+                    // encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug.
+                    assert!(
+                        buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE,
+                        "encoding_rs infinite loop"
+                    );
+                    self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]);
                 }
             }
         }
@@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 {
     b.leading_ones() as _
 }
 
+/// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments
 struct IncompleteUtf8Resync {
-    bytes: [u8; 4],
-    len: u8,
+    /// Buffers an incomplete UTF-8 sequence
+    char_bytes: [u8; 4],
+    /// Number of bytes in `bytes`
+    char_len: u8,
 }
 
 impl IncompleteUtf8Resync {
     pub fn new() -> Self {
         Self {
-            bytes: [0; 4],
-            len: 0,
+            char_bytes: [0; 4],
+            char_len: 0,
         }
     }
 
+    /// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes.
+    ///
+    /// Call `discard_incomplete()` after the last write to flush any partially-written chars.
     pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>(
         &'buf mut self,
         mut content: &'src [u8],
     ) -> Result<(&'buf str, &'src [u8]), Utf8Error> {
-        if self.len > 0 {
-            let mut found_end_byte = false;
+        // There may be incomplete char buffered from previous write, that must be continued now
+        if self.char_len > 0 {
+            let mut must_emit_now = false;
             while let Some((&next_byte, rest)) = content.split_first() {
                 if is_continuation_byte(next_byte) {
-                    if let Some(buf) = self.bytes.get_mut(self.len as usize) {
+                    if let Some(buf) = self.char_bytes.get_mut(self.char_len as usize) {
                         *buf = next_byte;
-                        self.len += 1;
+                        self.char_len += 1;
                         content = rest;
                         continue;
                     }
+                    // overlong sequences fall here, and will be checked when the char_bytes is flushed
                 }
-                found_end_byte = true;
+                must_emit_now = true;
                 break;
             }
 
-            if found_end_byte || self.len >= utf8_width(self.bytes[0]) {
-                let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?;
-                self.len = 0;
-                std::str::from_utf8(char_buf)
-                    .map_err(|_| Utf8Error)
-                    .map(|ch| (ch, content))
+            if self.char_len >= utf8_width(self.char_bytes[0]) {
+                must_emit_now = true;
+            }
+
+            if must_emit_now {
+                let char_buf = self
+                    .char_bytes
+                    .get(..self.char_len as usize)
+                    .ok_or(Utf8Error)?;
+                self.char_len = 0;
+                let ch = std::str::from_utf8(char_buf).map_err(|_| Utf8Error)?;
+                Ok((ch, content))
             } else {
+                // a partial write has ended without fully completing a char (it's possible to write 1 byte at a time)
                 debug_assert!(content.is_empty());
                 Ok(("", b""))
             }
@@ -264,11 +300,12 @@ impl IncompleteUtf8Resync {
                     let (valid, invalid) = content
                         .split_at_checked(err.valid_up_to())
                         .ok_or(Utf8Error)?;
-                    self.bytes
+                    // save the incomplete bytes from the end for the next write
+                    self.char_bytes
                         .get_mut(..invalid.len())
                         .ok_or(Utf8Error)?
                         .copy_from_slice(invalid);
-                    self.len = invalid.len() as _;
+                    self.char_len = invalid.len() as _;
                     // valid_up_to promises it is valid
                     debug_assert!(std::str::from_utf8(valid).is_ok());
                     let valid = unsafe { std::str::from_utf8_unchecked(valid) };
@@ -280,8 +317,8 @@ impl IncompleteUtf8Resync {
 
     /// True if there were incomplete invalid bytes in the buffer
     pub fn discard_incomplete(&mut self) -> bool {
-        if self.len > 0 {
-            self.len = 0;
+        if self.char_len > 0 {
+            self.char_len = 0;
             true
         } else {
             false