Skip to content

Commit

Permalink
Clarifications
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Nov 22, 2024
1 parent 59164a0 commit 7fd8c21
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 44 deletions.
9 changes: 5 additions & 4 deletions src/html/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ pub use self::text_type::TextType;
pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) {
loop {
if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) {
let Some((chunk_before, (matched, rest))) = content
.split_at_checked(pos)
.and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?)))
else {
let Some((chunk_before, rest)) = content.split_at_checked(pos) else {
return;
};
let Some((matched, rest)) = rest.split_at_checked(1) else {
return;
};

content = rest;
let matched = matched.as_bytes()[0];

Expand Down
117 changes: 77 additions & 40 deletions src/rewritable_units/text_encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,36 @@ impl<'output_handler> StreamingHandlerSinkInner<'output_handler> {
}
}

/// Temporary buffer used for encoding_rs output
enum Buffer {
/// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path.
Stack([u8; 63]), // leave a byte for the enum's tag, so that the enum has 64-byte size
/// Used when encoding_rs asks for a larger buffer, or the content is large enough for small buffer roundtrips to add up
Heap(Vec<u8>),
Stack([u8; 63]), // leave a byte for the tag
}

impl Buffer {
/// Arbitrary limit when to switch from a small on-stack buffer to heap allocation
const CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER: usize = 1 << 20;

/// Arbitrary, about a page size
const DEFAULT_HEAP_BUFFER_SIZE: usize = 4096;

fn buffer_for_length(&mut self, content_len: usize) -> &mut [u8] {
let buffer = match self {
Buffer::Heap(buf) => buf.as_mut_slice(),
// Long non-ASCII content could take lots of roundtrips through the encoder
buf if content_len >= Self::CONTENT_WRITE_LENGTH_LONG_ENOUGH_TO_USE_LARGER_BUFFER => {
*buf = Buffer::Heap(vec![0; Self::DEFAULT_HEAP_BUFFER_SIZE]);
match buf {
Buffer::Heap(buf) => buf.as_mut(),
_ => unreachable!(),
}
}
Buffer::Stack(buf) => buf.as_mut_slice(),
};
buffer
}
}

struct TextEncoder {
Expand All @@ -152,6 +179,7 @@ impl TextEncoder {
#[inline(never)]
fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) {
loop {
// First, fast path for ASCII-only prefix
debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it
let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes());
if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) {
Expand All @@ -164,41 +192,34 @@ impl TextEncoder {
content = remainder;
}

let buffer = match &mut self.buffer {
Buffer::Heap(buf) => buf.as_mut_slice(),
// Long non-ASCII content could take lots of roundtrips through the encoder
buf if content.len() >= 1 << 20 => {
*buf = Buffer::Heap(vec![0; 4096]);
match buf {
Buffer::Heap(buf) => buf.as_mut(),
_ => unreachable!(),
}
}
Buffer::Stack(buf) => buf.as_mut_slice(),
};
// Now the content starts with non-ASCII byte, so encoding_rs may need a buffer to convert to.
let buffer = self.buffer.buffer_for_length(content.len());

// last == true is needed only for the stateful ISO-JP encoding, which this library doesn't allow
let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false);

if written > 0 && written <= buffer.len() {
(output_handler)(&buffer[..written]);
}
if read >= content.len() {
return;
}
content = &content[read..];

match result {
CoderResult::InputEmpty => {
debug_assert!(content.is_empty());
return;
}
// we've made progress, and can try again without growing the buffer
CoderResult::OutputFull if written > 0 => {}
CoderResult::OutputFull => {
match &mut self.buffer {
Buffer::Heap(buf) if buf.len() >= 1024 => {
if written == 0 {
panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes
}
}
buf => *buf = Buffer::Heap(vec![0; 1024]),
}
// encoding_rs only needs a dozen bytes. If a large buffer is insufficient, it must be a bug.
assert!(
buffer.len() < Buffer::DEFAULT_HEAP_BUFFER_SIZE,
"encoding_rs infinite loop"
);
self.buffer = Buffer::Heap(vec![0; Buffer::DEFAULT_HEAP_BUFFER_SIZE]);
}
}
}
Expand All @@ -213,45 +234,60 @@ const fn utf8_width(b: u8) -> u8 {
b.leading_ones() as _
}

/// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments
struct IncompleteUtf8Resync {
bytes: [u8; 4],
len: u8,
/// Buffers an incomplete UTF-8 sequence
char_bytes: [u8; 4],
/// Number of bytes in `bytes`
char_len: u8,
}

impl IncompleteUtf8Resync {
pub fn new() -> Self {
Self {
bytes: [0; 4],
len: 0,
char_bytes: [0; 4],
char_len: 0,
}
}

/// Returns a valid UTF-8 fragment, and not-yet-checked remainder of the bytes.
///
/// Call `discard_incomplete()` after the last write to flush any partially-written chars.
pub fn utf8_bytes_to_slice<'buf, 'src: 'buf>(
&'buf mut self,
mut content: &'src [u8],
) -> Result<(&'buf str, &'src [u8]), Utf8Error> {
if self.len > 0 {
let mut found_end_byte = false;
// There may be incomplete char buffered from previous write, that must be continued now
if self.char_len > 0 {
let mut must_emit_now = false;
while let Some((&next_byte, rest)) = content.split_first() {
if is_continuation_byte(next_byte) {
if let Some(buf) = self.bytes.get_mut(self.len as usize) {
if let Some(buf) = self.char_bytes.get_mut(self.char_len as usize) {
*buf = next_byte;
self.len += 1;
self.char_len += 1;
content = rest;
continue;
}
// overlong sequences fall here, and will be checked when the char_bytes is flushed
}
found_end_byte = true;
must_emit_now = true;
break;
}

if found_end_byte || self.len >= utf8_width(self.bytes[0]) {
let char_buf = self.bytes.get(..self.len as usize).ok_or(Utf8Error)?;
self.len = 0;
std::str::from_utf8(char_buf)
.map_err(|_| Utf8Error)
.map(|ch| (ch, content))
if self.char_len >= utf8_width(self.char_bytes[0]) {
must_emit_now = true;
}

if must_emit_now {
let char_buf = self
.char_bytes
.get(..self.char_len as usize)
.ok_or(Utf8Error)?;
self.char_len = 0;
let ch = std::str::from_utf8(char_buf).map_err(|_| Utf8Error)?;
Ok((ch, content))
} else {
// a partial write has ended without fully completing a char (it's possible to write 1 byte at a time)
debug_assert!(content.is_empty());
Ok(("", b""))
}
Expand All @@ -264,11 +300,12 @@ impl IncompleteUtf8Resync {
let (valid, invalid) = content
.split_at_checked(err.valid_up_to())
.ok_or(Utf8Error)?;
self.bytes
// save the incomplete bytes from the end for the next write
self.char_bytes
.get_mut(..invalid.len())
.ok_or(Utf8Error)?
.copy_from_slice(invalid);
self.len = invalid.len() as _;
self.char_len = invalid.len() as _;
// valid_up_to promises it is valid
debug_assert!(std::str::from_utf8(valid).is_ok());
let valid = unsafe { std::str::from_utf8_unchecked(valid) };
Expand All @@ -280,8 +317,8 @@ impl IncompleteUtf8Resync {

/// True if there were incomplete invalid bytes in the buffer
pub fn discard_incomplete(&mut self) -> bool {
if self.len > 0 {
self.len = 0;
if self.char_len > 0 {
self.char_len = 0;
true
} else {
false
Expand Down

0 comments on commit 7fd8c21

Please sign in to comment.