Skip to content

Commit

Permalink
Avoid re-validating UTF-8 in FromUtf8Error::into_utf8_lossy
Browse files Browse the repository at this point in the history
Refactor `into_utf8_lossy` to copy valid UTF-8 bytes into the buffer,
avoiding double validation of bytes.
Add tests that mirror the `String::from_utf8_lossy` tests
  • Loading branch information
okaneco committed Sep 21, 2024
1 parent 8c2c9a9 commit b94c5a1
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 1 deletion.
26 changes: 25 additions & 1 deletion library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2081,7 +2081,31 @@ impl FromUtf8Error {
#[cfg(not(no_global_oom_handling))]
#[unstable(feature = "string_from_utf8_lossy_owned", issue = "129436")]
pub fn into_utf8_lossy(self) -> String {
String::from_utf8_lossy_owned(self.bytes)
const REPLACEMENT: &str = "\u{FFFD}";

let mut res = {
let mut v = Vec::with_capacity(self.bytes.len());

// `Utf8Error::valid_up_to` returns the maximum index of validated
// UTF-8 bytes. Copy the valid bytes into the output buffer.
v.extend_from_slice(&self.bytes[..self.error.valid_up_to()]);

// SAFETY: This is safe because the only bytes present in the buffer
// were validated as UTF-8 by the call to `String::from_utf8` which
// produced this `FromUtf8Error`.
unsafe { String::from_utf8_unchecked(v) }
};

let iter = self.bytes[self.error.valid_up_to()..].utf8_chunks();

for chunk in iter {
res.push_str(chunk.valid());
if !chunk.invalid().is_empty() {
res.push_str(REPLACEMENT);
}
}

res
}

/// Returns the bytes that were attempted to convert to a `String`.
Expand Down
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#![feature(iter_next_chunk)]
#![feature(round_char_boundary)]
#![feature(slice_partition_dedup)]
#![feature(string_from_utf8_lossy_owned)]
#![feature(string_remove_matches)]
#![feature(const_btree_len)]
#![feature(const_trait_impl)]
Expand Down
37 changes: 37 additions & 0 deletions library/alloc/tests/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,43 @@ fn test_from_utf8_lossy() {
);
}

#[test]
fn test_fromutf8error_into_lossy() {
fn func(input: &[u8]) -> String {
String::from_utf8(input.to_owned()).unwrap_or_else(|e| e.into_utf8_lossy())
}

let xs = b"hello";
let ys = "hello".to_owned();
assert_eq!(func(xs), ys);

let xs = "ศไทย中华Việt Nam".as_bytes();
let ys = "ศไทย中华Việt Nam".to_owned();
assert_eq!(func(xs), ys);

let xs = b"Hello\xC2 There\xFF Goodbye";
assert_eq!(func(xs), "Hello\u{FFFD} There\u{FFFD} Goodbye".to_owned());

let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
assert_eq!(func(xs), "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye".to_owned());

let xs = b"\xF5foo\xF5\x80bar";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}\u{FFFD}bar".to_owned());

let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz".to_owned());

let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
assert_eq!(func(xs), "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz".to_owned());

let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar".to_owned());

// surrogates
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
assert_eq!(func(xs), "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar".to_owned());
}

#[test]
fn test_from_utf16() {
let pairs = [
Expand Down

0 comments on commit b94c5a1

Please sign in to comment.