Skip to content

Commit

Permalink
Fix incorrect reading of correct utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
algesten committed Feb 8, 2025
1 parent 8b46d37 commit 0efd27e
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Unreleased

* Fix incorrect reading of valid utf8 (#992)
* ureq::Error wrapped as io::Error should pass through body chain (#984)
* send_json should set content-length header (#983)

Expand Down
97 changes: 67 additions & 30 deletions src/body/lossy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,43 +11,20 @@ pub struct LossyUtf8Reader<R> {
reader: R,
ended: bool,
input: ConsumeBuf,
valid_len: usize,
}
impl<R> LossyUtf8Reader<R> {
pub(crate) fn new(reader: R) -> Self {
Self {
reader,
ended: false,
input: ConsumeBuf::new(8),
valid_len: 0,
}
}
}

impl<R: io::Read> io::Read for LossyUtf8Reader<R> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
// Match the input buffer size
if !self.ended {
let total_len = self.input.unconsumed().len() + self.input.free_mut().len();
let wanted_len = buf.len().max(MIN_BUF);
if wanted_len < total_len {
self.input.add_space(total_len - wanted_len);
}
}

// Fill up to a point where we definitely will make progress.
while !self.ended && self.input.unconsumed().len() < MIN_BUF {
let amount = self.reader.read(self.input.free_mut())?;
self.input.add_filled(amount);

if amount == 0 {
self.ended = true;
}
}

if self.ended && self.input.unconsumed().is_empty() {
return Ok(0);
}

let valid_len = match utf8::decode(self.input.unconsumed()) {
fn process_input(&mut self) -> usize {
match utf8::decode(self.input.unconsumed()) {
Ok(_) => {
// Entire input is valid
self.input.unconsumed().len()
Expand Down Expand Up @@ -85,14 +62,47 @@ impl<R: io::Read> io::Read for LossyUtf8Reader<R> {
}
}
},
};
assert!(valid_len > 0);
}
}
}

impl<R: io::Read> io::Read for LossyUtf8Reader<R> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
// Match the input buffer size
if !self.ended {
let total_len = self.input.unconsumed().len() + self.input.free_mut().len();
let wanted_len = buf.len().max(MIN_BUF);
if wanted_len < total_len {
self.input.add_space(total_len - wanted_len);
}
}

// Fill up to a point where we definitely will make progress.
while !self.ended && self.input.unconsumed().len() < MIN_BUF {
let amount = self.reader.read(self.input.free_mut())?;
self.input.add_filled(amount);

let src = &self.input.unconsumed()[..valid_len];
if amount == 0 {
self.ended = true;
}
}

if self.ended && self.input.unconsumed().is_empty() {
return Ok(0);
}

if self.valid_len == 0 {
self.valid_len = self.process_input();
assert!(self.valid_len > 0);
}

let src = &self.input.unconsumed()[..self.valid_len];
let max = src.len().min(buf.len());
buf[..max].copy_from_slice(&src[..max]);
self.input.consume(max);

self.valid_len -= max;

Ok(max)
}
}
Expand Down Expand Up @@ -150,6 +160,33 @@ mod test {
assert_eq!(do_reader(&mut [&[97, 97, 97, 195]]), "aaa?");
}

#[test]
fn hiragana() {
assert_eq!(do_reader(&mut ["あいうえお".as_bytes()]), "あいうえお");
}

#[test]
fn emoji() {
assert_eq!(do_reader(&mut ["✅✅✅".as_bytes()]), "✅✅✅");
}

#[test]
fn leftover() {
let s = "あ";
assert_eq!(s.as_bytes(), &[227, 129, 130]);

let mut buf = [0; 2];
let mut r = LossyUtf8Reader::new(s.as_bytes());

assert_eq!(r.read(&mut buf).unwrap(), 2);
assert_eq!(&buf[..], &[227, 129]);

assert_eq!(r.read(&mut buf).unwrap(), 1);
assert_eq!(&buf[..1], &[130]);

assert_eq!(r.read(&mut buf).unwrap(), 0);
}

struct TestReader<'a>(&'a mut [&'a [u8]]);

impl<'a> io::Read for TestReader<'a> {
Expand Down

0 comments on commit 0efd27e

Please sign in to comment.