Skip to content

RUST-1998 Remove lossy utf8 as a decoder option #550

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions src/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -386,13 +386,6 @@ impl crate::DateTime {
}
}

#[deprecated(since = "2.3.0", note = "Use try_to_rfc3339_string instead.")]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is completely unrelated but I happened to notice it and figured we should remove deprecated methods while we're doing a major version release. I did a grep and didn't find any others.

/// Convert this [`DateTime`] to an RFC 3339 formatted string. Panics if it could not be
/// represented in that format.
pub fn to_rfc3339_string(self) -> String {
self.try_to_rfc3339_string().unwrap()
}

/// Convert this [`DateTime`] to an RFC 3339 formatted string.
pub fn try_to_rfc3339_string(self) -> Result<String> {
self.to_time_0_3()
Expand Down
30 changes: 1 addition & 29 deletions src/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,40 +137,12 @@ where
from_slice(bytes.as_slice())
}

/// Deserialize an instance of type `T` from an I/O stream of BSON, replacing any invalid UTF-8
/// sequences with the Unicode replacement character.
///
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
/// For most use cases, [`crate::from_reader`] can be used instead.
pub fn from_reader_utf8_lossy<R, T>(reader: R) -> Result<T>
where
T: DeserializeOwned,
R: Read,
{
let bytes = reader_to_vec(reader)?;
from_slice_utf8_lossy(bytes.as_slice())
}

/// Deserialize an instance of type `T` from a slice of BSON bytes.
pub fn from_slice<'de, T>(bytes: &'de [u8]) -> Result<T>
where
T: Deserialize<'de>,
{
from_raw(raw::Deserializer::new(bytes, false)?)
}

/// Deserialize an instance of type `T` from a slice of BSON bytes, replacing any invalid UTF-8
/// sequences with the Unicode replacement character.
///
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
/// For most use cases, [`crate::from_slice`] can be used instead.
pub fn from_slice_utf8_lossy<'de, T>(bytes: &'de [u8]) -> Result<T>
where
T: Deserialize<'de>,
{
from_raw(raw::Deserializer::new(bytes, true)?)
from_raw(raw::Deserializer::new(bytes)?)
}

pub(crate) fn from_raw<'de, T: Deserialize<'de>>(
Expand Down
8 changes: 4 additions & 4 deletions src/de/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,11 @@ struct DeserializerOptions {
}

impl<'de> Deserializer<'de> {
pub(crate) fn new(buf: &'de [u8], utf8_lossy: bool) -> Result<Self> {
pub(crate) fn new(buf: &'de [u8]) -> Result<Self> {
Ok(Self {
element: RawElement::toplevel(buf)?,
options: DeserializerOptions {
utf8_lossy,
utf8_lossy: false,
human_readable: false,
},
})
Expand All @@ -71,7 +71,7 @@ impl<'de> Deserializer<'de> {
V: serde::de::Visitor<'de>,
{
if self.options.utf8_lossy {
if let Some(lossy) = self.element.value_utf8_lossy()? {
if let Some(lossy) = self.element.value_utf8_lossy_inner()? {
return match lossy {
Utf8LossyBson::String(s) => visitor.visit_string(s),
Utf8LossyBson::RegularExpression(re) => {
Expand Down Expand Up @@ -178,7 +178,7 @@ impl<'de> Deserializer<'de> {

fn get_string(&self) -> Result<Cow<'de, str>> {
if self.options.utf8_lossy {
let value = self.element.value_utf8_lossy()?;
let value = self.element.value_utf8_lossy_inner()?;
let s = match value {
Some(Utf8LossyBson::String(s)) => s,
_ => {
Expand Down
17 changes: 3 additions & 14 deletions src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -695,9 +695,9 @@ impl Document {
Ok(())
}

fn decode<R: Read + ?Sized>(reader: &mut R, utf_lossy: bool) -> crate::de::Result<Document> {
fn decode<R: Read + ?Sized>(reader: &mut R) -> crate::de::Result<Document> {
let buf = crate::de::reader_to_vec(reader)?;
crate::de::from_raw(crate::de::RawDeserializer::new(&buf, utf_lossy)?)
crate::de::from_raw(crate::de::RawDeserializer::new(&buf)?)
}

/// Attempts to deserialize a [`Document`] from a byte stream.
Expand Down Expand Up @@ -729,18 +729,7 @@ impl Document {
/// # }
/// ```
pub fn from_reader<R: Read>(mut reader: R) -> crate::de::Result<Document> {
Self::decode(&mut reader, false)
}

/// Attempt to deserialize a [`Document`] that may contain invalid UTF-8 strings from a byte
/// stream.
///
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
/// For most use cases, `Document::from_reader` can be used instead.
#[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"]
pub fn from_reader_utf8_lossy<R: Read>(mut reader: R) -> crate::de::Result<Document> {
Self::decode(&mut reader, true)
Self::decode(&mut reader)
}
}

Expand Down
15 changes: 15 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ pub enum ErrorKind {
/// The kind of error that occurred.
kind: ValueAccessErrorKind,
},

/// A wrapped deserialization error.
/// TODO RUST-1406: collapse this
#[error("Deserialization error")]
DeError(crate::de::Error),
}

impl From<ErrorKind> for Error {
Expand All @@ -62,6 +67,16 @@ impl From<ErrorKind> for Error {
}
}

impl From<crate::de::Error> for Error {
fn from(value: crate::de::Error) -> Self {
Self {
kind: ErrorKind::DeError(value),
key: None,
index: None,
}
}
}

/// The types of errors that can occur when attempting to access a value in a document.
#[derive(Debug, Error)]
#[non_exhaustive]
Expand Down
3 changes: 0 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,9 +308,6 @@ pub use self::{
uuid::{Uuid, UuidRepresentation},
};

#[allow(deprecated)]
pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy};

#[macro_use]
mod macros;
mod base64;
Expand Down
14 changes: 14 additions & 0 deletions src/raw/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,20 @@ impl RawArray {
pub fn is_empty(&self) -> bool {
self.doc.is_empty()
}

/// Gets an iterator over the elements in the [`RawArray`],
/// which yields `Result<RawElement<'_>>` values. These hold a
/// reference to the underlying array but do not explicitly
/// resolve the values.
///
/// This iterator, which underpins the implementation of the
/// default iterator, produces `RawElement` objects that hold a
/// view onto the array but do not parse out or construct
/// values until the `.value()` or `.try_into()` methods are
/// called.
pub fn iter_elements(&self) -> RawIter {
RawIter::new(&self.doc)
}
}

impl std::fmt::Debug for RawArray {
Expand Down
53 changes: 53 additions & 0 deletions src/raw/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ use crate::{
de::MIN_BSON_DOCUMENT_SIZE,
error::{Error, Result},
raw::{serde::OwnedOrBorrowedRawDocument, RAW_DOCUMENT_NEWTYPE},
Bson,
DateTime,
JavaScriptCodeWithScope,
RawBson,
RawJavaScriptCodeWithScope,
Timestamp,
};

Expand Down Expand Up @@ -508,6 +512,55 @@ impl RawDocument {
let bytes = self.cstring_bytes_at(start_at)?;
try_to_str(bytes)
}

/// Copy this into a [`Document`], returning an error if invalid BSON is encountered.
pub fn to_document(&self) -> RawResult<Document> {
self.try_into()
}

/// Copy this into a [`Document`], returning an error if invalid BSON is encountered. Any
/// invalid UTF-8 sequences will be replaced with the Unicode replacement character.
pub fn to_document_utf8_lossy(&self) -> RawResult<Document> {
let mut out = Document::new();
for elem in self.iter_elements() {
let elem = elem?;
let value = deep_utf8_lossy(elem.value_utf8_lossy()?)?;
out.insert(elem.key(), value);
}
Ok(out)
}
}

fn deep_utf8_lossy(src: RawBson) -> RawResult<Bson> {
match src {
RawBson::Array(arr) => {
let mut tmp = vec![];
for elem in arr.iter_elements() {
tmp.push(deep_utf8_lossy(elem?.value_utf8_lossy()?)?);
}
Ok(Bson::Array(tmp))
}
RawBson::Document(doc) => {
let mut tmp = doc! {};
for elem in doc.iter_elements() {
let elem = elem?;
tmp.insert(elem.key(), deep_utf8_lossy(elem.value_utf8_lossy()?)?);
}
Ok(Bson::Document(tmp))
}
RawBson::JavaScriptCodeWithScope(RawJavaScriptCodeWithScope { code, scope }) => {
let mut tmp = doc! {};
for elem in scope.iter_elements() {
let elem = elem?;
tmp.insert(elem.key(), deep_utf8_lossy(elem.value_utf8_lossy()?)?);
}
Ok(Bson::JavaScriptCodeWithScope(JavaScriptCodeWithScope {
code,
scope: tmp,
}))
}
v => v.try_into(),
}
}

impl<'de: 'a, 'a> Deserialize<'de> for &'a RawDocument {
Expand Down
17 changes: 8 additions & 9 deletions src/raw/document_buf.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::{
borrow::{Borrow, Cow},
convert::{TryFrom, TryInto},
convert::TryFrom,
iter::FromIterator,
ops::Deref,
};
Expand Down Expand Up @@ -65,7 +65,7 @@ pub struct RawDocumentBuf {

impl RawDocumentBuf {
/// Creates a new, empty [`RawDocumentBuf`].
pub fn new() -> RawDocumentBuf {
pub fn new() -> Self {
let mut data = Vec::new();
data.extend(MIN_BSON_DOCUMENT_SIZE.to_le_bytes());
data.push(0);
Expand All @@ -89,11 +89,16 @@ impl RawDocumentBuf {
/// let doc = RawDocumentBuf::from_bytes(b"\x05\0\0\0\0".to_vec())?;
/// # Ok::<(), bson::error::Error>(())
/// ```
pub fn from_bytes(data: Vec<u8>) -> Result<RawDocumentBuf> {
pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
let _ = RawDocument::from_bytes(data.as_slice())?;
Ok(Self { data })
}

pub fn from_reader<R: std::io::Read>(reader: R) -> Result<Self> {
let buf = crate::de::reader_to_vec(reader)?;
Self::from_bytes(buf)
}

/// Create a [`RawDocumentBuf`] from a [`Document`].
///
/// ```
Expand Down Expand Up @@ -213,12 +218,6 @@ impl RawDocumentBuf {
.expect("key should not contain interior null byte")
})
}

/// Convert this [`RawDocumentBuf`] to a [`Document`], returning an error
/// if invalid BSON is encountered.
pub fn to_document(&self) -> Result<Document> {
self.as_ref().try_into()
}
}

impl Default for RawDocumentBuf {
Expand Down
28 changes: 27 additions & 1 deletion src/raw/iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,14 @@ impl<'a> RawElement<'a> {
})
}

pub(crate) fn value_utf8_lossy(&self) -> Result<Option<Utf8LossyBson<'a>>> {
pub fn value_utf8_lossy(&self) -> Result<RawBson> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opted to expose this method rather than the lower-level value_utf8_lossy_inner because in my estimation the confluence between needing to deal with invalid utf8 and needing to save the memory cost of copying a single element is very small, and outside of that this is a strictly more useful interface.

match self.value_utf8_lossy_inner()? {
Some(v) => Ok(v.into()),
None => Ok(self.value()?.to_raw_bson()),
}
}

pub(crate) fn value_utf8_lossy_inner(&self) -> Result<Option<Utf8LossyBson<'a>>> {
Ok(Some(match self.kind {
ElementType::String => Utf8LossyBson::String(self.read_utf8_lossy()),
ElementType::JavaScriptCode => Utf8LossyBson::JavaScriptCode(self.read_utf8_lossy()),
Expand Down Expand Up @@ -452,3 +459,22 @@ pub(crate) struct Utf8LossyJavaScriptCodeWithScope<'a> {
pub(crate) code: String,
pub(crate) scope: &'a RawDocument,
}

impl<'a> From<Utf8LossyBson<'a>> for RawBson {
fn from(value: Utf8LossyBson<'a>) -> Self {
match value {
Utf8LossyBson::String(s) => RawBson::String(s),
Utf8LossyBson::JavaScriptCode(s) => RawBson::JavaScriptCode(s),
Utf8LossyBson::JavaScriptCodeWithScope(Utf8LossyJavaScriptCodeWithScope {
code,
scope,
}) => RawBson::JavaScriptCodeWithScope(super::RawJavaScriptCodeWithScope {
code,
scope: scope.to_raw_document_buf(),
}),
Utf8LossyBson::Symbol(s) => RawBson::Symbol(s),
Utf8LossyBson::DbPointer(p) => RawBson::DbPointer(p),
Utf8LossyBson::RegularExpression(r) => RawBson::RegularExpression(r),
}
}
}
48 changes: 47 additions & 1 deletion src/serde_helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,8 @@ where
///
/// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization<T>`
/// will call the `serialize` method for the wrapped `T`.
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash, Default)]
#[repr(transparent)]
pub struct Utf8LossyDeserialization<T>(pub T);

pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy";
Expand Down Expand Up @@ -852,3 +853,48 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization<T>
deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData))
}
}

impl<T: std::fmt::Display> std::fmt::Display for Utf8LossyDeserialization<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}

impl<T> From<T> for Utf8LossyDeserialization<T> {
fn from(value: T) -> Self {
Self(value)
}
}

impl<T> Deref for Utf8LossyDeserialization<T> {
type Target = T;

fn deref(&self) -> &Self::Target {
&self.0
}
}

impl<T> DerefMut for Utf8LossyDeserialization<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

impl<T, R> AsRef<R> for Utf8LossyDeserialization<T>
where
R: ?Sized,
<Utf8LossyDeserialization<T> as Deref>::Target: AsRef<R>,
{
fn as_ref(&self) -> &R {
self.deref().as_ref()
}
}

impl<T, R: ?Sized> AsMut<R> for Utf8LossyDeserialization<T>
where
<Utf8LossyDeserialization<T> as Deref>::Target: AsMut<R>,
{
fn as_mut(&mut self) -> &mut R {
self.deref_mut().as_mut()
}
}
Loading