diff --git a/src/datetime.rs b/src/datetime.rs index eca1522b..b1394c32 100644 --- a/src/datetime.rs +++ b/src/datetime.rs @@ -386,13 +386,6 @@ impl crate::DateTime { } } - #[deprecated(since = "2.3.0", note = "Use try_to_rfc3339_string instead.")] - /// Convert this [`DateTime`] to an RFC 3339 formatted string. Panics if it could not be - /// represented in that format. - pub fn to_rfc3339_string(self) -> String { - self.try_to_rfc3339_string().unwrap() - } - /// Convert this [`DateTime`] to an RFC 3339 formatted string. pub fn try_to_rfc3339_string(self) -> Result { self.to_time_0_3() diff --git a/src/de.rs b/src/de.rs index 09ea1f0d..c1e1312a 100644 --- a/src/de.rs +++ b/src/de.rs @@ -137,40 +137,12 @@ where from_slice(bytes.as_slice()) } -/// Deserialize an instance of type `T` from an I/O stream of BSON, replacing any invalid UTF-8 -/// sequences with the Unicode replacement character. -/// -/// This is mainly useful when reading raw BSON returned from a MongoDB server, which -/// in rare cases can contain invalidly truncated strings (). -/// For most use cases, [`crate::from_reader`] can be used instead. -pub fn from_reader_utf8_lossy(reader: R) -> Result -where - T: DeserializeOwned, - R: Read, -{ - let bytes = reader_to_vec(reader)?; - from_slice_utf8_lossy(bytes.as_slice()) -} - /// Deserialize an instance of type `T` from a slice of BSON bytes. pub fn from_slice<'de, T>(bytes: &'de [u8]) -> Result where T: Deserialize<'de>, { - from_raw(raw::Deserializer::new(bytes, false)?) -} - -/// Deserialize an instance of type `T` from a slice of BSON bytes, replacing any invalid UTF-8 -/// sequences with the Unicode replacement character. -/// -/// This is mainly useful when reading raw BSON returned from a MongoDB server, which -/// in rare cases can contain invalidly truncated strings (). -/// For most use cases, [`crate::from_slice`] can be used instead. -pub fn from_slice_utf8_lossy<'de, T>(bytes: &'de [u8]) -> Result -where - T: Deserialize<'de>, -{ - from_raw(raw::Deserializer::new(bytes, true)?) + from_raw(raw::Deserializer::new(bytes)?) } pub(crate) fn from_raw<'de, T: Deserialize<'de>>( diff --git a/src/de/raw.rs b/src/de/raw.rs index 6ec3e437..4f809df1 100644 --- a/src/de/raw.rs +++ b/src/de/raw.rs @@ -50,11 +50,11 @@ struct DeserializerOptions { } impl<'de> Deserializer<'de> { - pub(crate) fn new(buf: &'de [u8], utf8_lossy: bool) -> Result { + pub(crate) fn new(buf: &'de [u8]) -> Result { Ok(Self { element: RawElement::toplevel(buf)?, options: DeserializerOptions { - utf8_lossy, + utf8_lossy: false, human_readable: false, }, }) @@ -71,7 +71,7 @@ impl<'de> Deserializer<'de> { V: serde::de::Visitor<'de>, { if self.options.utf8_lossy { - if let Some(lossy) = self.element.value_utf8_lossy()? { + if let Some(lossy) = self.element.value_utf8_lossy_inner()? { return match lossy { Utf8LossyBson::String(s) => visitor.visit_string(s), Utf8LossyBson::RegularExpression(re) => { @@ -178,7 +178,7 @@ impl<'de> Deserializer<'de> { fn get_string(&self) -> Result> { if self.options.utf8_lossy { - let value = self.element.value_utf8_lossy()?; + let value = self.element.value_utf8_lossy_inner()?; let s = match value { Some(Utf8LossyBson::String(s)) => s, _ => { diff --git a/src/document.rs b/src/document.rs index 30e49283..eed8213c 100644 --- a/src/document.rs +++ b/src/document.rs @@ -695,9 +695,9 @@ impl Document { Ok(()) } - fn decode(reader: &mut R, utf_lossy: bool) -> crate::de::Result { + fn decode(reader: &mut R) -> crate::de::Result { let buf = crate::de::reader_to_vec(reader)?; - crate::de::from_raw(crate::de::RawDeserializer::new(&buf, utf_lossy)?) + crate::de::from_raw(crate::de::RawDeserializer::new(&buf)?) } /// Attempts to deserialize a [`Document`] from a byte stream. @@ -729,18 +729,7 @@ impl Document { /// # } /// ``` pub fn from_reader(mut reader: R) -> crate::de::Result { - Self::decode(&mut reader, false) - } - - /// Attempt to deserialize a [`Document`] that may contain invalid UTF-8 strings from a byte - /// stream. - /// - /// This is mainly useful when reading raw BSON returned from a MongoDB server, which - /// in rare cases can contain invalidly truncated strings (). - /// For most use cases, `Document::from_reader` can be used instead. - #[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"] - pub fn from_reader_utf8_lossy(mut reader: R) -> crate::de::Result { - Self::decode(&mut reader, true) + Self::decode(&mut reader) } } diff --git a/src/error.rs b/src/error.rs index 58f3e000..d1497760 100644 --- a/src/error.rs +++ b/src/error.rs @@ -50,6 +50,11 @@ pub enum ErrorKind { /// The kind of error that occurred. kind: ValueAccessErrorKind, }, + + /// A wrapped deserialization error. + /// TODO RUST-1406: collapse this + #[error("Deserialization error")] + DeError(crate::de::Error), } impl From for Error { @@ -62,6 +67,16 @@ impl From for Error { } } +impl From for Error { + fn from(value: crate::de::Error) -> Self { + Self { + kind: ErrorKind::DeError(value), + key: None, + index: None, + } + } +} + /// The types of errors that can occur when attempting to access a value in a document. #[derive(Debug, Error)] #[non_exhaustive] diff --git a/src/lib.rs b/src/lib.rs index d06f4601..3780be14 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -308,9 +308,6 @@ pub use self::{ uuid::{Uuid, UuidRepresentation}, }; -#[allow(deprecated)] -pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy}; - #[macro_use] mod macros; mod base64; diff --git a/src/raw/array.rs b/src/raw/array.rs index 4f2f47ba..a55daa03 100644 --- a/src/raw/array.rs +++ b/src/raw/array.rs @@ -212,6 +212,20 @@ impl RawArray { pub fn is_empty(&self) -> bool { self.doc.is_empty() } + + /// Gets an iterator over the elements in the [`RawArray`], + /// which yields `Result>` values. These hold a + /// reference to the underlying array but do not explicitly + /// resolve the values. + /// + /// This iterator, which underpins the implementation of the + /// default iterator, produces `RawElement` objects that hold a + /// view onto the array but do not parse out or construct + /// values until the `.value()` or `.try_into()` methods are + /// called. + pub fn iter_elements(&self) -> RawIter { + RawIter::new(&self.doc) + } } impl std::fmt::Debug for RawArray { diff --git a/src/raw/document.rs b/src/raw/document.rs index 468a3da0..e62eb62e 100644 --- a/src/raw/document.rs +++ b/src/raw/document.rs @@ -9,7 +9,11 @@ use crate::{ de::MIN_BSON_DOCUMENT_SIZE, error::{Error, Result}, raw::{serde::OwnedOrBorrowedRawDocument, RAW_DOCUMENT_NEWTYPE}, + Bson, DateTime, + JavaScriptCodeWithScope, + RawBson, + RawJavaScriptCodeWithScope, Timestamp, }; @@ -508,6 +512,55 @@ impl RawDocument { let bytes = self.cstring_bytes_at(start_at)?; try_to_str(bytes) } + + /// Copy this into a [`Document`], returning an error if invalid BSON is encountered. + pub fn to_document(&self) -> RawResult { + self.try_into() + } + + /// Copy this into a [`Document`], returning an error if invalid BSON is encountered. Any + /// invalid UTF-8 sequences will be replaced with the Unicode replacement character. + pub fn to_document_utf8_lossy(&self) -> RawResult { + let mut out = Document::new(); + for elem in self.iter_elements() { + let elem = elem?; + let value = deep_utf8_lossy(elem.value_utf8_lossy()?)?; + out.insert(elem.key(), value); + } + Ok(out) + } +} + +fn deep_utf8_lossy(src: RawBson) -> RawResult { + match src { + RawBson::Array(arr) => { + let mut tmp = vec![]; + for elem in arr.iter_elements() { + tmp.push(deep_utf8_lossy(elem?.value_utf8_lossy()?)?); + } + Ok(Bson::Array(tmp)) + } + RawBson::Document(doc) => { + let mut tmp = doc! {}; + for elem in doc.iter_elements() { + let elem = elem?; + tmp.insert(elem.key(), deep_utf8_lossy(elem.value_utf8_lossy()?)?); + } + Ok(Bson::Document(tmp)) + } + RawBson::JavaScriptCodeWithScope(RawJavaScriptCodeWithScope { code, scope }) => { + let mut tmp = doc! {}; + for elem in scope.iter_elements() { + let elem = elem?; + tmp.insert(elem.key(), deep_utf8_lossy(elem.value_utf8_lossy()?)?); + } + Ok(Bson::JavaScriptCodeWithScope(JavaScriptCodeWithScope { + code, + scope: tmp, + })) + } + v => v.try_into(), + } } impl<'de: 'a, 'a> Deserialize<'de> for &'a RawDocument { diff --git a/src/raw/document_buf.rs b/src/raw/document_buf.rs index 07b0fdae..e89ef704 100644 --- a/src/raw/document_buf.rs +++ b/src/raw/document_buf.rs @@ -1,6 +1,6 @@ use std::{ borrow::{Borrow, Cow}, - convert::{TryFrom, TryInto}, + convert::TryFrom, iter::FromIterator, ops::Deref, }; @@ -65,7 +65,7 @@ pub struct RawDocumentBuf { impl RawDocumentBuf { /// Creates a new, empty [`RawDocumentBuf`]. - pub fn new() -> RawDocumentBuf { + pub fn new() -> Self { let mut data = Vec::new(); data.extend(MIN_BSON_DOCUMENT_SIZE.to_le_bytes()); data.push(0); @@ -89,11 +89,16 @@ impl RawDocumentBuf { /// let doc = RawDocumentBuf::from_bytes(b"\x05\0\0\0\0".to_vec())?; /// # Ok::<(), bson::error::Error>(()) /// ``` - pub fn from_bytes(data: Vec) -> Result { + pub fn from_bytes(data: Vec) -> Result { let _ = RawDocument::from_bytes(data.as_slice())?; Ok(Self { data }) } + pub fn from_reader(reader: R) -> Result { + let buf = crate::de::reader_to_vec(reader)?; + Self::from_bytes(buf) + } + /// Create a [`RawDocumentBuf`] from a [`Document`]. /// /// ``` @@ -213,12 +218,6 @@ impl RawDocumentBuf { .expect("key should not contain interior null byte") }) } - - /// Convert this [`RawDocumentBuf`] to a [`Document`], returning an error - /// if invalid BSON is encountered. - pub fn to_document(&self) -> Result { - self.as_ref().try_into() - } } impl Default for RawDocumentBuf { diff --git a/src/raw/iter.rs b/src/raw/iter.rs index 9a4150dc..62fdc8e6 100644 --- a/src/raw/iter.rs +++ b/src/raw/iter.rs @@ -265,7 +265,14 @@ impl<'a> RawElement<'a> { }) } - pub(crate) fn value_utf8_lossy(&self) -> Result>> { + pub fn value_utf8_lossy(&self) -> Result { + match self.value_utf8_lossy_inner()? { + Some(v) => Ok(v.into()), + None => Ok(self.value()?.to_raw_bson()), + } + } + + pub(crate) fn value_utf8_lossy_inner(&self) -> Result>> { Ok(Some(match self.kind { ElementType::String => Utf8LossyBson::String(self.read_utf8_lossy()), ElementType::JavaScriptCode => Utf8LossyBson::JavaScriptCode(self.read_utf8_lossy()), @@ -452,3 +459,22 @@ pub(crate) struct Utf8LossyJavaScriptCodeWithScope<'a> { pub(crate) code: String, pub(crate) scope: &'a RawDocument, } + +impl<'a> From> for RawBson { + fn from(value: Utf8LossyBson<'a>) -> Self { + match value { + Utf8LossyBson::String(s) => RawBson::String(s), + Utf8LossyBson::JavaScriptCode(s) => RawBson::JavaScriptCode(s), + Utf8LossyBson::JavaScriptCodeWithScope(Utf8LossyJavaScriptCodeWithScope { + code, + scope, + }) => RawBson::JavaScriptCodeWithScope(super::RawJavaScriptCodeWithScope { + code, + scope: scope.to_raw_document_buf(), + }), + Utf8LossyBson::Symbol(s) => RawBson::Symbol(s), + Utf8LossyBson::DbPointer(p) => RawBson::DbPointer(p), + Utf8LossyBson::RegularExpression(r) => RawBson::RegularExpression(r), + } + } +} diff --git a/src/serde_helpers.rs b/src/serde_helpers.rs index a8877bdf..6cd1bb25 100644 --- a/src/serde_helpers.rs +++ b/src/serde_helpers.rs @@ -817,7 +817,8 @@ where /// /// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization` /// will call the `serialize` method for the wrapped `T`. -#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash, Default)] +#[repr(transparent)] pub struct Utf8LossyDeserialization(pub T); pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy"; @@ -852,3 +853,48 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData)) } } + +impl std::fmt::Display for Utf8LossyDeserialization { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + +impl From for Utf8LossyDeserialization { + fn from(value: T) -> Self { + Self(value) + } +} + +impl Deref for Utf8LossyDeserialization { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for Utf8LossyDeserialization { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl AsRef for Utf8LossyDeserialization +where + R: ?Sized, + as Deref>::Target: AsRef, +{ + fn as_ref(&self) -> &R { + self.deref().as_ref() + } +} + +impl AsMut for Utf8LossyDeserialization +where + as Deref>::Target: AsMut, +{ + fn as_mut(&mut self) -> &mut R { + self.deref_mut().as_mut() + } +} diff --git a/src/tests/modules/serializer_deserializer.rs b/src/tests/modules/serializer_deserializer.rs index 1083e985..63182cde 100644 --- a/src/tests/modules/serializer_deserializer.rs +++ b/src/tests/modules/serializer_deserializer.rs @@ -18,6 +18,7 @@ use crate::{ Decimal128, Document, JavaScriptCodeWithScope, + RawDocumentBuf, Regex, Timestamp, }; @@ -67,14 +68,16 @@ fn test_encode_decode_utf8_string_invalid() { let bytes = b"\x80\xae".to_vec(); let src = unsafe { String::from_utf8_unchecked(bytes) }; - let doc = doc! { "key": src }; + let doc = doc! { "key": &src, "subdoc": { "subkey": &src } }; let mut buf = Vec::new(); doc.to_writer(&mut buf).unwrap(); - let expected = doc! { "key": "��" }; - #[allow(deprecated)] - let decoded = Document::from_reader_utf8_lossy(&mut Cursor::new(buf)).unwrap(); + let expected = doc! { "key": "��", "subdoc": { "subkey": "��" } }; + let decoded = RawDocumentBuf::from_reader(&mut Cursor::new(buf)) + .unwrap() + .to_document_utf8_lossy() + .unwrap(); assert_eq!(decoded, expected); } diff --git a/src/tests/spec/corpus.rs b/src/tests/spec/corpus.rs index 93bc3220..1ef1893e 100644 --- a/src/tests/spec/corpus.rs +++ b/src/tests/spec/corpus.rs @@ -182,7 +182,7 @@ fn run_test(test: TestFile) { if !description.contains("$regex query operator") { // deserialize the field from raw Bytes into a RawBson let deserializer_raw = - crate::de::RawDeserializer::new(canonical_bson.as_slice(), false).unwrap(); + crate::de::RawDeserializer::new(canonical_bson.as_slice()).unwrap(); let raw_bson_field = deserializer_raw .deserialize_any(FieldVisitor(test_key.as_str(), PhantomData::)) .expect(&description); @@ -194,7 +194,7 @@ fn run_test(test: TestFile) { // deserialize the field from raw Bytes into an OwnedRawBson let deserializer_raw = - crate::de::RawDeserializer::new(canonical_bson.as_slice(), false).unwrap(); + crate::de::RawDeserializer::new(canonical_bson.as_slice()).unwrap(); let owned_raw_bson_field = deserializer_raw .deserialize_any(FieldVisitor(test_key.as_str(), PhantomData::)) .expect(&description); @@ -203,7 +203,7 @@ fn run_test(test: TestFile) { // deserialize the field from raw Bytes into a Bson let deserializer_value = - crate::de::RawDeserializer::new(canonical_bson.as_slice(), false).unwrap(); + crate::de::RawDeserializer::new(canonical_bson.as_slice()).unwrap(); let bson_field = deserializer_value .deserialize_any(FieldVisitor(test_key.as_str(), PhantomData::)) .expect(&description); @@ -555,13 +555,15 @@ fn run_test(test: TestFile) { crate::from_reader::<_, Document>(bson.as_slice()).expect_err(description.as_str()); if decode_error.description.contains("invalid UTF-8") { - #[allow(deprecated)] - crate::from_reader_utf8_lossy::<_, Document>(bson.as_slice()).unwrap_or_else(|err| { - panic!( - "{}: utf8_lossy should not fail (failed with {:?})", - description, err - ) - }); + RawDocumentBuf::from_reader(bson.as_slice()) + .expect(&description) + .to_document_utf8_lossy() + .unwrap_or_else(|err| { + panic!( + "{}: utf8_lossy should not fail (failed with {:?})", + description, err + ) + }); crate::from_slice::>(bson.as_slice()) .expect(&description); }