Skip to content

Commit 8389d37

Browse files
authored
RUST-1998 Remove lossy utf8 as a decoder option (#550)
1 parent 328d540 commit 8389d37

File tree

13 files changed

+191
-82
lines changed

13 files changed

+191
-82
lines changed

src/datetime.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -386,13 +386,6 @@ impl crate::DateTime {
386386
}
387387
}
388388

389-
#[deprecated(since = "2.3.0", note = "Use try_to_rfc3339_string instead.")]
390-
/// Convert this [`DateTime`] to an RFC 3339 formatted string. Panics if it could not be
391-
/// represented in that format.
392-
pub fn to_rfc3339_string(self) -> String {
393-
self.try_to_rfc3339_string().unwrap()
394-
}
395-
396389
/// Convert this [`DateTime`] to an RFC 3339 formatted string.
397390
pub fn try_to_rfc3339_string(self) -> Result<String> {
398391
self.to_time_0_3()

src/de.rs

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -137,40 +137,12 @@ where
137137
from_slice(bytes.as_slice())
138138
}
139139

140-
/// Deserialize an instance of type `T` from an I/O stream of BSON, replacing any invalid UTF-8
141-
/// sequences with the Unicode replacement character.
142-
///
143-
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
144-
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
145-
/// For most use cases, [`crate::from_reader`] can be used instead.
146-
pub fn from_reader_utf8_lossy<R, T>(reader: R) -> Result<T>
147-
where
148-
T: DeserializeOwned,
149-
R: Read,
150-
{
151-
let bytes = reader_to_vec(reader)?;
152-
from_slice_utf8_lossy(bytes.as_slice())
153-
}
154-
155140
/// Deserialize an instance of type `T` from a slice of BSON bytes.
156141
pub fn from_slice<'de, T>(bytes: &'de [u8]) -> Result<T>
157142
where
158143
T: Deserialize<'de>,
159144
{
160-
from_raw(raw::Deserializer::new(bytes, false)?)
161-
}
162-
163-
/// Deserialize an instance of type `T` from a slice of BSON bytes, replacing any invalid UTF-8
164-
/// sequences with the Unicode replacement character.
165-
///
166-
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
167-
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
168-
/// For most use cases, [`crate::from_slice`] can be used instead.
169-
pub fn from_slice_utf8_lossy<'de, T>(bytes: &'de [u8]) -> Result<T>
170-
where
171-
T: Deserialize<'de>,
172-
{
173-
from_raw(raw::Deserializer::new(bytes, true)?)
145+
from_raw(raw::Deserializer::new(bytes)?)
174146
}
175147

176148
pub(crate) fn from_raw<'de, T: Deserialize<'de>>(

src/de/raw.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,11 @@ struct DeserializerOptions {
5050
}
5151

5252
impl<'de> Deserializer<'de> {
53-
pub(crate) fn new(buf: &'de [u8], utf8_lossy: bool) -> Result<Self> {
53+
pub(crate) fn new(buf: &'de [u8]) -> Result<Self> {
5454
Ok(Self {
5555
element: RawElement::toplevel(buf)?,
5656
options: DeserializerOptions {
57-
utf8_lossy,
57+
utf8_lossy: false,
5858
human_readable: false,
5959
},
6060
})
@@ -71,7 +71,7 @@ impl<'de> Deserializer<'de> {
7171
V: serde::de::Visitor<'de>,
7272
{
7373
if self.options.utf8_lossy {
74-
if let Some(lossy) = self.element.value_utf8_lossy()? {
74+
if let Some(lossy) = self.element.value_utf8_lossy_inner()? {
7575
return match lossy {
7676
Utf8LossyBson::String(s) => visitor.visit_string(s),
7777
Utf8LossyBson::RegularExpression(re) => {
@@ -178,7 +178,7 @@ impl<'de> Deserializer<'de> {
178178

179179
fn get_string(&self) -> Result<Cow<'de, str>> {
180180
if self.options.utf8_lossy {
181-
let value = self.element.value_utf8_lossy()?;
181+
let value = self.element.value_utf8_lossy_inner()?;
182182
let s = match value {
183183
Some(Utf8LossyBson::String(s)) => s,
184184
_ => {

src/document.rs

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -695,9 +695,9 @@ impl Document {
695695
Ok(())
696696
}
697697

698-
fn decode<R: Read + ?Sized>(reader: &mut R, utf_lossy: bool) -> crate::de::Result<Document> {
698+
fn decode<R: Read + ?Sized>(reader: &mut R) -> crate::de::Result<Document> {
699699
let buf = crate::de::reader_to_vec(reader)?;
700-
crate::de::from_raw(crate::de::RawDeserializer::new(&buf, utf_lossy)?)
700+
crate::de::from_raw(crate::de::RawDeserializer::new(&buf)?)
701701
}
702702

703703
/// Attempts to deserialize a [`Document`] from a byte stream.
@@ -729,18 +729,7 @@ impl Document {
729729
/// # }
730730
/// ```
731731
pub fn from_reader<R: Read>(mut reader: R) -> crate::de::Result<Document> {
732-
Self::decode(&mut reader, false)
733-
}
734-
735-
/// Attempt to deserialize a [`Document`] that may contain invalid UTF-8 strings from a byte
736-
/// stream.
737-
///
738-
/// This is mainly useful when reading raw BSON returned from a MongoDB server, which
739-
/// in rare cases can contain invalidly truncated strings (<https://jira.mongodb.org/browse/SERVER-24007>).
740-
/// For most use cases, `Document::from_reader` can be used instead.
741-
#[deprecated = "use bson::serde_helpers::Utf8LossyDeserialization"]
742-
pub fn from_reader_utf8_lossy<R: Read>(mut reader: R) -> crate::de::Result<Document> {
743-
Self::decode(&mut reader, true)
732+
Self::decode(&mut reader)
744733
}
745734
}
746735

src/error.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ pub enum ErrorKind {
5050
/// The kind of error that occurred.
5151
kind: ValueAccessErrorKind,
5252
},
53+
54+
/// A wrapped deserialization error.
55+
/// TODO RUST-1406: collapse this
56+
#[error("Deserialization error")]
57+
DeError(crate::de::Error),
5358
}
5459

5560
impl From<ErrorKind> for Error {
@@ -62,6 +67,16 @@ impl From<ErrorKind> for Error {
6267
}
6368
}
6469

70+
impl From<crate::de::Error> for Error {
71+
fn from(value: crate::de::Error) -> Self {
72+
Self {
73+
kind: ErrorKind::DeError(value),
74+
key: None,
75+
index: None,
76+
}
77+
}
78+
}
79+
6580
/// The types of errors that can occur when attempting to access a value in a document.
6681
#[derive(Debug, Error)]
6782
#[non_exhaustive]

src/lib.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,6 @@ pub use self::{
308308
uuid::{Uuid, UuidRepresentation},
309309
};
310310

311-
#[allow(deprecated)]
312-
pub use self::de::{from_reader_utf8_lossy, from_slice_utf8_lossy};
313-
314311
#[macro_use]
315312
mod macros;
316313
mod base64;

src/raw/array.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,20 @@ impl RawArray {
212212
pub fn is_empty(&self) -> bool {
213213
self.doc.is_empty()
214214
}
215+
216+
/// Gets an iterator over the elements in the [`RawArray`],
217+
/// which yields `Result<RawElement<'_>>` values. These hold a
218+
/// reference to the underlying array but do not explicitly
219+
/// resolve the values.
220+
///
221+
/// This iterator, which underpins the implementation of the
222+
/// default iterator, produces `RawElement` objects that hold a
223+
/// view onto the array but do not parse out or construct
224+
/// values until the `.value()` or `.try_into()` methods are
225+
/// called.
226+
pub fn iter_elements(&self) -> RawIter {
227+
RawIter::new(&self.doc)
228+
}
215229
}
216230

217231
impl std::fmt::Debug for RawArray {

src/raw/document.rs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,11 @@ use crate::{
99
de::MIN_BSON_DOCUMENT_SIZE,
1010
error::{Error, Result},
1111
raw::{serde::OwnedOrBorrowedRawDocument, RAW_DOCUMENT_NEWTYPE},
12+
Bson,
1213
DateTime,
14+
JavaScriptCodeWithScope,
15+
RawBson,
16+
RawJavaScriptCodeWithScope,
1317
Timestamp,
1418
};
1519

@@ -508,6 +512,55 @@ impl RawDocument {
508512
let bytes = self.cstring_bytes_at(start_at)?;
509513
try_to_str(bytes)
510514
}
515+
516+
/// Copy this into a [`Document`], returning an error if invalid BSON is encountered.
517+
pub fn to_document(&self) -> RawResult<Document> {
518+
self.try_into()
519+
}
520+
521+
/// Copy this into a [`Document`], returning an error if invalid BSON is encountered. Any
522+
/// invalid UTF-8 sequences will be replaced with the Unicode replacement character.
523+
pub fn to_document_utf8_lossy(&self) -> RawResult<Document> {
524+
let mut out = Document::new();
525+
for elem in self.iter_elements() {
526+
let elem = elem?;
527+
let value = deep_utf8_lossy(elem.value_utf8_lossy()?)?;
528+
out.insert(elem.key(), value);
529+
}
530+
Ok(out)
531+
}
532+
}
533+
534+
fn deep_utf8_lossy(src: RawBson) -> RawResult<Bson> {
535+
match src {
536+
RawBson::Array(arr) => {
537+
let mut tmp = vec![];
538+
for elem in arr.iter_elements() {
539+
tmp.push(deep_utf8_lossy(elem?.value_utf8_lossy()?)?);
540+
}
541+
Ok(Bson::Array(tmp))
542+
}
543+
RawBson::Document(doc) => {
544+
let mut tmp = doc! {};
545+
for elem in doc.iter_elements() {
546+
let elem = elem?;
547+
tmp.insert(elem.key(), deep_utf8_lossy(elem.value_utf8_lossy()?)?);
548+
}
549+
Ok(Bson::Document(tmp))
550+
}
551+
RawBson::JavaScriptCodeWithScope(RawJavaScriptCodeWithScope { code, scope }) => {
552+
let mut tmp = doc! {};
553+
for elem in scope.iter_elements() {
554+
let elem = elem?;
555+
tmp.insert(elem.key(), deep_utf8_lossy(elem.value_utf8_lossy()?)?);
556+
}
557+
Ok(Bson::JavaScriptCodeWithScope(JavaScriptCodeWithScope {
558+
code,
559+
scope: tmp,
560+
}))
561+
}
562+
v => v.try_into(),
563+
}
511564
}
512565

513566
impl<'de: 'a, 'a> Deserialize<'de> for &'a RawDocument {

src/raw/document_buf.rs

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::{
22
borrow::{Borrow, Cow},
3-
convert::{TryFrom, TryInto},
3+
convert::TryFrom,
44
iter::FromIterator,
55
ops::Deref,
66
};
@@ -65,7 +65,7 @@ pub struct RawDocumentBuf {
6565

6666
impl RawDocumentBuf {
6767
/// Creates a new, empty [`RawDocumentBuf`].
68-
pub fn new() -> RawDocumentBuf {
68+
pub fn new() -> Self {
6969
let mut data = Vec::new();
7070
data.extend(MIN_BSON_DOCUMENT_SIZE.to_le_bytes());
7171
data.push(0);
@@ -89,11 +89,16 @@ impl RawDocumentBuf {
8989
/// let doc = RawDocumentBuf::from_bytes(b"\x05\0\0\0\0".to_vec())?;
9090
/// # Ok::<(), bson::error::Error>(())
9191
/// ```
92-
pub fn from_bytes(data: Vec<u8>) -> Result<RawDocumentBuf> {
92+
pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
9393
let _ = RawDocument::from_bytes(data.as_slice())?;
9494
Ok(Self { data })
9595
}
9696

97+
pub fn from_reader<R: std::io::Read>(reader: R) -> Result<Self> {
98+
let buf = crate::de::reader_to_vec(reader)?;
99+
Self::from_bytes(buf)
100+
}
101+
97102
/// Create a [`RawDocumentBuf`] from a [`Document`].
98103
///
99104
/// ```
@@ -213,12 +218,6 @@ impl RawDocumentBuf {
213218
.expect("key should not contain interior null byte")
214219
})
215220
}
216-
217-
/// Convert this [`RawDocumentBuf`] to a [`Document`], returning an error
218-
/// if invalid BSON is encountered.
219-
pub fn to_document(&self) -> Result<Document> {
220-
self.as_ref().try_into()
221-
}
222221
}
223222

224223
impl Default for RawDocumentBuf {

src/raw/iter.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,14 @@ impl<'a> RawElement<'a> {
265265
})
266266
}
267267

268-
pub(crate) fn value_utf8_lossy(&self) -> Result<Option<Utf8LossyBson<'a>>> {
268+
pub fn value_utf8_lossy(&self) -> Result<RawBson> {
269+
match self.value_utf8_lossy_inner()? {
270+
Some(v) => Ok(v.into()),
271+
None => Ok(self.value()?.to_raw_bson()),
272+
}
273+
}
274+
275+
pub(crate) fn value_utf8_lossy_inner(&self) -> Result<Option<Utf8LossyBson<'a>>> {
269276
Ok(Some(match self.kind {
270277
ElementType::String => Utf8LossyBson::String(self.read_utf8_lossy()),
271278
ElementType::JavaScriptCode => Utf8LossyBson::JavaScriptCode(self.read_utf8_lossy()),
@@ -452,3 +459,22 @@ pub(crate) struct Utf8LossyJavaScriptCodeWithScope<'a> {
452459
pub(crate) code: String,
453460
pub(crate) scope: &'a RawDocument,
454461
}
462+
463+
impl<'a> From<Utf8LossyBson<'a>> for RawBson {
464+
fn from(value: Utf8LossyBson<'a>) -> Self {
465+
match value {
466+
Utf8LossyBson::String(s) => RawBson::String(s),
467+
Utf8LossyBson::JavaScriptCode(s) => RawBson::JavaScriptCode(s),
468+
Utf8LossyBson::JavaScriptCodeWithScope(Utf8LossyJavaScriptCodeWithScope {
469+
code,
470+
scope,
471+
}) => RawBson::JavaScriptCodeWithScope(super::RawJavaScriptCodeWithScope {
472+
code,
473+
scope: scope.to_raw_document_buf(),
474+
}),
475+
Utf8LossyBson::Symbol(s) => RawBson::Symbol(s),
476+
Utf8LossyBson::DbPointer(p) => RawBson::DbPointer(p),
477+
Utf8LossyBson::RegularExpression(r) => RawBson::RegularExpression(r),
478+
}
479+
}
480+
}

src/serde_helpers.rs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -817,7 +817,8 @@ where
817817
///
818818
/// This wrapper type has no impact on serialization. Serializing a `Utf8LossyDeserialization<T>`
819819
/// will call the `serialize` method for the wrapped `T`.
820-
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)]
820+
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash, Default)]
821+
#[repr(transparent)]
821822
pub struct Utf8LossyDeserialization<T>(pub T);
822823

823824
pub(crate) const UTF8_LOSSY_NEWTYPE: &str = "$__bson_private_utf8_lossy";
@@ -852,3 +853,48 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Utf8LossyDeserialization<T>
852853
deserializer.deserialize_newtype_struct(UTF8_LOSSY_NEWTYPE, V(PhantomData))
853854
}
854855
}
856+
857+
impl<T: std::fmt::Display> std::fmt::Display for Utf8LossyDeserialization<T> {
858+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
859+
self.0.fmt(f)
860+
}
861+
}
862+
863+
impl<T> From<T> for Utf8LossyDeserialization<T> {
864+
fn from(value: T) -> Self {
865+
Self(value)
866+
}
867+
}
868+
869+
impl<T> Deref for Utf8LossyDeserialization<T> {
870+
type Target = T;
871+
872+
fn deref(&self) -> &Self::Target {
873+
&self.0
874+
}
875+
}
876+
877+
impl<T> DerefMut for Utf8LossyDeserialization<T> {
878+
fn deref_mut(&mut self) -> &mut Self::Target {
879+
&mut self.0
880+
}
881+
}
882+
883+
impl<T, R> AsRef<R> for Utf8LossyDeserialization<T>
884+
where
885+
R: ?Sized,
886+
<Utf8LossyDeserialization<T> as Deref>::Target: AsRef<R>,
887+
{
888+
fn as_ref(&self) -> &R {
889+
self.deref().as_ref()
890+
}
891+
}
892+
893+
impl<T, R: ?Sized> AsMut<R> for Utf8LossyDeserialization<T>
894+
where
895+
<Utf8LossyDeserialization<T> as Deref>::Target: AsMut<R>,
896+
{
897+
fn as_mut(&mut self) -> &mut R {
898+
self.deref_mut().as_mut()
899+
}
900+
}

0 commit comments

Comments
 (0)