Skip to content

Commit 9672f39

Browse files
JsonValue: Deduplicate keys before populating Dict (#1865)
1 parent db94945 commit 9672f39

File tree

2 files changed

+70
-9
lines changed

2 files changed

+70
-9
lines changed

src/input/input_json.rs

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use std::borrow::Cow;
2+
use std::collections::HashSet;
23

34
use jiter::{JsonArray, JsonObject, JsonValue};
45
use num_traits::cast::ToPrimitive;
@@ -59,18 +60,35 @@ impl<'py, 'data> Input<'py> for JsonValue<'data> {
5960
}
6061

6162
fn as_kwargs(&self, py: Python<'py>) -> Option<Bound<'py, PyDict>> {
62-
match self {
63-
JsonValue::Object(object) => {
64-
let dict = PyDict::new(py);
65-
for (k, v) in object.as_slice() {
66-
// TODO: jiter doesn't deduplicate keys, so we should probably do that here to
67-
// avoid potential wasted work creating Python objects.
68-
dict.set_item(k, v).unwrap();
63+
let JsonValue::Object(object) = self else {
64+
return None;
65+
};
66+
67+
// deduplicate keys before creating objects to avoid wasted work
68+
// jiter doesn't deduplicate keys, so duplicate keys in JSON will appear multiple times
69+
// in the slice. We iterate backwards to keep only the last value for each key while preserving order
70+
let unique_indices_reversed = {
71+
let mut seen = HashSet::with_capacity(object.len());
72+
let mut unique = Vec::with_capacity(object.len());
73+
74+
for (i, (k, _)) in object.as_slice().iter().enumerate().rev() {
75+
if seen.insert(k) {
76+
unique.push(i);
6977
}
70-
Some(dict)
7178
}
72-
_ => None,
79+
80+
unique
81+
};
82+
83+
let object = object.as_slice();
84+
85+
let dict = PyDict::new(py);
86+
for &i in unique_indices_reversed.iter().rev() {
87+
let (k, v) = &object[i];
88+
dict.set_item(k, v).unwrap();
7389
}
90+
91+
Some(dict)
7492
}
7593

7694
type Arguments<'a>

tests/validators/test_dataclasses.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1845,3 +1845,46 @@ class MyDataclass:
18451845
assert dataclasses.asdict(
18461846
s.validate_python({'my_field': 1}, by_alias=runtime_by_alias, by_name=runtime_by_name)
18471847
) == {'my_field': 1}
1848+
1849+
1850+
def test_dataclass_json_duplicate_keys():
1851+
"""Test that duplicate keys in JSON are handled correctly (last value wins).
1852+
1853+
We want to ensure that:
1854+
1. The last value for a duplicate key is used (standard JSON behavior)
1855+
2. We don't waste work creating Python objects for values that get overwritten
1856+
"""
1857+
1858+
@dataclasses.dataclass
1859+
class MyDataclass:
1860+
name: str
1861+
age: int
1862+
1863+
schema = core_schema.dataclass_schema(
1864+
MyDataclass,
1865+
core_schema.dataclass_args_schema(
1866+
'MyDataclass',
1867+
[
1868+
core_schema.dataclass_field(name='name', schema=core_schema.str_schema()),
1869+
core_schema.dataclass_field(name='age', schema=core_schema.int_schema()),
1870+
],
1871+
),
1872+
['name', 'age'],
1873+
)
1874+
v = SchemaValidator(schema)
1875+
1876+
# json with duplicate keys - the last value should win
1877+
json_with_duplicates = '{"name": "Alice", "age": 30, "name": "Bob", "age": 25}'
1878+
result = v.validate_json(json_with_duplicates)
1879+
1880+
assert result.name == 'Bob', "Last value for 'name' should win"
1881+
assert result.age == 25, "Last value for 'age' should win"
1882+
assert dataclasses.asdict(result) == {'name': 'Bob', 'age': 25}
1883+
1884+
# test with multiple duplicates of the same key
1885+
json_multiple_duplicates = '{"name": "First", "age": 1, "name": "Second", "name": "Third", "age": 3}'
1886+
result2 = v.validate_json(json_multiple_duplicates)
1887+
1888+
assert result2.name == 'Third', 'Last value among multiple duplicates should win'
1889+
assert result2.age == 3
1890+
assert dataclasses.asdict(result2) == {'name': 'Third', 'age': 3}

0 commit comments

Comments
 (0)