diff --git a/src/input/input_json.rs b/src/input/input_json.rs index b66de5395..9d7763c6f 100644 --- a/src/input/input_json.rs +++ b/src/input/input_json.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashSet; use jiter::{JsonArray, JsonObject, JsonValue}; use num_traits::cast::ToPrimitive; @@ -59,18 +60,35 @@ impl<'py, 'data> Input<'py> for JsonValue<'data> { } fn as_kwargs(&self, py: Python<'py>) -> Option> { - match self { - JsonValue::Object(object) => { - let dict = PyDict::new(py); - for (k, v) in object.as_slice() { - // TODO: jiter doesn't deduplicate keys, so we should probably do that here to - // avoid potential wasted work creating Python objects. - dict.set_item(k, v).unwrap(); + let JsonValue::Object(object) = self else { + return None; + }; + + // deduplicate keys before creating objects to avoid wasted work + // jiter doesn't deduplicate keys, so duplicate keys in JSON will appear multiple times + // in the slice. We iterate backwards to keep only the last value for each key while preserving order + let unique_indices_reversed = { + let mut seen = HashSet::with_capacity(object.len()); + let mut unique = Vec::with_capacity(object.len()); + + for (i, (k, _)) in object.as_slice().iter().enumerate().rev() { + if seen.insert(k) { + unique.push(i); } - Some(dict) } - _ => None, + + unique + }; + + let object = object.as_slice(); + + let dict = PyDict::new(py); + for &i in unique_indices_reversed.iter().rev() { + let (k, v) = &object[i]; + dict.set_item(k, v).unwrap(); } + + Some(dict) } type Arguments<'a> diff --git a/tests/validators/test_dataclasses.py b/tests/validators/test_dataclasses.py index e25642d1f..4dec17eda 100644 --- a/tests/validators/test_dataclasses.py +++ b/tests/validators/test_dataclasses.py @@ -1845,3 +1845,46 @@ class MyDataclass: assert dataclasses.asdict( s.validate_python({'my_field': 1}, by_alias=runtime_by_alias, by_name=runtime_by_name) ) == {'my_field': 1} + + +def test_dataclass_json_duplicate_keys(): + """Test that duplicate keys in JSON are handled correctly (last value wins). + + We want to ensure that: + 1. The last value for a duplicate key is used (standard JSON behavior) + 2. We don't waste work creating Python objects for values that get overwritten + """ + + @dataclasses.dataclass + class MyDataclass: + name: str + age: int + + schema = core_schema.dataclass_schema( + MyDataclass, + core_schema.dataclass_args_schema( + 'MyDataclass', + [ + core_schema.dataclass_field(name='name', schema=core_schema.str_schema()), + core_schema.dataclass_field(name='age', schema=core_schema.int_schema()), + ], + ), + ['name', 'age'], + ) + v = SchemaValidator(schema) + + # json with duplicate keys - the last value should win + json_with_duplicates = '{"name": "Alice", "age": 30, "name": "Bob", "age": 25}' + result = v.validate_json(json_with_duplicates) + + assert result.name == 'Bob', "Last value for 'name' should win" + assert result.age == 25, "Last value for 'age' should win" + assert dataclasses.asdict(result) == {'name': 'Bob', 'age': 25} + + # test with multiple duplicates of the same key + json_multiple_duplicates = '{"name": "First", "age": 1, "name": "Second", "name": "Third", "age": 3}' + result2 = v.validate_json(json_multiple_duplicates) + + assert result2.name == 'Third', 'Last value among multiple duplicates should win' + assert result2.age == 3 + assert dataclasses.asdict(result2) == {'name': 'Third', 'age': 3}