Skip to content

Commit d0f3ebc

Browse files
committed
Address PR comments
1 parent e7768b4 commit d0f3ebc

File tree

3 files changed

+133
-65
lines changed

3 files changed

+133
-65
lines changed

kernel/src/lib.rs

-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,6 @@ pub mod table_properties;
8787
pub mod transaction;
8888

8989
pub(crate) mod predicates;
90-
pub(crate) mod schema_compat;
9190
pub(crate) mod utils;
9291

9392
#[cfg(feature = "developer-visibility")]

kernel/src/schema.rs kernel/src/schema/mod.rs

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ pub(crate) use crate::expressions::{column_name, ColumnName};
1414
use crate::utils::require;
1515
use crate::{DeltaResult, Error};
1616

17+
pub(crate) mod schema_compare;
18+
1719
pub type Schema = StructType;
1820
pub type SchemaRef = Arc<StructType>;
1921

kernel/src/schema_compat.rs kernel/src/schema/schema_compare.rs

+131-64
Original file line numberDiff line numberDiff line change
@@ -15,104 +15,138 @@
1515
//! StructField::new("value", DataType::STRING, true),
1616
//! StructField::new("year", DataType::INTEGER, true),
1717
//! ]);
18+
//! // Schemas are compatible since the `year` value is nullable
1819
//! assert!(schema.can_read_as(&read_schema).is_ok());
1920
//! ````
2021
//!
2122
//! [`Schema`]: crate::schema::Schema
2223
use std::collections::{HashMap, HashSet};
2324

2425
use crate::utils::require;
25-
use crate::{DeltaResult, Error};
26+
27+
use super::{DataType, StructField, StructType};
28+
29+
/// Represents the ways a schema comparison can fail.
30+
#[derive(Debug, thiserror::Error)]
31+
pub(crate) enum Error {
32+
#[error("The nullability was tightened for a schema field")]
33+
NullabilityTightening,
34+
#[error("Field names do not match")]
35+
FieldNameMismatch,
36+
#[error("Schema is invalid")]
37+
InvalidSchema,
38+
#[error("The read schema is missing a column present in the schema")]
39+
MissingColumn,
40+
#[error("Read schema has a non-nullable column that is not present in the schema")]
41+
NewNonNullableColumn,
42+
#[error("Types for two schema fields did not match")]
43+
TypeMismatch,
44+
}
45+
46+
/// A [`std::result::Result`] that has the schema comparison [`Error`] as the error variant.
47+
#[allow(unused)]
48+
pub(crate) type SchemaComparisonResult = Result<(), Error>;
49+
50+
/// Represents a schema compatibility check for the type. If `self` can be read as `read_type`,
51+
/// this function returns `Ok(())`. Otherwise, this function returns `Err`.
52+
#[allow(unused)]
53+
pub(crate) trait SchemaComparison {
54+
fn can_read_as(&self, read_type: &Self) -> SchemaComparisonResult;
55+
}
2656

2757
/// The nullability flag of a schema's field. This can be compared with a read schema field's
2858
/// nullability flag using [`NullabilityFlag::can_read_as`].
29-
struct NullabilityFlag(bool);
59+
#[allow(unused)]
60+
pub(crate) struct NullabilityFlag(bool);
3061

31-
impl NullabilityFlag {
62+
impl SchemaComparison for NullabilityFlag {
3263
/// Represents a nullability comparison between two schemas' fields. Returns true if the
3364
/// read nullability is the same or wider than the nullability of self.
34-
fn can_read_as(&self, read_nullable: NullabilityFlag) -> DeltaResult<()> {
65+
fn can_read_as(&self, read_nullable: &NullabilityFlag) -> SchemaComparisonResult {
3566
// The case to avoid is when the column is nullable, but the read schema specifies the
3667
// column as non-nullable. So we avoid the case where !read_nullable && nullable
3768
// Hence we check that !(!read_nullable && existing_nullable)
3869
// == read_nullable || !existing_nullable
39-
require!(
40-
read_nullable.0 || !self.0,
41-
Error::generic("Read field is non-nullable while this field is nullable")
42-
);
70+
require!(read_nullable.0 || !self.0, Error::NullabilityTightening);
4371
Ok(())
4472
}
4573
}
4674

47-
impl crate::schema::StructField {
48-
/// Returns `Ok` if this [`StructField`] can be read as `read_field` in the read schema.
49-
fn can_read_as(&self, read_field: &Self) -> DeltaResult<()> {
50-
NullabilityFlag(self.nullable).can_read_as(NullabilityFlag(read_field.nullable))?;
51-
require!(
52-
self.name() == read_field.name(),
53-
Error::generic(format!(
54-
"Struct field with name {} cannot be read with name {}",
55-
self.name(),
56-
read_field.name()
57-
))
58-
);
75+
impl SchemaComparison for StructField {
76+
/// Returns `Ok` if this [`StructField`] can be read as `read_field`. Three requirements must
77+
/// be satisfied:
78+
/// 1. The read schema field mustn't be non-nullable if this [`StructField`] is nullable.
79+
/// 2. The both this field and `read_field` must have the same name.
80+
/// 3. You can read this data type as the `read_field`'s data type.
81+
fn can_read_as(&self, read_field: &Self) -> SchemaComparisonResult {
82+
NullabilityFlag(self.nullable).can_read_as(&NullabilityFlag(read_field.nullable))?;
83+
require!(self.name() == read_field.name(), Error::FieldNameMismatch);
5984
self.data_type().can_read_as(read_field.data_type())?;
6085
Ok(())
6186
}
6287
}
63-
impl crate::schema::StructType {
64-
/// Returns `Ok` if this [`StructType`] can be read as `read_type` in the read schema.
65-
#[allow(unused)]
66-
pub(crate) fn can_read_as(&self, read_type: &Self) -> DeltaResult<()> {
67-
let field_map: HashMap<String, &crate::schema::StructField> = self
88+
impl SchemaComparison for StructType {
89+
/// Returns `Ok` if this [`StructType`] can be read as `read_type`. This is the case when:
90+
/// 1. The set of fields in this struct type are a subset of the `read_type`.
91+
/// 2. For each field in this struct, you can read it as the `read_type`'s field. See
92+
/// [`StructField::can_read_as`].
93+
/// 3. If a field in `read_type` is not present in this struct, then it must be nullable.
94+
/// 4. Both [`StructTypes`] must be valid schemas. No two fields of a structs may share a
95+
/// name that only differs by case. TODO: This check should be moved into the constructor
96+
/// for [`StructType`].
97+
fn can_read_as(&self, read_type: &Self) -> SchemaComparisonResult {
98+
let lowercase_field_map: HashMap<String, &StructField> = self
6899
.fields
69100
.iter()
70101
.map(|(name, field)| (name.to_lowercase(), field))
71102
.collect();
72103
require!(
73-
field_map.len() == self.fields.len(),
74-
Error::generic("Delta tables don't allow field names that only differ by case")
104+
lowercase_field_map.len() == self.fields.len(),
105+
Error::InvalidSchema
75106
);
76107

77-
let read_field_names: HashSet<String> =
108+
let lowercase_read_field_names: HashSet<String> =
78109
read_type.fields.keys().map(|x| x.to_lowercase()).collect();
79110
require!(
80-
read_field_names.len() == read_type.fields.len(),
81-
Error::generic("Delta tables don't allow field names that only differ by case")
111+
lowercase_read_field_names.len() == read_type.fields.len(),
112+
Error::InvalidSchema
82113
);
83114

84115
// Check that the field names are a subset of the read fields.
85-
if !field_map.keys().all(|name| read_field_names.contains(name)) {
86-
return Err(Error::generic(
87-
"Struct has column that does not exist in the read schema",
88-
));
116+
if !lowercase_field_map
117+
.keys()
118+
.all(|name| lowercase_read_field_names.contains(name))
119+
{
120+
return Err(Error::MissingColumn);
89121
}
90122
for read_field in read_type.fields() {
91-
match field_map.get(&read_field.name().to_lowercase()) {
123+
match lowercase_field_map.get(&read_field.name().to_lowercase()) {
92124
Some(existing_field) => existing_field.can_read_as(read_field)?,
93125
None => {
94126
// Note: Delta spark does not perform the following check. Hence it ignores fields
95127
// that exist in the read schema that aren't in this schema.
96-
require!(
97-
read_field.is_nullable(),
98-
Error::generic(
99-
"read type has non-nullable column that does not exist in this struct",
100-
)
101-
);
128+
require!(read_field.is_nullable(), Error::NewNonNullableColumn);
102129
}
103130
}
104131
}
105132
Ok(())
106133
}
107134
}
108135

109-
impl crate::schema::DataType {
110-
/// Returns `Ok` if this [`DataType`] can be read as `read_type` in the read schema.
111-
fn can_read_as(&self, read_type: &Self) -> DeltaResult<()> {
136+
impl SchemaComparison for DataType {
137+
/// Returns `Ok` if this [`DataType`] can be read as `read_type`. This is the case when:
138+
/// 1. The data types are the same. Note: This condition will be relaxed to include
139+
/// compatible data types with type widening. See issue [`#623`]
140+
/// 2. For complex data types, the nested types must be compatible as defined by [`SchemaComparison`]
141+
/// 3. For array data types, the nullability may not be tightened in the `read_type`. See
142+
/// [`NullabilityFlag::can_read_as`]
143+
///
144+
/// [`#623`]: <https://github.com/delta-io/delta-kernel-rs/issues/623>
145+
fn can_read_as(&self, read_type: &Self) -> SchemaComparisonResult {
112146
match (self, read_type) {
113147
(Self::Array(self_array), Self::Array(read_array)) => {
114148
NullabilityFlag(self_array.contains_null())
115-
.can_read_as(NullabilityFlag(read_array.contains_null()))?;
149+
.can_read_as(&NullabilityFlag(read_array.contains_null()))?;
116150
self_array
117151
.element_type()
118152
.can_read_as(read_array.element_type())?;
@@ -122,17 +156,14 @@ impl crate::schema::DataType {
122156
}
123157
(Self::Map(self_map), Self::Map(read_map)) => {
124158
NullabilityFlag(self_map.value_contains_null())
125-
.can_read_as(NullabilityFlag(read_map.value_contains_null()))?;
159+
.can_read_as(&NullabilityFlag(read_map.value_contains_null()))?;
126160
self_map.key_type().can_read_as(read_map.key_type())?;
127161
self_map.value_type().can_read_as(read_map.value_type())?;
128162
}
129163
(a, b) => {
130164
// TODO: In the future, we will change this to support type widening.
131-
// See: https://github.com/delta-io/delta-kernel-rs/issues/623
132-
require!(
133-
a == b,
134-
Error::generic(format!("Types {} and {} are not compatible", a, b))
135-
);
165+
// See: #623
166+
require!(a == b, Error::TypeMismatch);
136167
}
137168
};
138169
Ok(())
@@ -141,6 +172,7 @@ impl crate::schema::DataType {
141172

142173
#[cfg(test)]
143174
mod tests {
175+
use crate::schema::schema_compare::{Error, SchemaComparison};
144176
use crate::schema::{ArrayType, DataType, MapType, StructField, StructType};
145177

146178
#[test]
@@ -220,7 +252,10 @@ mod tests {
220252
false,
221253
)]);
222254

223-
assert!(existing_schema.can_read_as(&read_schema).is_err());
255+
assert!(matches!(
256+
existing_schema.can_read_as(&read_schema),
257+
Err(Error::NullabilityTightening)
258+
));
224259
}
225260
#[test]
226261
fn different_field_name_case_fails() {
@@ -235,7 +270,10 @@ mod tests {
235270
StructField::new("name", DataType::STRING, false),
236271
StructField::new("age", DataType::INTEGER, true),
237272
]);
238-
assert!(existing_schema.can_read_as(&read_schema).is_err());
273+
assert!(matches!(
274+
existing_schema.can_read_as(&read_schema),
275+
Err(Error::FieldNameMismatch)
276+
));
239277
}
240278
#[test]
241279
fn different_type_fails() {
@@ -249,7 +287,10 @@ mod tests {
249287
StructField::new("name", DataType::STRING, false),
250288
StructField::new("age", DataType::INTEGER, true),
251289
]);
252-
assert!(existing_schema.can_read_as(&read_schema).is_err());
290+
assert!(matches!(
291+
existing_schema.can_read_as(&read_schema),
292+
Err(Error::TypeMismatch)
293+
));
253294
}
254295
#[test]
255296
fn set_nullable_to_true() {
@@ -277,40 +318,57 @@ mod tests {
277318
StructField::new("name", DataType::STRING, false),
278319
StructField::new("age", DataType::INTEGER, false),
279320
]);
280-
assert!(existing_schema.can_read_as(&read_schema).is_err());
321+
assert!(matches!(
322+
existing_schema.can_read_as(&read_schema),
323+
Err(Error::NullabilityTightening)
324+
));
281325
}
282326
#[test]
283-
fn new_nullable_column() {
284-
let existing_schema = StructType::new([
327+
fn differ_by_nullable_column() {
328+
let a = StructType::new([
285329
StructField::new("id", DataType::LONG, false),
286330
StructField::new("name", DataType::STRING, false),
287331
StructField::new("age", DataType::INTEGER, true),
288332
]);
289333

290-
let read_schema = StructType::new([
334+
let b = StructType::new([
291335
StructField::new("id", DataType::LONG, false),
292336
StructField::new("name", DataType::STRING, false),
293337
StructField::new("age", DataType::INTEGER, true),
294338
StructField::new("location", DataType::STRING, true),
295339
]);
296-
assert!(existing_schema.can_read_as(&read_schema).is_ok());
340+
341+
// Read `a` as `b`. `b` adds a new nullable column. This is compatible with `a`'s schema.
342+
assert!(a.can_read_as(&b).is_ok());
343+
344+
// Read `b` as `a`. `a` is missing a column that is present in `b`.
345+
assert!(matches!(b.can_read_as(&a), Err(Error::MissingColumn)));
297346
}
298347
#[test]
299-
fn new_non_nullable_column_fails() {
300-
let existing_schema = StructType::new([
348+
fn differ_by_non_nullable_column() {
349+
let a = StructType::new([
301350
StructField::new("id", DataType::LONG, false),
302351
StructField::new("name", DataType::STRING, false),
303352
StructField::new("age", DataType::INTEGER, true),
304353
]);
305354

306-
let read_schema = StructType::new([
355+
let b = StructType::new([
307356
StructField::new("id", DataType::LONG, false),
308357
StructField::new("name", DataType::STRING, false),
309358
StructField::new("age", DataType::INTEGER, true),
310359
StructField::new("location", DataType::STRING, false),
311360
]);
312-
assert!(existing_schema.can_read_as(&read_schema).is_err());
361+
362+
// Read `a` as `b`. `b` has an extra non-nullable column.
363+
assert!(matches!(
364+
a.can_read_as(&b),
365+
Err(Error::NewNonNullableColumn)
366+
));
367+
368+
// Read `b` as `a`. `a` is missing a column that is present in `b`.
369+
assert!(matches!(b.can_read_as(&a), Err(Error::MissingColumn)));
313370
}
371+
314372
#[test]
315373
fn duplicate_field_modulo_case() {
316374
let existing_schema = StructType::new([
@@ -326,6 +384,15 @@ mod tests {
326384
StructField::new("name", DataType::STRING, false),
327385
StructField::new("age", DataType::INTEGER, true),
328386
]);
329-
assert!(existing_schema.can_read_as(&read_schema).is_err());
387+
assert!(matches!(
388+
existing_schema.can_read_as(&read_schema),
389+
Err(Error::InvalidSchema)
390+
));
391+
392+
// Checks in the inverse order
393+
assert!(matches!(
394+
read_schema.can_read_as(&existing_schema),
395+
Err(Error::InvalidSchema)
396+
));
330397
}
331398
}

0 commit comments

Comments
 (0)