15
15
//! StructField::new("value", DataType::STRING, true),
16
16
//! StructField::new("year", DataType::INTEGER, true),
17
17
//! ]);
18
+ //! // Schemas are compatible since the `year` value is nullable
18
19
//! assert!(schema.can_read_as(&read_schema).is_ok());
19
20
//! ````
20
21
//!
21
22
//! [`Schema`]: crate::schema::Schema
22
23
use std:: collections:: { HashMap , HashSet } ;
23
24
24
25
use crate :: utils:: require;
25
- use crate :: { DeltaResult , Error } ;
26
+
27
+ use super :: { DataType , StructField , StructType } ;
28
+
29
+ /// Represents the ways a schema comparison can fail.
30
+ #[ derive( Debug , thiserror:: Error ) ]
31
+ pub ( crate ) enum Error {
32
+ #[ error( "The nullability was tightened for a schema field" ) ]
33
+ NullabilityTightening ,
34
+ #[ error( "Field names do not match" ) ]
35
+ FieldNameMismatch ,
36
+ #[ error( "Schema is invalid" ) ]
37
+ InvalidSchema ,
38
+ #[ error( "The read schema is missing a column present in the schema" ) ]
39
+ MissingColumn ,
40
+ #[ error( "Read schema has a non-nullable column that is not present in the schema" ) ]
41
+ NewNonNullableColumn ,
42
+ #[ error( "Types for two schema fields did not match" ) ]
43
+ TypeMismatch ,
44
+ }
45
+
46
+ /// A [`std::result::Result`] that has the schema comparison [`Error`] as the error variant.
47
+ #[ allow( unused) ]
48
+ pub ( crate ) type SchemaComparisonResult = Result < ( ) , Error > ;
49
+
50
+ /// Represents a schema compatibility check for the type. If `self` can be read as `read_type`,
51
+ /// this function returns `Ok(())`. Otherwise, this function returns `Err`.
52
+ #[ allow( unused) ]
53
+ pub ( crate ) trait SchemaComparison {
54
+ fn can_read_as ( & self , read_type : & Self ) -> SchemaComparisonResult ;
55
+ }
26
56
27
57
/// The nullability flag of a schema's field. This can be compared with a read schema field's
28
58
/// nullability flag using [`NullabilityFlag::can_read_as`].
29
- struct NullabilityFlag ( bool ) ;
59
+ #[ allow( unused) ]
60
+ pub ( crate ) struct NullabilityFlag ( bool ) ;
30
61
31
- impl NullabilityFlag {
62
+ impl SchemaComparison for NullabilityFlag {
32
63
/// Represents a nullability comparison between two schemas' fields. Returns true if the
33
64
/// read nullability is the same or wider than the nullability of self.
34
- fn can_read_as ( & self , read_nullable : NullabilityFlag ) -> DeltaResult < ( ) > {
65
+ fn can_read_as ( & self , read_nullable : & NullabilityFlag ) -> SchemaComparisonResult {
35
66
// The case to avoid is when the column is nullable, but the read schema specifies the
36
67
// column as non-nullable. So we avoid the case where !read_nullable && nullable
37
68
// Hence we check that !(!read_nullable && existing_nullable)
38
69
// == read_nullable || !existing_nullable
39
- require ! (
40
- read_nullable. 0 || !self . 0 ,
41
- Error :: generic( "Read field is non-nullable while this field is nullable" )
42
- ) ;
70
+ require ! ( read_nullable. 0 || !self . 0 , Error :: NullabilityTightening ) ;
43
71
Ok ( ( ) )
44
72
}
45
73
}
46
74
47
- impl crate :: schema:: StructField {
48
- /// Returns `Ok` if this [`StructField`] can be read as `read_field` in the read schema.
49
- fn can_read_as ( & self , read_field : & Self ) -> DeltaResult < ( ) > {
50
- NullabilityFlag ( self . nullable ) . can_read_as ( NullabilityFlag ( read_field. nullable ) ) ?;
51
- require ! (
52
- self . name( ) == read_field. name( ) ,
53
- Error :: generic( format!(
54
- "Struct field with name {} cannot be read with name {}" ,
55
- self . name( ) ,
56
- read_field. name( )
57
- ) )
58
- ) ;
75
+ impl SchemaComparison for StructField {
76
+ /// Returns `Ok` if this [`StructField`] can be read as `read_field`. Three requirements must
77
+ /// be satisfied:
78
+ /// 1. The read schema field mustn't be non-nullable if this [`StructField`] is nullable.
79
+ /// 2. The both this field and `read_field` must have the same name.
80
+ /// 3. You can read this data type as the `read_field`'s data type.
81
+ fn can_read_as ( & self , read_field : & Self ) -> SchemaComparisonResult {
82
+ NullabilityFlag ( self . nullable ) . can_read_as ( & NullabilityFlag ( read_field. nullable ) ) ?;
83
+ require ! ( self . name( ) == read_field. name( ) , Error :: FieldNameMismatch ) ;
59
84
self . data_type ( ) . can_read_as ( read_field. data_type ( ) ) ?;
60
85
Ok ( ( ) )
61
86
}
62
87
}
63
- impl crate :: schema:: StructType {
64
- /// Returns `Ok` if this [`StructType`] can be read as `read_type` in the read schema.
65
- #[ allow( unused) ]
66
- pub ( crate ) fn can_read_as ( & self , read_type : & Self ) -> DeltaResult < ( ) > {
67
- let field_map: HashMap < String , & crate :: schema:: StructField > = self
88
+ impl SchemaComparison for StructType {
89
+ /// Returns `Ok` if this [`StructType`] can be read as `read_type`. This is the case when:
90
+ /// 1. The set of fields in this struct type are a subset of the `read_type`.
91
+ /// 2. For each field in this struct, you can read it as the `read_type`'s field. See
92
+ /// [`StructField::can_read_as`].
93
+ /// 3. If a field in `read_type` is not present in this struct, then it must be nullable.
94
+ /// 4. Both [`StructTypes`] must be valid schemas. No two fields of a structs may share a
95
+ /// name that only differs by case. TODO: This check should be moved into the constructor
96
+ /// for [`StructType`].
97
+ fn can_read_as ( & self , read_type : & Self ) -> SchemaComparisonResult {
98
+ let lowercase_field_map: HashMap < String , & StructField > = self
68
99
. fields
69
100
. iter ( )
70
101
. map ( |( name, field) | ( name. to_lowercase ( ) , field) )
71
102
. collect ( ) ;
72
103
require ! (
73
- field_map . len( ) == self . fields. len( ) ,
74
- Error :: generic ( "Delta tables don't allow field names that only differ by case" )
104
+ lowercase_field_map . len( ) == self . fields. len( ) ,
105
+ Error :: InvalidSchema
75
106
) ;
76
107
77
- let read_field_names : HashSet < String > =
108
+ let lowercase_read_field_names : HashSet < String > =
78
109
read_type. fields . keys ( ) . map ( |x| x. to_lowercase ( ) ) . collect ( ) ;
79
110
require ! (
80
- read_field_names . len( ) == read_type. fields. len( ) ,
81
- Error :: generic ( "Delta tables don't allow field names that only differ by case" )
111
+ lowercase_read_field_names . len( ) == read_type. fields. len( ) ,
112
+ Error :: InvalidSchema
82
113
) ;
83
114
84
115
// Check that the field names are a subset of the read fields.
85
- if !field_map. keys ( ) . all ( |name| read_field_names. contains ( name) ) {
86
- return Err ( Error :: generic (
87
- "Struct has column that does not exist in the read schema" ,
88
- ) ) ;
116
+ if !lowercase_field_map
117
+ . keys ( )
118
+ . all ( |name| lowercase_read_field_names. contains ( name) )
119
+ {
120
+ return Err ( Error :: MissingColumn ) ;
89
121
}
90
122
for read_field in read_type. fields ( ) {
91
- match field_map . get ( & read_field. name ( ) . to_lowercase ( ) ) {
123
+ match lowercase_field_map . get ( & read_field. name ( ) . to_lowercase ( ) ) {
92
124
Some ( existing_field) => existing_field. can_read_as ( read_field) ?,
93
125
None => {
94
126
// Note: Delta spark does not perform the following check. Hence it ignores fields
95
127
// that exist in the read schema that aren't in this schema.
96
- require ! (
97
- read_field. is_nullable( ) ,
98
- Error :: generic(
99
- "read type has non-nullable column that does not exist in this struct" ,
100
- )
101
- ) ;
128
+ require ! ( read_field. is_nullable( ) , Error :: NewNonNullableColumn ) ;
102
129
}
103
130
}
104
131
}
105
132
Ok ( ( ) )
106
133
}
107
134
}
108
135
109
- impl crate :: schema:: DataType {
110
- /// Returns `Ok` if this [`DataType`] can be read as `read_type` in the read schema.
111
- fn can_read_as ( & self , read_type : & Self ) -> DeltaResult < ( ) > {
136
+ impl SchemaComparison for DataType {
137
+ /// Returns `Ok` if this [`DataType`] can be read as `read_type`. This is the case when:
138
+ /// 1. The data types are the same. Note: This condition will be relaxed to include
139
+ /// compatible data types with type widening. See issue [`#623`]
140
+ /// 2. For complex data types, the nested types must be compatible as defined by [`SchemaComparison`]
141
+ /// 3. For array data types, the nullability may not be tightened in the `read_type`. See
142
+ /// [`NullabilityFlag::can_read_as`]
143
+ ///
144
+ /// [`#623`]: <https://github.com/delta-io/delta-kernel-rs/issues/623>
145
+ fn can_read_as ( & self , read_type : & Self ) -> SchemaComparisonResult {
112
146
match ( self , read_type) {
113
147
( Self :: Array ( self_array) , Self :: Array ( read_array) ) => {
114
148
NullabilityFlag ( self_array. contains_null ( ) )
115
- . can_read_as ( NullabilityFlag ( read_array. contains_null ( ) ) ) ?;
149
+ . can_read_as ( & NullabilityFlag ( read_array. contains_null ( ) ) ) ?;
116
150
self_array
117
151
. element_type ( )
118
152
. can_read_as ( read_array. element_type ( ) ) ?;
@@ -122,17 +156,14 @@ impl crate::schema::DataType {
122
156
}
123
157
( Self :: Map ( self_map) , Self :: Map ( read_map) ) => {
124
158
NullabilityFlag ( self_map. value_contains_null ( ) )
125
- . can_read_as ( NullabilityFlag ( read_map. value_contains_null ( ) ) ) ?;
159
+ . can_read_as ( & NullabilityFlag ( read_map. value_contains_null ( ) ) ) ?;
126
160
self_map. key_type ( ) . can_read_as ( read_map. key_type ( ) ) ?;
127
161
self_map. value_type ( ) . can_read_as ( read_map. value_type ( ) ) ?;
128
162
}
129
163
( a, b) => {
130
164
// TODO: In the future, we will change this to support type widening.
131
- // See: https://github.com/delta-io/delta-kernel-rs/issues/623
132
- require ! (
133
- a == b,
134
- Error :: generic( format!( "Types {} and {} are not compatible" , a, b) )
135
- ) ;
165
+ // See: #623
166
+ require ! ( a == b, Error :: TypeMismatch ) ;
136
167
}
137
168
} ;
138
169
Ok ( ( ) )
@@ -141,6 +172,7 @@ impl crate::schema::DataType {
141
172
142
173
#[ cfg( test) ]
143
174
mod tests {
175
+ use crate :: schema:: schema_compare:: { Error , SchemaComparison } ;
144
176
use crate :: schema:: { ArrayType , DataType , MapType , StructField , StructType } ;
145
177
146
178
#[ test]
@@ -220,7 +252,10 @@ mod tests {
220
252
false ,
221
253
) ] ) ;
222
254
223
- assert ! ( existing_schema. can_read_as( & read_schema) . is_err( ) ) ;
255
+ assert ! ( matches!(
256
+ existing_schema. can_read_as( & read_schema) ,
257
+ Err ( Error :: NullabilityTightening )
258
+ ) ) ;
224
259
}
225
260
#[ test]
226
261
fn different_field_name_case_fails ( ) {
@@ -235,7 +270,10 @@ mod tests {
235
270
StructField :: new ( "name" , DataType :: STRING , false ) ,
236
271
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
237
272
] ) ;
238
- assert ! ( existing_schema. can_read_as( & read_schema) . is_err( ) ) ;
273
+ assert ! ( matches!(
274
+ existing_schema. can_read_as( & read_schema) ,
275
+ Err ( Error :: FieldNameMismatch )
276
+ ) ) ;
239
277
}
240
278
#[ test]
241
279
fn different_type_fails ( ) {
@@ -249,7 +287,10 @@ mod tests {
249
287
StructField :: new ( "name" , DataType :: STRING , false ) ,
250
288
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
251
289
] ) ;
252
- assert ! ( existing_schema. can_read_as( & read_schema) . is_err( ) ) ;
290
+ assert ! ( matches!(
291
+ existing_schema. can_read_as( & read_schema) ,
292
+ Err ( Error :: TypeMismatch )
293
+ ) ) ;
253
294
}
254
295
#[ test]
255
296
fn set_nullable_to_true ( ) {
@@ -277,40 +318,57 @@ mod tests {
277
318
StructField :: new ( "name" , DataType :: STRING , false ) ,
278
319
StructField :: new ( "age" , DataType :: INTEGER , false ) ,
279
320
] ) ;
280
- assert ! ( existing_schema. can_read_as( & read_schema) . is_err( ) ) ;
321
+ assert ! ( matches!(
322
+ existing_schema. can_read_as( & read_schema) ,
323
+ Err ( Error :: NullabilityTightening )
324
+ ) ) ;
281
325
}
282
326
#[ test]
283
- fn new_nullable_column ( ) {
284
- let existing_schema = StructType :: new ( [
327
+ fn differ_by_nullable_column ( ) {
328
+ let a = StructType :: new ( [
285
329
StructField :: new ( "id" , DataType :: LONG , false ) ,
286
330
StructField :: new ( "name" , DataType :: STRING , false ) ,
287
331
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
288
332
] ) ;
289
333
290
- let read_schema = StructType :: new ( [
334
+ let b = StructType :: new ( [
291
335
StructField :: new ( "id" , DataType :: LONG , false ) ,
292
336
StructField :: new ( "name" , DataType :: STRING , false ) ,
293
337
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
294
338
StructField :: new ( "location" , DataType :: STRING , true ) ,
295
339
] ) ;
296
- assert ! ( existing_schema. can_read_as( & read_schema) . is_ok( ) ) ;
340
+
341
+ // Read `a` as `b`. `b` adds a new nullable column. This is compatible with `a`'s schema.
342
+ assert ! ( a. can_read_as( & b) . is_ok( ) ) ;
343
+
344
+ // Read `b` as `a`. `a` is missing a column that is present in `b`.
345
+ assert ! ( matches!( b. can_read_as( & a) , Err ( Error :: MissingColumn ) ) ) ;
297
346
}
298
347
#[ test]
299
- fn new_non_nullable_column_fails ( ) {
300
- let existing_schema = StructType :: new ( [
348
+ fn differ_by_non_nullable_column ( ) {
349
+ let a = StructType :: new ( [
301
350
StructField :: new ( "id" , DataType :: LONG , false ) ,
302
351
StructField :: new ( "name" , DataType :: STRING , false ) ,
303
352
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
304
353
] ) ;
305
354
306
- let read_schema = StructType :: new ( [
355
+ let b = StructType :: new ( [
307
356
StructField :: new ( "id" , DataType :: LONG , false ) ,
308
357
StructField :: new ( "name" , DataType :: STRING , false ) ,
309
358
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
310
359
StructField :: new ( "location" , DataType :: STRING , false ) ,
311
360
] ) ;
312
- assert ! ( existing_schema. can_read_as( & read_schema) . is_err( ) ) ;
361
+
362
+ // Read `a` as `b`. `b` has an extra non-nullable column.
363
+ assert ! ( matches!(
364
+ a. can_read_as( & b) ,
365
+ Err ( Error :: NewNonNullableColumn )
366
+ ) ) ;
367
+
368
+ // Read `b` as `a`. `a` is missing a column that is present in `b`.
369
+ assert ! ( matches!( b. can_read_as( & a) , Err ( Error :: MissingColumn ) ) ) ;
313
370
}
371
+
314
372
#[ test]
315
373
fn duplicate_field_modulo_case ( ) {
316
374
let existing_schema = StructType :: new ( [
@@ -326,6 +384,15 @@ mod tests {
326
384
StructField :: new ( "name" , DataType :: STRING , false ) ,
327
385
StructField :: new ( "age" , DataType :: INTEGER , true ) ,
328
386
] ) ;
329
- assert ! ( existing_schema. can_read_as( & read_schema) . is_err( ) ) ;
387
+ assert ! ( matches!(
388
+ existing_schema. can_read_as( & read_schema) ,
389
+ Err ( Error :: InvalidSchema )
390
+ ) ) ;
391
+
392
+ // Checks in the inverse order
393
+ assert ! ( matches!(
394
+ read_schema. can_read_as( & existing_schema) ,
395
+ Err ( Error :: InvalidSchema )
396
+ ) ) ;
330
397
}
331
398
}
0 commit comments