@@ -7,15 +7,17 @@ use std::sync::{Arc, LazyLock};
7
7
use crate :: actions:: schemas:: GetStructField ;
8
8
use crate :: actions:: visitors:: { visit_deletion_vector_at, ProtocolVisitor } ;
9
9
use crate :: actions:: {
10
- get_log_add_schema, Add , Cdc , Metadata , Protocol , Remove , ADD_NAME , CDC_NAME , METADATA_NAME ,
11
- PROTOCOL_NAME , REMOVE_NAME ,
10
+ get_log_add_schema, Add , Cdc , Metadata , Protocol , Remove , ADD_NAME , CDC_NAME , COMMIT_INFO_NAME ,
11
+ METADATA_NAME , PROTOCOL_NAME , REMOVE_NAME ,
12
12
} ;
13
13
use crate :: engine_data:: { GetData , TypedGetData } ;
14
14
use crate :: expressions:: { column_name, ColumnName } ;
15
15
use crate :: path:: ParsedLogPath ;
16
16
use crate :: scan:: data_skipping:: DataSkippingFilter ;
17
17
use crate :: scan:: state:: DvInfo ;
18
- use crate :: schema:: { ArrayType , ColumnNamesAndTypes , DataType , MapType , SchemaRef , StructType } ;
18
+ use crate :: schema:: {
19
+ ArrayType , ColumnNamesAndTypes , DataType , MapType , SchemaRef , StructField , StructType ,
20
+ } ;
19
21
use crate :: table_changes:: scan_file:: { cdf_scan_row_expression, cdf_scan_row_schema} ;
20
22
use crate :: table_changes:: { check_cdf_table_properties, ensure_cdf_read_supported} ;
21
23
use crate :: table_properties:: TableProperties ;
@@ -78,6 +80,12 @@ pub(crate) fn table_changes_action_iter(
78
80
/// Deletion vector resolution affects whether a remove action is selected in the second
79
81
/// phase, so we must perform it ahead of time in phase 1.
80
82
/// - Ensure that reading is supported on any protocol updates.
83
+ /// - Extract the in-commit timestamps from [`CommitInfo`] actions if they are present. These are
84
+ /// generated when in-commit timestamps (ICT) table feature is enabled. This must be done in the
85
+ /// first phase because the second phase lazily transforms engine data with an extra timestamp
86
+ /// column, so the timestamp must be known ahead of time. Note that when ICT is enabled, CommitInfo
87
+ /// should be the first action in every commit.
88
+ /// See: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#in-commit-timestamps
81
89
/// - Ensure that Change Data Feed is enabled for any metadata update. See [`TableProperties`]
82
90
/// - Ensure that any schema update is compatible with the provided `schema`. Currently, schema
83
91
/// compatibility is checked through schema equality. This will be expanded in the future to
@@ -93,12 +101,6 @@ pub(crate) fn table_changes_action_iter(
93
101
///
94
102
/// See https://github.com/delta-io/delta/blob/master/PROTOCOL.md#deletion-vectors
95
103
///
96
- /// TODO: When the kernel supports in-commit timestamps, we will also have to inspect CommitInfo
97
- /// actions to find the timestamp. These are generated when incommit timestamps is enabled.
98
- /// This must be done in the first phase because the second phase lazily transforms engine data with
99
- /// an extra timestamp column. Thus, the timestamp must be known ahead of time.
100
- /// See https://github.com/delta-io/delta-kernel-rs/issues/559
101
- ///
102
104
/// 2. Scan file generation phase [`LogReplayScanner::into_scan_batches`]: This iterates over every
103
105
/// action in the commit, and generates [`TableChangesScanData`]. It does so by transforming the
104
106
/// actions using [`add_transform_expr`], and generating selection vectors with the following rules:
@@ -118,14 +120,8 @@ struct LogReplayScanner {
118
120
// The commit file that this replay scanner will operate on.
119
121
commit_file : ParsedLogPath ,
120
122
// The timestamp associated with this commit. This is the file modification time
121
- // from the commit's [`FileMeta`].
122
- //
123
- //
124
- // TODO when incommit timestamps are supported: If there is a [`CommitInfo`] with a timestamp
125
- // generated by in-commit timestamps, that timestamp will be used instead.
126
- //
127
- // Note: This will be used once an expression is introduced to transform the engine data in
128
- // [`TableChangesScanData`]
123
+ // from the commit's [`FileMeta`]. If in-commit timestamps feature is enabled, this will be the
124
+ // in-commit timestamp from the [`CommitInfo`] action.
129
125
timestamp : i64 ,
130
126
}
131
127
@@ -136,15 +132,14 @@ impl LogReplayScanner {
136
132
/// 2. Construct a map from path to deletion vector of remove actions that share the same path
137
133
/// as an add action.
138
134
/// 3. Perform validation on each protocol and metadata action in the commit.
135
+ /// 4. Extract the in-commit timestamp from [`CommitInfo`] if it is present.
139
136
///
140
137
/// For more details, see the documentation for [`LogReplayScanner`].
141
138
fn try_new (
142
139
engine : & dyn Engine ,
143
140
commit_file : ParsedLogPath ,
144
141
table_schema : & SchemaRef ,
145
142
) -> DeltaResult < Self > {
146
- let visitor_schema = PreparePhaseVisitor :: schema ( ) ;
147
-
148
143
// Note: We do not perform data skipping yet because we need to visit all add and
149
144
// remove actions for deletion vector resolution to be correct.
150
145
//
@@ -156,22 +151,25 @@ impl LogReplayScanner {
156
151
// vectors are resolved so that we can skip both actions in the pair.
157
152
let action_iter = engine. get_json_handler ( ) . read_json_files (
158
153
& [ commit_file. location . clone ( ) ] ,
159
- visitor_schema ,
154
+ PreparePhaseVisitor :: schema ( ) ,
160
155
None , // not safe to apply data skipping yet
161
156
) ?;
162
157
163
158
let mut remove_dvs = HashMap :: default ( ) ;
164
159
let mut add_paths = HashSet :: default ( ) ;
165
160
let mut has_cdc_action = false ;
166
- for actions in action_iter {
161
+ let mut timestamp = commit_file. location . last_modified ;
162
+ for ( i, actions) in action_iter. enumerate ( ) {
167
163
let actions = actions?;
168
164
169
165
let mut visitor = PreparePhaseVisitor {
170
166
add_paths : & mut add_paths,
171
167
remove_dvs : & mut remove_dvs,
172
168
has_cdc_action : & mut has_cdc_action,
169
+ commit_timestamp : & mut timestamp,
173
170
protocol : None ,
174
171
metadata_info : None ,
172
+ is_first_batch : i == 0 ,
175
173
} ;
176
174
visitor. visit_rows_of ( actions. as_ref ( ) ) ?;
177
175
@@ -202,7 +200,7 @@ impl LogReplayScanner {
202
200
remove_dvs. retain ( |rm_path, _| add_paths. contains ( rm_path) ) ;
203
201
}
204
202
Ok ( LogReplayScanner {
205
- timestamp : commit_file . location . last_modified ,
203
+ timestamp,
206
204
commit_file,
207
205
has_cdc_action,
208
206
remove_dvs,
@@ -220,7 +218,6 @@ impl LogReplayScanner {
220
218
has_cdc_action,
221
219
remove_dvs,
222
220
commit_file,
223
- // TODO: Add the timestamp as a column with an expression
224
221
timestamp,
225
222
} = self ;
226
223
let remove_dvs = Arc :: new ( remove_dvs) ;
@@ -274,15 +271,19 @@ struct PreparePhaseVisitor<'a> {
274
271
has_cdc_action : & ' a mut bool ,
275
272
add_paths : & ' a mut HashSet < String > ,
276
273
remove_dvs : & ' a mut HashMap < String , DvInfo > ,
274
+ commit_timestamp : & ' a mut i64 ,
275
+ is_first_batch : bool ,
277
276
}
278
277
impl PreparePhaseVisitor < ' _ > {
279
278
fn schema ( ) -> Arc < StructType > {
279
+ let ict_type = StructField :: new ( "inCommitTimestamp" , DataType :: LONG , true ) ;
280
280
Arc :: new ( StructType :: new ( vec ! [
281
281
Option :: <Add >:: get_struct_field( ADD_NAME ) ,
282
282
Option :: <Remove >:: get_struct_field( REMOVE_NAME ) ,
283
283
Option :: <Cdc >:: get_struct_field( CDC_NAME ) ,
284
284
Option :: <Metadata >:: get_struct_field( METADATA_NAME ) ,
285
285
Option :: <Protocol >:: get_struct_field( PROTOCOL_NAME ) ,
286
+ StructField :: new( COMMIT_INFO_NAME , StructType :: new( [ ict_type] ) , true ) ,
286
287
] ) )
287
288
}
288
289
}
@@ -314,6 +315,7 @@ impl RowVisitor for PreparePhaseVisitor<'_> {
314
315
( INTEGER , column_name!( "protocol.minWriterVersion" ) ) ,
315
316
( string_list. clone( ) , column_name!( "protocol.readerFeatures" ) ) ,
316
317
( string_list, column_name!( "protocol.writerFeatures" ) ) ,
318
+ ( LONG , column_name!( "commitInfo.inCommitTimestamp" ) ) ,
317
319
] ;
318
320
let ( types, names) = types_and_names. into_iter ( ) . unzip ( ) ;
319
321
( names, types) . into ( )
@@ -323,7 +325,7 @@ impl RowVisitor for PreparePhaseVisitor<'_> {
323
325
324
326
fn visit < ' b > ( & mut self , row_count : usize , getters : & [ & ' b dyn GetData < ' b > ] ) -> DeltaResult < ( ) > {
325
327
require ! (
326
- getters. len( ) == 16 ,
328
+ getters. len( ) == 17 ,
327
329
Error :: InternalError ( format!(
328
330
"Wrong number of PreparePhaseVisitor getters: {}" ,
329
331
getters. len( )
@@ -354,6 +356,12 @@ impl RowVisitor for PreparePhaseVisitor<'_> {
354
356
let protocol =
355
357
ProtocolVisitor :: visit_protocol ( i, min_reader_version, & getters[ 12 ..=15 ] ) ?;
356
358
self . protocol = Some ( protocol) ;
359
+ } else if let Some ( in_commit_timestamp) =
360
+ getters[ 16 ] . get_long ( i, "commitInfo.inCommitTimestamp" ) ?
361
+ {
362
+ if self . is_first_batch && i == 0 {
363
+ * self . commit_timestamp = in_commit_timestamp;
364
+ }
357
365
}
358
366
}
359
367
Ok ( ( ) )
0 commit comments