From 182654f2e084ebde383b4253fdd12d5cc87bd385 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Thu, 24 Oct 2024 18:40:43 +0200 Subject: [PATCH 1/8] bump kernel to v0.4.0 --- CMakeLists.txt | 2 +- src/include/delta_kernel_ffi.hpp | 250 ++++++++++++++++--------------- 2 files changed, 127 insertions(+), 125 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4f9267..bac8c32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,7 +99,7 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs" # WARNING: the FFI headers are currently pinned due to the C linkage issue of the c++ headers. Currently, when bumping # the kernel version, the produced header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying the fix - GIT_TAG v0.3.1 + GIT_TAG v0.4.0 # Prints the env variables passed to the cargo build to the terminal, useful in debugging because passing them # through CMake is an error-prone mess CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} env diff --git a/src/include/delta_kernel_ffi.hpp b/src/include/delta_kernel_ffi.hpp index 58be58c..3c46bde 100644 --- a/src/include/delta_kernel_ffi.hpp +++ b/src/include/delta_kernel_ffi.hpp @@ -51,6 +51,7 @@ enum class KernelError { InvalidStructDataError, InternalError, InvalidExpression, + InvalidLogPath, }; struct CStringMap; @@ -107,10 +108,10 @@ struct KernelRowIndexArray { /// An accompanying [`HandleDescriptor`] trait defines the behavior of each handle type: /// /// * The true underlying ("target") type the handle represents. For safety reasons, target type -/// must always be [`Send`]. +/// must always be [`Send`]. /// /// * Mutable (`Box`-like) vs. shared (`Arc`-like). For safety reasons, the target type of a -/// shared handle must always be [`Send`]+[`Sync`]. +/// shared handle must always be [`Send`]+[`Sync`]. /// /// * Sized vs. unsized. Sized types allow handle operations to be implemented more efficiently. /// @@ -204,87 +205,10 @@ using NullableCvoid = void*; /// function is that `kernel_str` is _only_ valid until the return from this function using AllocateStringFn = NullableCvoid(*)(KernelStringSlice kernel_str); -/// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own -/// representation of a schema from a particular schema within kernel. -/// -/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a -/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier -/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the -/// future. -/// -/// Every schema element the kernel visits belongs to some list of "sibling" elements. The schema -/// itself is a list of schema elements, and every complex type (struct, map, array) contains a list -/// of "child" elements. -/// 1. Before visiting schema or any complex type, the kernel asks the engine to allocate a list to -/// hold its children -/// 2. When visiting any schema element, the kernel passes its parent's "child list" as the -/// "sibling list" the element should be appended to: -/// - For the top-level schema, visit each top-level column, passing the column's name and type -/// - For a struct, first visit each struct field, passing the field's name, type, nullability, -/// and metadata -/// - For a map, visit the key and value, passing its special name ("map_key" or "map_value"), -/// type, and value nullability (keys are never nullable) -/// - For a list, visit the element, passing its special name ("array_element"), type, and -/// nullability -/// 3. When visiting a complex schema element, the kernel also passes the "child list" containing -/// that element's (already-visited) children. -/// 4. The [`visit_schema`] method returns the id of the list of top-level columns -struct EngineSchemaVisitor { - /// opaque state pointer - void *data; - /// Creates a new field list, optionally reserving capacity up front - uintptr_t (*make_field_list)(void *data, uintptr_t reserve); - /// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a - /// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`. - void (*visit_struct)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uintptr_t child_list_id); - /// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list - /// with the array's element type - void (*visit_array)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - bool contains_null, - uintptr_t child_list_id); - /// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list - /// where the first element is the map's key type and the second element is the - /// map's value type - void (*visit_map)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - bool value_contains_null, - uintptr_t child_list_id); - /// visit a `decimal` with the specified `precision` and `scale` - void (*visit_decimal)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uint8_t precision, - uint8_t scale); - /// Visit a `string` belonging to the list identified by `sibling_list_id`. - void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `long` belonging to the list identified by `sibling_list_id`. - void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit an `integer` belonging to the list identified by `sibling_list_id`. - void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `short` belonging to the list identified by `sibling_list_id`. - void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `byte` belonging to the list identified by `sibling_list_id`. - void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `float` belonging to the list identified by `sibling_list_id`. - void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `double` belonging to the list identified by `sibling_list_id`. - void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. - void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit `binary` belonging to the list identified by `sibling_list_id`. - void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `date` belonging to the list identified by `sibling_list_id`. - void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. - void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. - void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); +struct FileMeta { + KernelStringSlice path; + int64_t last_modified; + uintptr_t size; }; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap @@ -297,12 +221,6 @@ struct EngineIterator { const void *(*get_next)(void *data); }; -struct FileMeta { - KernelStringSlice path; - int64_t last_modified; - uintptr_t size; -}; - /// ABI-compatible struct for ArrowArray from C Data Interface /// See /// @@ -341,6 +259,7 @@ struct FFI_ArrowSchema { const char *format; const char *name; const char *metadata; + /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags) int64_t flags; int64_t n_children; FFI_ArrowSchema **children; @@ -390,7 +309,7 @@ using CScanCallback = void(*)(NullableCvoid engine_context, const DvInfo *dv_info, const CStringMap *partition_map); - // This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163 +// This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163 struct im_an_unused_struct_that_tricks_msvc_into_compilation { ExternResult field; ExternResult field2; @@ -405,6 +324,89 @@ struct im_an_unused_struct_that_tricks_msvc_into_compilation { ExternResult field11; }; +/// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own +/// representation of a schema from a particular schema within kernel. +/// +/// The model is list based. When the kernel needs a list, it will ask engine to allocate one of a +/// particular size. Once allocated the engine returns an `id`, which can be any integer identifier +/// ([`usize`]) the engine wants, and will be passed back to the engine to identify the list in the +/// future. +/// +/// Every schema element the kernel visits belongs to some list of "sibling" elements. The schema +/// itself is a list of schema elements, and every complex type (struct, map, array) contains a list +/// of "child" elements. +/// 1. Before visiting schema or any complex type, the kernel asks the engine to allocate a list to +/// hold its children +/// 2. When visiting any schema element, the kernel passes its parent's "child list" as the +/// "sibling list" the element should be appended to: +/// - For the top-level schema, visit each top-level column, passing the column's name and type +/// - For a struct, first visit each struct field, passing the field's name, type, nullability, +/// and metadata +/// - For a map, visit the key and value, passing its special name ("map_key" or "map_value"), +/// type, and value nullability (keys are never nullable) +/// - For a list, visit the element, passing its special name ("array_element"), type, and +/// nullability +/// 3. When visiting a complex schema element, the kernel also passes the "child list" containing +/// that element's (already-visited) children. +/// 4. The [`visit_schema`] method returns the id of the list of top-level columns +struct EngineSchemaVisitor { + /// opaque state pointer + void *data; + /// Creates a new field list, optionally reserving capacity up front + uintptr_t (*make_field_list)(void *data, uintptr_t reserve); + /// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a + /// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`. + void (*visit_struct)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + uintptr_t child_list_id); + /// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list + /// with the array's element type + void (*visit_array)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + bool contains_null, + uintptr_t child_list_id); + /// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list + /// where the first element is the map's key type and the second element is the + /// map's value type + void (*visit_map)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + bool value_contains_null, + uintptr_t child_list_id); + /// visit a `decimal` with the specified `precision` and `scale` + void (*visit_decimal)(void *data, + uintptr_t sibling_list_id, + KernelStringSlice name, + uint8_t precision, + uint8_t scale); + /// Visit a `string` belonging to the list identified by `sibling_list_id`. + void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `long` belonging to the list identified by `sibling_list_id`. + void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit an `integer` belonging to the list identified by `sibling_list_id`. + void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `short` belonging to the list identified by `sibling_list_id`. + void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `byte` belonging to the list identified by `sibling_list_id`. + void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `float` belonging to the list identified by `sibling_list_id`. + void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `double` belonging to the list identified by `sibling_list_id`. + void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. + void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit `binary` belonging to the list identified by `sibling_list_id`. + void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `date` belonging to the list identified by `sibling_list_id`. + void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. + void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. + void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); +}; + extern "C" { /// # Safety @@ -516,15 +518,34 @@ bool string_slice_next(Handle data, /// Caller is responsible for (at most once) passing a valid pointer to a [`StringSliceIterator`] void free_string_slice_data(Handle data); -/// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the -/// documentation of [`EngineSchemaVisitor`] for a description of how this visitor works. +/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The +/// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the +/// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_ +/// call [`free_engine_data`] on it. /// -/// This method returns the id of the list allocated to hold the top level schema columns. +/// # Safety /// +/// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by +/// [`free_read_result_iter`]. The visitor function pointer must be non-null. +ExternResult read_result_next(Handle data, + NullableCvoid engine_context, + void (*engine_visitor)(NullableCvoid engine_context, + Handle engine_data)); + +/// Free the memory from the passed read result iterator /// # Safety /// -/// Caller is responsible for passing a valid snapshot handle and schema visitor. -uintptr_t visit_schema(Handle snapshot, EngineSchemaVisitor *visitor); +/// Caller is responsible for (at most once) passing a valid pointer returned by a call to +/// [`read_parquet_file`]. +void free_read_result_iter(Handle data); + +/// Use the specified engine's [`delta_kernel::ParquetHandler`] to read the specified file. +/// +/// # Safety +/// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta` +ExternResult> read_parquet_file(Handle engine, + const FileMeta *file, + Handle physical_schema); uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); @@ -568,35 +589,6 @@ uintptr_t visit_expression_literal_double(KernelExpressionVisitorState *state, d uintptr_t visit_expression_literal_bool(KernelExpressionVisitorState *state, bool value); -/// Call the engine back with the next `EngingeData` batch read by Parquet/Json handler. The -/// _engine_ "owns" the data that is passed into the `engine_visitor`, since it is allocated by the -/// `Engine` being used for log-replay. If the engine wants the kernel to free this data, it _must_ -/// call [`free_engine_data`] on it. -/// -/// # Safety -/// -/// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by -/// [`free_read_result_iter`]. The visitor function pointer must be non-null. -ExternResult read_result_next(Handle data, - NullableCvoid engine_context, - void (*engine_visitor)(NullableCvoid engine_context, - Handle engine_data)); - -/// Free the memory from the passed read result iterator -/// # Safety -/// -/// Caller is responsible for (at most once) passing a valid pointer returned by a call to -/// [`read_parquet_file`]. -void free_read_result_iter(Handle data); - -/// Use the specified engine's [`delta_kernel::ParquetHandler`] to read the specified file. -/// -/// # Safety -/// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta` -ExternResult> read_parquet_file(Handle engine, - const FileMeta *file, - Handle physical_schema); - /// Get the number of rows in an engine data /// /// # Safety @@ -737,6 +729,16 @@ void visit_scan_data(Handle data, NullableCvoid engine_context, CScanCallback callback); +/// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the +/// documentation of [`EngineSchemaVisitor`] for a description of how this visitor works. +/// +/// This method returns the id of the list allocated to hold the top level schema columns. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid snapshot handle and schema visitor. +uintptr_t visit_schema(Handle snapshot, EngineSchemaVisitor *visitor); + } // extern "C" } // namespace ffi From d6a83afb707739d8bb26688ed9411386496444b6 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Mon, 4 Nov 2024 16:30:59 +0100 Subject: [PATCH 2/8] bump to v1.1.3 --- .github/workflows/MainDistributionPipeline.yml | 10 +++++----- duckdb | 2 +- extension-ci-tools | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 496db97..0fd14ac 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -14,10 +14,10 @@ concurrency: jobs: duckdb-stable-build: name: Build extension binaries - uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.2 + uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.3 with: - duckdb_version: v1.1.2 - ci_tools_version: v1.1.2 + duckdb_version: v1.1.3 + ci_tools_version: v1.1.3 extension_name: delta enable_rust: true exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools' @@ -25,10 +25,10 @@ jobs: duckdb-stable-deploy: name: Deploy extension binaries needs: duckdb-stable-build - uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.1.2 + uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.1.3 secrets: inherit with: extension_name: delta - duckdb_version: v1.1.2 + duckdb_version: v1.1.3 exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools' deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} diff --git a/duckdb b/duckdb index f680b7d..1986445 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit f680b7d08f56183391b581077d4baf589e1cc8bd +Subproject commit 19864453f7d0ed095256d848b46e7b8630989bac diff --git a/extension-ci-tools b/extension-ci-tools index f5594c6..83f847f 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit f5594c61803daee122a5245afb817966e1a4545c +Subproject commit 83f847f8467a760f6c66dc7996c13300210220a8 From cb1ff5124eb491d61b2076c8204ea2366ce525a9 Mon Sep 17 00:00:00 2001 From: Emil Sadek Date: Fri, 8 Nov 2024 00:10:16 +0100 Subject: [PATCH 3/8] Add: .clang-format symlink --- .clang-format | 1 + 1 file changed, 1 insertion(+) create mode 120000 .clang-format diff --git a/.clang-format b/.clang-format new file mode 120000 index 0000000..9a13bb6 --- /dev/null +++ b/.clang-format @@ -0,0 +1 @@ +duckdb/.clang-format \ No newline at end of file From dd40939b9d06677dc606bd21e9319db87a639b1c Mon Sep 17 00:00:00 2001 From: Emil Sadek Date: Fri, 8 Nov 2024 00:31:20 +0100 Subject: [PATCH 4/8] Fix: formatting --- src/delta_extension.cpp | 20 +- src/delta_functions.cpp | 9 +- src/delta_utils.cpp | 493 ++++---- src/functions/delta_scan.cpp | 1541 +++++++++++++------------- src/include/delta_functions.hpp | 4 +- src/include/delta_kernel_ffi.hpp | 385 +++---- src/include/delta_utils.hpp | 198 ++-- src/include/functions/delta_scan.hpp | 208 ++-- 8 files changed, 1443 insertions(+), 1415 deletions(-) diff --git a/src/delta_extension.cpp b/src/delta_extension.cpp index 1a316d9..9fc95d0 100644 --- a/src/delta_extension.cpp +++ b/src/delta_extension.cpp @@ -1,8 +1,8 @@ #define DUCKDB_EXTENSION_MAIN #include "delta_extension.hpp" -#include "delta_functions.hpp" +#include "delta_functions.hpp" #include "duckdb.hpp" #include "duckdb/common/exception.hpp" #include "duckdb/main/extension_util.hpp" @@ -10,18 +10,18 @@ namespace duckdb { static void LoadInternal(DatabaseInstance &instance) { - // Load functions - for (const auto &function : DeltaFunctions::GetTableFunctions(instance)) { - ExtensionUtil::RegisterFunction(instance, function); - } + // Load functions + for (const auto &function : DeltaFunctions::GetTableFunctions(instance)) { + ExtensionUtil::RegisterFunction(instance, function); + } } void DeltaExtension::Load(DuckDB &db) { - LoadInternal(*db.instance); + LoadInternal(*db.instance); } std::string DeltaExtension::Name() { - return "delta"; + return "delta"; } } // namespace duckdb @@ -29,12 +29,12 @@ std::string DeltaExtension::Name() { extern "C" { DUCKDB_EXTENSION_API void delta_init(duckdb::DatabaseInstance &db) { - duckdb::DuckDB db_wrapper(db); - db_wrapper.LoadExtension(); + duckdb::DuckDB db_wrapper(db); + db_wrapper.LoadExtension(); } DUCKDB_EXTENSION_API const char *delta_version() { - return duckdb::DuckDB::LibraryVersion(); + return duckdb::DuckDB::LibraryVersion(); } } diff --git a/src/delta_functions.cpp b/src/delta_functions.cpp index f10602a..e79894b 100644 --- a/src/delta_functions.cpp +++ b/src/delta_functions.cpp @@ -2,16 +2,17 @@ #include "duckdb.hpp" #include "duckdb/main/extension_util.hpp" + #include namespace duckdb { vector DeltaFunctions::GetTableFunctions(DatabaseInstance &instance) { - vector functions; + vector functions; - functions.push_back(GetDeltaScanFunction(instance)); + functions.push_back(GetDeltaScanFunction(instance)); - return functions; + return functions; } -}; +}; // namespace duckdb diff --git a/src/delta_utils.cpp b/src/delta_utils.cpp index 32db255..035d300 100644 --- a/src/delta_utils.cpp +++ b/src/delta_utils.cpp @@ -2,312 +2,323 @@ #include "duckdb.hpp" #include "duckdb/main/extension_util.hpp" + #include #include namespace duckdb { -unique_ptr SchemaVisitor::VisitSnapshotSchema(ffi::SharedSnapshot* snapshot) { - SchemaVisitor state; - ffi::EngineSchemaVisitor visitor; - - visitor.data = &state; - visitor.make_field_list = (uintptr_t (*)(void*, uintptr_t)) &MakeFieldList; - visitor.visit_struct = (void (*)(void*, uintptr_t, ffi::KernelStringSlice, uintptr_t)) &VisitStruct; - visitor.visit_array = (void (*)(void*, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t)) &VisitArray; - visitor.visit_map = (void (*)(void*, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t)) &VisitMap; - visitor.visit_decimal = (void (*)(void*, uintptr_t, ffi::KernelStringSlice, uint8_t , uint8_t)) &VisitDecimal; - visitor.visit_string = VisitSimpleType(); - visitor.visit_long = VisitSimpleType(); - visitor.visit_integer = VisitSimpleType(); - visitor.visit_short = VisitSimpleType(); - visitor.visit_byte = VisitSimpleType(); - visitor.visit_float = VisitSimpleType(); - visitor.visit_double = VisitSimpleType(); - visitor.visit_boolean = VisitSimpleType(); - visitor.visit_binary = VisitSimpleType(); - visitor.visit_date = VisitSimpleType(); - visitor.visit_timestamp = VisitSimpleType(); - visitor.visit_timestamp_ntz = VisitSimpleType(); - - uintptr_t result = visit_schema(snapshot, &visitor); - return state.TakeFieldList(result); +unique_ptr SchemaVisitor::VisitSnapshotSchema(ffi::SharedSnapshot *snapshot) { + SchemaVisitor state; + ffi::EngineSchemaVisitor visitor; + + visitor.data = &state; + visitor.make_field_list = (uintptr_t (*)(void *, uintptr_t))&MakeFieldList; + visitor.visit_struct = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uintptr_t))&VisitStruct; + visitor.visit_array = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t))&VisitArray; + visitor.visit_map = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t))&VisitMap; + visitor.visit_decimal = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uint8_t, uint8_t))&VisitDecimal; + visitor.visit_string = VisitSimpleType(); + visitor.visit_long = VisitSimpleType(); + visitor.visit_integer = VisitSimpleType(); + visitor.visit_short = VisitSimpleType(); + visitor.visit_byte = VisitSimpleType(); + visitor.visit_float = VisitSimpleType(); + visitor.visit_double = VisitSimpleType(); + visitor.visit_boolean = VisitSimpleType(); + visitor.visit_binary = VisitSimpleType(); + visitor.visit_date = VisitSimpleType(); + visitor.visit_timestamp = VisitSimpleType(); + visitor.visit_timestamp_ntz = VisitSimpleType(); + + uintptr_t result = visit_schema(snapshot, &visitor); + return state.TakeFieldList(result); } -void SchemaVisitor::VisitDecimal(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, uint8_t precision, uint8_t scale) { - state->AppendToList(sibling_list_id, name, LogicalType::DECIMAL(precision, scale)); +void SchemaVisitor::VisitDecimal(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + uint8_t precision, uint8_t scale) { + state->AppendToList(sibling_list_id, name, LogicalType::DECIMAL(precision, scale)); } -uintptr_t SchemaVisitor::MakeFieldList(SchemaVisitor* state, uintptr_t capacity_hint) { - return state->MakeFieldListImpl(capacity_hint); +uintptr_t SchemaVisitor::MakeFieldList(SchemaVisitor *state, uintptr_t capacity_hint) { + return state->MakeFieldListImpl(capacity_hint); } -void SchemaVisitor::VisitStruct(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, uintptr_t child_list_id) { - auto children = state->TakeFieldList(child_list_id); - state->AppendToList(sibling_list_id, name, LogicalType::STRUCT(std::move(*children))); +void SchemaVisitor::VisitStruct(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + uintptr_t child_list_id) { + auto children = state->TakeFieldList(child_list_id); + state->AppendToList(sibling_list_id, name, LogicalType::STRUCT(std::move(*children))); } -void SchemaVisitor::VisitArray(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, bool contains_null, uintptr_t child_list_id) { - auto children = state->TakeFieldList(child_list_id); +void SchemaVisitor::VisitArray(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + bool contains_null, uintptr_t child_list_id) { + auto children = state->TakeFieldList(child_list_id); - D_ASSERT(children->size() == 1); - state->AppendToList(sibling_list_id, name, LogicalType::LIST(children->front().second)); + D_ASSERT(children->size() == 1); + state->AppendToList(sibling_list_id, name, LogicalType::LIST(children->front().second)); } -void SchemaVisitor::VisitMap(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, bool contains_null, uintptr_t child_list_id) { - auto children = state->TakeFieldList(child_list_id); +void SchemaVisitor::VisitMap(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + bool contains_null, uintptr_t child_list_id) { + auto children = state->TakeFieldList(child_list_id); - D_ASSERT(children->size() == 2); - state->AppendToList(sibling_list_id, name, LogicalType::MAP(LogicalType::STRUCT(std::move(*children)))); + D_ASSERT(children->size() == 2); + state->AppendToList(sibling_list_id, name, LogicalType::MAP(LogicalType::STRUCT(std::move(*children)))); } uintptr_t SchemaVisitor::MakeFieldListImpl(uintptr_t capacity_hint) { - uintptr_t id = next_id++; - auto list = make_uniq(); - if (capacity_hint > 0) { - list->reserve(capacity_hint); - } - inflight_lists.emplace(id, std::move(list)); - return id; + uintptr_t id = next_id++; + auto list = make_uniq(); + if (capacity_hint > 0) { + list->reserve(capacity_hint); + } + inflight_lists.emplace(id, std::move(list)); + return id; } -void SchemaVisitor::AppendToList(uintptr_t id, ffi::KernelStringSlice name, LogicalType&& child) { - auto it = inflight_lists.find(id); - if (it == inflight_lists.end()) { - // TODO... some error... - throw InternalException("WEIRD SHIT"); - } else { - it->second->emplace_back(std::make_pair(string(name.ptr, name.len), std::move(child))); - } +void SchemaVisitor::AppendToList(uintptr_t id, ffi::KernelStringSlice name, LogicalType &&child) { + auto it = inflight_lists.find(id); + if (it == inflight_lists.end()) { + // TODO... some error... + throw InternalException("WEIRD SHIT"); + } else { + it->second->emplace_back(std::make_pair(string(name.ptr, name.len), std::move(child))); + } } unique_ptr SchemaVisitor::TakeFieldList(uintptr_t id) { - auto it = inflight_lists.find(id); - if (it == inflight_lists.end()) { - // TODO: Raise some kind of error. - throw InternalException("WEIRD SHIT 2"); - } - auto rval = std::move(it->second); - inflight_lists.erase(it); - return rval; + auto it = inflight_lists.find(id); + if (it == inflight_lists.end()) { + // TODO: Raise some kind of error. + throw InternalException("WEIRD SHIT 2"); + } + auto rval = std::move(it->second); + inflight_lists.erase(it); + return rval; } - -ffi::EngineError* DuckDBEngineError::AllocateError(ffi::KernelError etype, ffi::KernelStringSlice msg) { - auto error = new DuckDBEngineError; - error->etype = etype; - error->error_message = string(msg.ptr, msg.len); - return error; +ffi::EngineError *DuckDBEngineError::AllocateError(ffi::KernelError etype, ffi::KernelStringSlice msg) { + auto error = new DuckDBEngineError; + error->etype = etype; + error->error_message = string(msg.ptr, msg.len); + return error; } string DuckDBEngineError::KernelErrorEnumToString(ffi::KernelError err) { - const char* KERNEL_ERROR_ENUM_STRINGS[] = { - "UnknownError", - "FFIError", - "ArrowError", - "EngineDataTypeError", - "ExtractError", - "GenericError", - "IOErrorError", - "ParquetError", - "ObjectStoreError", - "ObjectStorePathError", - "Reqwest", - "FileNotFoundError", - "MissingColumnError", - "UnexpectedColumnTypeError", - "MissingDataError", - "MissingVersionError", - "DeletionVectorError", - "InvalidUrlError", - "MalformedJsonError", - "MissingMetadataError", - "MissingProtocolError", - "MissingMetadataAndProtocolError", - "ParseError", - "JoinFailureError", - "Utf8Error", - "ParseIntError", - "InvalidColumnMappingMode", - "InvalidTableLocation", - "InvalidDecimalError", - }; - - static_assert(sizeof(KERNEL_ERROR_ENUM_STRINGS)/sizeof(char*)-1 == (int)ffi::KernelError::InvalidDecimalError, - "KernelErrorEnumStrings mismatched with kernel"); - - if ((int)err < sizeof(KERNEL_ERROR_ENUM_STRINGS)/sizeof(char*)) { - return KERNEL_ERROR_ENUM_STRINGS[(int)err]; - } - - return StringUtil::Format("EnumOutOfRange (enum val out of range: %d)", (int)err); + const char *KERNEL_ERROR_ENUM_STRINGS[] = { + "UnknownError", + "FFIError", + "ArrowError", + "EngineDataTypeError", + "ExtractError", + "GenericError", + "IOErrorError", + "ParquetError", + "ObjectStoreError", + "ObjectStorePathError", + "Reqwest", + "FileNotFoundError", + "MissingColumnError", + "UnexpectedColumnTypeError", + "MissingDataError", + "MissingVersionError", + "DeletionVectorError", + "InvalidUrlError", + "MalformedJsonError", + "MissingMetadataError", + "MissingProtocolError", + "MissingMetadataAndProtocolError", + "ParseError", + "JoinFailureError", + "Utf8Error", + "ParseIntError", + "InvalidColumnMappingMode", + "InvalidTableLocation", + "InvalidDecimalError", + }; + + static_assert(sizeof(KERNEL_ERROR_ENUM_STRINGS) / sizeof(char *) - 1 == (int)ffi::KernelError::InvalidDecimalError, + "KernelErrorEnumStrings mismatched with kernel"); + + if ((int)err < sizeof(KERNEL_ERROR_ENUM_STRINGS) / sizeof(char *)) { + return KERNEL_ERROR_ENUM_STRINGS[(int)err]; + } + + return StringUtil::Format("EnumOutOfRange (enum val out of range: %d)", (int)err); } void DuckDBEngineError::Throw(string from_where) { - // Make copies before calling delete this - auto etype_copy = etype; - auto message_copy = error_message; - - // Consume error by calling delete this (remember this error is created by kernel using AllocateError) - delete this; - throw IOException("Hit DeltaKernel FFI error (from: %s): Hit error: %u (%s) with message (%s)", - from_where.c_str(), etype_copy, KernelErrorEnumToString(etype_copy), message_copy); + // Make copies before calling delete this + auto etype_copy = etype; + auto message_copy = error_message; + + // Consume error by calling delete this (remember this error is created by + // kernel using AllocateError) + delete this; + throw IOException("Hit DeltaKernel FFI error (from: %s): Hit error: %u (%s) " + "with message (%s)", + from_where.c_str(), etype_copy, KernelErrorEnumToString(etype_copy), message_copy); } - - ffi::KernelStringSlice KernelUtils::ToDeltaString(const string &str) { - return {str.data(), str.size()}; + return {str.data(), str.size()}; } string KernelUtils::FromDeltaString(const struct ffi::KernelStringSlice slice) { - return {slice.ptr, slice.len}; + return {slice.ptr, slice.len}; } vector KernelUtils::FromDeltaBoolSlice(const struct ffi::KernelBoolSlice slice) { - vector result; - result.assign(slice.ptr, slice.ptr + slice.len); - return result; + vector result; + result.assign(slice.ptr, slice.ptr + slice.len); + return result; } PredicateVisitor::PredicateVisitor(const vector &column_names, optional_ptr filters) { - predicate = this; - visitor = (uintptr_t (*)(void*, ffi::KernelExpressionVisitorState*)) &VisitPredicate; - - if (filters) { - for (auto& filter : filters->filters) { - column_filters[column_names[filter.first]] = filter.second.get(); - } - } + predicate = this; + visitor = (uintptr_t (*)(void *, ffi::KernelExpressionVisitorState *))&VisitPredicate; + + if (filters) { + for (auto &filter : filters->filters) { + column_filters[column_names[filter.first]] = filter.second.get(); + } + } } -// Template wrapper function that implements get_next for EngineIteratorFromCallable. +// Template wrapper function that implements get_next for +// EngineIteratorFromCallable. template -static auto GetNextFromCallable(Callable* callable) -> decltype(std::declval()()) { - return callable->operator()(); +static auto GetNextFromCallable(Callable *callable) -> decltype(std::declval()()) { + return callable->operator()(); } // Wraps a callable object (e.g. C++11 lambda) as an EngineIterator. template -ffi::EngineIterator EngineIteratorFromCallable(Callable& callable) { - auto* get_next = &GetNextFromCallable; - return {&callable, (const void *(*)(void*)) get_next}; +ffi::EngineIterator EngineIteratorFromCallable(Callable &callable) { + auto *get_next = &GetNextFromCallable; + return {&callable, (const void *(*)(void *))get_next}; }; -uintptr_t PredicateVisitor::VisitPredicate(PredicateVisitor* predicate, ffi::KernelExpressionVisitorState* state) { - auto &filters = predicate->column_filters; - - auto it = filters.begin(); - auto end = filters.end(); - auto get_next = [predicate, state, &it, &end]() -> uintptr_t { - if (it == end) { - return 0; - } - auto &filter = *it++; - return predicate->VisitFilter(filter.first, *filter.second, state); - }; - auto eit = EngineIteratorFromCallable(get_next); - - return visit_expression_and(state, &eit); +uintptr_t PredicateVisitor::VisitPredicate(PredicateVisitor *predicate, ffi::KernelExpressionVisitorState *state) { + auto &filters = predicate->column_filters; + + auto it = filters.begin(); + auto end = filters.end(); + auto get_next = [predicate, state, &it, &end]() -> uintptr_t { + if (it == end) { + return 0; + } + auto &filter = *it++; + return predicate->VisitFilter(filter.first, *filter.second, state); + }; + auto eit = EngineIteratorFromCallable(get_next); + + return visit_expression_and(state, &eit); } -uintptr_t PredicateVisitor::VisitConstantFilter(const string &col_name, const ConstantFilter &filter, ffi::KernelExpressionVisitorState* state) { - auto maybe_left = ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); - uintptr_t left = KernelUtils::UnpackResult(maybe_left, "VisitConstantFilter failed to visit_expression_column"); - - uintptr_t right = ~0; - auto &value = filter.constant; - switch (value.type().id()) { - case LogicalType::BIGINT: - right = visit_expression_literal_long(state, BigIntValue::Get(value)); - break; - case LogicalType::INTEGER: - right = visit_expression_literal_int(state, IntegerValue::Get(value)); - break; - case LogicalType::SMALLINT: - right = visit_expression_literal_short(state, SmallIntValue::Get(value)); - break; - case LogicalType::TINYINT: - right = visit_expression_literal_byte(state, TinyIntValue::Get(value)); - break; - case LogicalType::FLOAT: - right = visit_expression_literal_float(state, FloatValue::Get(value)); - break; - case LogicalType::DOUBLE: - right = visit_expression_literal_double(state, DoubleValue::Get(value)); - break; - case LogicalType::BOOLEAN: - right = visit_expression_literal_bool(state, BooleanValue::Get(value)); - break; - case LogicalType::VARCHAR: { - // WARNING: C++ lifetime extension rules don't protect calls of the form foo(std::string(...).c_str()) - auto str = StringValue::Get(value); - auto maybe_right = ffi::visit_expression_literal_string(state, KernelUtils::ToDeltaString(str), DuckDBEngineError::AllocateError); - right = KernelUtils::UnpackResult(maybe_right, "VisitConstantFilter failed to visit_expression_literal_string"); - break; - } - default: - break; // unsupported type - } - - // TODO support other comparison types? - switch (filter.comparison_type) { - case ExpressionType::COMPARE_LESSTHAN: - return visit_expression_lt(state, left, right); - case ExpressionType::COMPARE_LESSTHANOREQUALTO: - return visit_expression_le(state, left, right); - case ExpressionType::COMPARE_GREATERTHAN: - return visit_expression_gt(state, left, right); - case ExpressionType::COMPARE_GREATERTHANOREQUALTO: - return visit_expression_ge(state, left, right); - case ExpressionType::COMPARE_EQUAL: - return visit_expression_eq(state, left, right); - - default: - std::cout << " Unsupported operation: " << (int) filter.comparison_type << std::endl; - return ~0; // Unsupported operation - } +uintptr_t PredicateVisitor::VisitConstantFilter(const string &col_name, const ConstantFilter &filter, + ffi::KernelExpressionVisitorState *state) { + auto maybe_left = + ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); + uintptr_t left = KernelUtils::UnpackResult(maybe_left, "VisitConstantFilter failed to visit_expression_column"); + + uintptr_t right = ~0; + auto &value = filter.constant; + switch (value.type().id()) { + case LogicalType::BIGINT: + right = visit_expression_literal_long(state, BigIntValue::Get(value)); + break; + case LogicalType::INTEGER: + right = visit_expression_literal_int(state, IntegerValue::Get(value)); + break; + case LogicalType::SMALLINT: + right = visit_expression_literal_short(state, SmallIntValue::Get(value)); + break; + case LogicalType::TINYINT: + right = visit_expression_literal_byte(state, TinyIntValue::Get(value)); + break; + case LogicalType::FLOAT: + right = visit_expression_literal_float(state, FloatValue::Get(value)); + break; + case LogicalType::DOUBLE: + right = visit_expression_literal_double(state, DoubleValue::Get(value)); + break; + case LogicalType::BOOLEAN: + right = visit_expression_literal_bool(state, BooleanValue::Get(value)); + break; + case LogicalType::VARCHAR: { + // WARNING: C++ lifetime extension rules don't protect calls of the form + // foo(std::string(...).c_str()) + auto str = StringValue::Get(value); + auto maybe_right = ffi::visit_expression_literal_string(state, KernelUtils::ToDeltaString(str), + DuckDBEngineError::AllocateError); + right = KernelUtils::UnpackResult(maybe_right, "VisitConstantFilter failed to visit_expression_literal_string"); + break; + } + default: + break; // unsupported type + } + + // TODO support other comparison types? + switch (filter.comparison_type) { + case ExpressionType::COMPARE_LESSTHAN: + return visit_expression_lt(state, left, right); + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + return visit_expression_le(state, left, right); + case ExpressionType::COMPARE_GREATERTHAN: + return visit_expression_gt(state, left, right); + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + return visit_expression_ge(state, left, right); + case ExpressionType::COMPARE_EQUAL: + return visit_expression_eq(state, left, right); + + default: + std::cout << " Unsupported operation: " << (int)filter.comparison_type << std::endl; + return ~0; // Unsupported operation + } } - -uintptr_t PredicateVisitor::VisitAndFilter(const string &col_name, const ConjunctionAndFilter &filter, ffi::KernelExpressionVisitorState* state) { - auto it = filter.child_filters.begin(); - auto end = filter.child_filters.end(); - auto get_next = [this, col_name, state, &it, &end]() -> uintptr_t { - if (it == end) { - return 0; - } - auto &child_filter = *it++; - - return VisitFilter(col_name, *child_filter, state); - }; - auto eit = EngineIteratorFromCallable(get_next); - return visit_expression_and(state, &eit); +uintptr_t PredicateVisitor::VisitAndFilter(const string &col_name, const ConjunctionAndFilter &filter, + ffi::KernelExpressionVisitorState *state) { + auto it = filter.child_filters.begin(); + auto end = filter.child_filters.end(); + auto get_next = [this, col_name, state, &it, &end]() -> uintptr_t { + if (it == end) { + return 0; + } + auto &child_filter = *it++; + + return VisitFilter(col_name, *child_filter, state); + }; + auto eit = EngineIteratorFromCallable(get_next); + return visit_expression_and(state, &eit); } uintptr_t PredicateVisitor::VisitIsNull(const string &col_name, ffi::KernelExpressionVisitorState *state) { - auto maybe_inner = ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); - uintptr_t inner = KernelUtils::UnpackResult(maybe_inner, "VisitIsNull failed to visit_expression_column"); - return ffi::visit_expression_is_null(state, inner); + auto maybe_inner = + ffi::visit_expression_column(state, KernelUtils::ToDeltaString(col_name), DuckDBEngineError::AllocateError); + uintptr_t inner = KernelUtils::UnpackResult(maybe_inner, "VisitIsNull failed to visit_expression_column"); + return ffi::visit_expression_is_null(state, inner); } uintptr_t PredicateVisitor::VisitIsNotNull(const string &col_name, ffi::KernelExpressionVisitorState *state) { - return ffi::visit_expression_not(state, VisitIsNull(col_name, state)); + return ffi::visit_expression_not(state, VisitIsNull(col_name, state)); } -uintptr_t PredicateVisitor::VisitFilter(const string &col_name, const TableFilter &filter, ffi::KernelExpressionVisitorState* state) { - switch (filter.filter_type) { - case TableFilterType::CONSTANT_COMPARISON: - return VisitConstantFilter(col_name, static_cast(filter), state); - case TableFilterType::CONJUNCTION_AND: - return VisitAndFilter(col_name, static_cast(filter), state); - case TableFilterType::IS_NULL: - return VisitIsNull(col_name, state); - case TableFilterType::IS_NOT_NULL: - return VisitIsNotNull(col_name, state); - default: - return ~0; - } +uintptr_t PredicateVisitor::VisitFilter(const string &col_name, const TableFilter &filter, + ffi::KernelExpressionVisitorState *state) { + switch (filter.filter_type) { + case TableFilterType::CONSTANT_COMPARISON: + return VisitConstantFilter(col_name, static_cast(filter), state); + case TableFilterType::CONJUNCTION_AND: + return VisitAndFilter(col_name, static_cast(filter), state); + case TableFilterType::IS_NULL: + return VisitIsNull(col_name, state); + case TableFilterType::IS_NOT_NULL: + return VisitIsNotNull(col_name, state); + default: + return ~0; + } } -}; +}; // namespace duckdb diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 65eb34f..3bf4105 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -1,745 +1,774 @@ -#include "duckdb/function/table_function.hpp" +#include "functions/delta_scan.hpp" #include "delta_functions.hpp" -#include "functions/delta_scan.hpp" -#include "duckdb/optimizer/filter_combiner.hpp" -#include "duckdb/planner/operator/logical_get.hpp" -#include "duckdb/main/extension_util.hpp" #include "duckdb/catalog/catalog_entry/table_function_catalog_entry.hpp" #include "duckdb/common/local_file_system.hpp" #include "duckdb/common/types/data_chunk.hpp" +#include "duckdb/execution/expression_executor.hpp" +#include "duckdb/function/table_function.hpp" +#include "duckdb/main/extension_util.hpp" +#include "duckdb/main/secret/secret_manager.hpp" +#include "duckdb/optimizer/filter_combiner.hpp" #include "duckdb/parser/expression/constant_expression.hpp" #include "duckdb/parser/expression/function_expression.hpp" #include "duckdb/parser/parsed_expression.hpp" -#include "duckdb/execution/expression_executor.hpp" #include "duckdb/planner/binder.hpp" -#include "duckdb/main/secret/secret_manager.hpp" - +#include "duckdb/planner/operator/logical_get.hpp" -#include +#include #include #include -#include +#include namespace duckdb { -static void* allocate_string(const struct ffi::KernelStringSlice slice) { - return new string(slice.ptr, slice.len); +static void *allocate_string(const struct ffi::KernelStringSlice slice) { + return new string(slice.ptr, slice.len); } string url_decode(string input) { - string result; - result.reserve(input.size()); - char ch; - replace(input.begin(), input.end(), '+', ' '); - for (idx_t i = 0; i < input.length(); i++) { - if (int(input[i]) == 37) { - unsigned int ii; - sscanf(input.substr(i + 1, 2).c_str(), "%x", &ii); - ch = static_cast(ii); - result += ch; - i += 2; - } else { - result += input[i]; - } - } - return result; + string result; + result.reserve(input.size()); + char ch; + replace(input.begin(), input.end(), '+', ' '); + for (idx_t i = 0; i < input.length(); i++) { + if (int(input[i]) == 37) { + unsigned int ii; + sscanf(input.substr(i + 1, 2).c_str(), "%x", &ii); + ch = static_cast(ii); + result += ch; + i += 2; + } else { + result += input[i]; + } + } + return result; } -static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, const ffi::Stats * stats, const ffi::DvInfo *dv_info, const struct ffi::CStringMap *partition_values) { - auto context = (DeltaSnapshot *) engine_context; - auto path_string = context->GetPath(); - StringUtil::RTrim(path_string, "/"); - path_string += "/" + KernelUtils::FromDeltaString(path); - - path_string = url_decode(path_string); - - // First we append the file to our resolved files - context->resolved_files.push_back(DeltaSnapshot::ToDuckDBPath(path_string)); - context->metadata.emplace_back(make_uniq()); - - D_ASSERT(context->resolved_files.size() == context->metadata.size()); - - // Initialize the file metadata - context->metadata.back()->delta_snapshot_version = context->version; - context->metadata.back()->file_number = context->resolved_files.size() - 1; - if (stats) { - context->metadata.back()->cardinality = stats->num_records; - } - - // Fetch the deletion vector - auto selection_vector_res = ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get()); - auto selection_vector = KernelUtils::UnpackResult(selection_vector_res, "selection_vector_from_dv for path " + context->GetPath()); - if (selection_vector.ptr) { - context->metadata.back()->selection_vector = selection_vector; - } - - // Lookup all columns for potential hits in the constant map - case_insensitive_map_t constant_map; - for (const auto &col: context->names) { - auto key = KernelUtils::ToDeltaString(col); - auto *partition_val = (string *) ffi::get_from_map(partition_values, key, allocate_string); - if (partition_val) { - constant_map[col] = *partition_val; - delete partition_val; - } - } - context->metadata.back()->partition_map = std::move(constant_map); +static void visit_callback(ffi::NullableCvoid engine_context, struct ffi::KernelStringSlice path, int64_t size, + const ffi::Stats *stats, const ffi::DvInfo *dv_info, + const struct ffi::CStringMap *partition_values) { + auto context = (DeltaSnapshot *)engine_context; + auto path_string = context->GetPath(); + StringUtil::RTrim(path_string, "/"); + path_string += "/" + KernelUtils::FromDeltaString(path); + + path_string = url_decode(path_string); + + // First we append the file to our resolved files + context->resolved_files.push_back(DeltaSnapshot::ToDuckDBPath(path_string)); + context->metadata.emplace_back(make_uniq()); + + D_ASSERT(context->resolved_files.size() == context->metadata.size()); + + // Initialize the file metadata + context->metadata.back()->delta_snapshot_version = context->version; + context->metadata.back()->file_number = context->resolved_files.size() - 1; + if (stats) { + context->metadata.back()->cardinality = stats->num_records; + } + + // Fetch the deletion vector + auto selection_vector_res = + ffi::selection_vector_from_dv(dv_info, context->extern_engine.get(), context->global_state.get()); + auto selection_vector = + KernelUtils::UnpackResult(selection_vector_res, "selection_vector_from_dv for path " + context->GetPath()); + if (selection_vector.ptr) { + context->metadata.back()->selection_vector = selection_vector; + } + + // Lookup all columns for potential hits in the constant map + case_insensitive_map_t constant_map; + for (const auto &col : context->names) { + auto key = KernelUtils::ToDeltaString(col); + auto *partition_val = (string *)ffi::get_from_map(partition_values, key, allocate_string); + if (partition_val) { + constant_map[col] = *partition_val; + delete partition_val; + } + } + context->metadata.back()->partition_map = std::move(constant_map); } - static void visit_data(void *engine_context, ffi::ExclusiveEngineData* engine_data, const struct ffi::KernelBoolSlice selection_vec) { - ffi::visit_scan_data(engine_data, selection_vec, engine_context, visit_callback); +static void visit_data(void *engine_context, ffi::ExclusiveEngineData *engine_data, + const struct ffi::KernelBoolSlice selection_vec) { + ffi::visit_scan_data(engine_data, selection_vec, engine_context, visit_callback); } -string ParseAccountNameFromEndpoint(const string& endpoint) { - if (!StringUtil::StartsWith(endpoint, "https://")) { - return ""; - } - auto result = endpoint.find('.', 8); - if (result == endpoint.npos) { - return ""; - } - return endpoint.substr(8,result-8); +string ParseAccountNameFromEndpoint(const string &endpoint) { + if (!StringUtil::StartsWith(endpoint, "https://")) { + return ""; + } + auto result = endpoint.find('.', 8); + if (result == endpoint.npos) { + return ""; + } + return endpoint.substr(8, result - 8); } -string parseFromConnectionString(const string& connectionString, const string& key) { - std::regex pattern(key + "=([^;]+)(?=;|$)"); - std::smatch matches; - if (std::regex_search(connectionString, matches, pattern) && matches.size() > 1) { - // The second match ([1]) contains the access key - return matches[1].str(); - } - return ""; +string parseFromConnectionString(const string &connectionString, const string &key) { + std::regex pattern(key + "=([^;]+)(?=;|$)"); + std::smatch matches; + if (std::regex_search(connectionString, matches, pattern) && matches.size() > 1) { + // The second match ([1]) contains the access key + return matches[1].str(); + } + return ""; } -static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &path) { - ffi::EngineBuilder* builder; - - // For "regular" paths we early out with the default builder config - if (!StringUtil::StartsWith(path, "s3://") && - !StringUtil::StartsWith(path, "gcs://") && - !StringUtil::StartsWith(path, "gs://") && - !StringUtil::StartsWith(path, "r2://") && - !StringUtil::StartsWith(path, "azure://") && - !StringUtil::StartsWith(path, "az://") && - !StringUtil::StartsWith(path, "abfs://") && - !StringUtil::StartsWith(path, "abfss://")) { - auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); - return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); - } - - string bucket; - string path_in_bucket; - string secret_type; - - if (StringUtil::StartsWith(path, "s3://")) { - auto end_of_container = path.find('/',5); - - if(end_of_container == string::npos) { - throw IOException("Invalid s3 url passed to delta scan: %s", path); - } - bucket = path.substr(5, end_of_container-5); - path_in_bucket = path.substr(end_of_container); - secret_type = "s3"; - } else if (StringUtil::StartsWith(path, "gcs://")) { - auto end_of_container = path.find('/',6); - - if(end_of_container == string::npos) { - throw IOException("Invalid gcs url passed to delta scan: %s", path); - } - bucket = path.substr(6, end_of_container-6); - path_in_bucket = path.substr(end_of_container); - secret_type = "gcs"; - } else if (StringUtil::StartsWith(path, "gs://")) { - auto end_of_container = path.find('/',5); - - if(end_of_container == string::npos) { - throw IOException("Invalid gcs url passed to delta scan: %s", path); - } - bucket = path.substr(5, end_of_container-5); - path_in_bucket = path.substr(end_of_container); - secret_type = "gcs"; - } else if (StringUtil::StartsWith(path, "r2://")) { - auto end_of_container = path.find('/',5); - - if(end_of_container == string::npos) { - throw IOException("Invalid gcs url passed to delta scan: %s", path); - } - bucket = path.substr(5, end_of_container-5); - path_in_bucket = path.substr(end_of_container); - secret_type = "r2"; - } else if ((StringUtil::StartsWith(path, "azure://")) || (StringUtil::StartsWith(path, "abfss://"))) { - auto end_of_container = path.find('/',8); - - if(end_of_container == string::npos) { - throw IOException("Invalid azure url passed to delta scan: %s", path); - } - bucket = path.substr(8, end_of_container-8); - path_in_bucket = path.substr(end_of_container); - secret_type = "azure"; - } else if (StringUtil::StartsWith(path, "az://")) { - auto end_of_container = path.find('/',5); - - if(end_of_container == string::npos) { - throw IOException("Invalid azure url passed to delta scan: %s", path); - } - bucket = path.substr(5, end_of_container-5); - path_in_bucket = path.substr(end_of_container); - secret_type = "azure"; - } else if (StringUtil::StartsWith(path, "abfs://")) { - auto end_of_container = path.find('/',7); - - if(end_of_container == string::npos) { - throw IOException("Invalid azure url passed to delta scan: %s", path); - } - bucket = path.substr(8, end_of_container-8); - path_in_bucket = path.substr(end_of_container); - secret_type = "azure"; - } - - // We need to substitute DuckDB's usage of s3 and r2 paths because delta kernel needs to just interpret them as s3 protocol servers. - string cleaned_path; - if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") ) { - cleaned_path = "s3://" + path.substr(5); - } else if (StringUtil::StartsWith(path, "gcs://")) { - cleaned_path = "s3://" + path.substr(6); - } else { - cleaned_path = path; - } - - auto interface_builder_res = ffi::get_engine_builder(KernelUtils::ToDeltaString(cleaned_path), DuckDBEngineError::AllocateError); - builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + cleaned_path); - - // For S3 or Azure paths we need to trim the url, set the container, and fetch a potential secret - auto &secret_manager = SecretManager::Get(context); - auto transaction = CatalogTransaction::GetSystemCatalogTransaction(context); - - auto secret_match = secret_manager.LookupSecret(transaction, path, secret_type); - - // No secret: nothing left to do here! - if (!secret_match.HasMatch()) { - if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") || StringUtil::StartsWith(path, "gcs://")) { - throw NotImplementedException("Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please create an R2 or GCS secret containing the credentials for this endpoint and try again."); - } - - return builder; - } - const auto &kv_secret = dynamic_cast(*secret_match.secret_entry->secret); - - KeyValueSecretReader secret_reader(kv_secret, *context.client_data->file_opener); - - // Here you would need to add the logic for setting the builder options for Azure - // This is just a placeholder and will need to be replaced with the actual logic - if (secret_type == "s3" || secret_type == "gcs" || secret_type == "r2") { - - string key_id, secret, session_token, region, endpoint, url_style; - bool use_ssl = true; - secret_reader.TryGetSecretKey("key_id", key_id); - secret_reader.TryGetSecretKey("secret", secret); - secret_reader.TryGetSecretKey("session_token", session_token); - secret_reader.TryGetSecretKey("region", region); - secret_reader.TryGetSecretKey("endpoint", endpoint); - secret_reader.TryGetSecretKey("url_style", url_style); - secret_reader.TryGetSecretKey("use_ssl", use_ssl); - - if (key_id.empty() && secret.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"), KernelUtils::ToDeltaString("true")); - } - - if (!key_id.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_access_key_id"), KernelUtils::ToDeltaString(key_id)); - } - if (!secret.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_secret_access_key"), KernelUtils::ToDeltaString(secret)); - } - if (!session_token.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"), KernelUtils::ToDeltaString(session_token)); - } - if (!endpoint.empty() && endpoint != "s3.amazonaws.com") { - if(!StringUtil::StartsWith(endpoint, "https://") && !StringUtil::StartsWith(endpoint, "http://")) { - if(use_ssl) { - endpoint = "https://" + endpoint; - } else { - endpoint = "http://" + endpoint; - } - } - - if (StringUtil::StartsWith(endpoint, "http://")) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("allow_http"), KernelUtils::ToDeltaString("true")); - } - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_endpoint"), KernelUtils::ToDeltaString(endpoint)); - } - - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region)); - - } else if (secret_type == "azure") { - // azure seems to be super complicated as we need to cover duckdb azure plugin and delta RS builder - // and both require different settings - string connection_string, account_name, endpoint, client_id, client_secret, tenant_id, chain; - secret_reader.TryGetSecretKey("connection_string", connection_string); - secret_reader.TryGetSecretKey("account_name", account_name); - secret_reader.TryGetSecretKey("endpoint", endpoint); - secret_reader.TryGetSecretKey("client_id", client_id); - secret_reader.TryGetSecretKey("client_secret", client_secret); - secret_reader.TryGetSecretKey("tenant_id", tenant_id); - secret_reader.TryGetSecretKey("chain", chain); - - if (!account_name.empty() && account_name == "onelake") { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_fabric_endpoint"), KernelUtils::ToDeltaString("true")); - } - - auto provider = kv_secret.GetProvider(); - if (provider == "access_token") { - // Authentication option 0: https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variant.Token - string access_token; - secret_reader.TryGetSecretKey("access_token", access_token); - if (access_token.empty()) { - throw InvalidInputException("No access_token value not found in secret provider!"); - } - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("bearer_token"), KernelUtils::ToDeltaString(access_token)); - } else if (provider == "credential_chain") { - // Authentication option 1a: using the cli authentication - if (chain.find("cli") != std::string::npos) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_azure_cli"), KernelUtils::ToDeltaString("true")); - } - // Authentication option 1b: non-cli credential chains will just "hope for the best" technically since we are using the default - // credential chain provider duckDB and delta-kernel-rs should find the same auth - } else if (!connection_string.empty() && connection_string != "NULL") { - - // Authentication option 2: a connection string based on account key - auto account_key = parseFromConnectionString(connection_string, "AccountKey"); - account_name = parseFromConnectionString(connection_string, "AccountName"); - // Authentication option 2: a connection string based on account key - if (!account_name.empty() && !account_key.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_key"), - KernelUtils::ToDeltaString(account_key)); - } else { - // Authentication option 2b: a connection string based on SAS token - endpoint = parseFromConnectionString(connection_string, "BlobEndpoint"); - if (account_name.empty()) { - account_name = ParseAccountNameFromEndpoint(endpoint); - } - auto sas_token = parseFromConnectionString(connection_string, "SharedAccessSignature"); - if (!sas_token.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("sas_token"), - KernelUtils::ToDeltaString(sas_token)); - } - } - } else if (provider == "service_principal") { - if (!client_id.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), KernelUtils::ToDeltaString(client_id)); - } - if (!client_secret.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), KernelUtils::ToDeltaString(client_secret)); - } - if (!tenant_id.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), KernelUtils::ToDeltaString(tenant_id)); - } - } else { - // Authentication option 3: no authentication, just an account name - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_skip_signature"), KernelUtils::ToDeltaString("true")); - } - // Set the use_emulator option for when the azurite test server is used - if (account_name == "devstoreaccount1" || connection_string.find("devstoreaccount1") != string::npos) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), KernelUtils::ToDeltaString("true")); - } - if (!account_name.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_name"), KernelUtils::ToDeltaString(account_name)); //needed for delta RS builder - } - if (!endpoint.empty()) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString(endpoint)); - } - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), KernelUtils::ToDeltaString(bucket)); - } - return builder; +static ffi::EngineBuilder *CreateBuilder(ClientContext &context, const string &path) { + ffi::EngineBuilder *builder; + + // For "regular" paths we early out with the default builder config + if (!StringUtil::StartsWith(path, "s3://") && !StringUtil::StartsWith(path, "gcs://") && + !StringUtil::StartsWith(path, "gs://") && !StringUtil::StartsWith(path, "r2://") && + !StringUtil::StartsWith(path, "azure://") && !StringUtil::StartsWith(path, "az://") && + !StringUtil::StartsWith(path, "abfs://") && !StringUtil::StartsWith(path, "abfss://")) { + auto interface_builder_res = + ffi::get_engine_builder(KernelUtils::ToDeltaString(path), DuckDBEngineError::AllocateError); + return KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + path); + } + + string bucket; + string path_in_bucket; + string secret_type; + + if (StringUtil::StartsWith(path, "s3://")) { + auto end_of_container = path.find('/', 5); + + if (end_of_container == string::npos) { + throw IOException("Invalid s3 url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container - 5); + path_in_bucket = path.substr(end_of_container); + secret_type = "s3"; + } else if (StringUtil::StartsWith(path, "gcs://")) { + auto end_of_container = path.find('/', 6); + + if (end_of_container == string::npos) { + throw IOException("Invalid gcs url passed to delta scan: %s", path); + } + bucket = path.substr(6, end_of_container - 6); + path_in_bucket = path.substr(end_of_container); + secret_type = "gcs"; + } else if (StringUtil::StartsWith(path, "gs://")) { + auto end_of_container = path.find('/', 5); + + if (end_of_container == string::npos) { + throw IOException("Invalid gcs url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container - 5); + path_in_bucket = path.substr(end_of_container); + secret_type = "gcs"; + } else if (StringUtil::StartsWith(path, "r2://")) { + auto end_of_container = path.find('/', 5); + + if (end_of_container == string::npos) { + throw IOException("Invalid gcs url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container - 5); + path_in_bucket = path.substr(end_of_container); + secret_type = "r2"; + } else if ((StringUtil::StartsWith(path, "azure://")) || (StringUtil::StartsWith(path, "abfss://"))) { + auto end_of_container = path.find('/', 8); + + if (end_of_container == string::npos) { + throw IOException("Invalid azure url passed to delta scan: %s", path); + } + bucket = path.substr(8, end_of_container - 8); + path_in_bucket = path.substr(end_of_container); + secret_type = "azure"; + } else if (StringUtil::StartsWith(path, "az://")) { + auto end_of_container = path.find('/', 5); + + if (end_of_container == string::npos) { + throw IOException("Invalid azure url passed to delta scan: %s", path); + } + bucket = path.substr(5, end_of_container - 5); + path_in_bucket = path.substr(end_of_container); + secret_type = "azure"; + } else if (StringUtil::StartsWith(path, "abfs://")) { + auto end_of_container = path.find('/', 7); + + if (end_of_container == string::npos) { + throw IOException("Invalid azure url passed to delta scan: %s", path); + } + bucket = path.substr(8, end_of_container - 8); + path_in_bucket = path.substr(end_of_container); + secret_type = "azure"; + } + + // We need to substitute DuckDB's usage of s3 and r2 paths because delta kernel needs to just interpret them as s3 + // protocol servers. + string cleaned_path; + if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://")) { + cleaned_path = "s3://" + path.substr(5); + } else if (StringUtil::StartsWith(path, "gcs://")) { + cleaned_path = "s3://" + path.substr(6); + } else { + cleaned_path = path; + } + + auto interface_builder_res = + ffi::get_engine_builder(KernelUtils::ToDeltaString(cleaned_path), DuckDBEngineError::AllocateError); + builder = KernelUtils::UnpackResult(interface_builder_res, "get_engine_interface_builder for path " + cleaned_path); + + // For S3 or Azure paths we need to trim the url, set the container, and fetch a potential secret + auto &secret_manager = SecretManager::Get(context); + auto transaction = CatalogTransaction::GetSystemCatalogTransaction(context); + + auto secret_match = secret_manager.LookupSecret(transaction, path, secret_type); + + // No secret: nothing left to do here! + if (!secret_match.HasMatch()) { + if (StringUtil::StartsWith(path, "r2://") || StringUtil::StartsWith(path, "gs://") || + StringUtil::StartsWith(path, "gcs://")) { + throw NotImplementedException( + "Can not scan a gcs:// gs:// or r2:// url without a secret providing its endpoint currently. Please " + "create an R2 or GCS secret containing the credentials for this endpoint and try again."); + } + + return builder; + } + const auto &kv_secret = dynamic_cast(*secret_match.secret_entry->secret); + + KeyValueSecretReader secret_reader(kv_secret, *context.client_data->file_opener); + + // Here you would need to add the logic for setting the builder options for Azure + // This is just a placeholder and will need to be replaced with the actual logic + if (secret_type == "s3" || secret_type == "gcs" || secret_type == "r2") { + + string key_id, secret, session_token, region, endpoint, url_style; + bool use_ssl = true; + secret_reader.TryGetSecretKey("key_id", key_id); + secret_reader.TryGetSecretKey("secret", secret); + secret_reader.TryGetSecretKey("session_token", session_token); + secret_reader.TryGetSecretKey("region", region); + secret_reader.TryGetSecretKey("endpoint", endpoint); + secret_reader.TryGetSecretKey("url_style", url_style); + secret_reader.TryGetSecretKey("use_ssl", use_ssl); + + if (key_id.empty() && secret.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("skip_signature"), + KernelUtils::ToDeltaString("true")); + } + + if (!key_id.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_access_key_id"), + KernelUtils::ToDeltaString(key_id)); + } + if (!secret.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_secret_access_key"), + KernelUtils::ToDeltaString(secret)); + } + if (!session_token.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_session_token"), + KernelUtils::ToDeltaString(session_token)); + } + if (!endpoint.empty() && endpoint != "s3.amazonaws.com") { + if (!StringUtil::StartsWith(endpoint, "https://") && !StringUtil::StartsWith(endpoint, "http://")) { + if (use_ssl) { + endpoint = "https://" + endpoint; + } else { + endpoint = "http://" + endpoint; + } + } + + if (StringUtil::StartsWith(endpoint, "http://")) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("allow_http"), KernelUtils::ToDeltaString("true")); + } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_endpoint"), + KernelUtils::ToDeltaString(endpoint)); + } + + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_region"), KernelUtils::ToDeltaString(region)); + + } else if (secret_type == "azure") { + // azure seems to be super complicated as we need to cover duckdb azure plugin and delta RS builder + // and both require different settings + string connection_string, account_name, endpoint, client_id, client_secret, tenant_id, chain; + secret_reader.TryGetSecretKey("connection_string", connection_string); + secret_reader.TryGetSecretKey("account_name", account_name); + secret_reader.TryGetSecretKey("endpoint", endpoint); + secret_reader.TryGetSecretKey("client_id", client_id); + secret_reader.TryGetSecretKey("client_secret", client_secret); + secret_reader.TryGetSecretKey("tenant_id", tenant_id); + secret_reader.TryGetSecretKey("chain", chain); + + if (!account_name.empty() && account_name == "onelake") { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_fabric_endpoint"), + KernelUtils::ToDeltaString("true")); + } + + auto provider = kv_secret.GetProvider(); + if (provider == "access_token") { + // Authentication option 0: + // https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variant.Token + string access_token; + secret_reader.TryGetSecretKey("access_token", access_token); + if (access_token.empty()) { + throw InvalidInputException("No access_token value not found in secret provider!"); + } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("bearer_token"), + KernelUtils::ToDeltaString(access_token)); + } else if (provider == "credential_chain") { + // Authentication option 1a: using the cli authentication + if (chain.find("cli") != std::string::npos) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_azure_cli"), + KernelUtils::ToDeltaString("true")); + } + // Authentication option 1b: non-cli credential chains will just "hope for the best" technically since we + // are using the default credential chain provider duckDB and delta-kernel-rs should find the same auth + } else if (!connection_string.empty() && connection_string != "NULL") { + + // Authentication option 2: a connection string based on account key + auto account_key = parseFromConnectionString(connection_string, "AccountKey"); + account_name = parseFromConnectionString(connection_string, "AccountName"); + // Authentication option 2: a connection string based on account key + if (!account_name.empty() && !account_key.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_key"), + KernelUtils::ToDeltaString(account_key)); + } else { + // Authentication option 2b: a connection string based on SAS token + endpoint = parseFromConnectionString(connection_string, "BlobEndpoint"); + if (account_name.empty()) { + account_name = ParseAccountNameFromEndpoint(endpoint); + } + auto sas_token = parseFromConnectionString(connection_string, "SharedAccessSignature"); + if (!sas_token.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("sas_token"), + KernelUtils::ToDeltaString(sas_token)); + } + } + } else if (provider == "service_principal") { + if (!client_id.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_id"), + KernelUtils::ToDeltaString(client_id)); + } + if (!client_secret.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_client_secret"), + KernelUtils::ToDeltaString(client_secret)); + } + if (!tenant_id.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_tenant_id"), + KernelUtils::ToDeltaString(tenant_id)); + } + } else { + // Authentication option 3: no authentication, just an account name + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_skip_signature"), + KernelUtils::ToDeltaString("true")); + } + // Set the use_emulator option for when the azurite test server is used + if (account_name == "devstoreaccount1" || connection_string.find("devstoreaccount1") != string::npos) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), KernelUtils::ToDeltaString("true")); + } + if (!account_name.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_name"), + KernelUtils::ToDeltaString(account_name)); // needed for delta RS builder + } + if (!endpoint.empty()) { + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), + KernelUtils::ToDeltaString(endpoint)); + } + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), KernelUtils::ToDeltaString(bucket)); + } + return builder; } -DeltaSnapshot::DeltaSnapshot(ClientContext &context_p, const string &path) : MultiFileList({ToDeltaPath(path)}, FileGlobOptions::ALLOW_EMPTY), context(context_p) { +DeltaSnapshot::DeltaSnapshot(ClientContext &context_p, const string &path) + : MultiFileList({ToDeltaPath(path)}, FileGlobOptions::ALLOW_EMPTY), context(context_p) { } string DeltaSnapshot::GetPath() { - return GetPaths()[0]; + return GetPaths()[0]; } string DeltaSnapshot::ToDuckDBPath(const string &raw_path) { - if (StringUtil::StartsWith(raw_path, "file://")) { - return raw_path.substr(7); - } - return raw_path; + if (StringUtil::StartsWith(raw_path, "file://")) { + return raw_path.substr(7); + } + return raw_path; } string DeltaSnapshot::ToDeltaPath(const string &raw_path) { - string path; - if (StringUtil::StartsWith(raw_path, "./")) { - LocalFileSystem fs; - path = fs.JoinPath(fs.GetWorkingDirectory(), raw_path.substr(2)); - path = "file://" + path; - } else { - path = raw_path; - } - - // Paths always end in a slash (kernel likes it that way for now) - if (path[path.size()-1] != '/') { - path = path + '/'; - } - - return path; + string path; + if (StringUtil::StartsWith(raw_path, "./")) { + LocalFileSystem fs; + path = fs.JoinPath(fs.GetWorkingDirectory(), raw_path.substr(2)); + path = "file://" + path; + } else { + path = raw_path; + } + + // Paths always end in a slash (kernel likes it that way for now) + if (path[path.size() - 1] != '/') { + path = path + '/'; + } + + return path; } void DeltaSnapshot::Bind(vector &return_types, vector &names) { - if (!initialized) { - InitializeFiles(); - } - auto schema = SchemaVisitor::VisitSnapshotSchema(snapshot.get()); - for (const auto &field: *schema) { - names.push_back(field.first); - return_types.push_back(field.second); - } - // Store the bound names for resolving the complex filter pushdown later - this->names = names; + if (!initialized) { + InitializeFiles(); + } + auto schema = SchemaVisitor::VisitSnapshotSchema(snapshot.get()); + for (const auto &field : *schema) { + names.push_back(field.first); + return_types.push_back(field.second); + } + // Store the bound names for resolving the complex filter pushdown later + this->names = names; } string DeltaSnapshot::GetFile(idx_t i) { - if (!initialized) { - InitializeFiles(); - } - // We already have this file - if (i < resolved_files.size()) { - return resolved_files[i]; - } - - if (files_exhausted) { - return ""; - } - - while(i >= resolved_files.size()) { - auto have_scan_data_res = ffi::kernel_scan_data_next(scan_data_iterator.get(), this, visit_data); - - auto have_scan_data = TryUnpackKernelResult(have_scan_data_res); - - // kernel has indicated that we have no more data to scan - if (!have_scan_data) { - files_exhausted = true; - return ""; - } - } - - // The kernel scan visitor should have resolved a file OR returned - if(i >= resolved_files.size()) { - throw IOException("Delta Kernel seems to have failed to resolve a new file"); - } - - return resolved_files[i]; + if (!initialized) { + InitializeFiles(); + } + // We already have this file + if (i < resolved_files.size()) { + return resolved_files[i]; + } + + if (files_exhausted) { + return ""; + } + + while (i >= resolved_files.size()) { + auto have_scan_data_res = ffi::kernel_scan_data_next(scan_data_iterator.get(), this, visit_data); + + auto have_scan_data = TryUnpackKernelResult(have_scan_data_res); + + // kernel has indicated that we have no more data to scan + if (!have_scan_data) { + files_exhausted = true; + return ""; + } + } + + // The kernel scan visitor should have resolved a file OR returned + if (i >= resolved_files.size()) { + throw IOException("Delta Kernel seems to have failed to resolve a new file"); + } + + return resolved_files[i]; } void DeltaSnapshot::InitializeFiles() { - auto path_slice = KernelUtils::ToDeltaString(paths[0]); + auto path_slice = KernelUtils::ToDeltaString(paths[0]); - // Register engine - auto interface_builder = CreateBuilder(context, paths[0]); - extern_engine = TryUnpackKernelResult( ffi::builder_build(interface_builder)); + // Register engine + auto interface_builder = CreateBuilder(context, paths[0]); + extern_engine = TryUnpackKernelResult(ffi::builder_build(interface_builder)); - // Initialize Snapshot - snapshot = TryUnpackKernelResult(ffi::snapshot(path_slice, extern_engine.get())); + // Initialize Snapshot + snapshot = TryUnpackKernelResult(ffi::snapshot(path_slice, extern_engine.get())); - // Create Scan - PredicateVisitor visitor(names, &table_filters); - scan = TryUnpackKernelResult(ffi::scan(snapshot.get(), extern_engine.get(), &visitor)); + // Create Scan + PredicateVisitor visitor(names, &table_filters); + scan = TryUnpackKernelResult(ffi::scan(snapshot.get(), extern_engine.get(), &visitor)); - // Create GlobalState - global_state = ffi::get_global_scan_state(scan.get()); + // Create GlobalState + global_state = ffi::get_global_scan_state(scan.get()); - // Set version - this->version = ffi::version(snapshot.get()); + // Set version + this->version = ffi::version(snapshot.get()); - // Create scan data iterator - scan_data_iterator = TryUnpackKernelResult(ffi::kernel_scan_data_init(extern_engine.get(), scan.get())); + // Create scan data iterator + scan_data_iterator = TryUnpackKernelResult(ffi::kernel_scan_data_init(extern_engine.get(), scan.get())); - initialized = true; + initialized = true; } -unique_ptr DeltaSnapshot::ComplexFilterPushdown(ClientContext &context, const MultiFileReaderOptions &options, MultiFilePushdownInfo &info, - vector> &filters) { - FilterCombiner combiner(context); - for (const auto &filter : filters) { - combiner.AddFilter(filter->Copy()); - } - auto filterstmp = combiner.GenerateTableScanFilters(info.column_ids); +unique_ptr DeltaSnapshot::ComplexFilterPushdown(ClientContext &context, + const MultiFileReaderOptions &options, + MultiFilePushdownInfo &info, + vector> &filters) { + FilterCombiner combiner(context); + for (const auto &filter : filters) { + combiner.AddFilter(filter->Copy()); + } + auto filterstmp = combiner.GenerateTableScanFilters(info.column_ids); - // TODO: can/should we figure out if this filtered anything? - auto filtered_list = make_uniq(context, paths[0]); - filtered_list->table_filters = std::move(filterstmp); - filtered_list->names = names; + // TODO: can/should we figure out if this filtered anything? + auto filtered_list = make_uniq(context, paths[0]); + filtered_list->table_filters = std::move(filterstmp); + filtered_list->names = names; - return std::move(filtered_list); + return std::move(filtered_list); } vector DeltaSnapshot::GetAllFiles() { - idx_t i = resolved_files.size(); - // TODO: this can probably be improved - while(!GetFile(i).empty()) { - i++; - } - return resolved_files; + idx_t i = resolved_files.size(); + // TODO: this can probably be improved + while (!GetFile(i).empty()) { + i++; + } + return resolved_files; } FileExpandResult DeltaSnapshot::GetExpandResult() { - // GetFile(1) will ensure at least the first 2 files are expanded if they are available - GetFile(1); + // GetFile(1) will ensure at least the first 2 files are expanded if they are available + GetFile(1); - if (resolved_files.size() > 1) { - return FileExpandResult::MULTIPLE_FILES; - } else if (resolved_files.size() == 1) { - return FileExpandResult::SINGLE_FILE; - } + if (resolved_files.size() > 1) { + return FileExpandResult::MULTIPLE_FILES; + } else if (resolved_files.size() == 1) { + return FileExpandResult::SINGLE_FILE; + } - return FileExpandResult::NO_FILES; + return FileExpandResult::NO_FILES; } idx_t DeltaSnapshot::GetTotalFileCount() { - // TODO: this can probably be improved - idx_t i = resolved_files.size(); - while(!GetFile(i).empty()) { - i++; - } - return resolved_files.size(); + // TODO: this can probably be improved + idx_t i = resolved_files.size(); + while (!GetFile(i).empty()) { + i++; + } + return resolved_files.size(); } unique_ptr DeltaSnapshot::GetCardinality(ClientContext &context) { - // This also ensures all files are expanded - auto total_file_count = DeltaSnapshot::GetTotalFileCount(); - - if (total_file_count == 0) { - return make_uniq(0,0); - } - - idx_t total_tuple_count = 0; - bool have_any_stats = false; - for (auto &metadatum : metadata) { - if (metadatum->cardinality != DConstants::INVALID_INDEX) { - have_any_stats = true; - total_tuple_count += metadatum->cardinality; - } - } - - if (have_any_stats) { - return make_uniq(total_tuple_count,total_tuple_count); - } - - return nullptr; + // This also ensures all files are expanded + auto total_file_count = DeltaSnapshot::GetTotalFileCount(); + + if (total_file_count == 0) { + return make_uniq(0, 0); + } + + idx_t total_tuple_count = 0; + bool have_any_stats = false; + for (auto &metadatum : metadata) { + if (metadatum->cardinality != DConstants::INVALID_INDEX) { + have_any_stats = true; + total_tuple_count += metadatum->cardinality; + } + } + + if (have_any_stats) { + return make_uniq(total_tuple_count, total_tuple_count); + } + + return nullptr; } unique_ptr DeltaMultiFileReader::CreateInstance() { - return std::move(make_uniq()); + return std::move(make_uniq()); } bool DeltaMultiFileReader::Bind(MultiFileReaderOptions &options, MultiFileList &files, - vector &return_types, vector &names, MultiFileReaderBindData &bind_data) { - auto &delta_snapshot = dynamic_cast(files); - - delta_snapshot.Bind(return_types, names); - - // We need to parse this option - bool file_row_number_enabled = options.custom_options.find("file_row_number") != options.custom_options.end(); - if (file_row_number_enabled) { - bind_data.file_row_number_idx = names.size(); - return_types.emplace_back(LogicalType::BIGINT); - names.emplace_back("file_row_number"); - } else { - // TODO: this is a bogus ID? Change for flag indicating it should be enabled? - bind_data.file_row_number_idx = names.size(); - } - - return true; + vector &return_types, vector &names, + MultiFileReaderBindData &bind_data) { + auto &delta_snapshot = dynamic_cast(files); + + delta_snapshot.Bind(return_types, names); + + // We need to parse this option + bool file_row_number_enabled = options.custom_options.find("file_row_number") != options.custom_options.end(); + if (file_row_number_enabled) { + bind_data.file_row_number_idx = names.size(); + return_types.emplace_back(LogicalType::BIGINT); + names.emplace_back("file_row_number"); + } else { + // TODO: this is a bogus ID? Change for flag indicating it should be enabled? + bind_data.file_row_number_idx = names.size(); + } + + return true; }; void DeltaMultiFileReader::BindOptions(MultiFileReaderOptions &options, MultiFileList &files, - vector &return_types, vector &names, MultiFileReaderBindData& bind_data) { - - // Disable all other multifilereader options - options.auto_detect_hive_partitioning = false; - options.hive_partitioning = false; - options.union_by_name = false; - - MultiFileReader::BindOptions(options, files, return_types, names, bind_data); - - auto demo_gen_col_opt = options.custom_options.find("delta_file_number"); - if (demo_gen_col_opt != options.custom_options.end()) { - if (demo_gen_col_opt->second.GetValue()) { - names.push_back("delta_file_number"); - return_types.push_back(LogicalType::UBIGINT); - } - } + vector &return_types, vector &names, + MultiFileReaderBindData &bind_data) { + + // Disable all other multifilereader options + options.auto_detect_hive_partitioning = false; + options.hive_partitioning = false; + options.union_by_name = false; + + MultiFileReader::BindOptions(options, files, return_types, names, bind_data); + + auto demo_gen_col_opt = options.custom_options.find("delta_file_number"); + if (demo_gen_col_opt != options.custom_options.end()) { + if (demo_gen_col_opt->second.GetValue()) { + names.push_back("delta_file_number"); + return_types.push_back(LogicalType::UBIGINT); + } + } } -void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, const MultiFileReaderBindData &options, - const string &filename, const vector &local_names, - const vector &global_types, const vector &global_names, - const vector &global_column_ids, MultiFileReaderData &reader_data, - ClientContext &context, optional_ptr global_state) { - MultiFileReader::FinalizeBind(file_options, options, filename, local_names, global_types, global_names, global_column_ids, reader_data, context, global_state); - - // Handle custom delta option set in MultiFileReaderOptions::custom_options - auto file_number_opt = file_options.custom_options.find("delta_file_number"); - if (file_number_opt != file_options.custom_options.end()) { - if (file_number_opt->second.GetValue()) { - D_ASSERT(global_state); - auto &delta_global_state = global_state->Cast(); - D_ASSERT(delta_global_state.delta_file_number_idx != DConstants::INVALID_INDEX); - - // We add the constant column for the delta_file_number option - // NOTE: we add a placeholder here, to demonstrate how we can also populate extra columns in the FinalizeChunk - reader_data.constant_map.emplace_back(delta_global_state.delta_file_number_idx, Value::UBIGINT(0)); - } - } - - // Get the metadata for this file - D_ASSERT(global_state->file_list); - const auto &snapshot = dynamic_cast(*global_state->file_list); - auto &file_metadata = snapshot.metadata[reader_data.file_list_idx.GetIndex()]; - - if (!file_metadata->partition_map.empty()) { - for (idx_t i = 0; i < global_column_ids.size(); i++) { - column_t col_id = global_column_ids[i]; - if (IsRowIdColumnId(col_id)) { - continue; - } - auto col_partition_entry = file_metadata->partition_map.find(global_names[col_id]); - if (col_partition_entry != file_metadata->partition_map.end()) { - auto ¤t_type = global_types[col_id]; - if (current_type == LogicalType::BLOB) { - reader_data.constant_map.emplace_back(i, Value::BLOB_RAW(col_partition_entry->second)); - } else { - auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(current_type); - reader_data.constant_map.emplace_back(i, maybe_value); - } - } - } - } +void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_options, + const MultiFileReaderBindData &options, const string &filename, + const vector &local_names, const vector &global_types, + const vector &global_names, const vector &global_column_ids, + MultiFileReaderData &reader_data, ClientContext &context, + optional_ptr global_state) { + MultiFileReader::FinalizeBind(file_options, options, filename, local_names, global_types, global_names, + global_column_ids, reader_data, context, global_state); + + // Handle custom delta option set in MultiFileReaderOptions::custom_options + auto file_number_opt = file_options.custom_options.find("delta_file_number"); + if (file_number_opt != file_options.custom_options.end()) { + if (file_number_opt->second.GetValue()) { + D_ASSERT(global_state); + auto &delta_global_state = global_state->Cast(); + D_ASSERT(delta_global_state.delta_file_number_idx != DConstants::INVALID_INDEX); + + // We add the constant column for the delta_file_number option + // NOTE: we add a placeholder here, to demonstrate how we can also populate extra columns in the + // FinalizeChunk + reader_data.constant_map.emplace_back(delta_global_state.delta_file_number_idx, Value::UBIGINT(0)); + } + } + + // Get the metadata for this file + D_ASSERT(global_state->file_list); + const auto &snapshot = dynamic_cast(*global_state->file_list); + auto &file_metadata = snapshot.metadata[reader_data.file_list_idx.GetIndex()]; + + if (!file_metadata->partition_map.empty()) { + for (idx_t i = 0; i < global_column_ids.size(); i++) { + column_t col_id = global_column_ids[i]; + if (IsRowIdColumnId(col_id)) { + continue; + } + auto col_partition_entry = file_metadata->partition_map.find(global_names[col_id]); + if (col_partition_entry != file_metadata->partition_map.end()) { + auto ¤t_type = global_types[col_id]; + if (current_type == LogicalType::BLOB) { + reader_data.constant_map.emplace_back(i, Value::BLOB_RAW(col_partition_entry->second)); + } else { + auto maybe_value = Value(col_partition_entry->second).DefaultCastAs(current_type); + reader_data.constant_map.emplace_back(i, maybe_value); + } + } + } + } } -unique_ptr DeltaMultiFileReader::CreateFileList(ClientContext &context, const vector& paths, FileGlobOptions options) { - if (paths.size() != 1) { - throw BinderException("'delta_scan' only supports single path as input"); - } +unique_ptr DeltaMultiFileReader::CreateFileList(ClientContext &context, const vector &paths, + FileGlobOptions options) { + if (paths.size() != 1) { + throw BinderException("'delta_scan' only supports single path as input"); + } - return make_uniq(context, paths[0]); + return make_uniq(context, paths[0]); } // Generate the correct Selection Vector Based on the Raw delta KernelBoolSlice dv and the row_id_column // TODO: this probably is slower than needed (we can do with less branches in the for loop for most cases) -static SelectionVector DuckSVFromDeltaSV(const ffi::KernelBoolSlice &dv, Vector row_id_column, idx_t count, idx_t &select_count) { - D_ASSERT(row_id_column.GetType() == LogicalType::BIGINT); - - UnifiedVectorFormat data; - row_id_column.ToUnifiedFormat(count, data); - auto row_ids = UnifiedVectorFormat::GetData(data); - - SelectionVector result {count}; - idx_t current_select = 0; - for (idx_t i = 0; i < count; i++) { - auto row_id = row_ids[data.sel->get_index(i)]; - - // TODO: why are deletion vectors not spanning whole data? - if (row_id >= dv.len || dv.ptr[row_id]) { - result.data()[current_select] = i; - current_select++; - } - } +static SelectionVector DuckSVFromDeltaSV(const ffi::KernelBoolSlice &dv, Vector row_id_column, idx_t count, + idx_t &select_count) { + D_ASSERT(row_id_column.GetType() == LogicalType::BIGINT); + + UnifiedVectorFormat data; + row_id_column.ToUnifiedFormat(count, data); + auto row_ids = UnifiedVectorFormat::GetData(data); + + SelectionVector result {count}; + idx_t current_select = 0; + for (idx_t i = 0; i < count; i++) { + auto row_id = row_ids[data.sel->get_index(i)]; + + // TODO: why are deletion vectors not spanning whole data? + if (row_id >= dv.len || dv.ptr[row_id]) { + result.data()[current_select] = i; + current_select++; + } + } - select_count = current_select; + select_count = current_select; - return result; + return result; } // Parses the columns that are used by the delta extension into void DeltaMultiFileReaderGlobalState::SetColumnIdx(const string &column, idx_t idx) { - if (column == "file_row_number") { - file_row_number_idx = idx; - return; - } else if (column == "delta_file_number") { - delta_file_number_idx = idx; - return; - } - throw IOException("Unknown column '%s' found as required by the DeltaMultiFileReader"); + if (column == "file_row_number") { + file_row_number_idx = idx; + return; + } else if (column == "delta_file_number") { + delta_file_number_idx = idx; + return; + } + throw IOException("Unknown column '%s' found as required by the DeltaMultiFileReader"); } -unique_ptr DeltaMultiFileReader::InitializeGlobalState(duckdb::ClientContext &context, - const duckdb::MultiFileReaderOptions &file_options, - const duckdb::MultiFileReaderBindData &bind_data, - const duckdb::MultiFileList &file_list, - const vector &global_types, - const vector &global_names, - const vector &global_column_ids) { - vector extra_columns; - vector> mapped_columns; - - // Create a map of the columns that are in the projection - case_insensitive_map_t selected_columns; - for (idx_t i = 0; i < global_column_ids.size(); i++) { - auto global_id = global_column_ids[i]; - if (IsRowIdColumnId(global_id)) { - continue; - } - - auto &global_name = global_names[global_id]; - selected_columns.insert({global_name, i}); - } - - // TODO: only add file_row_number column if there are deletes - case_insensitive_map_t columns_to_map = { - {"file_row_number", LogicalType::BIGINT}, - }; - - // Add the delta_file_number column to the columns to map - auto demo_gen_col_opt = file_options.custom_options.find("delta_file_number"); - if (demo_gen_col_opt != file_options.custom_options.end()) { - if (demo_gen_col_opt->second.GetValue()) { - columns_to_map.insert({"delta_file_number", LogicalType::UBIGINT}); - } - } - - // Map every column to either a column in the projection, or add it to the extra columns if it doesn't exist - idx_t col_offset = 0; - for (const auto &required_column : columns_to_map) { - // First check if the column is in the projection - auto res = selected_columns.find(required_column.first); - if (res != selected_columns.end()) { - // The column is in the projection, no special handling is required; we simply store the index - mapped_columns.push_back({required_column.first, res->second}); - continue; - } - - // The column is NOT in the projection: it needs to be added as an extra_column - - // Calculate the index of the added column (extra columns are added after all other columns) - idx_t current_col_idx = global_column_ids.size() + col_offset++; - - // Add column to the map, to ensure the MultiFileReader can find it when processing the Chunk - mapped_columns.push_back({required_column.first, current_col_idx}); - - // Ensure the result DataChunk has a vector of the correct type to store this column - extra_columns.push_back(required_column.second); - } - - auto res = make_uniq(extra_columns, &file_list); - - // Parse all the mapped columns into the DeltaMultiFileReaderGlobalState for easy use; - for (const auto& mapped_column : mapped_columns) { - res->SetColumnIdx(mapped_column.first, mapped_column.second); - } - - return std::move(res); +unique_ptr DeltaMultiFileReader::InitializeGlobalState( + duckdb::ClientContext &context, const duckdb::MultiFileReaderOptions &file_options, + const duckdb::MultiFileReaderBindData &bind_data, const duckdb::MultiFileList &file_list, + const vector &global_types, const vector &global_names, + const vector &global_column_ids) { + vector extra_columns; + vector> mapped_columns; + + // Create a map of the columns that are in the projection + case_insensitive_map_t selected_columns; + for (idx_t i = 0; i < global_column_ids.size(); i++) { + auto global_id = global_column_ids[i]; + if (IsRowIdColumnId(global_id)) { + continue; + } + + auto &global_name = global_names[global_id]; + selected_columns.insert({global_name, i}); + } + + // TODO: only add file_row_number column if there are deletes + case_insensitive_map_t columns_to_map = { + {"file_row_number", LogicalType::BIGINT}, + }; + + // Add the delta_file_number column to the columns to map + auto demo_gen_col_opt = file_options.custom_options.find("delta_file_number"); + if (demo_gen_col_opt != file_options.custom_options.end()) { + if (demo_gen_col_opt->second.GetValue()) { + columns_to_map.insert({"delta_file_number", LogicalType::UBIGINT}); + } + } + + // Map every column to either a column in the projection, or add it to the extra columns if it doesn't exist + idx_t col_offset = 0; + for (const auto &required_column : columns_to_map) { + // First check if the column is in the projection + auto res = selected_columns.find(required_column.first); + if (res != selected_columns.end()) { + // The column is in the projection, no special handling is required; we simply store the index + mapped_columns.push_back({required_column.first, res->second}); + continue; + } + + // The column is NOT in the projection: it needs to be added as an extra_column + + // Calculate the index of the added column (extra columns are added after all other columns) + idx_t current_col_idx = global_column_ids.size() + col_offset++; + + // Add column to the map, to ensure the MultiFileReader can find it when processing the Chunk + mapped_columns.push_back({required_column.first, current_col_idx}); + + // Ensure the result DataChunk has a vector of the correct type to store this column + extra_columns.push_back(required_column.second); + } + + auto res = make_uniq(extra_columns, &file_list); + + // Parse all the mapped columns into the DeltaMultiFileReaderGlobalState for easy use; + for (const auto &mapped_column : mapped_columns) { + res->SetColumnIdx(mapped_column.first, mapped_column.second); + } + + return std::move(res); } // This code is duplicated from MultiFileReader::CreateNameMapping the difference is that for columns that are not found // in the parquet files, we just add null constant columns static void CustomMulfiFileNameMapping(const string &file_name, const vector &local_types, - const vector &local_names, const vector &global_types, - const vector &global_names, const vector &global_column_ids, - MultiFileReaderData &reader_data, const string &initial_file, - optional_ptr global_state) { - D_ASSERT(global_types.size() == global_names.size()); + const vector &local_names, const vector &global_types, + const vector &global_names, const vector &global_column_ids, + MultiFileReaderData &reader_data, const string &initial_file, + optional_ptr global_state) { + D_ASSERT(global_types.size() == global_names.size()); D_ASSERT(local_types.size() == local_names.size()); // we have expected types: create a map of name -> column index case_insensitive_map_t name_map; @@ -776,10 +805,10 @@ static void CustomMulfiFileNameMapping(const string &file_name, const vectorsecond; @@ -799,138 +828,144 @@ static void CustomMulfiFileNameMapping(const string &file_name, const vector &local_types, - const vector &local_names, const vector &global_types, - const vector &global_names, const vector &global_column_ids, - MultiFileReaderData &reader_data, const string &initial_file, - optional_ptr global_state) { - // First call the base implementation to do most mapping - CustomMulfiFileNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, reader_data, initial_file, global_state); - - // Then we handle delta specific mapping - D_ASSERT(global_state); - auto &delta_global_state = global_state->Cast(); - - // Check if the file_row_number column is an "extra_column" which is not part of the projection - if (delta_global_state.file_row_number_idx >= global_column_ids.size()) { - D_ASSERT(delta_global_state.file_row_number_idx != DConstants::INVALID_INDEX); - - // Build the name map - case_insensitive_map_t name_map; - for (idx_t col_idx = 0; col_idx < local_names.size(); col_idx++) { - name_map[local_names[col_idx]] = col_idx; - } - - // Lookup the required column in the local map - auto entry = name_map.find("file_row_number"); - if (entry == name_map.end()) { - throw IOException("Failed to find the file_row_number column"); - } - - // Register the column to be scanned from this file - reader_data.column_ids.push_back(entry->second); - reader_data.column_mapping.push_back(delta_global_state.file_row_number_idx); - } - - // This may have changed: update it - reader_data.empty_columns = reader_data.column_ids.empty(); + const vector &local_names, const vector &global_types, + const vector &global_names, + const vector &global_column_ids, + MultiFileReaderData &reader_data, const string &initial_file, + optional_ptr global_state) { + // First call the base implementation to do most mapping + CustomMulfiFileNameMapping(file_name, local_types, local_names, global_types, global_names, global_column_ids, + reader_data, initial_file, global_state); + + // Then we handle delta specific mapping + D_ASSERT(global_state); + auto &delta_global_state = global_state->Cast(); + + // Check if the file_row_number column is an "extra_column" which is not part of the projection + if (delta_global_state.file_row_number_idx >= global_column_ids.size()) { + D_ASSERT(delta_global_state.file_row_number_idx != DConstants::INVALID_INDEX); + + // Build the name map + case_insensitive_map_t name_map; + for (idx_t col_idx = 0; col_idx < local_names.size(); col_idx++) { + name_map[local_names[col_idx]] = col_idx; + } + + // Lookup the required column in the local map + auto entry = name_map.find("file_row_number"); + if (entry == name_map.end()) { + throw IOException("Failed to find the file_row_number column"); + } + + // Register the column to be scanned from this file + reader_data.column_ids.push_back(entry->second); + reader_data.column_mapping.push_back(delta_global_state.file_row_number_idx); + } + + // This may have changed: update it + reader_data.empty_columns = reader_data.column_ids.empty(); } void DeltaMultiFileReader::FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data, - const MultiFileReaderData &reader_data, DataChunk &chunk, optional_ptr global_state) { - // Base class finalization first - MultiFileReader::FinalizeChunk(context, bind_data, reader_data, chunk, global_state); - - D_ASSERT(global_state); - auto &delta_global_state = global_state->Cast(); - D_ASSERT(delta_global_state.file_list); - - // Get the metadata for this file - const auto &snapshot = dynamic_cast(*global_state->file_list); - auto &metadata = snapshot.metadata[reader_data.file_list_idx.GetIndex()]; - - if (metadata->selection_vector.ptr && chunk.size() != 0) { - D_ASSERT(delta_global_state.file_row_number_idx != DConstants::INVALID_INDEX); - auto &file_row_number_column = chunk.data[delta_global_state.file_row_number_idx]; - - // Construct the selection vector using the file_row_number column and the raw selection vector from delta - idx_t select_count; - auto sv = DuckSVFromDeltaSV(metadata->selection_vector, file_row_number_column, chunk.size(), select_count); - chunk.Slice(sv, select_count); - } - - // Note: this demo function shows how we can use DuckDB's Binder create expression-based generated columns - if (delta_global_state.delta_file_number_idx != DConstants::INVALID_INDEX) { - //! Create Dummy expression (0 + file_number) - vector> child_expr; - child_expr.push_back(make_uniq(Value::UBIGINT(0))); - child_expr.push_back(make_uniq(Value::UBIGINT(7))); - unique_ptr expr = make_uniq("+", std::move(child_expr), nullptr, nullptr, false, true); - - //! s dummy expression - auto binder = Binder::CreateBinder(context); - ExpressionBinder expr_binder(*binder, context); - auto bound_expr = expr_binder.Bind(expr, nullptr); - - //! Execute dummy expression into result column - ExpressionExecutor expr_executor(context); - expr_executor.AddExpression(*bound_expr); - - //! Execute the expression directly into the output Chunk - expr_executor.ExecuteExpression(chunk.data[delta_global_state.delta_file_number_idx]); - } + const MultiFileReaderData &reader_data, DataChunk &chunk, + optional_ptr global_state) { + // Base class finalization first + MultiFileReader::FinalizeChunk(context, bind_data, reader_data, chunk, global_state); + + D_ASSERT(global_state); + auto &delta_global_state = global_state->Cast(); + D_ASSERT(delta_global_state.file_list); + + // Get the metadata for this file + const auto &snapshot = dynamic_cast(*global_state->file_list); + auto &metadata = snapshot.metadata[reader_data.file_list_idx.GetIndex()]; + + if (metadata->selection_vector.ptr && chunk.size() != 0) { + D_ASSERT(delta_global_state.file_row_number_idx != DConstants::INVALID_INDEX); + auto &file_row_number_column = chunk.data[delta_global_state.file_row_number_idx]; + + // Construct the selection vector using the file_row_number column and the raw selection vector from delta + idx_t select_count; + auto sv = DuckSVFromDeltaSV(metadata->selection_vector, file_row_number_column, chunk.size(), select_count); + chunk.Slice(sv, select_count); + } + + // Note: this demo function shows how we can use DuckDB's Binder create expression-based generated columns + if (delta_global_state.delta_file_number_idx != DConstants::INVALID_INDEX) { + //! Create Dummy expression (0 + file_number) + vector> child_expr; + child_expr.push_back(make_uniq(Value::UBIGINT(0))); + child_expr.push_back(make_uniq(Value::UBIGINT(7))); + unique_ptr expr = + make_uniq("+", std::move(child_expr), nullptr, nullptr, false, true); + + //! s dummy expression + auto binder = Binder::CreateBinder(context); + ExpressionBinder expr_binder(*binder, context); + auto bound_expr = expr_binder.Bind(expr, nullptr); + + //! Execute dummy expression into result column + ExpressionExecutor expr_executor(context); + expr_executor.AddExpression(*bound_expr); + + //! Execute the expression directly into the output Chunk + expr_executor.ExecuteExpression(chunk.data[delta_global_state.delta_file_number_idx]); + } }; -bool DeltaMultiFileReader::ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, ClientContext &context) { - auto loption = StringUtil::Lower(key); +bool DeltaMultiFileReader::ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, + ClientContext &context) { + auto loption = StringUtil::Lower(key); - if (loption == "delta_file_number") { - options.custom_options[loption] = val; - return true; - } + if (loption == "delta_file_number") { + options.custom_options[loption] = val; + return true; + } - // We need to capture this one to know whether to emit - if (loption == "file_row_number") { - options.custom_options[loption] = val; - return true; - } + // We need to capture this one to know whether to emit + if (loption == "file_row_number") { + options.custom_options[loption] = val; + return true; + } - return MultiFileReader::ParseOption(key, val, options, context); + return MultiFileReader::ParseOption(key, val, options, context); } // -//DeltaMultiFileReaderBindData::DeltaMultiFileReaderBindData(DeltaSnapshot & delta_snapshot): current_snapshot(delta_snapshot){ +// DeltaMultiFileReaderBindData::DeltaMultiFileReaderBindData(DeltaSnapshot & delta_snapshot): +// current_snapshot(delta_snapshot){ // //} TableFunctionSet DeltaFunctions::GetDeltaScanFunction(DatabaseInstance &instance) { - // The delta_scan function is constructed by grabbing the parquet scan from the Catalog, then injecting the - // DeltaMultiFileReader into it to create a Delta-based multi file read + // The delta_scan function is constructed by grabbing the parquet scan from the Catalog, then injecting the + // DeltaMultiFileReader into it to create a Delta-based multi file read - auto &parquet_scan = ExtensionUtil::GetTableFunction(instance, "parquet_scan"); - auto parquet_scan_copy = parquet_scan.functions; + auto &parquet_scan = ExtensionUtil::GetTableFunction(instance, "parquet_scan"); + auto parquet_scan_copy = parquet_scan.functions; - for (auto &function : parquet_scan_copy.functions) { - // Register the MultiFileReader as the driver for reads - function.get_multi_file_reader = DeltaMultiFileReader::CreateInstance; + for (auto &function : parquet_scan_copy.functions) { + // Register the MultiFileReader as the driver for reads + function.get_multi_file_reader = DeltaMultiFileReader::CreateInstance; - // Unset all of these: they are either broken, very inefficient. - // TODO: implement/fix these - function.serialize = nullptr; - function.deserialize = nullptr; - function.statistics = nullptr; - function.table_scan_progress = nullptr; - function.get_bind_info = nullptr; + // Unset all of these: they are either broken, very inefficient. + // TODO: implement/fix these + function.serialize = nullptr; + function.deserialize = nullptr; + function.statistics = nullptr; + function.table_scan_progress = nullptr; + function.get_bind_info = nullptr; - // Schema param is just confusing here - function.named_parameters.erase("schema"); + // Schema param is just confusing here + function.named_parameters.erase("schema"); - // Demonstration of a generated column based on information from DeltaSnapshot - function.named_parameters["delta_file_number"] = LogicalType::BOOLEAN; + // Demonstration of a generated column based on information from DeltaSnapshot + function.named_parameters["delta_file_number"] = LogicalType::BOOLEAN; - function.name = "delta_scan"; - } + function.name = "delta_scan"; + } - parquet_scan_copy.name = "delta_scan"; - return parquet_scan_copy; + parquet_scan_copy.name = "delta_scan"; + return parquet_scan_copy; } } // namespace duckdb diff --git a/src/include/delta_functions.hpp b/src/include/delta_functions.hpp index 390c593..4f819cb 100644 --- a/src/include/delta_functions.hpp +++ b/src/include/delta_functions.hpp @@ -14,9 +14,9 @@ namespace duckdb { class DeltaFunctions { public: - static vector GetTableFunctions(DatabaseInstance &instance); + static vector GetTableFunctions(DatabaseInstance &instance); private: - static TableFunctionSet GetDeltaScanFunction(DatabaseInstance &instance); + static TableFunctionSet GetDeltaScanFunction(DatabaseInstance &instance); }; } // namespace duckdb diff --git a/src/include/delta_kernel_ffi.hpp b/src/include/delta_kernel_ffi.hpp index 3c46bde..6f1401e 100644 --- a/src/include/delta_kernel_ffi.hpp +++ b/src/include/delta_kernel_ffi.hpp @@ -3,55 +3,55 @@ #include #include #include -#include #include +#include namespace ffi { enum class KernelError { - UnknownError, - FFIError, + UnknownError, + FFIError, #if (defined(DEFINE_DEFAULT_ENGINE) || defined(DEFINE_SYNC_ENGINE)) - ArrowError, + ArrowError, #endif - EngineDataTypeError, - ExtractError, - GenericError, - IOErrorError, + EngineDataTypeError, + ExtractError, + GenericError, + IOErrorError, #if (defined(DEFINE_DEFAULT_ENGINE) || defined(DEFINE_SYNC_ENGINE)) - ParquetError, + ParquetError, #endif #if defined(DEFINE_DEFAULT_ENGINE) - ObjectStoreError, + ObjectStoreError, #endif #if defined(DEFINE_DEFAULT_ENGINE) - ObjectStorePathError, + ObjectStorePathError, #endif #if defined(DEFINE_DEFAULT_ENGINE) - ReqwestError, + ReqwestError, #endif - FileNotFoundError, - MissingColumnError, - UnexpectedColumnTypeError, - MissingDataError, - MissingVersionError, - DeletionVectorError, - InvalidUrlError, - MalformedJsonError, - MissingMetadataError, - MissingProtocolError, - MissingMetadataAndProtocolError, - ParseError, - JoinFailureError, - Utf8Error, - ParseIntError, - InvalidColumnMappingModeError, - InvalidTableLocationError, - InvalidDecimalError, - InvalidStructDataError, - InternalError, - InvalidExpression, - InvalidLogPath, + FileNotFoundError, + MissingColumnError, + UnexpectedColumnTypeError, + MissingDataError, + MissingVersionError, + DeletionVectorError, + InvalidUrlError, + MalformedJsonError, + MissingMetadataError, + MissingProtocolError, + MissingMetadataAndProtocolError, + ParseError, + JoinFailureError, + Utf8Error, + ParseIntError, + InvalidColumnMappingModeError, + InvalidTableLocationError, + InvalidDecimalError, + InvalidStructDataError, + InternalError, + InvalidExpression, + InvalidLogPath, }; struct CStringMap; @@ -91,15 +91,15 @@ struct StringSliceIterator; /// receives a `KernelBoolSlice` as a return value from a kernel method, engine is responsible /// to free that slice, by calling [super::free_bool_slice] exactly once. struct KernelBoolSlice { - bool *ptr; - uintptr_t len; + bool *ptr; + uintptr_t len; }; /// An owned slice of u64 row indexes allocated by the kernel. The engine is responsible for /// freeing this slice by calling [super::free_row_indexes] once. struct KernelRowIndexArray { - uint64_t *ptr; - uintptr_t len; + uint64_t *ptr; + uintptr_t len; }; /// Represents an object that crosses the FFI boundary and which outlives the scope that created @@ -134,8 +134,8 @@ struct KernelRowIndexArray { /// NOTE: Because the underlying type is always [`Sync`], multi-threaded external code can /// freely access shared (non-mutable) handles. /// -template -using Handle = H*; +template +using Handle = H *; /// An error that can be returned to the engine. Engines that wish to associate additional /// information can define and use any type that is [pointer @@ -144,31 +144,31 @@ using Handle = H*; /// of a [standard layout](https://en.cppreference.com/w/cpp/language/data_members#Standard-layout) /// class. struct EngineError { - KernelError etype; + KernelError etype; }; /// Semantics: Kernel will always immediately return the leaked engine error to the engine (if it /// allocated one at all), and engine is responsible for freeing it. -template +template struct ExternResult { - enum class Tag { - Ok, - Err, - }; - - struct Ok_Body { - T _0; - }; - - struct Err_Body { - EngineError *_0; - }; - - Tag tag; - union { - Ok_Body ok; - Err_Body err; - }; + enum class Tag { + Ok, + Err, + }; + + struct Ok_Body { + T _0; + }; + + struct Err_Body { + EngineError *_0; + }; + + Tag tag; + union { + Ok_Body ok; + Err_Body err; + }; }; /// A non-owned slice of a UTF8 string, intended for arg-passing between kernel and engine. The @@ -193,32 +193,32 @@ struct ExternResult { /// wants_slice(msg.into()); /// ``` struct KernelStringSlice { - const char *ptr; - uintptr_t len; + const char *ptr; + uintptr_t len; }; -using AllocateErrorFn = EngineError*(*)(KernelError etype, KernelStringSlice msg); +using AllocateErrorFn = EngineError *(*)(KernelError etype, KernelStringSlice msg); -using NullableCvoid = void*; +using NullableCvoid = void *; /// Allow engines to allocate strings of their own type. the contract of calling a passed allocate /// function is that `kernel_str` is _only_ valid until the return from this function -using AllocateStringFn = NullableCvoid(*)(KernelStringSlice kernel_str); +using AllocateStringFn = NullableCvoid (*)(KernelStringSlice kernel_str); struct FileMeta { - KernelStringSlice path; - int64_t last_modified; - uintptr_t size; + KernelStringSlice path; + int64_t last_modified; + uintptr_t size; }; /// Model iterators. This allows an engine to specify iteration however it likes, and we simply wrap /// the engine functions. The engine retains ownership of the iterator. struct EngineIterator { - void *data; - /// A function that should advance the iterator and return the next time from the data - /// If the iterator is complete, it should return null. It should be safe to - /// call `get_next()` multiple times if it returns null. - const void *(*get_next)(void *data); + void *data; + /// A function that should advance the iterator and return the next time from the data + /// If the iterator is complete, it should return null. It should be safe to + /// call `get_next()` multiple times if it returns null. + const void *(*get_next)(void *data); }; /// ABI-compatible struct for ArrowArray from C Data Interface @@ -232,16 +232,16 @@ struct EngineIterator { /// } /// ``` struct FFI_ArrowArray { - int64_t length; - int64_t null_count; - int64_t offset; - int64_t n_buffers; - int64_t n_children; - const void **buffers; - FFI_ArrowArray **children; - FFI_ArrowArray *dictionary; - void (*release)(FFI_ArrowArray *arg1); - void *private_data; + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void **buffers; + FFI_ArrowArray **children; + FFI_ArrowArray *dictionary; + void (*release)(FFI_ArrowArray *arg1); + void *private_data; }; /// ABI-compatible struct for `ArrowSchema` from C Data Interface @@ -256,16 +256,16 @@ struct FFI_ArrowArray { /// ``` /// struct FFI_ArrowSchema { - const char *format; - const char *name; - const char *metadata; - /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags) - int64_t flags; - int64_t n_children; - FFI_ArrowSchema **children; - FFI_ArrowSchema *dictionary; - void (*release)(FFI_ArrowSchema *arg1); - void *private_data; + const char *format; + const char *name; + const char *metadata; + /// Refer to [Arrow Flags](https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.flags) + int64_t flags; + int64_t n_children; + FFI_ArrowSchema **children; + FFI_ArrowSchema *dictionary; + void (*release)(FFI_ArrowSchema *arg1); + void *private_data; }; #if defined(DEFINE_DEFAULT_ENGINE) @@ -273,8 +273,8 @@ struct FFI_ArrowSchema { /// Interface](https://arrow.apache.org/docs/format/CDataInterface.html). This includes the data and /// the schema. struct ArrowFFIData { - FFI_ArrowArray array; - FFI_ArrowSchema schema; + FFI_ArrowArray array; + FFI_ArrowSchema schema; }; #endif @@ -289,39 +289,35 @@ struct ArrowFFIData { /// kernel each retain ownership of their respective objects, with no need to coordinate memory /// lifetimes with the other. struct EnginePredicate { - void *predicate; - uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); + void *predicate; + uintptr_t (*visitor)(void *predicate, KernelExpressionVisitorState *state); }; /// Give engines an easy way to consume stats struct Stats { - /// For any file where the deletion vector is not present (see [`DvInfo::has_vector`]), the - /// `num_records` statistic must be present and accurate, and must equal the number of records - /// in the data file. In the presence of Deletion Vectors the statistics may be somewhat - /// outdated, i.e. not reflecting deleted rows yet. - uint64_t num_records; + /// For any file where the deletion vector is not present (see [`DvInfo::has_vector`]), the + /// `num_records` statistic must be present and accurate, and must equal the number of records + /// in the data file. In the presence of Deletion Vectors the statistics may be somewhat + /// outdated, i.e. not reflecting deleted rows yet. + uint64_t num_records; }; -using CScanCallback = void(*)(NullableCvoid engine_context, - KernelStringSlice path, - int64_t size, - const Stats *stats, - const DvInfo *dv_info, - const CStringMap *partition_map); +using CScanCallback = void (*)(NullableCvoid engine_context, KernelStringSlice path, int64_t size, const Stats *stats, + const DvInfo *dv_info, const CStringMap *partition_map); // This trickery is from https://github.com/mozilla/cbindgen/issues/402#issuecomment-578680163 struct im_an_unused_struct_that_tricks_msvc_into_compilation { - ExternResult field; - ExternResult field2; - ExternResult field3; - ExternResult> field4; - ExternResult> field5; - ExternResult field6; - ExternResult field7; - ExternResult> field8; - ExternResult> field9; - ExternResult> field10; - ExternResult field11; + ExternResult field; + ExternResult field2; + ExternResult field3; + ExternResult> field4; + ExternResult> field5; + ExternResult field6; + ExternResult field7; + ExternResult> field8; + ExternResult> field9; + ExternResult> field10; + ExternResult field11; }; /// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own @@ -350,61 +346,49 @@ struct im_an_unused_struct_that_tricks_msvc_into_compilation { /// that element's (already-visited) children. /// 4. The [`visit_schema`] method returns the id of the list of top-level columns struct EngineSchemaVisitor { - /// opaque state pointer - void *data; - /// Creates a new field list, optionally reserving capacity up front - uintptr_t (*make_field_list)(void *data, uintptr_t reserve); - /// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a - /// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`. - void (*visit_struct)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uintptr_t child_list_id); - /// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list - /// with the array's element type - void (*visit_array)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - bool contains_null, - uintptr_t child_list_id); - /// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list - /// where the first element is the map's key type and the second element is the - /// map's value type - void (*visit_map)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - bool value_contains_null, - uintptr_t child_list_id); - /// visit a `decimal` with the specified `precision` and `scale` - void (*visit_decimal)(void *data, - uintptr_t sibling_list_id, - KernelStringSlice name, - uint8_t precision, - uint8_t scale); - /// Visit a `string` belonging to the list identified by `sibling_list_id`. - void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `long` belonging to the list identified by `sibling_list_id`. - void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit an `integer` belonging to the list identified by `sibling_list_id`. - void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `short` belonging to the list identified by `sibling_list_id`. - void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `byte` belonging to the list identified by `sibling_list_id`. - void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `float` belonging to the list identified by `sibling_list_id`. - void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `double` belonging to the list identified by `sibling_list_id`. - void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. - void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit `binary` belonging to the list identified by `sibling_list_id`. - void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `date` belonging to the list identified by `sibling_list_id`. - void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. - void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); - /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. - void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// opaque state pointer + void *data; + /// Creates a new field list, optionally reserving capacity up front + uintptr_t (*make_field_list)(void *data, uintptr_t reserve); + /// Indicate that the schema contains a `Struct` type. The top level of a Schema is always a + /// `Struct`. The fields of the `Struct` are in the list identified by `child_list_id`. + void (*visit_struct)(void *data, uintptr_t sibling_list_id, KernelStringSlice name, uintptr_t child_list_id); + /// Indicate that the schema contains an Array type. `child_list_id` will be a _one_ item list + /// with the array's element type + void (*visit_array)(void *data, uintptr_t sibling_list_id, KernelStringSlice name, bool contains_null, + uintptr_t child_list_id); + /// Indicate that the schema contains an Map type. `child_list_id` will be a _two_ item list + /// where the first element is the map's key type and the second element is the + /// map's value type + void (*visit_map)(void *data, uintptr_t sibling_list_id, KernelStringSlice name, bool value_contains_null, + uintptr_t child_list_id); + /// visit a `decimal` with the specified `precision` and `scale` + void (*visit_decimal)(void *data, uintptr_t sibling_list_id, KernelStringSlice name, uint8_t precision, + uint8_t scale); + /// Visit a `string` belonging to the list identified by `sibling_list_id`. + void (*visit_string)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `long` belonging to the list identified by `sibling_list_id`. + void (*visit_long)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit an `integer` belonging to the list identified by `sibling_list_id`. + void (*visit_integer)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `short` belonging to the list identified by `sibling_list_id`. + void (*visit_short)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `byte` belonging to the list identified by `sibling_list_id`. + void (*visit_byte)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `float` belonging to the list identified by `sibling_list_id`. + void (*visit_float)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `double` belonging to the list identified by `sibling_list_id`. + void (*visit_double)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. + void (*visit_boolean)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit `binary` belonging to the list identified by `sibling_list_id`. + void (*visit_binary)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `date` belonging to the list identified by `sibling_list_id`. + void (*visit_date)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. + void (*visit_timestamp)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); + /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. + void (*visit_timestamp_ntz)(void *data, uintptr_t sibling_list_id, KernelStringSlice name); }; extern "C" { @@ -433,8 +417,7 @@ void free_engine_data(Handle engine_data); /// /// # Safety /// Caller is responsible for passing a valid path pointer. -ExternResult get_engine_builder(KernelStringSlice path, - AllocateErrorFn allocate_error); +ExternResult get_engine_builder(KernelStringSlice path, AllocateErrorFn allocate_error); #endif #if defined(DEFINE_DEFAULT_ENGINE) @@ -461,8 +444,7 @@ ExternResult> builder_build(EngineBuilder *builder); /// # Safety /// /// Caller is responsible for passing a valid path pointer. -ExternResult> get_default_engine(KernelStringSlice path, - AllocateErrorFn allocate_error); +ExternResult> get_default_engine(KernelStringSlice path, AllocateErrorFn allocate_error); #endif #if defined(DEFINE_SYNC_ENGINE) @@ -482,8 +464,7 @@ void free_engine(Handle engine); /// # Safety /// /// Caller is responsible for passing valid handles and path pointer. -ExternResult> snapshot(KernelStringSlice path, - Handle engine); +ExternResult> snapshot(KernelStringSlice path, Handle engine); /// # Safety /// @@ -509,8 +490,7 @@ NullableCvoid snapshot_table_root(Handle snapshot, AllocateStrin /// /// The iterator must be valid (returned by [kernel_scan_data_init]) and not yet freed by /// [kernel_scan_data_free]. The visitor function pointer must be non-null. -bool string_slice_next(Handle data, - NullableCvoid engine_context, +bool string_slice_next(Handle data, NullableCvoid engine_context, void (*engine_visitor)(NullableCvoid engine_context, KernelStringSlice slice)); /// # Safety @@ -527,8 +507,7 @@ void free_string_slice_data(Handle data); /// /// The iterator must be valid (returned by [`read_parquet_file`]) and not yet freed by /// [`free_read_result_iter`]. The visitor function pointer must be non-null. -ExternResult read_result_next(Handle data, - NullableCvoid engine_context, +ExternResult read_result_next(Handle data, NullableCvoid engine_context, void (*engine_visitor)(NullableCvoid engine_context, Handle engine_data)); @@ -543,9 +522,8 @@ void free_read_result_iter(Handle data); /// /// # Safety /// Caller is responsible for calling with a valid `ExternEngineHandle` and `FileMeta` -ExternResult> read_parquet_file(Handle engine, - const FileMeta *file, - Handle physical_schema); +ExternResult> +read_parquet_file(Handle engine, const FileMeta *file, Handle physical_schema); uintptr_t visit_expression_and(KernelExpressionVisitorState *state, EngineIterator *children); @@ -561,8 +539,7 @@ uintptr_t visit_expression_eq(KernelExpressionVisitorState *state, uintptr_t a, /// # Safety /// The string slice must be valid -ExternResult visit_expression_column(KernelExpressionVisitorState *state, - KernelStringSlice name, +ExternResult visit_expression_column(KernelExpressionVisitorState *state, KernelStringSlice name, AllocateErrorFn allocate_error); uintptr_t visit_expression_not(KernelExpressionVisitorState *state, uintptr_t inner_expr); @@ -571,8 +548,7 @@ uintptr_t visit_expression_is_null(KernelExpressionVisitorState *state, uintptr_ /// # Safety /// The string slice must be valid -ExternResult visit_expression_literal_string(KernelExpressionVisitorState *state, - KernelStringSlice value, +ExternResult visit_expression_literal_string(KernelExpressionVisitorState *state, KernelStringSlice value, AllocateErrorFn allocate_error); uintptr_t visit_expression_literal_int(KernelExpressionVisitorState *state, int32_t value); @@ -612,8 +588,7 @@ void *get_raw_engine_data(Handle data); /// # Safety /// data_handle must be a valid ExclusiveEngineData as read by the /// [`delta_kernel::engine::default::DefaultEngine`] obtained from `get_default_engine`. -ExternResult get_raw_arrow_data(Handle data, - Handle engine); +ExternResult get_raw_arrow_data(Handle data, Handle engine); #endif /// Drops a scan. @@ -625,8 +600,7 @@ void free_scan(Handle scan); /// # Safety /// /// Caller is responsible for passing a valid snapshot pointer, and engine pointer -ExternResult> scan(Handle snapshot, - Handle engine, +ExternResult> scan(Handle snapshot, Handle engine, EnginePredicate *predicate); /// Get the global state for a scan. See the docs for [`delta_kernel::scan::state::GlobalScanState`] @@ -680,8 +654,7 @@ ExternResult> kernel_scan_data_init(Handle kernel_scan_data_next(Handle data, - NullableCvoid engine_context, +ExternResult kernel_scan_data_next(Handle data, NullableCvoid engine_context, void (*engine_visitor)(NullableCvoid engine_context, Handle engine_data, KernelBoolSlice selection_vector)); @@ -699,24 +672,20 @@ void free_kernel_scan_data(Handle data); /// # Safety /// /// The engine is responsible for providing a valid [`CStringMap`] pointer and [`KernelStringSlice`] -NullableCvoid get_from_map(const CStringMap *map, - KernelStringSlice key, - AllocateStringFn allocate_fn); +NullableCvoid get_from_map(const CStringMap *map, KernelStringSlice key, AllocateStringFn allocate_fn); /// Get a selection vector out of a [`DvInfo`] struct /// /// # Safety /// Engine is responsible for providing valid pointers for each argument -ExternResult selection_vector_from_dv(const DvInfo *dv_info, - Handle engine, +ExternResult selection_vector_from_dv(const DvInfo *dv_info, Handle engine, Handle state); /// Get a vector of row indexes out of a [`DvInfo`] struct /// /// # Safety /// Engine is responsible for providing valid pointers for each argument -ExternResult row_indexes_from_dv(const DvInfo *dv_info, - Handle engine, +ExternResult row_indexes_from_dv(const DvInfo *dv_info, Handle engine, Handle state); /// Shim for ffi to call visit_scan_data. This will generally be called when iterating through scan @@ -724,9 +693,7 @@ ExternResult row_indexes_from_dv(const DvInfo *dv_info, /// /// # Safety /// engine is responsbile for passing a valid [`ExclusiveEngineData`] and selection vector. -void visit_scan_data(Handle data, - KernelBoolSlice selection_vec, - NullableCvoid engine_context, +void visit_scan_data(Handle data, KernelBoolSlice selection_vec, NullableCvoid engine_context, CScanCallback callback); /// Visit the schema of the passed `SnapshotHandle`, using the provided `visitor`. See the @@ -739,6 +706,6 @@ void visit_scan_data(Handle data, /// Caller is responsible for passing a valid snapshot handle and schema visitor. uintptr_t visit_schema(Handle snapshot, EngineSchemaVisitor *visitor); -} // extern "C" +} // extern "C" -} // namespace ffi +} // namespace ffi diff --git a/src/include/delta_utils.hpp b/src/include/delta_utils.hpp index 9b33c5c..23b87f4 100644 --- a/src/include/delta_utils.hpp +++ b/src/include/delta_utils.hpp @@ -1,11 +1,12 @@ #pragma once #include "delta_kernel_ffi.hpp" -#include "duckdb/planner/filter/constant_filter.hpp" -#include "duckdb/planner/filter/conjunction_filter.hpp" #include "duckdb/common/enum_util.hpp" -#include +#include "duckdb/planner/filter/conjunction_filter.hpp" +#include "duckdb/planner/filter/constant_filter.hpp" + #include +#include // TODO: clean up this file as we go @@ -14,48 +15,52 @@ namespace duckdb { // SchemaVisitor is used to parse the schema of a Delta table from the Kernel class SchemaVisitor { public: - using FieldList = child_list_t; + using FieldList = child_list_t; - static unique_ptr VisitSnapshotSchema(ffi::SharedSnapshot* snapshot); + static unique_ptr VisitSnapshotSchema(ffi::SharedSnapshot *snapshot); private: - unordered_map> inflight_lists; - uintptr_t next_id = 1; - - typedef void (SimpleTypeVisitorFunction)(void*, uintptr_t, ffi::KernelStringSlice); - - template - static SimpleTypeVisitorFunction* VisitSimpleType() { - return (SimpleTypeVisitorFunction*) &VisitSimpleTypeImpl; - } - template - static void VisitSimpleTypeImpl(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name) { - state->AppendToList(sibling_list_id, name, TypeId); - } - - static void VisitDecimal(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, uint8_t precision, uint8_t scale); - static uintptr_t MakeFieldList(SchemaVisitor* state, uintptr_t capacity_hint); - static void VisitStruct(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, uintptr_t child_list_id); - static void VisitArray(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, bool contains_null, uintptr_t child_list_id); - static void VisitMap(SchemaVisitor* state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, bool contains_null, uintptr_t child_list_id); - - uintptr_t MakeFieldListImpl(uintptr_t capacity_hint); - void AppendToList(uintptr_t id, ffi::KernelStringSlice name, LogicalType&& child); - unique_ptr TakeFieldList(uintptr_t id); + unordered_map> inflight_lists; + uintptr_t next_id = 1; + + typedef void(SimpleTypeVisitorFunction)(void *, uintptr_t, ffi::KernelStringSlice); + + template + static SimpleTypeVisitorFunction *VisitSimpleType() { + return (SimpleTypeVisitorFunction *)&VisitSimpleTypeImpl; + } + template + static void VisitSimpleTypeImpl(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name) { + state->AppendToList(sibling_list_id, name, TypeId); + } + + static void VisitDecimal(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + uint8_t precision, uint8_t scale); + static uintptr_t MakeFieldList(SchemaVisitor *state, uintptr_t capacity_hint); + static void VisitStruct(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + uintptr_t child_list_id); + static void VisitArray(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + bool contains_null, uintptr_t child_list_id); + static void VisitMap(SchemaVisitor *state, uintptr_t sibling_list_id, ffi::KernelStringSlice name, + bool contains_null, uintptr_t child_list_id); + + uintptr_t MakeFieldListImpl(uintptr_t capacity_hint); + void AppendToList(uintptr_t id, ffi::KernelStringSlice name, LogicalType &&child); + unique_ptr TakeFieldList(uintptr_t id); }; // Allocator for errors that the kernel might throw struct DuckDBEngineError : ffi::EngineError { - // Allocate a DuckDBEngineError, function ptr passed to kernel for error allocation - static ffi::EngineError* AllocateError(ffi::KernelError etype, ffi::KernelStringSlice msg); - // Convert a kernel error enum to a string - static string KernelErrorEnumToString(ffi::KernelError err); + // Allocate a DuckDBEngineError, function ptr passed to kernel for error allocation + static ffi::EngineError *AllocateError(ffi::KernelError etype, ffi::KernelStringSlice msg); + // Convert a kernel error enum to a string + static string KernelErrorEnumToString(ffi::KernelError err); - // Throw the error as an IOException - [[noreturn]] void Throw(string from_info); + // Throw the error as an IOException + [[noreturn]] void Throw(string from_info); - // The error message from Kernel - string error_message; + // The error message from Kernel + string error_message; }; // RAII wrapper that returns ownership of a kernel pointer to kernel when it goes out of @@ -63,43 +68,45 @@ struct DuckDBEngineError : ffi::EngineError { // kernel type to be complete. template struct UniqueKernelPointer { - UniqueKernelPointer() : ptr(nullptr), free(nullptr) {} - - // Takes ownership of a pointer with associated deleter. - UniqueKernelPointer(KernelType* ptr, void (*free)(KernelType*)) : ptr(ptr), free(free) {} - - // movable but not copyable - UniqueKernelPointer(UniqueKernelPointer&& other) : ptr(other.ptr) { - other.ptr = nullptr; - } - UniqueKernelPointer& operator=(UniqueKernelPointer&& other) { - std::swap(ptr, other.ptr); - std::swap(free, other.free); - return *this; - } - UniqueKernelPointer(const UniqueKernelPointer&) = delete; - UniqueKernelPointer& operator=(const UniqueKernelPointer&) = delete; - - ~UniqueKernelPointer() { - if (ptr && free) { - free(ptr); - } - } - - KernelType* get() const { return ptr; } + UniqueKernelPointer() : ptr(nullptr), free(nullptr) { + } + + // Takes ownership of a pointer with associated deleter. + UniqueKernelPointer(KernelType *ptr, void (*free)(KernelType *)) : ptr(ptr), free(free) { + } + + // movable but not copyable + UniqueKernelPointer(UniqueKernelPointer &&other) : ptr(other.ptr) { + other.ptr = nullptr; + } + UniqueKernelPointer &operator=(UniqueKernelPointer &&other) { + std::swap(ptr, other.ptr); + std::swap(free, other.free); + return *this; + } + UniqueKernelPointer(const UniqueKernelPointer &) = delete; + UniqueKernelPointer &operator=(const UniqueKernelPointer &) = delete; + + ~UniqueKernelPointer() { + if (ptr && free) { + free(ptr); + } + } + + KernelType *get() const { + return ptr; + } private: - KernelType* ptr; - void (*free)(KernelType*) = nullptr; + KernelType *ptr; + void (*free)(KernelType *) = nullptr; }; // Syntactic sugar around the different kernel types -template +template struct TemplatedUniqueKernelPointer : public UniqueKernelPointer { - TemplatedUniqueKernelPointer() : UniqueKernelPointer() { - }; - TemplatedUniqueKernelPointer(KernelType* ptr) : UniqueKernelPointer(ptr, DeleteFunction) { - }; + TemplatedUniqueKernelPointer() : UniqueKernelPointer() {}; + TemplatedUniqueKernelPointer(KernelType *ptr) : UniqueKernelPointer(ptr, DeleteFunction) {}; }; typedef TemplatedUniqueKernelPointer KernelSnapshot; @@ -109,43 +116,46 @@ typedef TemplatedUniqueKernelPointer KernelScanDataIterator; struct KernelUtils { - static ffi::KernelStringSlice ToDeltaString(const string &str); - static string FromDeltaString(const struct ffi::KernelStringSlice slice); - static vector FromDeltaBoolSlice(const struct ffi::KernelBoolSlice slice); - - // TODO: all kernel results need to be unpacked, not doing so will result in an error. This should be cleaned up - template - static T UnpackResult(ffi::ExternResult result, const string &from_where) { - if (result.tag == ffi::ExternResult::Tag::Err) { - if (result.err._0){ - auto error_cast = static_cast(result.err._0); - error_cast->Throw(from_where); - } else { - throw IOException("Hit DeltaKernel FFI error (from: %s): Hit error, but error was nullptr", from_where.c_str()); - } - } else if (result.tag == ffi::ExternResult::Tag::Ok) { - return result.ok._0; - } - throw IOException("Invalid error ExternResult tag found!"); - } + static ffi::KernelStringSlice ToDeltaString(const string &str); + static string FromDeltaString(const struct ffi::KernelStringSlice slice); + static vector FromDeltaBoolSlice(const struct ffi::KernelBoolSlice slice); + + // TODO: all kernel results need to be unpacked, not doing so will result in an error. This should be cleaned up + template + static T UnpackResult(ffi::ExternResult result, const string &from_where) { + if (result.tag == ffi::ExternResult::Tag::Err) { + if (result.err._0) { + auto error_cast = static_cast(result.err._0); + error_cast->Throw(from_where); + } else { + throw IOException("Hit DeltaKernel FFI error (from: %s): Hit error, but error was nullptr", + from_where.c_str()); + } + } else if (result.tag == ffi::ExternResult::Tag::Ok) { + return result.ok._0; + } + throw IOException("Invalid error ExternResult tag found!"); + } }; class PredicateVisitor : public ffi::EnginePredicate { public: - PredicateVisitor(const vector &column_names, optional_ptr filters); + PredicateVisitor(const vector &column_names, optional_ptr filters); private: - unordered_map column_filters; + unordered_map column_filters; - static uintptr_t VisitPredicate(PredicateVisitor* predicate, ffi::KernelExpressionVisitorState* state); + static uintptr_t VisitPredicate(PredicateVisitor *predicate, ffi::KernelExpressionVisitorState *state); - uintptr_t VisitConstantFilter(const string &col_name, const ConstantFilter &filter, ffi::KernelExpressionVisitorState* state); - uintptr_t VisitAndFilter(const string &col_name, const ConjunctionAndFilter &filter, ffi::KernelExpressionVisitorState* state); + uintptr_t VisitConstantFilter(const string &col_name, const ConstantFilter &filter, + ffi::KernelExpressionVisitorState *state); + uintptr_t VisitAndFilter(const string &col_name, const ConjunctionAndFilter &filter, + ffi::KernelExpressionVisitorState *state); - uintptr_t VisitIsNull(const string &col_name, ffi::KernelExpressionVisitorState* state); - uintptr_t VisitIsNotNull(const string &col_name, ffi::KernelExpressionVisitorState* state); + uintptr_t VisitIsNull(const string &col_name, ffi::KernelExpressionVisitorState *state); + uintptr_t VisitIsNotNull(const string &col_name, ffi::KernelExpressionVisitorState *state); - uintptr_t VisitFilter(const string &col_name, const TableFilter &filter, ffi::KernelExpressionVisitorState* state); + uintptr_t VisitFilter(const string &col_name, const TableFilter &filter, ffi::KernelExpressionVisitorState *state); }; } // namespace duckdb diff --git a/src/include/functions/delta_scan.hpp b/src/include/functions/delta_scan.hpp index aac35cc..936de5a 100644 --- a/src/include/functions/delta_scan.hpp +++ b/src/include/functions/delta_scan.hpp @@ -14,133 +14,137 @@ namespace duckdb { struct DeltaFileMetaData { - DeltaFileMetaData() {}; - - // No copying pls - DeltaFileMetaData (const DeltaFileMetaData&) = delete; - DeltaFileMetaData& operator= (const DeltaFileMetaData&) = delete; - - ~DeltaFileMetaData() { - if (selection_vector.ptr) { - ffi::free_bool_slice(selection_vector); - } - } - - idx_t delta_snapshot_version = DConstants::INVALID_INDEX; - idx_t file_number = DConstants::INVALID_INDEX; - idx_t cardinality = DConstants::INVALID_INDEX; - ffi::KernelBoolSlice selection_vector = {nullptr, 0}; - case_insensitive_map_t partition_map; + DeltaFileMetaData() {}; + + // No copying pls + DeltaFileMetaData(const DeltaFileMetaData &) = delete; + DeltaFileMetaData &operator=(const DeltaFileMetaData &) = delete; + + ~DeltaFileMetaData() { + if (selection_vector.ptr) { + ffi::free_bool_slice(selection_vector); + } + } + + idx_t delta_snapshot_version = DConstants::INVALID_INDEX; + idx_t file_number = DConstants::INVALID_INDEX; + idx_t cardinality = DConstants::INVALID_INDEX; + ffi::KernelBoolSlice selection_vector = {nullptr, 0}; + case_insensitive_map_t partition_map; }; //! The DeltaSnapshot implements the MultiFileList API to allow injecting it into the regular DuckDB parquet scan struct DeltaSnapshot : public MultiFileList { - DeltaSnapshot(ClientContext &context, const string &path); - string GetPath(); - static string ToDuckDBPath(const string &raw_path); - static string ToDeltaPath(const string &raw_path); + DeltaSnapshot(ClientContext &context, const string &path); + string GetPath(); + static string ToDuckDBPath(const string &raw_path); + static string ToDeltaPath(const string &raw_path); - //! MultiFileList API + //! MultiFileList API public: - void Bind(vector &return_types, vector &names); - unique_ptr ComplexFilterPushdown(ClientContext &context, - const MultiFileReaderOptions &options, MultiFilePushdownInfo &info, - vector> &filters) override; - vector GetAllFiles() override; - FileExpandResult GetExpandResult() override; - idx_t GetTotalFileCount() override; + void Bind(vector &return_types, vector &names); + unique_ptr ComplexFilterPushdown(ClientContext &context, const MultiFileReaderOptions &options, + MultiFilePushdownInfo &info, + vector> &filters) override; + vector GetAllFiles() override; + FileExpandResult GetExpandResult() override; + idx_t GetTotalFileCount() override; - unique_ptr GetCardinality(ClientContext &context) override; + unique_ptr GetCardinality(ClientContext &context) override; protected: - //! Get the i-th expanded file - string GetFile(idx_t i) override; + //! Get the i-th expanded file + string GetFile(idx_t i) override; protected: - // TODO: How to guarantee we only call this after the filter pushdown? - void InitializeFiles(); + // TODO: How to guarantee we only call this after the filter pushdown? + void InitializeFiles(); - template - T TryUnpackKernelResult(ffi::ExternResult result) { - return KernelUtils::UnpackResult(result, StringUtil::Format("While trying to read from delta table: '%s'", paths[0])); - } + template + T TryUnpackKernelResult(ffi::ExternResult result) { + return KernelUtils::UnpackResult( + result, StringUtil::Format("While trying to read from delta table: '%s'", paths[0])); + } -// TODO: change back to protected + // TODO: change back to protected public: - idx_t version; + idx_t version; - //! Delta Kernel Structures - KernelSnapshot snapshot; - KernelExternEngine extern_engine; - KernelScan scan; - KernelGlobalScanState global_state; - KernelScanDataIterator scan_data_iterator; + //! Delta Kernel Structures + KernelSnapshot snapshot; + KernelExternEngine extern_engine; + KernelScan scan; + KernelGlobalScanState global_state; + KernelScanDataIterator scan_data_iterator; - //! Names - vector names; + //! Names + vector names; - //! Metadata map for files - vector> metadata; + //! Metadata map for files + vector> metadata; - //! Current file list resolution state - bool initialized = false; - bool files_exhausted = false; - vector resolved_files; - TableFilterSet table_filters; + //! Current file list resolution state + bool initialized = false; + bool files_exhausted = false; + vector resolved_files; + TableFilterSet table_filters; - ClientContext &context; + ClientContext &context; }; struct DeltaMultiFileReaderGlobalState : public MultiFileReaderGlobalState { - DeltaMultiFileReaderGlobalState(vector extra_columns_p, optional_ptr file_list_p) : MultiFileReaderGlobalState(extra_columns_p, file_list_p) { - } - //! The idx of the file number column in the result chunk - idx_t delta_file_number_idx = DConstants::INVALID_INDEX; - //! The idx of the file_row_number column in the result chunk - idx_t file_row_number_idx = DConstants::INVALID_INDEX; - - void SetColumnIdx(const string &column, idx_t idx); + DeltaMultiFileReaderGlobalState(vector extra_columns_p, optional_ptr file_list_p) + : MultiFileReaderGlobalState(extra_columns_p, file_list_p) { + } + //! The idx of the file number column in the result chunk + idx_t delta_file_number_idx = DConstants::INVALID_INDEX; + //! The idx of the file_row_number column in the result chunk + idx_t file_row_number_idx = DConstants::INVALID_INDEX; + + void SetColumnIdx(const string &column, idx_t idx); }; struct DeltaMultiFileReader : public MultiFileReader { - static unique_ptr CreateInstance(); - //! Return a DeltaSnapshot - unique_ptr CreateFileList(ClientContext &context, const vector &paths, - FileGlobOptions options) override; - - //! Override the regular parquet bind using the MultiFileReader Bind. The bind from these are what DuckDB's file - //! readers will try read - bool Bind(MultiFileReaderOptions &options, MultiFileList &files, - vector &return_types, vector &names, MultiFileReaderBindData &bind_data) override; - - //! Override the Options bind - void BindOptions(MultiFileReaderOptions &options, MultiFileList &files, - vector &return_types, vector &names, MultiFileReaderBindData& bind_data) override; - - void CreateNameMapping(const string &file_name, const vector &local_types, - const vector &local_names, const vector &global_types, - const vector &global_names, const vector &global_column_ids, - MultiFileReaderData &reader_data, const string &initial_file, - optional_ptr global_state) override; - - unique_ptr InitializeGlobalState(ClientContext &context, const MultiFileReaderOptions &file_options, - const MultiFileReaderBindData &bind_data, const MultiFileList &file_list, - const vector &global_types, const vector &global_names, - const vector &global_column_ids) override; - - void FinalizeBind(const MultiFileReaderOptions &file_options, const MultiFileReaderBindData &options, - const string &filename, const vector &local_names, - const vector &global_types, const vector &global_names, - const vector &global_column_ids, MultiFileReaderData &reader_data, - ClientContext &context, optional_ptr global_state) override; - - //! Override the FinalizeChunk method - void FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data, - const MultiFileReaderData &reader_data, DataChunk &chunk, optional_ptr global_state) override; - - //! Override the ParseOption call to parse delta_scan specific options - bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, - ClientContext &context) override; + static unique_ptr CreateInstance(); + //! Return a DeltaSnapshot + unique_ptr CreateFileList(ClientContext &context, const vector &paths, + FileGlobOptions options) override; + + //! Override the regular parquet bind using the MultiFileReader Bind. The bind from these are what DuckDB's file + //! readers will try read + bool Bind(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, + vector &names, MultiFileReaderBindData &bind_data) override; + + //! Override the Options bind + void BindOptions(MultiFileReaderOptions &options, MultiFileList &files, vector &return_types, + vector &names, MultiFileReaderBindData &bind_data) override; + + void CreateNameMapping(const string &file_name, const vector &local_types, + const vector &local_names, const vector &global_types, + const vector &global_names, const vector &global_column_ids, + MultiFileReaderData &reader_data, const string &initial_file, + optional_ptr global_state) override; + + unique_ptr + InitializeGlobalState(ClientContext &context, const MultiFileReaderOptions &file_options, + const MultiFileReaderBindData &bind_data, const MultiFileList &file_list, + const vector &global_types, const vector &global_names, + const vector &global_column_ids) override; + + void FinalizeBind(const MultiFileReaderOptions &file_options, const MultiFileReaderBindData &options, + const string &filename, const vector &local_names, + const vector &global_types, const vector &global_names, + const vector &global_column_ids, MultiFileReaderData &reader_data, + ClientContext &context, optional_ptr global_state) override; + + //! Override the FinalizeChunk method + void FinalizeChunk(ClientContext &context, const MultiFileReaderBindData &bind_data, + const MultiFileReaderData &reader_data, DataChunk &chunk, + optional_ptr global_state) override; + + //! Override the ParseOption call to parse delta_scan specific options + bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, + ClientContext &context) override; }; } // namespace duckdb From 3cfc758eec2a088d581ff39928f7cc4936646ca2 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Mon, 11 Nov 2024 15:37:46 +0100 Subject: [PATCH 5/8] autoload('parquet') since it's needed for scan_parquet --- src/functions/delta_scan.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 3bf4105..098ed23 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -6,6 +6,7 @@ #include "duckdb/common/types/data_chunk.hpp" #include "duckdb/execution/expression_executor.hpp" #include "duckdb/function/table_function.hpp" +#include "duckdb/main/extension_helper.hpp" #include "duckdb/main/extension_util.hpp" #include "duckdb/main/secret/secret_manager.hpp" #include "duckdb/optimizer/filter_combiner.hpp" @@ -937,9 +938,11 @@ bool DeltaMultiFileReader::ParseOption(const string &key, const Value &val, Mult //} TableFunctionSet DeltaFunctions::GetDeltaScanFunction(DatabaseInstance &instance) { + // Parquet extension needs to be loaded for this to make sense + ExtensionHelper::AutoLoadExtension(instance, "parquet"); + // The delta_scan function is constructed by grabbing the parquet scan from the Catalog, then injecting the // DeltaMultiFileReader into it to create a Delta-based multi file read - auto &parquet_scan = ExtensionUtil::GetTableFunction(instance, "parquet_scan"); auto parquet_scan_copy = parquet_scan.functions; From 296f51494d7a40041ae4ccd79fe12795066161ee Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Tue, 12 Nov 2024 12:22:56 +0100 Subject: [PATCH 6/8] format --- CMakeLists.txt | 226 ++++++++++-------- src/delta_extension.cpp | 34 +-- src/delta_utils.cpp | 12 +- src/functions/delta_scan.cpp | 182 +++++++------- src/functions/expression_functions.cpp | 46 ++++ src/include/delta_utils.hpp | 77 +++--- src/include/functions/delta_scan.hpp | 60 ++--- .../functions/expression_functions.hpp | 0 src/include/storage/delta_catalog.hpp | 14 +- src/include/storage/delta_schema_entry.hpp | 10 +- src/include/storage/delta_table_entry.hpp | 4 +- src/include/storage/delta_transaction.hpp | 9 +- src/storage/delta_catalog.cpp | 63 ++--- src/storage/delta_schema_entry.cpp | 125 +++++----- src/storage/delta_table_entry.cpp | 11 +- src/storage/delta_transaction.cpp | 4 +- 16 files changed, 487 insertions(+), 390 deletions(-) create mode 100644 src/functions/expression_functions.cpp create mode 100644 src/include/functions/expression_functions.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 351f307..50df657 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8.12) include(ExternalProject) -### Core config +# Core config set(TARGET_NAME delta) set(EXTENSION_NAME ${TARGET_NAME}_extension) @@ -12,30 +12,43 @@ project(${TARGET_NAME}) include_directories(src/include) set(EXTENSION_SOURCES - src/delta_extension.cpp - src/delta_functions.cpp - src/delta_utils.cpp - src/functions/delta_scan.cpp - src/storage/delta_catalog.cpp - src/storage/delta_schema_entry.cpp - src/storage/delta_table_entry.cpp - src/storage/delta_transaction.cpp - src/storage/delta_transaction_manager.cpp -) - -### Custom config -# TODO: figure out if we really need this? + src/delta_extension.cpp + src/delta_functions.cpp + src/delta_utils.cpp + src/functions/delta_scan.cpp + src/storage/delta_catalog.cpp + src/storage/delta_schema_entry.cpp + src/storage/delta_table_entry.cpp + src/storage/delta_transaction.cpp + src/storage/delta_transaction_manager.cpp) + +# Custom config TODO: figure out if we really need this? if(APPLE) - set(PLATFORM_LIBS m c System resolv "-framework Corefoundation -framework SystemConfiguration -framework Security") + set(PLATFORM_LIBS + m + c + System + resolv + "-framework Corefoundation -framework SystemConfiguration -framework Security" + ) elseif(UNIX) - set(PLATFORM_LIBS m c resolv) + set(PLATFORM_LIBS m c resolv) elseif(WIN32) - set(PLATFORM_LIBS ntdll ncrypt secur32 ws2_32 userenv bcrypt msvcrt advapi32 RuntimeObject) + set(PLATFORM_LIBS + ntdll + ncrypt + secur32 + ws2_32 + userenv + bcrypt + msvcrt + advapi32 + RuntimeObject) else() - message(STATUS "UNKNOWN OS") + message(STATUS "UNKNOWN OS") endif() -### Setup delta-kernel-rs dependency +# Setup delta-kernel-rs dependency set(KERNEL_NAME delta_kernel) # Set default ExternalProject root directory @@ -46,40 +59,50 @@ set(RUST_ENV_VARS "") # Propagate arch to rust build for CI set(RUST_PLATFORM_TARGET "") if("${OS_NAME}" STREQUAL "linux") - if ("${OS_ARCH}" STREQUAL "arm64") - set(RUST_PLATFORM_TARGET "aarch64-unknown-linux-gnu") - elseif("${CMAKE_CXX_COMPILER}" MATCHES "aarch64") - set(RUST_ENV_VARS ${RUST_ENV_VARS} CFLAGS_aarch64_unknown_linux_gnu=--sysroot=/usr/aarch64-linux-gnu) - set(RUST_ENV_VARS ${RUST_ENV_VARS} CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc) - set(RUST_ENV_VARS ${RUST_ENV_VARS} OPENSSL_LIB_DIR=${CMAKE_BINARY_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/lib) - set(RUST_ENV_VARS ${RUST_ENV_VARS} OPENSSL_INCLUDE_DIR=${CMAKE_BINARY_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/include) - set(RUST_PLATFORM_TARGET "aarch64-unknown-linux-gnu") - else() - set(RUST_PLATFORM_TARGET "x86_64-unknown-linux-gnu") - endif() + if("${OS_ARCH}" STREQUAL "arm64") + set(RUST_PLATFORM_TARGET "aarch64-unknown-linux-gnu") + elseif("${CMAKE_CXX_COMPILER}" MATCHES "aarch64") + set(RUST_ENV_VARS + ${RUST_ENV_VARS} + CFLAGS_aarch64_unknown_linux_gnu=--sysroot=/usr/aarch64-linux-gnu) + set(RUST_ENV_VARS + ${RUST_ENV_VARS} + CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc) + set(RUST_ENV_VARS + ${RUST_ENV_VARS} + OPENSSL_LIB_DIR=${CMAKE_BINARY_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/lib + ) + set(RUST_ENV_VARS + ${RUST_ENV_VARS} + OPENSSL_INCLUDE_DIR=${CMAKE_BINARY_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/include + ) + set(RUST_PLATFORM_TARGET "aarch64-unknown-linux-gnu") + else() + set(RUST_PLATFORM_TARGET "x86_64-unknown-linux-gnu") + endif() elseif("${OS_NAME}" STREQUAL "osx") - if ("${OSX_BUILD_ARCH}" STREQUAL "arm64") - set(RUST_PLATFORM_TARGET "aarch64-apple-darwin") - elseif ("${OSX_BUILD_ARCH}" STREQUAL "x86_64") - set(RUST_PLATFORM_TARGET "x86_64-apple-darwin") - elseif ("${OS_ARCH}" STREQUAL "arm64") - set(RUST_PLATFORM_TARGET "aarch64-apple-darwin") - endif() + if("${OSX_BUILD_ARCH}" STREQUAL "arm64") + set(RUST_PLATFORM_TARGET "aarch64-apple-darwin") + elseif("${OSX_BUILD_ARCH}" STREQUAL "x86_64") + set(RUST_PLATFORM_TARGET "x86_64-apple-darwin") + elseif("${OS_ARCH}" STREQUAL "arm64") + set(RUST_PLATFORM_TARGET "aarch64-apple-darwin") + endif() elseif(WIN32) - if (MINGW AND "${OS_ARCH}" STREQUAL "arm64") - set(RUST_PLATFORM_TARGET "aarch64-pc-windows-gnu") - elseif (MINGW AND "${OS_ARCH}" STREQUAL "amd64") - set(RUST_PLATFORM_TARGET "x86_64-pc-windows-gnu") - elseif (MSVC AND "${OS_ARCH}" STREQUAL "arm64") - set(RUST_PLATFORM_TARGET "aarch64-pc-windows-msvc") - elseif (MSVC AND "${OS_ARCH}" STREQUAL "amd64") - set(RUST_PLATFORM_TARGET "x86_64-pc-windows-msvc") - endif() + if(MINGW AND "${OS_ARCH}" STREQUAL "arm64") + set(RUST_PLATFORM_TARGET "aarch64-pc-windows-gnu") + elseif(MINGW AND "${OS_ARCH}" STREQUAL "amd64") + set(RUST_PLATFORM_TARGET "x86_64-pc-windows-gnu") + elseif(MSVC AND "${OS_ARCH}" STREQUAL "arm64") + set(RUST_PLATFORM_TARGET "aarch64-pc-windows-msvc") + elseif(MSVC AND "${OS_ARCH}" STREQUAL "amd64") + set(RUST_PLATFORM_TARGET "x86_64-pc-windows-msvc") + endif() endif() # We currently only support the predefined targets. -if ("${RUST_PLATFORM_TARGET}" STREQUAL "") - message(FATAL_ERROR "Failed to detect the correct platform") +if("${RUST_PLATFORM_TARGET}" STREQUAL "") + message(FATAL_ERROR "Failed to detect the correct platform") endif() set(RUST_PLATFORM_PARAM "--target=${RUST_PLATFORM_TARGET}") @@ -92,69 +115,84 @@ string(STRIP "${RUST_ENV_VARS}" RUST_ENV_VARS) set(RUST_UNSET_ENV_VARS --unset=CC --unset=CXX --unset=LD) # Define all the relevant delta-kernel-rs paths/names -set(DELTA_KERNEL_LIBNAME "${CMAKE_STATIC_LIBRARY_PREFIX}delta_kernel_ffi${CMAKE_STATIC_LIBRARY_SUFFIX}") -set(DELTA_KERNEL_LIBPATH_DEBUG "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/${DELTA_KERNEL_LIBNAME}") -set(DELTA_KERNEL_LIBPATH_RELEASE "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/${DELTA_KERNEL_LIBNAME}") -set(DELTA_KERNEL_FFI_HEADER_PATH "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers") -set(DELTA_KERNEL_FFI_HEADER_C "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.h") -set(DELTA_KERNEL_FFI_HEADER_CXX "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.hpp") +set(DELTA_KERNEL_LIBNAME + "${CMAKE_STATIC_LIBRARY_PREFIX}delta_kernel_ffi${CMAKE_STATIC_LIBRARY_SUFFIX}" +) +set(DELTA_KERNEL_LIBPATH_DEBUG + "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/${DELTA_KERNEL_LIBNAME}" +) +set(DELTA_KERNEL_LIBPATH_RELEASE + "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/${DELTA_KERNEL_LIBNAME}" +) +set(DELTA_KERNEL_FFI_HEADER_PATH + "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers") +set(DELTA_KERNEL_FFI_HEADER_C + "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.h" +) +set(DELTA_KERNEL_FFI_HEADER_CXX + "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.hpp" +) # Add rust_example as a CMake target ExternalProject_Add( - ${KERNEL_NAME} - GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs" - # WARNING: the FFI headers are currently pinned due to the C linkage issue of the c++ headers. Currently, when bumping - # the kernel version, the produced header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying the fix - GIT_TAG v0.4.0 - # Prints the env variables passed to the cargo build to the terminal, useful in debugging because passing them - # through CMake is an error-prone mess - CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} env - UPDATE_COMMAND "" - BUILD_IN_SOURCE 1 - # Build debug build - BUILD_COMMAND - ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} - cargo build --package delta_kernel_ffi --workspace --all-features ${RUST_PLATFORM_PARAM} - # Build release build - COMMAND - ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} - cargo build --package delta_kernel_ffi --workspace --all-features --release ${RUST_PLATFORM_PARAM} - # Build DATs - COMMAND - ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} - cargo build --manifest-path=${CMAKE_BINARY_DIR}/rust/src/delta_kernel/acceptance/Cargo.toml - # Define the byproducts, required for building with Ninja - BUILD_BYPRODUCTS "${DELTA_KERNEL_LIBPATH_DEBUG}" - BUILD_BYPRODUCTS "${DELTA_KERNEL_LIBPATH_RELEASE}" - BUILD_BYPRODUCTS "${DELTA_KERNEL_FFI_HEADER_C}" - BUILD_BYPRODUCTS "${DELTA_KERNEL_FFI_HEADER_CXX}" - INSTALL_COMMAND "" - LOG_BUILD ON) + ${KERNEL_NAME} + GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs" + # WARNING: the FFI headers are currently pinned due to the C linkage issue of + # the c++ headers. Currently, when bumping the kernel version, the produced + # header in ./src/include/delta_kernel_ffi.hpp should be also bumped, applying + # the fix + GIT_TAG v0.4.0 + # Prints the env variables passed to the cargo build to the terminal, useful + # in debugging because passing them through CMake is an error-prone mess + CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} + ${RUST_ENV_VARS} env + UPDATE_COMMAND "" + BUILD_IN_SOURCE 1 + # Build debug build + BUILD_COMMAND + ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} cargo build + --package delta_kernel_ffi --workspace --all-features ${RUST_PLATFORM_PARAM} + # Build release build + COMMAND + ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} cargo build + --package delta_kernel_ffi --workspace --all-features --release + ${RUST_PLATFORM_PARAM} + # Build DATs + COMMAND + ${CMAKE_COMMAND} -E env ${RUST_UNSET_ENV_VARS} ${RUST_ENV_VARS} cargo build + --manifest-path=${CMAKE_BINARY_DIR}/rust/src/delta_kernel/acceptance/Cargo.toml + # Define the byproducts, required for building with Ninja + BUILD_BYPRODUCTS "${DELTA_KERNEL_LIBPATH_DEBUG}" + BUILD_BYPRODUCTS "${DELTA_KERNEL_LIBPATH_RELEASE}" + BUILD_BYPRODUCTS "${DELTA_KERNEL_FFI_HEADER_C}" + BUILD_BYPRODUCTS "${DELTA_KERNEL_FFI_HEADER_CXX}" + INSTALL_COMMAND "" + LOG_BUILD ON) build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES}) build_loadable_extension(${TARGET_NAME} " " ${EXTENSION_SOURCES}) -# TODO: when C linkage issue is resolved, we should switch back to using the generated headers -#include_directories(${DELTA_KERNEL_FFI_HEADER_PATH}) +# TODO: when C linkage issue is resolved, we should switch back to using the +# generated headers include_directories(${DELTA_KERNEL_FFI_HEADER_PATH}) # Hides annoying linker warnings -set(CMAKE_OSX_DEPLOYMENT_TARGET 13.3 CACHE STRING "Minimum OS X deployment version" FORCE) +set(CMAKE_OSX_DEPLOYMENT_TARGET + 13.3 + CACHE STRING "Minimum OS X deployment version" FORCE) # Add the default client add_compile_definitions(DEFINE_DEFAULT_ENGINE) # Link delta-kernal-rs to static lib -target_link_libraries(${EXTENSION_NAME} - debug ${DELTA_KERNEL_LIBPATH_DEBUG} - optimized ${DELTA_KERNEL_LIBPATH_RELEASE} - ${PLATFORM_LIBS}) +target_link_libraries( + ${EXTENSION_NAME} debug ${DELTA_KERNEL_LIBPATH_DEBUG} optimized + ${DELTA_KERNEL_LIBPATH_RELEASE} ${PLATFORM_LIBS}) add_dependencies(${EXTENSION_NAME} delta_kernel) # Link delta-kernal-rs to dynamic lib -target_link_libraries(${LOADABLE_EXTENSION_NAME} - debug ${DELTA_KERNEL_LIBPATH_DEBUG} - optimized ${DELTA_KERNEL_LIBPATH_RELEASE} - ${PLATFORM_LIBS}) +target_link_libraries( + ${LOADABLE_EXTENSION_NAME} debug ${DELTA_KERNEL_LIBPATH_DEBUG} optimized + ${DELTA_KERNEL_LIBPATH_RELEASE} ${PLATFORM_LIBS}) add_dependencies(${LOADABLE_EXTENSION_NAME} delta_kernel) install( diff --git a/src/delta_extension.cpp b/src/delta_extension.cpp index 97d1b53..36003a3 100644 --- a/src/delta_extension.cpp +++ b/src/delta_extension.cpp @@ -13,18 +13,18 @@ namespace duckdb { static unique_ptr DeltaCatalogAttach(StorageExtensionInfo *storage_info, ClientContext &context, - AttachedDatabase &db, const string &name, AttachInfo &info, - AccessMode access_mode) { + AttachedDatabase &db, const string &name, AttachInfo &info, + AccessMode access_mode) { - auto res = make_uniq(db, info.path, access_mode); + auto res = make_uniq(db, info.path, access_mode); - for (const auto& option : info.options) { - if (StringUtil::Lower(option.first) == "pin_snapshot") { - res->use_cache = option.second.GetValue(); - } - } + for (const auto &option : info.options) { + if (StringUtil::Lower(option.first) == "pin_snapshot") { + res->use_cache = option.second.GetValue(); + } + } - res->SetDefaultTable(DEFAULT_SCHEMA, DEFAULT_DELTA_TABLE); + res->SetDefaultTable(DEFAULT_SCHEMA, DEFAULT_DELTA_TABLE); return std::move(res); } @@ -44,14 +44,14 @@ class DeltaStorageExtension : public StorageExtension { }; static void LoadInternal(DatabaseInstance &instance) { - // Load functions - for (const auto &function : DeltaFunctions::GetTableFunctions(instance)) { - ExtensionUtil::RegisterFunction(instance, function); - } - - // Register the "single table" delta catalog (to ATTACH a single delta table) - auto &config = DBConfig::GetConfig(instance); - config.storage_extensions["delta"] = make_uniq(); + // Load functions + for (const auto &function : DeltaFunctions::GetTableFunctions(instance)) { + ExtensionUtil::RegisterFunction(instance, function); + } + + // Register the "single table" delta catalog (to ATTACH a single delta table) + auto &config = DBConfig::GetConfig(instance); + config.storage_extensions["delta"] = make_uniq(); } void DeltaExtension::Load(DuckDB &db) { diff --git a/src/delta_utils.cpp b/src/delta_utils.cpp index 035d300..1a8ff04 100644 --- a/src/delta_utils.cpp +++ b/src/delta_utils.cpp @@ -13,11 +13,11 @@ unique_ptr SchemaVisitor::VisitSnapshotSchema(ffi::Sha ffi::EngineSchemaVisitor visitor; visitor.data = &state; - visitor.make_field_list = (uintptr_t (*)(void *, uintptr_t))&MakeFieldList; - visitor.visit_struct = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uintptr_t))&VisitStruct; - visitor.visit_array = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t))&VisitArray; - visitor.visit_map = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t))&VisitMap; - visitor.visit_decimal = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uint8_t, uint8_t))&VisitDecimal; + visitor.make_field_list = (uintptr_t(*)(void *, uintptr_t)) & MakeFieldList; + visitor.visit_struct = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uintptr_t)) & VisitStruct; + visitor.visit_array = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t)) & VisitArray; + visitor.visit_map = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, bool, uintptr_t)) & VisitMap; + visitor.visit_decimal = (void (*)(void *, uintptr_t, ffi::KernelStringSlice, uint8_t, uint8_t)) & VisitDecimal; visitor.visit_string = VisitSimpleType(); visitor.visit_long = VisitSimpleType(); visitor.visit_integer = VisitSimpleType(); @@ -176,7 +176,7 @@ vector KernelUtils::FromDeltaBoolSlice(const struct ffi::KernelBoolSlice s PredicateVisitor::PredicateVisitor(const vector &column_names, optional_ptr filters) { predicate = this; - visitor = (uintptr_t (*)(void *, ffi::KernelExpressionVisitorState *))&VisitPredicate; + visitor = (uintptr_t(*)(void *, ffi::KernelExpressionVisitorState *)) & VisitPredicate; if (filters) { for (auto &filter : filters->filters) { diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index d88d597..4e35b17 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -278,7 +278,8 @@ static ffi::EngineBuilder *CreateBuilder(ClientContext &context, const string &p } if (StringUtil::StartsWith(endpoint, "http://")) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("allow_http"), KernelUtils::ToDeltaString("true")); + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("allow_http"), + KernelUtils::ToDeltaString("true")); } ffi::set_builder_option(builder, KernelUtils::ToDeltaString("aws_endpoint"), KernelUtils::ToDeltaString(endpoint)); @@ -363,7 +364,8 @@ static ffi::EngineBuilder *CreateBuilder(ClientContext &context, const string &p } // Set the use_emulator option for when the azurite test server is used if (account_name == "devstoreaccount1" || connection_string.find("devstoreaccount1") != string::npos) { - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), KernelUtils::ToDeltaString("true")); + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("use_emulator"), + KernelUtils::ToDeltaString("true")); } if (!account_name.empty()) { ffi::set_builder_option(builder, KernelUtils::ToDeltaString("account_name"), @@ -373,7 +375,8 @@ static ffi::EngineBuilder *CreateBuilder(ClientContext &context, const string &p ffi::set_builder_option(builder, KernelUtils::ToDeltaString("azure_endpoint"), KernelUtils::ToDeltaString(endpoint)); } - ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), KernelUtils::ToDeltaString(bucket)); + ffi::set_builder_option(builder, KernelUtils::ToDeltaString("container_name"), + KernelUtils::ToDeltaString(bucket)); } return builder; } @@ -412,46 +415,46 @@ string DeltaSnapshot::ToDeltaPath(const string &raw_path) { } void DeltaSnapshot::Bind(vector &return_types, vector &names) { - if (have_bound) { - names = this->names; - return_types = this->types; - return; - } - - if (!initialized_snapshot) { - InitializeSnapshot(); - } - - unique_ptr schema; - - { - auto snapshot_ref = snapshot->GetLockingRef(); - schema = SchemaVisitor::VisitSnapshotSchema(snapshot_ref.GetPtr()); - } - - for (const auto &field: *schema) { - names.push_back(field.first); - return_types.push_back(field.second); - } - // Store the bound names for resolving the complex filter pushdown later - have_bound = true; - this->names = names; - this->types = return_types; + if (have_bound) { + names = this->names; + return_types = this->types; + return; + } + + if (!initialized_snapshot) { + InitializeSnapshot(); + } + + unique_ptr schema; + + { + auto snapshot_ref = snapshot->GetLockingRef(); + schema = SchemaVisitor::VisitSnapshotSchema(snapshot_ref.GetPtr()); + } + + for (const auto &field : *schema) { + names.push_back(field.first); + return_types.push_back(field.second); + } + // Store the bound names for resolving the complex filter pushdown later + have_bound = true; + this->names = names; + this->types = return_types; } string DeltaSnapshot::GetFile(idx_t i) { - if (!initialized_snapshot) { - InitializeSnapshot(); - } + if (!initialized_snapshot) { + InitializeSnapshot(); + } - if(!initialized_scan) { - InitializeScan(); - } + if (!initialized_scan) { + InitializeScan(); + } - // We already have this file - if (i < resolved_files.size()) { - return resolved_files[i]; - } + // We already have this file + if (i < resolved_files.size()) { + return resolved_files[i]; + } if (files_exhausted) { return ""; @@ -478,59 +481,62 @@ string DeltaSnapshot::GetFile(idx_t i) { } void DeltaSnapshot::InitializeSnapshot() { - auto path_slice = KernelUtils::ToDeltaString(paths[0]); + auto path_slice = KernelUtils::ToDeltaString(paths[0]); - auto interface_builder = CreateBuilder(context, paths[0]); - extern_engine = TryUnpackKernelResult( ffi::builder_build(interface_builder)); + auto interface_builder = CreateBuilder(context, paths[0]); + extern_engine = TryUnpackKernelResult(ffi::builder_build(interface_builder)); - if (!snapshot) { - snapshot = make_shared_ptr(TryUnpackKernelResult(ffi::snapshot(path_slice, extern_engine.get()))); - } + if (!snapshot) { + snapshot = make_shared_ptr( + TryUnpackKernelResult(ffi::snapshot(path_slice, extern_engine.get()))); + } - initialized_snapshot = true; + initialized_snapshot = true; } void DeltaSnapshot::InitializeScan() { - auto snapshot_ref = snapshot->GetLockingRef(); + auto snapshot_ref = snapshot->GetLockingRef(); - // Create Scan - PredicateVisitor visitor(names, &table_filters); - scan = TryUnpackKernelResult(ffi::scan(snapshot_ref.GetPtr(), extern_engine.get(), &visitor)); + // Create Scan + PredicateVisitor visitor(names, &table_filters); + scan = TryUnpackKernelResult(ffi::scan(snapshot_ref.GetPtr(), extern_engine.get(), &visitor)); // Create GlobalState global_state = ffi::get_global_scan_state(scan.get()); - // Set version - this->version = ffi::version(snapshot_ref.GetPtr()); + // Set version + this->version = ffi::version(snapshot_ref.GetPtr()); // Create scan data iterator scan_data_iterator = TryUnpackKernelResult(ffi::kernel_scan_data_init(extern_engine.get(), scan.get())); - initialized_scan = true; + initialized_scan = true; } -unique_ptr DeltaSnapshot::ComplexFilterPushdown(ClientContext &context, const MultiFileReaderOptions &options, MultiFilePushdownInfo &info, - vector> &filters) { - FilterCombiner combiner(context); +unique_ptr DeltaSnapshot::ComplexFilterPushdown(ClientContext &context, + const MultiFileReaderOptions &options, + MultiFilePushdownInfo &info, + vector> &filters) { + FilterCombiner combiner(context); - if (filters.empty()) { - return nullptr; - } + if (filters.empty()) { + return nullptr; + } - for (const auto &filter : filters) { - combiner.AddFilter(filter->Copy()); - } - auto filterstmp = combiner.GenerateTableScanFilters(info.column_ids); + for (const auto &filter : filters) { + combiner.AddFilter(filter->Copy()); + } + auto filterstmp = combiner.GenerateTableScanFilters(info.column_ids); // TODO: can/should we figure out if this filtered anything? auto filtered_list = make_uniq(context, paths[0]); filtered_list->table_filters = std::move(filterstmp); filtered_list->names = names; - // Copy over the snapshot, this avoids reparsing metadata - filtered_list->snapshot = snapshot; + // Copy over the snapshot, this avoids reparsing metadata + filtered_list->snapshot = snapshot; - return std::move(filtered_list); + return std::move(filtered_list); } vector DeltaSnapshot::GetAllFiles() { @@ -543,11 +549,11 @@ vector DeltaSnapshot::GetAllFiles() { } FileExpandResult DeltaSnapshot::GetExpandResult() { - // We avoid exposing the ExpandResult to DuckDB here because we want to materialize the Snapshot as late as possible: - // materializing too early (GetExpandResult is called *before* filter pushdown by the Parquet scanner), will lead into - // needing to create 2 scans of the snapshot TODO: we need to investigate if this is actually a sensible decision with - // some benchmarking, its currently based on intuition. - return FileExpandResult::MULTIPLE_FILES; + // We avoid exposing the ExpandResult to DuckDB here because we want to materialize the Snapshot as late as + // possible: materializing too early (GetExpandResult is called *before* filter pushdown by the Parquet scanner), + // will lead into needing to create 2 scans of the snapshot TODO: we need to investigate if this is actually a + // sensible decision with some benchmarking, its currently based on intuition. + return FileExpandResult::MULTIPLE_FILES; } idx_t DeltaSnapshot::GetTotalFileCount() { @@ -584,13 +590,13 @@ unique_ptr DeltaSnapshot::GetCardinality(ClientContext &context) } unique_ptr DeltaMultiFileReader::CreateInstance(const TableFunction &table_function) { - auto result = make_uniq(); + auto result = make_uniq(); - if (table_function.function_info) { - result->snapshot = table_function.function_info->Cast().snapshot; - } + if (table_function.function_info) { + result->snapshot = table_function.function_info->Cast().snapshot; + } - return std::move(result); + return std::move(result); } bool DeltaMultiFileReader::Bind(MultiFileReaderOptions &options, MultiFileList &files, @@ -683,21 +689,21 @@ void DeltaMultiFileReader::FinalizeBind(const MultiFileReaderOptions &file_optio } } -shared_ptr DeltaMultiFileReader::CreateFileList(ClientContext &context, const vector& paths, FileGlobOptions options) { - if (paths.size() != 1) { - throw BinderException("'delta_scan' only supports single path as input"); - } - +shared_ptr DeltaMultiFileReader::CreateFileList(ClientContext &context, const vector &paths, + FileGlobOptions options) { + if (paths.size() != 1) { + throw BinderException("'delta_scan' only supports single path as input"); + } - if (snapshot) { - // TODO: assert that we are querying the same path as this injected snapshot - // This takes the kernel snapshot from the delta snapshot and ensures we use that snapshot for reading - if (snapshot) { - return snapshot; - } - } + if (snapshot) { + // TODO: assert that we are querying the same path as this injected snapshot + // This takes the kernel snapshot from the delta snapshot and ensures we use that snapshot for reading + if (snapshot) { + return snapshot; + } + } - return make_uniq(context, paths[0]); + return make_uniq(context, paths[0]); } // Generate the correct Selection Vector Based on the Raw delta KernelBoolSlice dv and the row_id_column diff --git a/src/functions/expression_functions.cpp b/src/functions/expression_functions.cpp new file mode 100644 index 0000000..373e42f --- /dev/null +++ b/src/functions/expression_functions.cpp @@ -0,0 +1,46 @@ +#include +#include + +#include "duckdb/function/scalar_function.hpp" +#include "duckdb/planner/expression/bound_constant_expression.hpp" + +#include "delta_utils.hpp" +#include "delta_functions.hpp" + +namespace duckdb { + +static void GetDeltaTestExpression(DataChunk &input, ExpressionState &state, Vector &output) { + output.SetVectorType(VectorType::CONSTANT_VECTOR); + + auto test_expression = ffi::get_testing_kernel_expression(); + ExpressionVisitor visitor; + + auto result = visitor.VisitKernelExpression(&test_expression); + if (result->size() != 1) { + throw InternalException("Unexpected result: expected single expression"); + } + + auto &expr = result->back(); + if (expr->GetExpressionType() != ExpressionType::CONJUNCTION_AND) { + throw InternalException("Unexpected result: expected single top level Conjuntion"); + } + + vector result_to_string; + for (auto &expr : expr->Cast().children) { + result_to_string.push_back(expr->ToString()); + } + + output.SetValue(0, Value::LIST(result_to_string)); +}; + +ScalarFunctionSet DeltaFunctions::GetExpressionFunction(DatabaseInstance &instance) { + ScalarFunctionSet result; + result.name = "get_delta_test_expression"; + + ScalarFunction getvar({}, LogicalType::LIST(LogicalType::VARCHAR), GetDeltaTestExpression, nullptr, nullptr); + result.AddFunction(getvar); + + return result; +} + +} // namespace duckdb \ No newline at end of file diff --git a/src/include/delta_utils.hpp b/src/include/delta_utils.hpp index 24806a4..8760862 100644 --- a/src/include/delta_utils.hpp +++ b/src/include/delta_utils.hpp @@ -115,57 +115,64 @@ typedef TemplatedUniqueKernelPointer KernelScan typedef TemplatedUniqueKernelPointer KernelGlobalScanState; typedef TemplatedUniqueKernelPointer KernelScanDataIterator; -template +template struct SharedKernelPointer; // A reference to a SharedKernelPointer, only 1 can be handed out at the same time -template +template struct SharedKernelRef { - friend struct SharedKernelPointer; + friend struct SharedKernelPointer; + public: - KernelType* GetPtr() { - return owning_pointer.kernel_ptr.get(); - } - ~SharedKernelRef() { - owning_pointer.lock.unlock(); - } + KernelType *GetPtr() { + return owning_pointer.kernel_ptr.get(); + } + ~SharedKernelRef() { + owning_pointer.lock.unlock(); + } protected: - SharedKernelRef(SharedKernelPointer& owning_pointer_p) : owning_pointer(owning_pointer_p) { - owning_pointer.lock.lock(); - } + SharedKernelRef(SharedKernelPointer &owning_pointer_p) + : owning_pointer(owning_pointer_p) { + owning_pointer.lock.lock(); + } protected: - // The pointer that owns this ref - SharedKernelPointer& owning_pointer; + // The pointer that owns this ref + SharedKernelPointer &owning_pointer; }; // Wrapper around ffi objects to share between threads -template +template struct SharedKernelPointer { - friend struct SharedKernelRef; + friend struct SharedKernelRef; + public: - SharedKernelPointer(TemplatedUniqueKernelPointer unique_kernel_ptr) : kernel_ptr(unique_kernel_ptr) {} - SharedKernelPointer(KernelType* ptr) : kernel_ptr(ptr){} - SharedKernelPointer(){} - - SharedKernelPointer(SharedKernelPointer&& other) : SharedKernelPointer() { - other.lock.lock(); - lock.lock(); - kernel_ptr = std::move(other.kernel_ptr); - lock.lock(); - other.lock.lock(); - } - - // Returns a reference to the underlying kernel object. The SharedKernelPointer to this object will be locked for the - // lifetime of this reference - SharedKernelRef GetLockingRef() { - return SharedKernelRef(*this); - } + SharedKernelPointer(TemplatedUniqueKernelPointer unique_kernel_ptr) + : kernel_ptr(unique_kernel_ptr) { + } + SharedKernelPointer(KernelType *ptr) : kernel_ptr(ptr) { + } + SharedKernelPointer() { + } + + SharedKernelPointer(SharedKernelPointer &&other) : SharedKernelPointer() { + other.lock.lock(); + lock.lock(); + kernel_ptr = std::move(other.kernel_ptr); + lock.lock(); + other.lock.lock(); + } + + // Returns a reference to the underlying kernel object. The SharedKernelPointer to this object will be locked for + // the lifetime of this reference + SharedKernelRef GetLockingRef() { + return SharedKernelRef(*this); + } protected: - TemplatedUniqueKernelPointer kernel_ptr; - mutex lock; + TemplatedUniqueKernelPointer kernel_ptr; + mutex lock; }; typedef SharedKernelPointer SharedKernelSnapshot; diff --git a/src/include/functions/delta_scan.hpp b/src/include/functions/delta_scan.hpp index eb2de6e..32662a2 100644 --- a/src/include/functions/delta_scan.hpp +++ b/src/include/functions/delta_scan.hpp @@ -15,8 +15,8 @@ namespace duckdb { struct DeltaSnapshot; struct DeltaFunctionInfo : public TableFunctionInfo { - shared_ptr snapshot; - string expected_path; + shared_ptr snapshot; + string expected_path; }; struct DeltaFileMetaData { @@ -63,8 +63,8 @@ struct DeltaSnapshot : public MultiFileList { string GetFile(idx_t i) override; protected: - void InitializeSnapshot(); - void InitializeScan(); + void InitializeSnapshot(); + void InitializeScan(); template T TryUnpackKernelResult(ffi::ExternResult result) { @@ -76,29 +76,29 @@ struct DeltaSnapshot : public MultiFileList { public: idx_t version; - //! Delta Kernel Structures - shared_ptr snapshot; + //! Delta Kernel Structures + shared_ptr snapshot; - KernelExternEngine extern_engine; - KernelScan scan; - KernelGlobalScanState global_state; - KernelScanDataIterator scan_data_iterator; + KernelExternEngine extern_engine; + KernelScan scan; + KernelGlobalScanState global_state; + KernelScanDataIterator scan_data_iterator; - //! Names - vector names; - vector types; - bool have_bound = false; + //! Names + vector names; + vector types; + bool have_bound = false; //! Metadata map for files vector> metadata; - //! Current file list resolution state - bool initialized_snapshot = false; - bool initialized_scan = false; + //! Current file list resolution state + bool initialized_snapshot = false; + bool initialized_scan = false; - bool files_exhausted = false; - vector resolved_files; - TableFilterSet table_filters; + bool files_exhausted = false; + vector resolved_files; + TableFilterSet table_filters; ClientContext &context; }; @@ -116,10 +116,10 @@ struct DeltaMultiFileReaderGlobalState : public MultiFileReaderGlobalState { }; struct DeltaMultiFileReader : public MultiFileReader { - static unique_ptr CreateInstance(const TableFunction &table_function); - //! Return a DeltaSnapshot - shared_ptr CreateFileList(ClientContext &context, const vector &paths, - FileGlobOptions options) override; + static unique_ptr CreateInstance(const TableFunction &table_function); + //! Return a DeltaSnapshot + shared_ptr CreateFileList(ClientContext &context, const vector &paths, + FileGlobOptions options) override; //! Override the regular parquet bind using the MultiFileReader Bind. The bind from these are what DuckDB's file //! readers will try read @@ -153,13 +153,13 @@ struct DeltaMultiFileReader : public MultiFileReader { const MultiFileReaderData &reader_data, DataChunk &chunk, optional_ptr global_state) override; - //! Override the ParseOption call to parse delta_scan specific options - bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, - ClientContext &context) override; + //! Override the ParseOption call to parse delta_scan specific options + bool ParseOption(const string &key, const Value &val, MultiFileReaderOptions &options, + ClientContext &context) override; - // A snapshot can be injected into the multifilereader, this ensures the GetMultiFileList can return this snapshot - // (note that the path should match the one passed to CreateFileList) - shared_ptr snapshot; + // A snapshot can be injected into the multifilereader, this ensures the GetMultiFileList can return this snapshot + // (note that the path should match the one passed to CreateFileList) + shared_ptr snapshot; }; } // namespace duckdb diff --git a/src/include/functions/expression_functions.hpp b/src/include/functions/expression_functions.hpp new file mode 100644 index 0000000..e69de29 diff --git a/src/include/storage/delta_catalog.hpp b/src/include/storage/delta_catalog.hpp index faeb00c..17549dd 100644 --- a/src/include/storage/delta_catalog.hpp +++ b/src/include/storage/delta_catalog.hpp @@ -30,7 +30,7 @@ class DeltaCatalog : public Catalog { string path; AccessMode access_mode; - bool use_cache; + bool use_cache; public: void Initialize(bool load_builtin) override; @@ -59,22 +59,22 @@ class DeltaCatalog : public Catalog { DatabaseSize GetDatabaseSize(ClientContext &context) override; - optional_idx GetCatalogVersion(ClientContext &context) override; + optional_idx GetCatalogVersion(ClientContext &context) override; bool InMemory() override; string GetDBPath() override; - bool UseCachedSnapshot(); + bool UseCachedSnapshot(); - DeltaSchemaEntry& GetMainSchema() { - return *main_schema; - } + DeltaSchemaEntry &GetMainSchema() { + return *main_schema; + } private: void DropSchema(ClientContext &context, DropInfo &info) override; private: - unique_ptr main_schema; + unique_ptr main_schema; string default_schema; }; diff --git a/src/include/storage/delta_schema_entry.hpp b/src/include/storage/delta_schema_entry.hpp index c8a8d09..dc41a4c 100644 --- a/src/include/storage/delta_schema_entry.hpp +++ b/src/include/storage/delta_schema_entry.hpp @@ -40,13 +40,13 @@ class DeltaSchemaEntry : public SchemaCatalogEntry { void DropEntry(ClientContext &context, DropInfo &info) override; optional_ptr GetEntry(CatalogTransaction transaction, CatalogType type, const string &name) override; - optional_ptr GetCachedTable(); + optional_ptr GetCachedTable(); private: - //! Delta tables may be cached in the SchemaEntry. Since the TableEntry holds the snapshot, this allows sharing a snapshot - //! between different scans. - unique_ptr cached_table; - mutex lock; + //! Delta tables may be cached in the SchemaEntry. Since the TableEntry holds the snapshot, this allows sharing a + //! snapshot between different scans. + unique_ptr cached_table; + mutex lock; }; } // namespace duckdb diff --git a/src/include/storage/delta_table_entry.hpp b/src/include/storage/delta_table_entry.hpp index c131694..5263e88 100644 --- a/src/include/storage/delta_table_entry.hpp +++ b/src/include/storage/delta_table_entry.hpp @@ -17,7 +17,7 @@ struct DeltaSnapshot; class DeltaTableEntry : public TableCatalogEntry { public: DeltaTableEntry(Catalog &catalog, SchemaCatalogEntry &schema, CreateTableInfo &info); - ~DeltaTableEntry(); + ~DeltaTableEntry(); public: unique_ptr GetStatistics(ClientContext &context, column_t column_id) override; @@ -30,7 +30,7 @@ class DeltaTableEntry : public TableCatalogEntry { ClientContext &context) override; public: - shared_ptr snapshot; + shared_ptr snapshot; }; } // namespace duckdb diff --git a/src/include/storage/delta_transaction.hpp b/src/include/storage/delta_transaction.hpp index 3a004ef..b9d369c 100644 --- a/src/include/storage/delta_transaction.hpp +++ b/src/include/storage/delta_transaction.hpp @@ -30,11 +30,12 @@ class DeltaTransaction : public Transaction { static DeltaTransaction &Get(ClientContext &context, Catalog &catalog); AccessMode GetAccessMode() const; - void SetReadWrite() override { - throw NotImplementedException("Can not start read-write transaction"); - }; + void SetReadWrite() override { + throw NotImplementedException("Can not start read-write transaction"); + }; + public: - unique_ptr table_entry; + unique_ptr table_entry; private: // DeltaConnection connection; diff --git a/src/storage/delta_catalog.cpp b/src/storage/delta_catalog.cpp index 1e8ac4e..53b1195 100644 --- a/src/storage/delta_catalog.cpp +++ b/src/storage/delta_catalog.cpp @@ -18,31 +18,32 @@ DeltaCatalog::DeltaCatalog(AttachedDatabase &db_p, const string &path, AccessMod DeltaCatalog::~DeltaCatalog() = default; void DeltaCatalog::Initialize(bool load_builtin) { - CreateSchemaInfo info; - main_schema = make_uniq(*this, info); + CreateSchemaInfo info; + main_schema = make_uniq(*this, info); } optional_ptr DeltaCatalog::CreateSchema(CatalogTransaction transaction, CreateSchemaInfo &info) { - throw BinderException("Delta tables do not support creating new schemas"); + throw BinderException("Delta tables do not support creating new schemas"); } void DeltaCatalog::DropSchema(ClientContext &context, DropInfo &info) { - throw BinderException("Delta tables do not support dropping schemas"); + throw BinderException("Delta tables do not support dropping schemas"); } void DeltaCatalog::ScanSchemas(ClientContext &context, std::function callback) { - callback(*main_schema); + callback(*main_schema); } optional_ptr DeltaCatalog::GetSchema(CatalogTransaction transaction, const string &schema_name, - OnEntryNotFound if_not_found, QueryErrorContext error_context) { - if (schema_name == DEFAULT_SCHEMA || schema_name == INVALID_SCHEMA) { - return main_schema.get(); - } - if (if_not_found == OnEntryNotFound::RETURN_NULL) { - return nullptr; - } - return nullptr; + OnEntryNotFound if_not_found, + QueryErrorContext error_context) { + if (schema_name == DEFAULT_SCHEMA || schema_name == INVALID_SCHEMA) { + return main_schema.get(); + } + if (if_not_found == OnEntryNotFound::RETURN_NULL) { + return nullptr; + } + return nullptr; } bool DeltaCatalog::InMemory() { @@ -54,24 +55,24 @@ string DeltaCatalog::GetDBPath() { } bool DeltaCatalog::UseCachedSnapshot() { - return use_cache; + return use_cache; } optional_idx DeltaCatalog::GetCatalogVersion(ClientContext &context) { - auto &delta_transaction = DeltaTransaction::Get(context, *this); + auto &delta_transaction = DeltaTransaction::Get(context, *this); - // Option 1: snapshot is cached table-wide - auto cached_snapshot = main_schema->GetCachedTable(); - if (cached_snapshot) { - return cached_snapshot->snapshot->version; - } + // Option 1: snapshot is cached table-wide + auto cached_snapshot = main_schema->GetCachedTable(); + if (cached_snapshot) { + return cached_snapshot->snapshot->version; + } - // Option 2: snapshot is cached in transaction - if (delta_transaction.table_entry) { - return delta_transaction.table_entry->snapshot->version; - } + // Option 2: snapshot is cached in transaction + if (delta_transaction.table_entry) { + return delta_transaction.table_entry->snapshot->version; + } - return {}; + return {}; } DatabaseSize DeltaCatalog::GetDatabaseSize(ClientContext &context) { @@ -84,23 +85,23 @@ DatabaseSize DeltaCatalog::GetDatabaseSize(ClientContext &context) { } unique_ptr DeltaCatalog::PlanInsert(ClientContext &context, LogicalInsert &op, - unique_ptr plan) { + unique_ptr plan) { throw NotImplementedException("DeltaCatalog does not support inserts"); } unique_ptr DeltaCatalog::PlanCreateTableAs(ClientContext &context, LogicalCreateTable &op, - unique_ptr plan) { + unique_ptr plan) { throw NotImplementedException("DeltaCatalog does not support creating new tables"); } unique_ptr DeltaCatalog::PlanDelete(ClientContext &context, LogicalDelete &op, - unique_ptr plan) { + unique_ptr plan) { throw NotImplementedException("DeltaCatalog does not support deletes"); } unique_ptr DeltaCatalog::PlanUpdate(ClientContext &context, LogicalUpdate &op, - unique_ptr plan) { + unique_ptr plan) { throw NotImplementedException("DeltaCatalog does not support updates"); } -unique_ptr DeltaCatalog::BindCreateIndex(Binder &binder, CreateStatement &stmt, TableCatalogEntry &table, - unique_ptr plan) { +unique_ptr DeltaCatalog::BindCreateIndex(Binder &binder, CreateStatement &stmt, + TableCatalogEntry &table, unique_ptr plan) { throw NotImplementedException("DeltaCatalog does not support creating indices"); } diff --git a/src/storage/delta_schema_entry.cpp b/src/storage/delta_schema_entry.cpp index 7e15c5b..61348d4 100644 --- a/src/storage/delta_schema_entry.cpp +++ b/src/storage/delta_schema_entry.cpp @@ -17,11 +17,9 @@ #include "duckdb/parser/parsed_data/alter_table_info.hpp" #include "duckdb/parser/parsed_expression_iterator.hpp" - namespace duckdb { -DeltaSchemaEntry::DeltaSchemaEntry(Catalog &catalog, CreateSchemaInfo &info) - : SchemaCatalogEntry(catalog, info) { +DeltaSchemaEntry::DeltaSchemaEntry(Catalog &catalog, CreateSchemaInfo &info) : SchemaCatalogEntry(catalog, info) { } DeltaSchemaEntry::~DeltaSchemaEntry() { @@ -35,7 +33,7 @@ DeltaTransaction &GetDeltaTransaction(CatalogTransaction transaction) { } optional_ptr DeltaSchemaEntry::CreateTable(CatalogTransaction transaction, BoundCreateTableInfo &info) { - throw BinderException("Delta tables do not support creating tables"); + throw BinderException("Delta tables do not support creating tables"); } optional_ptr DeltaSchemaEntry::CreateFunction(CatalogTransaction transaction, CreateFunctionInfo &info) { @@ -53,7 +51,7 @@ void DeltaUnqualifyColumnRef(ParsedExpression &expr) { } optional_ptr DeltaSchemaEntry::CreateIndex(CatalogTransaction transaction, CreateIndexInfo &info, - TableCatalogEntry &table) { + TableCatalogEntry &table) { throw NotImplementedException("CreateIndex"); } @@ -62,7 +60,7 @@ string GetDeltaCreateView(CreateViewInfo &info) { } optional_ptr DeltaSchemaEntry::CreateView(CatalogTransaction transaction, CreateViewInfo &info) { - throw BinderException("Delta tables do not support creating views"); + throw BinderException("Delta tables do not support creating views"); } optional_ptr DeltaSchemaEntry::CreateType(CatalogTransaction transaction, CreateTypeInfo &info) { @@ -74,26 +72,27 @@ optional_ptr DeltaSchemaEntry::CreateSequence(CatalogTransaction t } optional_ptr DeltaSchemaEntry::CreateTableFunction(CatalogTransaction transaction, - CreateTableFunctionInfo &info) { + CreateTableFunctionInfo &info) { throw BinderException("Delta databases do not support creating table functions"); } optional_ptr DeltaSchemaEntry::CreateCopyFunction(CatalogTransaction transaction, - CreateCopyFunctionInfo &info) { + CreateCopyFunctionInfo &info) { throw BinderException("Delta databases do not support creating copy functions"); } optional_ptr DeltaSchemaEntry::CreatePragmaFunction(CatalogTransaction transaction, - CreatePragmaFunctionInfo &info) { + CreatePragmaFunctionInfo &info) { throw BinderException("Delta databases do not support creating pragma functions"); } -optional_ptr DeltaSchemaEntry::CreateCollation(CatalogTransaction transaction, CreateCollationInfo &info) { +optional_ptr DeltaSchemaEntry::CreateCollation(CatalogTransaction transaction, + CreateCollationInfo &info) { throw BinderException("Delta databases do not support creating collations"); } void DeltaSchemaEntry::Alter(CatalogTransaction transaction, AlterInfo &info) { - throw NotImplementedException("Delta tables do not support altering"); + throw NotImplementedException("Delta tables do not support altering"); } bool CatalogTypeIsSupported(CatalogType type) { @@ -105,80 +104,80 @@ bool CatalogTypeIsSupported(CatalogType type) { } } -static unique_ptr CreateTableEntry(ClientContext &context, DeltaCatalog &delta_catalog, DeltaSchemaEntry &schema_entry) { - auto snapshot = make_shared_ptr(context, delta_catalog.GetDBPath()); +static unique_ptr CreateTableEntry(ClientContext &context, DeltaCatalog &delta_catalog, + DeltaSchemaEntry &schema_entry) { + auto snapshot = make_shared_ptr(context, delta_catalog.GetDBPath()); - // Get the names and types from the delta snapshot - vector return_types; - vector names; - snapshot->Bind(return_types, names); + // Get the names and types from the delta snapshot + vector return_types; + vector names; + snapshot->Bind(return_types, names); - CreateTableInfo table_info; - for (idx_t i = 0; i < return_types.size(); i++) { - table_info.columns.AddColumn(ColumnDefinition(names[i], return_types[i])); - } - table_info.table = DEFAULT_DELTA_TABLE; - auto table_entry = make_uniq(delta_catalog, schema_entry, table_info); - table_entry->snapshot = std::move(snapshot); + CreateTableInfo table_info; + for (idx_t i = 0; i < return_types.size(); i++) { + table_info.columns.AddColumn(ColumnDefinition(names[i], return_types[i])); + } + table_info.table = DEFAULT_DELTA_TABLE; + auto table_entry = make_uniq(delta_catalog, schema_entry, table_info); + table_entry->snapshot = std::move(snapshot); - return table_entry; + return table_entry; } void DeltaSchemaEntry::Scan(ClientContext &context, CatalogType type, - const std::function &callback) { + const std::function &callback) { if (!CatalogTypeIsSupported(type)) { - auto transaction = catalog.GetCatalogTransaction(context); + auto transaction = catalog.GetCatalogTransaction(context); auto default_table = GetEntry(transaction, type, DEFAULT_DELTA_TABLE); - if (default_table) { - callback(*default_table); - } + if (default_table) { + callback(*default_table); + } } - } void DeltaSchemaEntry::Scan(CatalogType type, const std::function &callback) { throw NotImplementedException("Scan without context not supported"); } void DeltaSchemaEntry::DropEntry(ClientContext &context, DropInfo &info) { - throw NotImplementedException("Delta tables do not support dropping"); + throw NotImplementedException("Delta tables do not support dropping"); } optional_ptr DeltaSchemaEntry::GetEntry(CatalogTransaction transaction, CatalogType type, - const string &name) { - if (!transaction.HasContext()) { - throw NotImplementedException("Can not DeltaSchemaEntry::GetEntry without context"); - } - auto &context = transaction.GetContext(); - - if (type == CatalogType::TABLE_ENTRY && name == DEFAULT_DELTA_TABLE) { - auto &delta_transaction = GetDeltaTransaction(transaction); - auto &delta_catalog = catalog.Cast(); - - if (delta_transaction.table_entry) { - return *delta_transaction.table_entry; - } - - if (delta_catalog.UseCachedSnapshot()) { - unique_lock l(lock); - if (!cached_table) { - cached_table = CreateTableEntry(context, delta_catalog, *this); - } - return *cached_table; - } - - delta_transaction.table_entry = CreateTableEntry(context, delta_catalog, *this); - return *delta_transaction.table_entry; - } + const string &name) { + if (!transaction.HasContext()) { + throw NotImplementedException("Can not DeltaSchemaEntry::GetEntry without context"); + } + auto &context = transaction.GetContext(); + + if (type == CatalogType::TABLE_ENTRY && name == DEFAULT_DELTA_TABLE) { + auto &delta_transaction = GetDeltaTransaction(transaction); + auto &delta_catalog = catalog.Cast(); + + if (delta_transaction.table_entry) { + return *delta_transaction.table_entry; + } + + if (delta_catalog.UseCachedSnapshot()) { + unique_lock l(lock); + if (!cached_table) { + cached_table = CreateTableEntry(context, delta_catalog, *this); + } + return *cached_table; + } + + delta_transaction.table_entry = CreateTableEntry(context, delta_catalog, *this); + return *delta_transaction.table_entry; + } - return nullptr; + return nullptr; } optional_ptr DeltaSchemaEntry::GetCachedTable() { - lock_guard lck(lock); - if (cached_table) { - return *cached_table; - } - return nullptr; + lock_guard lck(lock); + if (cached_table) { + return *cached_table; + } + return nullptr; } } // namespace duckdb diff --git a/src/storage/delta_table_entry.cpp b/src/storage/delta_table_entry.cpp index f82caa4..6f7f829 100644 --- a/src/storage/delta_table_entry.cpp +++ b/src/storage/delta_table_entry.cpp @@ -32,7 +32,7 @@ unique_ptr DeltaTableEntry::GetStatistics(ClientContext &context } void DeltaTableEntry::BindUpdateConstraints(Binder &binder, LogicalGet &, LogicalProjection &, LogicalUpdate &, - ClientContext &) { + ClientContext &) { throw NotImplementedException("BindUpdateConstraints for delta table"); } @@ -43,11 +43,11 @@ TableFunction DeltaTableEntry::GetScanFunction(ClientContext &context, unique_pt auto delta_scan_function = delta_function_set.functions.GetFunctionByArguments(context, {LogicalType::VARCHAR}); auto &delta_catalog = catalog.Cast(); - // Copy over the internal kernel snapshot - auto function_info = make_shared_ptr(); + // Copy over the internal kernel snapshot + auto function_info = make_shared_ptr(); - function_info->snapshot = this->snapshot; - delta_scan_function.function_info = std::move(function_info); + function_info->snapshot = this->snapshot; + delta_scan_function.function_info = std::move(function_info); vector inputs = {delta_catalog.GetDBPath()}; named_parameter_map_t param_map; @@ -55,7 +55,6 @@ TableFunction DeltaTableEntry::GetScanFunction(ClientContext &context, unique_pt vector names; TableFunctionRef empty_ref; - TableFunctionBindInput bind_input(inputs, param_map, return_types, names, nullptr, nullptr, delta_scan_function, empty_ref); diff --git a/src/storage/delta_transaction.cpp b/src/storage/delta_transaction.cpp index 3846c47..2af1a46 100644 --- a/src/storage/delta_transaction.cpp +++ b/src/storage/delta_transaction.cpp @@ -27,7 +27,7 @@ void DeltaTransaction::Commit() { void DeltaTransaction::Rollback() { if (transaction_state == DeltaTransactionState::TRANSACTION_STARTED) { transaction_state = DeltaTransactionState::TRANSACTION_FINISHED; - // NOP: we only support read-only transactions currently + // NOP: we only support read-only transactions currently } } @@ -36,7 +36,7 @@ DeltaTransaction &DeltaTransaction::Get(ClientContext &context, Catalog &catalog } AccessMode DeltaTransaction::GetAccessMode() const { - return access_mode; + return access_mode; } } // namespace duckdb From f240e459aacc141ef9043b877baa56df3945c106 Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Tue, 12 Nov 2024 13:03:02 +0100 Subject: [PATCH 7/8] add ci tools version --- .github/workflows/MainDistributionPipeline.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index f43fd4e..0be63d2 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -32,6 +32,7 @@ jobs: secrets: inherit with: extension_name: delta + ci_tools_version: main duckdb_version: 0ccf3c25cc exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw' deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From febbb798e3b0e7d8b60b6c48956655a041e15f6a Mon Sep 17 00:00:00 2001 From: Sam Ansmink Date: Tue, 12 Nov 2024 15:08:15 +0100 Subject: [PATCH 8/8] also skip old arch label --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index 0be63d2..fb259e8 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -21,7 +21,7 @@ jobs: ci_tools_version: main extension_name: delta enable_rust: true - exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw' + exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools;windows_amd64_mingw' extra_toolchains: 'python3' vcpkg_commit: c82f74667287d3dc386bce81e44964370c91a289 @@ -34,5 +34,5 @@ jobs: extension_name: delta ci_tools_version: main duckdb_version: 0ccf3c25cc - exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw' + exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools;windows_amd64_mingw' deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}