diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d9c251..72e85bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,10 +66,15 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs" GIT_TAG main CONFIGURE_COMMAND "" - PATCH_COMMAND git apply < ${CMAKE_SOURCE_DIR}/../patches/delta_kernel_rs/enable_s3.patch + UPDATE_COMMAND "" + PATCH_COMMAND python3 ${CMAKE_SOURCE_DIR}/../scripts/apply_patches.py ${CMAKE_SOURCE_DIR}/../patches/delta_kernel_rs BUILD_IN_SOURCE 1 + # Build debug build BUILD_COMMAND cargo build --package delta_kernel_ffi --all-features --target=${RUST_PLATFORM_TARGET} + # Build release build COMMAND cargo build --package delta_kernel_ffi --all-features --release --target=${RUST_PLATFORM_TARGET} + # Build DATs + COMMAND cargo build --manifest-path=${CMAKE_BINARY_DIR}/rust/src/delta_kernel/acceptance/Cargo.toml BUILD_BYPRODUCTS "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/libdelta_kernel_ffi.a" BUILD_BYPRODUCTS "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/libdelta_kernel_ffi.a" BUILD_BYPRODUCTS "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.h" diff --git a/Makefile b/Makefile index 397b07d..05db957 100644 --- a/Makefile +++ b/Makefile @@ -4,10 +4,22 @@ PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) EXT_NAME=deltatable EXT_CONFIG=${PROJ_DIR}extension_config.cmake +# Set test paths +test_release: export DELTA_KERNEL_TESTS_PATH=./build/release/rust/src/delta_kernel/kernel/tests/data +test_release: export DAT_PATH=./build/release/rust/src/delta_kernel/acceptance/tests/dat + +test_debug: export DELTA_KERNEL_TESTS_PATH=./build/debug/rust/src/delta_kernel/kernel/tests/data +test_debug: export DAT_PATH=./build/debug/rust/src/delta_kernel/acceptance/tests/dat + # Include the Makefile from extension-ci-tools include extension-ci-tools/makefiles/duckdb_extension.Makefile reldebug: mkdir -p build/reldebug && \ cmake $(GENERATOR) $(BUILD_FLAGS) $(EXT_RELEASE_FLAGS) -DCMAKE_BUILD_TYPE=RelWithDebInfo -S ./duckdb/ -B build/reldebug && \ - cmake --build build/reldebug --config RelWithDebInfo \ No newline at end of file + cmake --build build/reldebug --config RelWithDebInfo + +# Generate some test data to test with +generate-data: + python3 -m pip install delta-spark duckdb pandas deltalake pyspark delta + python3 scripts/generate_test_data.py diff --git a/README.md b/README.md index d54c2e8..e9aa0e2 100644 --- a/README.md +++ b/README.md @@ -49,3 +49,24 @@ More features coming soon! # Building See the [Extension Template](https://github.com/duckdb/extension-template) for generic build instructions + +# Running tests +There are various tests available for the delta extension: +1. Delta Acceptence Test (DAT) based tests in `/test/sql/dat` +2. delta-kernel-rs based tests in `/test/sql/delta_kernel_rs` +3. Generated data based tests in `tests/sql/generated` (generated using [delta-rs](https://delta-io.github.io/delta-rs/), [PySpark](https://spark.apache.org/docs/latest/api/python/index.html), and DuckDB) + +To run the first 2 sets of tests: +```shell +make test_debug +``` +or in release mode +```shell +make test +``` + +To also run the tests on generated data: +```shell +make generate-data +GENERATED_DATA_AVAILABLE=1 make test +``` \ No newline at end of file diff --git a/scripts/apply_patches.py b/scripts/apply_patches.py new file mode 100644 index 0000000..eb49729 --- /dev/null +++ b/scripts/apply_patches.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +import sys +import glob +import subprocess +import os + +# Get the directory and construct the patch file pattern +directory = sys.argv[1] +patch_pattern = f"{directory}*.patch" + +# Early out if the patches were applied here before +DUCKDB_APPLIED_PATCHES_FILE = './applied_duckdb_patches' +if os.path.isfile(DUCKDB_APPLIED_PATCHES_FILE): + sys.exit(0) + +# Find patch files matching the pattern +patches = glob.glob(patch_pattern) + +import os +def touch(path): + with open(path, 'a'): + os.utime(path, None) + +def raise_error(error_msg): + sys.stderr.write(error_message + '\n') + sys.exit(1) + +patches = os.listdir(directory) +for patch in patches: + if not patch.endswith('.patch'): + raise_error( + f'Patch file {patch} found in directory {directory} does not end in ".patch" - rename the patch file' + ) + +# Apply each patch file using git apply +for patch in patches: + print(f"Applying patch: {patch}\n") + subprocess.run( + ["git", "apply", "--ignore-space-change", "--ignore-whitespace", os.path.join(directory, patch)], check=True + ) + +# Mark patches applied because CMake is wonky +touch(DUCKDB_APPLIED_PATCHES_FILE) \ No newline at end of file diff --git a/src/functions/delta_scan.cpp b/src/functions/delta_scan.cpp index 0f2cb30..73c0fbd 100644 --- a/src/functions/delta_scan.cpp +++ b/src/functions/delta_scan.cpp @@ -115,8 +115,12 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p auto endpoint = kv_secret.TryGetValue("endpoint"); auto session_token = kv_secret.TryGetValue("session_token"); - ffi::set_builder_option(builder, to_delta_string_slice("aws_access_key_id"), to_delta_string_slice(key_id.ToString())); - ffi::set_builder_option(builder, to_delta_string_slice("aws_secret_access_key"), to_delta_string_slice(secret.ToString())); + if (!key_id.ToString().empty()) { + ffi::set_builder_option(builder, to_delta_string_slice("aws_access_key_id"), to_delta_string_slice(key_id.ToString())); + } + if (!secret.ToString().empty()) { + ffi::set_builder_option(builder, to_delta_string_slice("aws_secret_access_key"), to_delta_string_slice(secret.ToString())); + } ffi::set_builder_option(builder, to_delta_string_slice("aws_region"), to_delta_string_slice(region.ToString())); return builder; @@ -420,6 +424,10 @@ unique_ptr DeltaMultiFileReader::InitializeGlobalSta case_insensitive_map_t selected_columns; for (idx_t i = 0; i < global_column_ids.size(); i++) { auto global_id = global_column_ids[i]; + if (IsRowIdColumnId(global_id)) { + continue; + } + auto &global_name = global_names[global_id]; selected_columns.insert({global_name, i}); } diff --git a/test/sql/dat/basic_append.test b/test/sql/dat/basic_append.test index 9574ad6..013f91d 100644 --- a/test/sql/dat/basic_append.test +++ b/test/sql/dat/basic_append.test @@ -6,7 +6,7 @@ require parquet require delta -require-env DAT_AVAILABLE +require-env DAT_PATH # Note: this table has 2 parquet files: # - part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet @@ -16,7 +16,7 @@ require-env DAT_AVAILABLE query III SELECT * -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') ---- d 4 4.4 e 5 5.5 @@ -26,7 +26,7 @@ c 3 3.3 query I SELECT letter -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') ---- d e @@ -36,7 +36,7 @@ c query I SELECT number -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') ---- 4 5 @@ -50,7 +50,7 @@ mode skip # Now we add a filter that filters out one of the files query II SELECT letter, number -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') WHERE number < 2 ---- a 1 @@ -60,7 +60,7 @@ mode unskip # Now we add a filter that filters out the other file query III SELECT a_float, letter, number, -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') WHERE number > 4 ---- 5.5 e 5 @@ -70,6 +70,6 @@ mode skip # Now we add a filter that filters out all columns query III SELECT a_float, number, letter -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta') WHERE number > 6 ---- diff --git a/test/sql/dat/custom_parameters.test b/test/sql/dat/custom_parameters.test index 2f4509b..a942ae6 100644 --- a/test/sql/dat/custom_parameters.test +++ b/test/sql/dat/custom_parameters.test @@ -6,7 +6,7 @@ require parquet require delta -require-env DAT_AVAILABLE +require-env DAT_PATH # Note: this table has 2 parquet files: # - part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet @@ -17,7 +17,7 @@ require-env DAT_AVAILABLE # Test with appends and several custom options query IIIII SELECT parse_filename(filename), file_row_number, letter, delta_file_number, number -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1, filename=1) +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1, filename=1) ---- part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet 0 d 0 4 part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet 1 e 0 5 diff --git a/test/sql/dat/primitive_types.test b/test/sql/dat/primitive_types.test index 5f0f551..2c2fd03 100644 --- a/test/sql/dat/primitive_types.test +++ b/test/sql/dat/primitive_types.test @@ -6,11 +6,11 @@ require parquet require delta -require-env DAT_AVAILABLE +require-env DAT_PATH query IIIIIIIIIIII SELECT * -FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/all_primitive_types/delta') +FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/all_primitive_types/delta') ---- 0 0 0 0 0 0.0 0.0 true (empty) 10.000 1970-01-01 1970-01-01 00:00:00 1 1 1 1 1 1.0 1.0 false \x00 11.000 1970-01-02 1970-01-01 01:00:00 diff --git a/test/sql/delta_kernel_rs/basic_partitioned.test b/test/sql/delta_kernel_rs/basic_partitioned.test index 79024e6..79804d1 100644 --- a/test/sql/delta_kernel_rs/basic_partitioned.test +++ b/test/sql/delta_kernel_rs/basic_partitioned.test @@ -6,10 +6,12 @@ require parquet require delta +require-env DELTA_KERNEL_TESTS_PATH + # FIXME: this fails due some weird error mode skip statement error -SELECT * FROM delta_scan('./delta-kernel-rs/kernel/tests/data/basic_partitioned') +SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned') ---- Failed to read file "/Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/basic_partitioned/letter=__HIVE_DEFAULT_PARTITION__ diff --git a/test/sql/delta_kernel_rs/simple_with_dv.test b/test/sql/delta_kernel_rs/simple_with_dv.test index 13615d1..605cd85 100644 --- a/test/sql/delta_kernel_rs/simple_with_dv.test +++ b/test/sql/delta_kernel_rs/simple_with_dv.test @@ -6,9 +6,11 @@ require parquet require delta +require-env DELTA_KERNEL_TESTS_PATH + # Simplest example query I -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/') +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/') ---- 1 2 @@ -21,7 +23,7 @@ FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/') # With filter: ensures the deletion vector is applied properly on top of pushed down filters query I -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/') +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/') WHERE value > 3 ---- 4 @@ -32,7 +34,7 @@ WHERE value > 3 # With filter: ensures the deletion vector is applied properly on top of pushed down filters with the file_row_number column query II -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', file_row_number=1) +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', file_row_number=1) WHERE value > 3 ---- 4 4 @@ -43,7 +45,7 @@ WHERE value > 3 # With filter and a delta scan based extra constant column query II -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1) +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1) WHERE value > 3 ---- 4 0 @@ -55,7 +57,7 @@ WHERE value > 3 # With filter, delta-extension-originated const column, and parquet-originated const column query III SELECT value, parse_filename(filename), delta_file_number -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1, filename=1) +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1) WHERE value > 3 ---- 4 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0 @@ -67,7 +69,7 @@ WHERE value > 3 # With PRUNED filter, delta-extension-originated const column, and parquet-originated const column query II SELECT parse_filename(filename), delta_file_number -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1, filename=1) +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1) WHERE value > 3 ---- part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0 @@ -79,7 +81,7 @@ part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0 # With PRUNED filters, delta-extension-originated const column, and parquet-originated const column query I SELECT delta_file_number -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1, filename=1) +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1) WHERE value > 3 and filename is not null ---- 0 @@ -91,7 +93,7 @@ WHERE value > 3 and filename is not null # Enabling the file_row_number option, but projecting it out query I SELECT value -FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', file_row_number=1) +FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', file_row_number=1) ---- 1 2 diff --git a/test/sql/delta_kernel_rs/simple_without_dv.test b/test/sql/delta_kernel_rs/simple_without_dv.test index 01d5a24..8f61ae4 100644 --- a/test/sql/delta_kernel_rs/simple_without_dv.test +++ b/test/sql/delta_kernel_rs/simple_without_dv.test @@ -6,9 +6,11 @@ require parquet require delta +require-env DELTA_KERNEL_TESTS_PATH + # Filename param (i.e. MultiFileReader provided) query II -SELECT value, parse_filename(filename) FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-without-dv-small', filename=1) +SELECT value, parse_filename(filename) FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-without-dv-small', filename=1) ---- 0 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet 1 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet @@ -23,7 +25,7 @@ SELECT value, parse_filename(filename) FROM delta_scan('./delta-kernel-rs/kernel # FileRowNumer param (i.e. ParquetReader provided) query II -SELECT * FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-without-dv-small', file_row_number=1) +SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-without-dv-small', file_row_number=1) ---- 0 0 1 1