Skip to content

Commit

Permalink
clean up testing, clean up tests, fix small bug
Browse files Browse the repository at this point in the history
  • Loading branch information
samansmink committed May 15, 2024
1 parent 84930fa commit 7236aa4
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 26 deletions.
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,15 @@ ExternalProject_Add(
GIT_REPOSITORY "https://github.com/delta-incubator/delta-kernel-rs"
GIT_TAG main
CONFIGURE_COMMAND ""
PATCH_COMMAND git apply < ${CMAKE_SOURCE_DIR}/../patches/delta_kernel_rs/enable_s3.patch
UPDATE_COMMAND ""
PATCH_COMMAND python3 ${CMAKE_SOURCE_DIR}/../scripts/apply_patches.py ${CMAKE_SOURCE_DIR}/../patches/delta_kernel_rs
BUILD_IN_SOURCE 1
# Build debug build
BUILD_COMMAND cargo build --package delta_kernel_ffi --all-features --target=${RUST_PLATFORM_TARGET}
# Build release build
COMMAND cargo build --package delta_kernel_ffi --all-features --release --target=${RUST_PLATFORM_TARGET}
# Build DATs
COMMAND cargo build --manifest-path=${CMAKE_BINARY_DIR}/rust/src/delta_kernel/acceptance/Cargo.toml
BUILD_BYPRODUCTS "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/debug/libdelta_kernel_ffi.a"
BUILD_BYPRODUCTS "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/${RUST_PLATFORM_TARGET}/release/libdelta_kernel_ffi.a"
BUILD_BYPRODUCTS "${CMAKE_BINARY_DIR}/rust/src/delta_kernel/target/ffi-headers/delta_kernel_ffi.h"
Expand Down
14 changes: 13 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,22 @@ PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
EXT_NAME=deltatable
EXT_CONFIG=${PROJ_DIR}extension_config.cmake

# Set test paths
test_release: export DELTA_KERNEL_TESTS_PATH=./build/release/rust/src/delta_kernel/kernel/tests/data
test_release: export DAT_PATH=./build/release/rust/src/delta_kernel/acceptance/tests/dat

test_debug: export DELTA_KERNEL_TESTS_PATH=./build/debug/rust/src/delta_kernel/kernel/tests/data
test_debug: export DAT_PATH=./build/debug/rust/src/delta_kernel/acceptance/tests/dat

# Include the Makefile from extension-ci-tools
include extension-ci-tools/makefiles/duckdb_extension.Makefile

reldebug:
mkdir -p build/reldebug && \
cmake $(GENERATOR) $(BUILD_FLAGS) $(EXT_RELEASE_FLAGS) -DCMAKE_BUILD_TYPE=RelWithDebInfo -S ./duckdb/ -B build/reldebug && \
cmake --build build/reldebug --config RelWithDebInfo
cmake --build build/reldebug --config RelWithDebInfo

# Generate some test data to test with
generate-data:
python3 -m pip install delta-spark duckdb pandas deltalake pyspark delta
python3 scripts/generate_test_data.py
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,24 @@ More features coming soon!

# Building
See the [Extension Template](https://github.com/duckdb/extension-template) for generic build instructions

# Running tests
There are various tests available for the delta extension:
1. Delta Acceptence Test (DAT) based tests in `/test/sql/dat`
2. delta-kernel-rs based tests in `/test/sql/delta_kernel_rs`
3. Generated data based tests in `tests/sql/generated` (generated using [delta-rs](https://delta-io.github.io/delta-rs/), [PySpark](https://spark.apache.org/docs/latest/api/python/index.html), and DuckDB)

To run the first 2 sets of tests:
```shell
make test_debug
```
or in release mode
```shell
make test
```

To also run the tests on generated data:
```shell
make generate-data
GENERATED_DATA_AVAILABLE=1 make test
```
43 changes: 43 additions & 0 deletions scripts/apply_patches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python3
import sys
import glob
import subprocess
import os

# Get the directory and construct the patch file pattern
directory = sys.argv[1]
patch_pattern = f"{directory}*.patch"

# Early out if the patches were applied here before
DUCKDB_APPLIED_PATCHES_FILE = './applied_duckdb_patches'
if os.path.isfile(DUCKDB_APPLIED_PATCHES_FILE):
sys.exit(0)

# Find patch files matching the pattern
patches = glob.glob(patch_pattern)

import os
def touch(path):
with open(path, 'a'):
os.utime(path, None)

def raise_error(error_msg):
sys.stderr.write(error_message + '\n')
sys.exit(1)

patches = os.listdir(directory)
for patch in patches:
if not patch.endswith('.patch'):
raise_error(
f'Patch file {patch} found in directory {directory} does not end in ".patch" - rename the patch file'
)

# Apply each patch file using git apply
for patch in patches:
print(f"Applying patch: {patch}\n")
subprocess.run(
["git", "apply", "--ignore-space-change", "--ignore-whitespace", os.path.join(directory, patch)], check=True
)

# Mark patches applied because CMake is wonky
touch(DUCKDB_APPLIED_PATCHES_FILE)
12 changes: 10 additions & 2 deletions src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,12 @@ static ffi::EngineBuilder* CreateBuilder(ClientContext &context, const string &p
auto endpoint = kv_secret.TryGetValue("endpoint");
auto session_token = kv_secret.TryGetValue("session_token");

ffi::set_builder_option(builder, to_delta_string_slice("aws_access_key_id"), to_delta_string_slice(key_id.ToString()));
ffi::set_builder_option(builder, to_delta_string_slice("aws_secret_access_key"), to_delta_string_slice(secret.ToString()));
if (!key_id.ToString().empty()) {
ffi::set_builder_option(builder, to_delta_string_slice("aws_access_key_id"), to_delta_string_slice(key_id.ToString()));
}
if (!secret.ToString().empty()) {
ffi::set_builder_option(builder, to_delta_string_slice("aws_secret_access_key"), to_delta_string_slice(secret.ToString()));
}
ffi::set_builder_option(builder, to_delta_string_slice("aws_region"), to_delta_string_slice(region.ToString()));

return builder;
Expand Down Expand Up @@ -420,6 +424,10 @@ unique_ptr<MultiFileReaderGlobalState> DeltaMultiFileReader::InitializeGlobalSta
case_insensitive_map_t<idx_t> selected_columns;
for (idx_t i = 0; i < global_column_ids.size(); i++) {
auto global_id = global_column_ids[i];
if (IsRowIdColumnId(global_id)) {
continue;
}

auto &global_name = global_names[global_id];
selected_columns.insert({global_name, i});
}
Expand Down
14 changes: 7 additions & 7 deletions test/sql/dat/basic_append.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require parquet

require delta

require-env DAT_AVAILABLE
require-env DAT_PATH

# Note: this table has 2 parquet files:
# - part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet
Expand All @@ -16,7 +16,7 @@ require-env DAT_AVAILABLE

query III
SELECT *
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
d 4 4.4
e 5 5.5
Expand All @@ -26,7 +26,7 @@ c 3 3.3

query I
SELECT letter
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
d
e
Expand All @@ -36,7 +36,7 @@ c

query I
SELECT number
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
----
4
5
Expand All @@ -50,7 +50,7 @@ mode skip
# Now we add a filter that filters out one of the files
query II
SELECT letter, number
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number < 2
----
a 1
Expand All @@ -60,7 +60,7 @@ mode unskip
# Now we add a filter that filters out the other file
query III
SELECT a_float, letter, number,
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 4
----
5.5 e 5
Expand All @@ -70,6 +70,6 @@ mode skip
# Now we add a filter that filters out all columns
query III
SELECT a_float, number, letter
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta')
WHERE number > 6
----
4 changes: 2 additions & 2 deletions test/sql/dat/custom_parameters.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require parquet

require delta

require-env DAT_AVAILABLE
require-env DAT_PATH

# Note: this table has 2 parquet files:
# - part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet
Expand All @@ -17,7 +17,7 @@ require-env DAT_AVAILABLE
# Test with appends and several custom options
query IIIII
SELECT parse_filename(filename), file_row_number, letter, delta_file_number, number
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1, filename=1)
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1, filename=1)
----
part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet 0 d 0 4
part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet 1 e 0 5
Expand Down
4 changes: 2 additions & 2 deletions test/sql/dat/primitive_types.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ require parquet

require delta

require-env DAT_AVAILABLE
require-env DAT_PATH

query IIIIIIIIIIII
SELECT *
FROM delta_scan('./delta-kernel-rs/acceptance/tests/dat/out/reader_tests/generated/all_primitive_types/delta')
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/all_primitive_types/delta')
----
0 0 0 0 0 0.0 0.0 true (empty) 10.000 1970-01-01 1970-01-01 00:00:00
1 1 1 1 1 1.0 1.0 false \x00 11.000 1970-01-02 1970-01-01 01:00:00
Expand Down
4 changes: 3 additions & 1 deletion test/sql/delta_kernel_rs/basic_partitioned.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ require parquet

require delta

require-env DELTA_KERNEL_TESTS_PATH

# FIXME: this fails due some weird error
mode skip

statement error
SELECT * FROM delta_scan('./delta-kernel-rs/kernel/tests/data/basic_partitioned')
SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/basic_partitioned')
----
Failed to read file "/Users/sam/Development/delta-kernel-testing/delta-kernel-rs/kernel/tests/data/basic_partitioned/letter=__HIVE_DEFAULT_PARTITION__
18 changes: 10 additions & 8 deletions test/sql/delta_kernel_rs/simple_with_dv.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ require parquet

require delta

require-env DELTA_KERNEL_TESTS_PATH

# Simplest example
query I
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/')
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/')
----
1
2
Expand All @@ -21,7 +23,7 @@ FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/')

# With filter: ensures the deletion vector is applied properly on top of pushed down filters
query I
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/')
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/')
WHERE value > 3
----
4
Expand All @@ -32,7 +34,7 @@ WHERE value > 3

# With filter: ensures the deletion vector is applied properly on top of pushed down filters with the file_row_number column
query II
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', file_row_number=1)
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', file_row_number=1)
WHERE value > 3
----
4 4
Expand All @@ -43,7 +45,7 @@ WHERE value > 3

# With filter and a delta scan based extra constant column
query II
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1)
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1)
WHERE value > 3
----
4 0
Expand All @@ -55,7 +57,7 @@ WHERE value > 3
# With filter, delta-extension-originated const column, and parquet-originated const column
query III
SELECT value, parse_filename(filename), delta_file_number
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1, filename=1)
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1)
WHERE value > 3
----
4 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
Expand All @@ -67,7 +69,7 @@ WHERE value > 3
# With PRUNED filter, delta-extension-originated const column, and parquet-originated const column
query II
SELECT parse_filename(filename), delta_file_number
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1, filename=1)
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1)
WHERE value > 3
----
part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
Expand All @@ -79,7 +81,7 @@ part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
# With PRUNED filters, delta-extension-originated const column, and parquet-originated const column
query I
SELECT delta_file_number
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', delta_file_number=1, filename=1)
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1)
WHERE value > 3 and filename is not null
----
0
Expand All @@ -91,7 +93,7 @@ WHERE value > 3 and filename is not null
# Enabling the file_row_number option, but projecting it out
query I
SELECT value
FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-with-dv-small/', file_row_number=1)
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', file_row_number=1)
----
1
2
Expand Down
6 changes: 4 additions & 2 deletions test/sql/delta_kernel_rs/simple_without_dv.test
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ require parquet

require delta

require-env DELTA_KERNEL_TESTS_PATH

# Filename param (i.e. MultiFileReader provided)
query II
SELECT value, parse_filename(filename) FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-without-dv-small', filename=1)
SELECT value, parse_filename(filename) FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-without-dv-small', filename=1)
----
0 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
1 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
Expand All @@ -23,7 +25,7 @@ SELECT value, parse_filename(filename) FROM delta_scan('./delta-kernel-rs/kernel

# FileRowNumer param (i.e. ParquetReader provided)
query II
SELECT * FROM delta_scan('./delta-kernel-rs/kernel/tests/data/table-without-dv-small', file_row_number=1)
SELECT * FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-without-dv-small', file_row_number=1)
----
0 0
1 1
Expand Down

0 comments on commit 7236aa4

Please sign in to comment.