Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support attaching delta tables as catalogs #110

Merged
merged 9 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/regression/micro.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
benchmark/micro/snapshot_performance/delta_scan.benchmark
benchmark/micro/snapshot_performance/snapshot_no_pin.benchmark
benchmark/micro/snapshot_performance/snapshot_no_pin_filter.benchmark
benchmark/micro/snapshot_performance/snapshot_pin.benchmark
benchmark/micro/snapshot_performance/snapshot_pin_filter.benchmark
16 changes: 12 additions & 4 deletions .github/workflows/LocalTesting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -210,12 +210,14 @@ jobs:

- name: Build
shell: bash
run: make generate-data
run: |
make generate-data
make release

- name: Test
shell: bash
run: |
GENERATED_DATA_AVAILABLE=1 make test
GENERATED_DATA_AVAILABLE=1 make test_release

regression-test-benchmark-runner:
name: Performance Regression Tests
Expand Down Expand Up @@ -280,13 +282,19 @@ jobs:
if: always()
shell: bash
run: |
python3 ./duckdb/scripts/regression_test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpch_sf1_local.csv --verbose --threads=2 --root-dir=.
python3 ./duckdb/scripts/regression/test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpch_sf1_local.csv --verbose --threads=2 --root-dir=.

- name: Regression Test TPC-DS
if: always()
shell: bash
run: |
python ./duckdb/scripts/regression_test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpcds_sf1_local.csv --verbose --threads=2 --root-dir=.
python ./duckdb/scripts/regression/test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/tpcds_sf1_local.csv --verbose --threads=2 --root-dir=.

- name: Regression Test Micro
if: always()
shell: bash
run: |
python ./duckdb/scripts/regression/test_runner.py --old=duckdb_delta/build/release/benchmark/benchmark_runner --new=build/release/benchmark/benchmark_runner --benchmarks=.github/regression/micro.csv --verbose --threads=2 --root-dir=.

- name: Test benchmark makefile
shell: bash
Expand Down
17 changes: 10 additions & 7 deletions .github/workflows/MainDistributionPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,24 @@ concurrency:
jobs:
duckdb-stable-build:
name: Build extension binaries
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@v1.1.2
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
with:
duckdb_version: v1.1.2
ci_tools_version: v1.1.2
# pip install duckdb==1.1.4.dev1594
duckdb_version: 0ccf3c25cc
ci_tools_version: main
extension_name: delta
enable_rust: true
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools'
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw'
extra_toolchains: 'python3'
vcpkg_commit: c82f74667287d3dc386bce81e44964370c91a289

duckdb-stable-deploy:
name: Deploy extension binaries
needs: duckdb-stable-build
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@v1.1.2
uses: duckdb/extension-ci-tools/.github/workflows/_extension_deploy.yml@main
secrets: inherit
with:
extension_name: delta
duckdb_version: v1.1.2
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_rtools'
duckdb_version: 0ccf3c25cc
exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads;windows_amd64_mingw'
deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }}
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ test/python/__pycache__/
data/generated
__azurite*__.json
__blobstorage__
.venv
venv
.vscode
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@
src/delta_extension.cpp
src/delta_functions.cpp
src/delta_utils.cpp
src/functions/delta_scan.cpp)
src/functions/delta_scan.cpp
src/storage/delta_catalog.cpp
src/storage/delta_schema_entry.cpp
src/storage/delta_table_entry.cpp
src/storage/delta_transaction.cpp
src/storage/delta_transaction_manager.cpp
)

### Custom config
# TODO: figure out if we really need this?
Expand Down
17 changes: 15 additions & 2 deletions benchmark/benchmark.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,16 @@ plot:
# TPCH SF1 on delta table
bench-run-tpch-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1/local/delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta.csv
bench-run-tpch-sf1-delta-attach: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1/local/delta_attach/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-delta-attach.csv
# TPCH SF1 on parquet files
bench-run-tpch-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner 'benchmark/tpch/sf1-parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-parquet.csv
# TPCH SF1 on duckdb file
bench-run-tpch-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpch/sf1/local/duckdb/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpch-sf1-duckdb.csv
# COMPARES TPCH SF1 on parquet file vs on delta files vs on duckdb files
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet
bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet bench-run-tpch-sf1-delta-attach

###
# TPCDS
Expand All @@ -42,6 +44,10 @@ bench-run-tpch-sf1: bench-run-tpch-sf1-delta bench-run-tpch-sf1-parquet
# TPCDS SF1 on delta table
bench-run-tpcds-sf1-delta: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/delta/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta-$(IO_MODE).csv
bench-run-tpcds-sf1-delta-attach: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/delta_attach/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta-attach-$(IO_MODE).csv
bench-run-tpcds-sf1-delta-attach-pin: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/delta_attach_pin/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-delta-attach-pin-$(IO_MODE).csv
# TPCDS SF1 on parquet files
bench-run-tpcds-sf1-parquet: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/parquet/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-parquet-$(IO_MODE).csv
Expand All @@ -50,4 +56,11 @@ bench-run-tpcds-sf1-duckdb: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/tpcds/sf1/$(IO_MODE)/duckdb/$(BENCHMARK_PATTERN)' 2>&1 | tee benchmark_results/tpcds-sf1-duckdb-$(IO_MODE).csv

# COMPARES TPCDS SF1 on parquet file vs on delta files
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet bench-run-tpcds-sf1-duckdb
bench-run-tpcds-sf1: bench-run-tpcds-sf1-delta bench-run-tpcds-sf1-parquet bench-run-tpcds-sf1-duckdb bench-run-tpcds-sf1-delta-attach bench-run-tpcds-sf1-delta-attach-pin

###
# MICRO
###

bench-run-snapshot-performance: bench-output-dir
./build/release/benchmark/benchmark_runner --root-dir './' 'benchmark/micro/snapshot_performance/.*' 2>&1 | tee benchmark_results/snapshot-performance.csv
16 changes: 16 additions & 0 deletions benchmark/micro/snapshot_performance/delta_scan.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# name: benchmark/micro/snapshot_performance/delta_scan.benchmark
# description: Reference result to compare attach functions to
# group: [aggregate]

name delta_scan reference
group snapshot_performance

require delta

require parquet

run
SELECT COUNT(*) FROM delta_scan('./data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake')

result I
6001215
16 changes: 16 additions & 0 deletions benchmark/micro/snapshot_performance/delta_scan_filter.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# name: benchmark/micro/snapshot_performance/delta_scan.benchmark
# description: Reference result to compare attach functions to
# group: [aggregate]

name delta_scan reference
group snapshot_performance

require delta

require parquet

run
SELECT COUNT(*) FROM delta_scan('./data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake') where l_orderkey is not null

result I
6001215
19 changes: 19 additions & 0 deletions benchmark/micro/snapshot_performance/snapshot_no_pin.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_no_pin.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot no pin
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_no_pin (TYPE delta);

run
SELECT COUNT(*) FROM lineitem_no_pin

result I
6001215
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_no_pin_filter.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot no pin filter
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_no_pin (TYPE delta);

run
SELECT COUNT(*) FROM lineitem_no_pin where l_orderkey is not null

result I
6001215
19 changes: 19 additions & 0 deletions benchmark/micro/snapshot_performance/snapshot_pin.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_pin.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot pin
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_pin (TYPE delta, PIN_SNAPSHOT);

run
SELECT COUNT(*) FROM lineitem_pin

result I
6001215
19 changes: 19 additions & 0 deletions benchmark/micro/snapshot_performance/snapshot_pin_filter.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# name: benchmark/micro/snapshot_performance/snapshot_pin_filter.benchmark
# description: Performance of reading from a table with many log entries
# group: [aggregate]

name Snapshot pin filter
group snapshot_performance

require delta

require parquet

load
ATTACH './data/generated/delta_rs_tpch_sf1_100_splits/lineitem/delta_lake' as lineitem_pin (TYPE delta, PIN_SNAPSHOT);

run
SELECT COUNT(*) FROM lineitem_pin where l_orderkey is not null

result I
6001215
24 changes: 24 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/load.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
ATTACH './data/generated/tpcds_sf1/call_center/delta_lake' as call_center (TYPE delta);
ATTACH './data/generated/tpcds_sf1/catalog_page/delta_lake' as catalog_page (TYPE delta);
ATTACH './data/generated/tpcds_sf1/catalog_returns/delta_lake' as catalog_returns (TYPE delta);
ATTACH './data/generated/tpcds_sf1/catalog_sales/delta_lake' as catalog_sales (TYPE delta);
ATTACH './data/generated/tpcds_sf1/customer/delta_lake' as customer (TYPE delta);
ATTACH './data/generated/tpcds_sf1/customer_demographics/delta_lake' as customer_demographics (TYPE delta);
ATTACH './data/generated/tpcds_sf1/customer_address/delta_lake' as customer_address (TYPE delta);
ATTACH './data/generated/tpcds_sf1/date_dim/delta_lake' as date_dim (TYPE delta);
ATTACH './data/generated/tpcds_sf1/household_demographics/delta_lake' as household_demographics (TYPE delta);
ATTACH './data/generated/tpcds_sf1/inventory/delta_lake' as inventory (TYPE delta);
ATTACH './data/generated/tpcds_sf1/income_band/delta_lake' as income_band (TYPE delta);
ATTACH './data/generated/tpcds_sf1/item/delta_lake' as item (TYPE delta);
ATTACH './data/generated/tpcds_sf1/promotion/delta_lake' as promotion (TYPE delta);
ATTACH './data/generated/tpcds_sf1/reason/delta_lake' as reason (TYPE delta);
ATTACH './data/generated/tpcds_sf1/ship_mode/delta_lake' as ship_mode (TYPE delta);
ATTACH './data/generated/tpcds_sf1/store/delta_lake' as store (TYPE delta);
ATTACH './data/generated/tpcds_sf1/store_returns/delta_lake' as store_returns (TYPE delta);
ATTACH './data/generated/tpcds_sf1/store_sales/delta_lake' as store_sales (TYPE delta);
ATTACH './data/generated/tpcds_sf1/time_dim/delta_lake' as time_dim (TYPE delta);
ATTACH './data/generated/tpcds_sf1/warehouse/delta_lake' as warehouse (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_page/delta_lake' as web_page (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_returns/delta_lake' as web_returns (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_sales/delta_lake' as web_sales (TYPE delta);
ATTACH './data/generated/tpcds_sf1/web_site/delta_lake' as web_site (TYPE delta);
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q01.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q01.benchmark
# description: Run query 01 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=1
QUERY_NUMBER_PADDED=01
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q02.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q02.benchmark
# description: Run query 02 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=2
QUERY_NUMBER_PADDED=02
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q03.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q03.benchmark
# description: Run query 03 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=3
QUERY_NUMBER_PADDED=03
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q04.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q04.benchmark
# description: Run query 04 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=4
QUERY_NUMBER_PADDED=04
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q05.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q05.benchmark
# description: Run query 05 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=5
QUERY_NUMBER_PADDED=05
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q06.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q06.benchmark
# description: Run query 06 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=6
QUERY_NUMBER_PADDED=06
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q07.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q07.benchmark
# description: Run query 07 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=7
QUERY_NUMBER_PADDED=07
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q08.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q08.benchmark
# description: Run query 08 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=8
QUERY_NUMBER_PADDED=08
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q09.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q09.benchmark
# description: Run query 09 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=9
QUERY_NUMBER_PADDED=09
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q10.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q10.benchmark
# description: Run query 10 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=10
QUERY_NUMBER_PADDED=10
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q11.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q11.benchmark
# description: Run query 11 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=11
QUERY_NUMBER_PADDED=11
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q12.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q12.benchmark
# description: Run query 12 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=12
QUERY_NUMBER_PADDED=12
7 changes: 7 additions & 0 deletions benchmark/tpcds/sf1/local/delta_attach/q13.benchmark
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# name: benchmark/tpcds/sf1/local/delta_attach/q13.benchmark
# description: Run query 13 from the TPC-DS benchmark
# group: [sf1]

template benchmark/tpcds/sf1/local/delta_attach/tpcds_sf1.benchmark.in
QUERY_NUMBER=13
QUERY_NUMBER_PADDED=13
Loading
Loading