Skip to content

Commit

Permalink
Merge pull request #12 from samansmink/add-generated-tests-to-ci
Browse files Browse the repository at this point in the history
Add generated tests to ci
  • Loading branch information
samansmink authored May 24, 2024
2 parents 04c61e4 + 9ff7daa commit d74797a
Show file tree
Hide file tree
Showing 11 changed files with 251 additions and 108 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/GeneratedTests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#
# This workflow calls the main distribution pipeline from DuckDB to build, test and (optionally) release the extension
#
name: GeneratedTests
on:
push:
pull_request:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || '' }}-${{ github.base_ref || '' }}-${{ github.ref != 'refs/heads/main' || github.sha }}
cancel-in-progress: true

jobs:
generated-tests-linux:
name: Generated Tests (Linux)
runs-on: ubuntu-latest
env:
GEN: ninja
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: 'true'

- name: Install
shell: bash
run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build

- name: Setup Ccache
uses: hendrikmuhs/ccache-action@main
with:
key: ${{ github.job }}

- uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Setup vcpkg
uses: lukka/[email protected]
with:
vcpkgGitCommitId: a1a1cbc975abf909a6c8985a6a2b8fe20bbd9bd6

- name: Build
shell: bash
run: make generate-data

- name: Test
shell: bash
run: |
GENERATED_DATA_AVAILABLE=1 make test
5 changes: 3 additions & 2 deletions extension_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@ duckdb_extension_load(delta
# Build the httpfs extension to test with s3/http
duckdb_extension_load(httpfs)

# Build the tpch extension for testing/benchmarking
duckdb_extension_load(tpch)
# Build the tpch and tpcds extension for testing/benchmarking
duckdb_extension_load(tpch)
duckdb_extension_load(tpcds)
8 changes: 7 additions & 1 deletion scripts/generate_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,10 @@ def generate_test_data_pyspark(name, current_path, input_path, delete_predicate
con = duckdb.connect()
con.query(f"call dbgen(sf=0.01); EXPORT DATABASE '{TMP_PATH}/tpch_sf0_01_export' (FORMAT parquet)")
for table in ["customer","lineitem","nation","orders","part","partsupp","region","supplier"]:
generate_test_data_pyspark(f"tpch_sf0_01_{table}", f'tpch_sf0_01/{table}', f'{TMP_PATH}/tpch_sf0_01_export/{table}.parquet')
generate_test_data_pyspark(f"tpch_sf0_01_{table}", f'tpch_sf0_01/{table}', f'{TMP_PATH}/tpch_sf0_01_export/{table}.parquet')

## TPCDS SF0.01 full dataset
con = duckdb.connect()
con.query(f"call dsdgen(sf=0.01); EXPORT DATABASE '{TMP_PATH}/tpcds_sf0_01_export' (FORMAT parquet)")
for table in ["call_center","catalog_page","catalog_returns","catalog_sales","customer","customer_demographics","customer_address","date_dim","household_demographics","inventory","income_band","item","promotion","reason","ship_mode","store","store_returns","store_sales","time_dim","warehouse","web_page","web_returns","web_sales","web_site"]:
generate_test_data_pyspark(f"tpcds_sf0_01_{table}", f'tpcds_sf0_01/{table}', f'{TMP_PATH}/tpcds_sf0_01_export/{table}.parquet')
4 changes: 2 additions & 2 deletions src/functions/delta_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ unique_ptr<MultiFileList> DeltaSnapshot::ComplexFilterPushdown(ClientContext &co
filtered_list->table_filters = std::move(filterstmp);
filtered_list->names = names;

return filtered_list;
return std::move(filtered_list);
}

vector<string> DeltaSnapshot::GetAllFiles() {
Expand Down Expand Up @@ -545,7 +545,7 @@ void DeltaMultiFileReader::FinalizeChunk(ClientContext &context, const MultiFile
//! Create Dummy expression (0 + file_number)
vector<unique_ptr<ParsedExpression>> child_expr;
child_expr.push_back(make_uniq<ConstantExpression>(Value::UBIGINT(0)));
child_expr.push_back(make_uniq<ConstantExpression>(Value::UBIGINT(metadata->file_number)));
child_expr.push_back(make_uniq<ConstantExpression>(Value::UBIGINT(7)));
unique_ptr<ParsedExpression> expr = make_uniq<FunctionExpression>("+", std::move(child_expr), nullptr, nullptr, false, true);

//! s dummy expression
Expand Down
35 changes: 34 additions & 1 deletion src/include/delta_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,38 @@ ffi::EngineIterator EngineIteratorFromCallable(Callable& callable) {
return {.data = &callable, .get_next = (const void *(*)(void*)) get_next};
};

// Helper function to prevent pushing down filters kernel cant handle
// TODO: remove once kernel handles this properly?
static bool CanHandleFilter(TableFilter *filter) {
switch (filter->filter_type) {
case TableFilterType::CONSTANT_COMPARISON:
return true;
case TableFilterType::CONJUNCTION_AND: {
auto &conjunction = static_cast<const ConjunctionAndFilter&>(*filter);
bool can_handle = true;
for (const auto& child : conjunction.child_filters) {
can_handle = can_handle && CanHandleFilter(child.get());
}
return can_handle;
}

default:
return false;
}
}

// Prunes the list of predicates to ones that we can handle
static std::map<string, TableFilter*> PrunePredicates(std::map<string, TableFilter*> predicates) {
std::map<string, TableFilter*> result;
for (const auto &predicate : predicates) {
if (CanHandleFilter(predicate.second)) {
result[predicate.first] = predicate.second;
}

}
return result;
}

class PredicateVisitor : public ffi::EnginePredicate {
public:
PredicateVisitor(const vector<string> &column_names, optional_ptr<TableFilterSet> filters) : EnginePredicate {
Expand All @@ -254,7 +286,8 @@ class PredicateVisitor : public ffi::EnginePredicate {
std::map<string, TableFilter*> column_filters;

static uintptr_t VisitPredicate(PredicateVisitor* predicate, ffi::KernelExpressionVisitorState* state) {
auto& filters = predicate->column_filters;
auto filters = PrunePredicates(predicate->column_filters);

auto it = filters.begin();
auto end = filters.end();
auto get_next = [predicate, state, &it, &end]() -> uintptr_t {
Expand Down
12 changes: 6 additions & 6 deletions test/sql/dat/custom_parameters.test
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ require-env DAT_PATH

# Test with appends and several custom options
query IIIII
SELECT parse_filename(filename), file_row_number, letter, delta_file_number, number
SELECT parse_filename(filename)[-15:-1], file_row_number, letter, delta_file_number, number
FROM delta_scan('${DAT_PATH}/out/reader_tests/generated/basic_append/delta', delta_file_number=1, file_row_number=1, filename=1)
----
part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet 0 d 0 4
part-00000-c156ac8b-f738-4479-803d-750072dd4c51-c000.snappy.parquet 1 e 0 5
part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet 0 a 1 1
part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet 1 b 1 2
part-00000-ef42f28f-e8e8-4d54-b51f-c3af96c72a44-c000.snappy.parquet 2 c 1 3
.snappy.parquet 0 d 7 4
.snappy.parquet 1 e 7 5
.snappy.parquet 0 a 7 1
.snappy.parquet 1 b 7 2
.snappy.parquet 2 c 7 3
44 changes: 22 additions & 22 deletions test/sql/delta_kernel_rs/simple_with_dv.test
Original file line number Diff line number Diff line change
Expand Up @@ -66,47 +66,47 @@ query II
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1)
WHERE value > 3
----
4 0
5 0
6 0
7 0
8 0
4 7
5 7
6 7
7 7
8 7

# With filter, delta-extension-originated const column, and parquet-originated const column
query III
SELECT value, parse_filename(filename), delta_file_number
SELECT value, parse_filename(filename)[-15:-1], delta_file_number
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1)
WHERE value > 3
----
4 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
5 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
6 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
7 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
8 part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
4 .snappy.parquet 7
5 .snappy.parquet 7
6 .snappy.parquet 7
7 .snappy.parquet 7
8 .snappy.parquet 7

# With PRUNED filter, delta-extension-originated const column, and parquet-originated const column
query II
SELECT parse_filename(filename), delta_file_number
SELECT parse_filename(filename)[-15:-1], delta_file_number
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1)
WHERE value > 3
----
part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet 0
.snappy.parquet 7
.snappy.parquet 7
.snappy.parquet 7
.snappy.parquet 7
.snappy.parquet 7

# With PRUNED filters, delta-extension-originated const column, and parquet-originated const column
query I
SELECT delta_file_number
FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-with-dv-small/', delta_file_number=1, filename=1)
WHERE value > 3 and filename is not null
----
0
0
0
0
0
7
7
7
7
7

# Enabling the file_row_number option, but projecting it out
query I
Expand Down
22 changes: 11 additions & 11 deletions test/sql/delta_kernel_rs/simple_without_dv.test
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@ pragma enable_verification

# Filename param (i.e. MultiFileReader provided)
query II
SELECT value, parse_filename(filename) FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-without-dv-small', filename=1)
SELECT value, parse_filename(filename)[-15:-1] FROM delta_scan('${DELTA_KERNEL_TESTS_PATH}/table-without-dv-small', filename=1)
----
0 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
1 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
2 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
3 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
4 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
5 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
6 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
7 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
8 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
9 part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet
0 .snappy.parquet
1 .snappy.parquet
2 .snappy.parquet
3 .snappy.parquet
4 .snappy.parquet
5 .snappy.parquet
6 .snappy.parquet
7 .snappy.parquet
8 .snappy.parquet
9 .snappy.parquet

# FileRowNumer param (i.e. ParquetReader provided)
query II
Expand Down
Loading

0 comments on commit d74797a

Please sign in to comment.