diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index edaa49ec6e7e..4527d047e4c0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -521,7 +521,7 @@ jobs: run: taplo format --check config-docs-check: - name: check configs.md is up-to-date + name: check configs.md and ***_functions.md is up-to-date needs: [ linux-build-lib ] runs-on: ubuntu-latest container: @@ -542,6 +542,11 @@ jobs: # If you encounter an error, run './dev/update_config_docs.sh' and commit ./dev/update_config_docs.sh git diff --exit-code + - name: Check if any of the ***_functions.md has been modified + run: | + # If you encounter an error, run './dev/update_function_docs.sh' and commit + ./dev/update_function_docs.sh + git diff --exit-code # Verify MSRV for the crates which are directly used by other projects: # - datafusion diff --git a/Cargo.toml b/Cargo.toml index b8bf83a5ab53..448607257ca1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,22 +70,22 @@ version = "42.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "53.0.0", features = [ +arrow = { version = "53.1.0", features = [ "prettyprint", ] } -arrow-array = { version = "53.0.0", default-features = false, features = [ +arrow-array = { version = "53.1.0", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "53.0.0", default-features = false } -arrow-flight = { version = "53.0.0", features = [ +arrow-buffer = { version = "53.1.0", default-features = false } +arrow-flight = { version = "53.1.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "53.0.0", default-features = false, features = [ +arrow-ipc = { version = "53.1.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "53.0.0", default-features = false } -arrow-schema = { version = "53.0.0", default-features = false } -arrow-string = { version = "53.0.0", default-features = false } +arrow-ord = { version = "53.1.0", default-features = false } +arrow-schema = { version = "53.1.0", default-features = false } +arrow-string = { version = "53.1.0", default-features = false } async-trait = "0.1.73" bigdecimal = "=0.4.1" bytes = "1.4" @@ -126,7 +126,7 @@ log = "^0.4" num_cpus = "1.13.0" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "53.0.0", default-features = false, features = [ +parquet = { version = "53.1.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/README.md b/README.md index bb8526c24e2c..5d0b096c1de1 100644 --- a/README.md +++ b/README.md @@ -42,14 +42,25 @@ DataFusion is an extensible query engine written in [Rust] that -uses [Apache Arrow] as its in-memory format. DataFusion's target users are +uses [Apache Arrow] as its in-memory format. + +The DataFusion libraries in this repository are used to build data-centric system software. DataFusion also provides the +following subprojects, which are packaged versions of DataFusion intended for end users. + +- [DataFusion Python](https://github.com/apache/datafusion-python/) offers a Python interface for SQL and DataFrame + queries. +- [DataFusion Ray](https://github.com/apache/datafusion-ray/) provides a distributed version of DataFusion that scales + out on Ray clusters. +- [DataFusion Comet](https://github.com/apache/datafusion-comet/) is an accelerator for Apache Spark based on + DataFusion. + +The target audience for the DataFusion crates in this repository are developers building fast and feature rich database and analytic systems, customized to particular workloads. See [use cases] for examples. -"Out of the box," DataFusion offers [SQL] and [`Dataframe`] APIs, +DataFusion offers [SQL] and [`Dataframe`] APIs, excellent [performance], built-in support for CSV, Parquet, JSON, and Avro, extensive customization, and a great community. -[Python Bindings] are also available. DataFusion features a full query planner, a columnar, streaming, multi-threaded, vectorized execution engine, and partitioned data sources. You can diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 24efab6c6ca5..70faa9ef2b73 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -211,6 +211,7 @@ main() { run_clickbench_1 run_clickbench_partitioned run_clickbench_extended + run_imdb ;; tpch) run_tpch "1" @@ -239,6 +240,9 @@ main() { clickbench_extended) run_clickbench_extended ;; + imdb) + run_imdb + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for run" usage @@ -510,6 +514,16 @@ data_imdb() { fi } +# Runs the imdb benchmark +run_imdb() { + IMDB_DIR="${DATA_DIR}/imdb" + + RESULTS_FILE="${RESULTS_DIR}/imdb.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running imdb benchmark..." + $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" +} + diff --git a/benchmarks/queries/imdb/10a.sql b/benchmarks/queries/imdb/10a.sql new file mode 100644 index 000000000000..95b049b77479 --- /dev/null +++ b/benchmarks/queries/imdb/10a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS uncredited_voiced_character, MIN(t.title) AS russian_movie FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(voice)%' and ci.note like '%(uncredited)%' AND cn.country_code = '[ru]' AND rt.role = 'actor' AND t.production_year > 2005 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/10b.sql b/benchmarks/queries/imdb/10b.sql new file mode 100644 index 000000000000..c32153631412 --- /dev/null +++ b/benchmarks/queries/imdb/10b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character, MIN(t.title) AS russian_mov_with_actor_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(producer)%' AND cn.country_code = '[ru]' AND rt.role = 'actor' AND t.production_year > 2010 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/10c.sql b/benchmarks/queries/imdb/10c.sql new file mode 100644 index 000000000000..b862cf4fa7ac --- /dev/null +++ b/benchmarks/queries/imdb/10c.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character, MIN(t.title) AS movie_with_american_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(producer)%' AND cn.country_code = '[us]' AND t.production_year > 1990 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/11a.sql b/benchmarks/queries/imdb/11a.sql new file mode 100644 index 000000000000..f835968e900b --- /dev/null +++ b/benchmarks/queries/imdb/11a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS non_polish_sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/11b.sql b/benchmarks/queries/imdb/11b.sql new file mode 100644 index 000000000000..2411e19ea608 --- /dev/null +++ b/benchmarks/queries/imdb/11b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follows%' AND mc.note IS NULL AND t.production_year = 1998 and t.title like '%Money%' AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/11c.sql b/benchmarks/queries/imdb/11c.sql new file mode 100644 index 000000000000..3bf794678918 --- /dev/null +++ b/benchmarks/queries/imdb/11c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(mc.note) AS production_note, MIN(t.title) AS movie_based_on_book FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' and (cn.name like '20th Century Fox%' or cn.name like 'Twentieth Century Fox%') AND ct.kind != 'production companies' and ct.kind is not NULL AND k.keyword in ('sequel', 'revenge', 'based-on-novel') AND mc.note is not NULL AND t.production_year > 1950 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/11d.sql b/benchmarks/queries/imdb/11d.sql new file mode 100644 index 000000000000..0bc33e1d6e88 --- /dev/null +++ b/benchmarks/queries/imdb/11d.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(mc.note) AS production_note, MIN(t.title) AS movie_based_on_book FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND ct.kind != 'production companies' and ct.kind is not NULL AND k.keyword in ('sequel', 'revenge', 'based-on-novel') AND mc.note is not NULL AND t.production_year > 1950 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/12a.sql b/benchmarks/queries/imdb/12a.sql new file mode 100644 index 000000000000..22add74bd55d --- /dev/null +++ b/benchmarks/queries/imdb/12a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS drama_horror_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, title AS t WHERE cn.country_code = '[us]' AND ct.kind = 'production companies' AND it1.info = 'genres' AND it2.info = 'rating' AND mi.info in ('Drama', 'Horror') AND mi_idx.info > '8.0' AND t.production_year between 2005 and 2008 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND mi.info_type_id = it1.id AND mi_idx.info_type_id = it2.id AND t.id = mc.movie_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id; diff --git a/benchmarks/queries/imdb/12b.sql b/benchmarks/queries/imdb/12b.sql new file mode 100644 index 000000000000..fc30ad550d10 --- /dev/null +++ b/benchmarks/queries/imdb/12b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS budget, MIN(t.title) AS unsuccsessful_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, title AS t WHERE cn.country_code ='[us]' AND ct.kind is not NULL and (ct.kind ='production companies' or ct.kind = 'distributors') AND it1.info ='budget' AND it2.info ='bottom 10 rank' AND t.production_year >2000 AND (t.title LIKE 'Birdemic%' OR t.title LIKE '%Movie%') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND mi.info_type_id = it1.id AND mi_idx.info_type_id = it2.id AND t.id = mc.movie_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id; diff --git a/benchmarks/queries/imdb/12c.sql b/benchmarks/queries/imdb/12c.sql new file mode 100644 index 000000000000..64a340b2381e --- /dev/null +++ b/benchmarks/queries/imdb/12c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS mainstream_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, title AS t WHERE cn.country_code = '[us]' AND ct.kind = 'production companies' AND it1.info = 'genres' AND it2.info = 'rating' AND mi.info in ('Drama', 'Horror', 'Western', 'Family') AND mi_idx.info > '7.0' AND t.production_year between 2000 and 2010 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND mi.info_type_id = it1.id AND mi_idx.info_type_id = it2.id AND t.id = mc.movie_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id; diff --git a/benchmarks/queries/imdb/13a.sql b/benchmarks/queries/imdb/13a.sql new file mode 100644 index 000000000000..95eb439d1e22 --- /dev/null +++ b/benchmarks/queries/imdb/13a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(miidx.info) AS rating, MIN(t.title) AS german_movie FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[de]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/13b.sql b/benchmarks/queries/imdb/13b.sql new file mode 100644 index 000000000000..4b6f75ab0ae6 --- /dev/null +++ b/benchmarks/queries/imdb/13b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie_about_winning FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND t.title != '' AND (t.title LIKE '%Champion%' OR t.title LIKE '%Loser%') AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/13c.sql b/benchmarks/queries/imdb/13c.sql new file mode 100644 index 000000000000..9e8c92327bd5 --- /dev/null +++ b/benchmarks/queries/imdb/13c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie_about_winning FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND t.title != '' AND (t.title LIKE 'Champion%' OR t.title LIKE 'Loser%') AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/13d.sql b/benchmarks/queries/imdb/13d.sql new file mode 100644 index 000000000000..a8bc567cabe1 --- /dev/null +++ b/benchmarks/queries/imdb/13d.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/14a.sql b/benchmarks/queries/imdb/14a.sql new file mode 100644 index 000000000000..af1a7c8983a6 --- /dev/null +++ b/benchmarks/queries/imdb/14a.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS northern_dark_movie FROM info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind = 'movie' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2010 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/14b.sql b/benchmarks/queries/imdb/14b.sql new file mode 100644 index 000000000000..c606ebc73dd4 --- /dev/null +++ b/benchmarks/queries/imdb/14b.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS western_dark_production FROM info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title') AND kt.kind = 'movie' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info > '6.0' AND t.production_year > 2010 and (t.title like '%murder%' or t.title like '%Murder%' or t.title like '%Mord%') AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/14c.sql b/benchmarks/queries/imdb/14c.sql new file mode 100644 index 000000000000..2a6dffde2639 --- /dev/null +++ b/benchmarks/queries/imdb/14c.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS north_european_dark_production FROM info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it1.info = 'countries' AND it2.info = 'rating' AND k.keyword is not null and k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/15a.sql b/benchmarks/queries/imdb/15a.sql new file mode 100644 index 000000000000..1d052f004426 --- /dev/null +++ b/benchmarks/queries/imdb/15a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(t.title) AS internet_movie FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' AND it1.info = 'release dates' AND mc.note like '%(200%)%' and mc.note like '%(worldwide)%' AND mi.note like '%internet%' AND mi.info like 'USA:% 200%' AND t.production_year > 2000 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/15b.sql b/benchmarks/queries/imdb/15b.sql new file mode 100644 index 000000000000..21c81358fa7a --- /dev/null +++ b/benchmarks/queries/imdb/15b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(t.title) AS youtube_movie FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' and cn.name = 'YouTube' AND it1.info = 'release dates' AND mc.note like '%(200%)%' and mc.note like '%(worldwide)%' AND mi.note like '%internet%' AND mi.info like 'USA:% 200%' AND t.production_year between 2005 and 2010 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/15c.sql b/benchmarks/queries/imdb/15c.sql new file mode 100644 index 000000000000..2d08c5203974 --- /dev/null +++ b/benchmarks/queries/imdb/15c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(t.title) AS modern_american_internet_movie FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' AND it1.info = 'release dates' AND mi.note like '%internet%' AND mi.info is not NULL and (mi.info like 'USA:% 199%' or mi.info like 'USA:% 200%') AND t.production_year > 1990 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/15d.sql b/benchmarks/queries/imdb/15d.sql new file mode 100644 index 000000000000..040e9815d86c --- /dev/null +++ b/benchmarks/queries/imdb/15d.sql @@ -0,0 +1 @@ +SELECT MIN(at.title) AS aka_title, MIN(t.title) AS internet_movie_title FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' AND it1.info = 'release dates' AND mi.note like '%internet%' AND t.production_year > 1990 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/16a.sql b/benchmarks/queries/imdb/16a.sql new file mode 100644 index 000000000000..aaa0020269d2 --- /dev/null +++ b/benchmarks/queries/imdb/16a.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND t.episode_nr >= 50 AND t.episode_nr < 100 AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/16b.sql b/benchmarks/queries/imdb/16b.sql new file mode 100644 index 000000000000..c6c0bef319de --- /dev/null +++ b/benchmarks/queries/imdb/16b.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/16c.sql b/benchmarks/queries/imdb/16c.sql new file mode 100644 index 000000000000..5c3b35752195 --- /dev/null +++ b/benchmarks/queries/imdb/16c.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND t.episode_nr < 100 AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/16d.sql b/benchmarks/queries/imdb/16d.sql new file mode 100644 index 000000000000..c9e1b5f25ce5 --- /dev/null +++ b/benchmarks/queries/imdb/16d.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND t.episode_nr >= 5 AND t.episode_nr < 100 AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17a.sql b/benchmarks/queries/imdb/17a.sql new file mode 100644 index 000000000000..e854a957e429 --- /dev/null +++ b/benchmarks/queries/imdb/17a.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_american_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND n.name LIKE 'B%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17b.sql b/benchmarks/queries/imdb/17b.sql new file mode 100644 index 000000000000..903f2196b278 --- /dev/null +++ b/benchmarks/queries/imdb/17b.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE 'Z%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17c.sql b/benchmarks/queries/imdb/17c.sql new file mode 100644 index 000000000000..a96faa0b4339 --- /dev/null +++ b/benchmarks/queries/imdb/17c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE 'X%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17d.sql b/benchmarks/queries/imdb/17d.sql new file mode 100644 index 000000000000..73e1f2c30976 --- /dev/null +++ b/benchmarks/queries/imdb/17d.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE '%Bert%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17e.sql b/benchmarks/queries/imdb/17e.sql new file mode 100644 index 000000000000..65ea73ed0510 --- /dev/null +++ b/benchmarks/queries/imdb/17e.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17f.sql b/benchmarks/queries/imdb/17f.sql new file mode 100644 index 000000000000..542233d63e9d --- /dev/null +++ b/benchmarks/queries/imdb/17f.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE '%B%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/18a.sql b/benchmarks/queries/imdb/18a.sql new file mode 100644 index 000000000000..275e04bdb184 --- /dev/null +++ b/benchmarks/queries/imdb/18a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(producer)', '(executive producer)') AND it1.info = 'budget' AND it2.info = 'votes' AND n.gender = 'm' and n.name like '%Tim%' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/18b.sql b/benchmarks/queries/imdb/18b.sql new file mode 100644 index 000000000000..3ae40ed93d2f --- /dev/null +++ b/benchmarks/queries/imdb/18b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'rating' AND mi.info in ('Horror', 'Thriller') and mi.note is NULL AND mi_idx.info > '8.0' AND n.gender is not null and n.gender = 'f' AND t.production_year between 2008 and 2014 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/18c.sql b/benchmarks/queries/imdb/18c.sql new file mode 100644 index 000000000000..01f28ea527fe --- /dev/null +++ b/benchmarks/queries/imdb/18c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/19a.sql b/benchmarks/queries/imdb/19a.sql new file mode 100644 index 000000000000..ceaae671fd20 --- /dev/null +++ b/benchmarks/queries/imdb/19a.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND mc.note is not NULL and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%Ang%' AND rt.role ='actress' AND t.production_year between 2005 and 2009 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/19b.sql b/benchmarks/queries/imdb/19b.sql new file mode 100644 index 000000000000..62e852ba3ec6 --- /dev/null +++ b/benchmarks/queries/imdb/19b.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS kung_fu_panda FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note = '(voice)' AND cn.country_code ='[us]' AND it.info = 'release dates' AND mc.note like '%(200%)%' and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND mi.info is not null and (mi.info like 'Japan:%2007%' or mi.info like 'USA:%2008%') AND n.gender ='f' and n.name like '%Angel%' AND rt.role ='actress' AND t.production_year between 2007 and 2008 and t.title like '%Kung%Fu%Panda%' AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/19c.sql b/benchmarks/queries/imdb/19c.sql new file mode 100644 index 000000000000..6885af5012fc --- /dev/null +++ b/benchmarks/queries/imdb/19c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS jap_engl_voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/19d.sql b/benchmarks/queries/imdb/19d.sql new file mode 100644 index 000000000000..06fcc76ba7ad --- /dev/null +++ b/benchmarks/queries/imdb/19d.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS jap_engl_voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND n.gender ='f' AND rt.role ='actress' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/1a.sql b/benchmarks/queries/imdb/1a.sql new file mode 100644 index 000000000000..07b351638857 --- /dev/null +++ b/benchmarks/queries/imdb/1a.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'top 250 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' and (mc.note like '%(co-production)%' or mc.note like '%(presents)%') AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/1b.sql b/benchmarks/queries/imdb/1b.sql new file mode 100644 index 000000000000..f2901e8b5262 --- /dev/null +++ b/benchmarks/queries/imdb/1b.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'bottom 10 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' AND t.production_year between 2005 and 2010 AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/1c.sql b/benchmarks/queries/imdb/1c.sql new file mode 100644 index 000000000000..94e66c30aa14 --- /dev/null +++ b/benchmarks/queries/imdb/1c.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'top 250 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' and (mc.note like '%(co-production)%') AND t.production_year >2010 AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/1d.sql b/benchmarks/queries/imdb/1d.sql new file mode 100644 index 000000000000..52f58e80c811 --- /dev/null +++ b/benchmarks/queries/imdb/1d.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'bottom 10 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' AND t.production_year >2000 AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/20a.sql b/benchmarks/queries/imdb/20a.sql new file mode 100644 index 000000000000..2a1c269d6a51 --- /dev/null +++ b/benchmarks/queries/imdb/20a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS complete_downey_ironman_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, keyword AS k, kind_type AS kt, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name not like '%Sherlock%' and (chn.name like '%Tony%Stark%' or chn.name like '%Iron%Man%') AND k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND kt.kind = 'movie' AND t.production_year > 1950 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND ci.movie_id = cc.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/20b.sql b/benchmarks/queries/imdb/20b.sql new file mode 100644 index 000000000000..4c2455a52eb1 --- /dev/null +++ b/benchmarks/queries/imdb/20b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS complete_downey_ironman_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, keyword AS k, kind_type AS kt, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name not like '%Sherlock%' and (chn.name like '%Tony%Stark%' or chn.name like '%Iron%Man%') AND k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND kt.kind = 'movie' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND ci.movie_id = cc.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/20c.sql b/benchmarks/queries/imdb/20c.sql new file mode 100644 index 000000000000..b85b22f6b4f2 --- /dev/null +++ b/benchmarks/queries/imdb/20c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS cast_member, MIN(t.title) AS complete_dynamic_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, keyword AS k, kind_type AS kt, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND ci.movie_id = cc.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/21a.sql b/benchmarks/queries/imdb/21a.sql new file mode 100644 index 000000000000..8a66a00be6cb --- /dev/null +++ b/benchmarks/queries/imdb/21a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS company_name, MIN(lt.link) AS link_type, MIN(t.title) AS western_follow_up FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German') AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id; diff --git a/benchmarks/queries/imdb/21b.sql b/benchmarks/queries/imdb/21b.sql new file mode 100644 index 000000000000..90d3a5a4c078 --- /dev/null +++ b/benchmarks/queries/imdb/21b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS company_name, MIN(lt.link) AS link_type, MIN(t.title) AS german_follow_up FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Germany', 'German') AND t.production_year BETWEEN 2000 AND 2010 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id; diff --git a/benchmarks/queries/imdb/21c.sql b/benchmarks/queries/imdb/21c.sql new file mode 100644 index 000000000000..16a42ae6f426 --- /dev/null +++ b/benchmarks/queries/imdb/21c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS company_name, MIN(lt.link) AS link_type, MIN(t.title) AS western_follow_up FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'English') AND t.production_year BETWEEN 1950 AND 2010 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id; diff --git a/benchmarks/queries/imdb/22a.sql b/benchmarks/queries/imdb/22a.sql new file mode 100644 index 000000000000..e513799698c5 --- /dev/null +++ b/benchmarks/queries/imdb/22a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Germany', 'German', 'USA', 'American') AND mi_idx.info < '7.0' AND t.production_year > 2008 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/22b.sql b/benchmarks/queries/imdb/22b.sql new file mode 100644 index 000000000000..f98d0ea8099d --- /dev/null +++ b/benchmarks/queries/imdb/22b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Germany', 'German', 'USA', 'American') AND mi_idx.info < '7.0' AND t.production_year > 2009 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/22c.sql b/benchmarks/queries/imdb/22c.sql new file mode 100644 index 000000000000..cf757956e0de --- /dev/null +++ b/benchmarks/queries/imdb/22c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/22d.sql b/benchmarks/queries/imdb/22d.sql new file mode 100644 index 000000000000..a47feeb05157 --- /dev/null +++ b/benchmarks/queries/imdb/22d.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/23a.sql b/benchmarks/queries/imdb/23a.sql new file mode 100644 index 000000000000..724da913b51a --- /dev/null +++ b/benchmarks/queries/imdb/23a.sql @@ -0,0 +1 @@ +SELECT MIN(kt.kind) AS movie_kind, MIN(t.title) AS complete_us_internet_movie FROM complete_cast AS cc, comp_cast_type AS cct1, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cct1.kind = 'complete+verified' AND cn.country_code = '[us]' AND it1.info = 'release dates' AND kt.kind in ('movie') AND mi.note like '%internet%' AND mi.info is not NULL and (mi.info like 'USA:% 199%' or mi.info like 'USA:% 200%') AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND cct1.id = cc.status_id; diff --git a/benchmarks/queries/imdb/23b.sql b/benchmarks/queries/imdb/23b.sql new file mode 100644 index 000000000000..e39f0ecc28a2 --- /dev/null +++ b/benchmarks/queries/imdb/23b.sql @@ -0,0 +1 @@ +SELECT MIN(kt.kind) AS movie_kind, MIN(t.title) AS complete_nerdy_internet_movie FROM complete_cast AS cc, comp_cast_type AS cct1, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cct1.kind = 'complete+verified' AND cn.country_code = '[us]' AND it1.info = 'release dates' AND k.keyword in ('nerd', 'loner', 'alienation', 'dignity') AND kt.kind in ('movie') AND mi.note like '%internet%' AND mi.info like 'USA:% 200%' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND cct1.id = cc.status_id; diff --git a/benchmarks/queries/imdb/23c.sql b/benchmarks/queries/imdb/23c.sql new file mode 100644 index 000000000000..839d762d0533 --- /dev/null +++ b/benchmarks/queries/imdb/23c.sql @@ -0,0 +1 @@ +SELECT MIN(kt.kind) AS movie_kind, MIN(t.title) AS complete_us_internet_movie FROM complete_cast AS cc, comp_cast_type AS cct1, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cct1.kind = 'complete+verified' AND cn.country_code = '[us]' AND it1.info = 'release dates' AND kt.kind in ('movie', 'tv movie', 'video movie', 'video game') AND mi.note like '%internet%' AND mi.info is not NULL and (mi.info like 'USA:% 199%' or mi.info like 'USA:% 200%') AND t.production_year > 1990 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND cct1.id = cc.status_id; diff --git a/benchmarks/queries/imdb/24a.sql b/benchmarks/queries/imdb/24a.sql new file mode 100644 index 000000000000..8f10621e0209 --- /dev/null +++ b/benchmarks/queries/imdb/24a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress_name, MIN(t.title) AS voiced_action_movie_jap_eng FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND k.keyword in ('hero', 'martial-arts', 'hand-to-hand-combat') AND mi.info is not null and (mi.info like 'Japan:%201%' or mi.info like 'USA:%201%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND ci.movie_id = mk.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/24b.sql b/benchmarks/queries/imdb/24b.sql new file mode 100644 index 000000000000..d8a2836000b2 --- /dev/null +++ b/benchmarks/queries/imdb/24b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress_name, MIN(t.title) AS kung_fu_panda FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND cn.name = 'DreamWorks Animation' AND it.info = 'release dates' AND k.keyword in ('hero', 'martial-arts', 'hand-to-hand-combat', 'computer-animated-movie') AND mi.info is not null and (mi.info like 'Japan:%201%' or mi.info like 'USA:%201%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2010 AND t.title like 'Kung Fu Panda%' AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND ci.movie_id = mk.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/25a.sql b/benchmarks/queries/imdb/25a.sql new file mode 100644 index 000000000000..bc55cc01d26b --- /dev/null +++ b/benchmarks/queries/imdb/25a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS male_writer, MIN(t.title) AS violent_movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'blood', 'gore', 'death', 'female-nudity') AND mi.info = 'Horror' AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi_idx.movie_id = mk.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/25b.sql b/benchmarks/queries/imdb/25b.sql new file mode 100644 index 000000000000..3457655bb9eb --- /dev/null +++ b/benchmarks/queries/imdb/25b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS male_writer, MIN(t.title) AS violent_movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'blood', 'gore', 'death', 'female-nudity') AND mi.info = 'Horror' AND n.gender = 'm' AND t.production_year > 2010 AND t.title like 'Vampire%' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi_idx.movie_id = mk.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/25c.sql b/benchmarks/queries/imdb/25c.sql new file mode 100644 index 000000000000..cf56a313d861 --- /dev/null +++ b/benchmarks/queries/imdb/25c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS male_writer, MIN(t.title) AS violent_movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi_idx.movie_id = mk.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/26a.sql b/benchmarks/queries/imdb/26a.sql new file mode 100644 index 000000000000..b431f204c6dc --- /dev/null +++ b/benchmarks/queries/imdb/26a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(n.name) AS playing_actor, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND mi_idx.info > '7.0' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/26b.sql b/benchmarks/queries/imdb/26b.sql new file mode 100644 index 000000000000..882d234d77e0 --- /dev/null +++ b/benchmarks/queries/imdb/26b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'fight') AND kt.kind = 'movie' AND mi_idx.info > '8.0' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/26c.sql b/benchmarks/queries/imdb/26c.sql new file mode 100644 index 000000000000..4b9eae0b7633 --- /dev/null +++ b/benchmarks/queries/imdb/26c.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/27a.sql b/benchmarks/queries/imdb/27a.sql new file mode 100644 index 000000000000..239673cd8147 --- /dev/null +++ b/benchmarks/queries/imdb/27a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(lt.link) AS link_type, MIN(t.title) AS complete_western_sequel FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind = 'complete' AND cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Germany','Swedish', 'German') AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND t.id = cc.movie_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id AND ml.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = cc.movie_id; diff --git a/benchmarks/queries/imdb/27b.sql b/benchmarks/queries/imdb/27b.sql new file mode 100644 index 000000000000..4bf85260f22d --- /dev/null +++ b/benchmarks/queries/imdb/27b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(lt.link) AS link_type, MIN(t.title) AS complete_western_sequel FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind = 'complete' AND cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Germany','Swedish', 'German') AND t.production_year = 1998 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND t.id = cc.movie_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id AND ml.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = cc.movie_id; diff --git a/benchmarks/queries/imdb/27c.sql b/benchmarks/queries/imdb/27c.sql new file mode 100644 index 000000000000..dc26ebff6851 --- /dev/null +++ b/benchmarks/queries/imdb/27c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(lt.link) AS link_type, MIN(t.title) AS complete_western_sequel FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like 'complete%' AND cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'English') AND t.production_year BETWEEN 1950 AND 2010 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND t.id = cc.movie_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id AND ml.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = cc.movie_id; diff --git a/benchmarks/queries/imdb/28a.sql b/benchmarks/queries/imdb/28a.sql new file mode 100644 index 000000000000..8cb1177386da --- /dev/null +++ b/benchmarks/queries/imdb/28a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_euro_dark_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cct1.kind = 'crew' AND cct2.kind != 'complete+verified' AND cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = mi_idx.movie_id AND mc.movie_id = cc.movie_id AND mi_idx.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/28b.sql b/benchmarks/queries/imdb/28b.sql new file mode 100644 index 000000000000..10f43c898226 --- /dev/null +++ b/benchmarks/queries/imdb/28b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_euro_dark_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cct1.kind = 'crew' AND cct2.kind != 'complete+verified' AND cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Germany', 'Swedish', 'German') AND mi_idx.info > '6.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = mi_idx.movie_id AND mc.movie_id = cc.movie_id AND mi_idx.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/28c.sql b/benchmarks/queries/imdb/28c.sql new file mode 100644 index 000000000000..6b2e4047ae8a --- /dev/null +++ b/benchmarks/queries/imdb/28c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_euro_dark_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cct1.kind = 'cast' AND cct2.kind = 'complete' AND cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = mi_idx.movie_id AND mc.movie_id = cc.movie_id AND mi_idx.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/29a.sql b/benchmarks/queries/imdb/29a.sql new file mode 100644 index 000000000000..3033acbe6cf3 --- /dev/null +++ b/benchmarks/queries/imdb/29a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char, MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_animation FROM aka_name AS an, complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, info_type AS it3, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, person_info AS pi, role_type AS rt, title AS t WHERE cct1.kind ='cast' AND cct2.kind ='complete+verified' AND chn.name = 'Queen' AND ci.note in ('(voice)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND it3.info = 'trivia' AND k.keyword = 'computer-animation' AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.title = 'Shrek 2' AND t.production_year between 2000 and 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND n.id = pi.person_id AND ci.person_id = pi.person_id AND it3.id = pi.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/29b.sql b/benchmarks/queries/imdb/29b.sql new file mode 100644 index 000000000000..88d50fc7b783 --- /dev/null +++ b/benchmarks/queries/imdb/29b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char, MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_animation FROM aka_name AS an, complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, info_type AS it3, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, person_info AS pi, role_type AS rt, title AS t WHERE cct1.kind ='cast' AND cct2.kind ='complete+verified' AND chn.name = 'Queen' AND ci.note in ('(voice)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND it3.info = 'height' AND k.keyword = 'computer-animation' AND mi.info like 'USA:%200%' AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.title = 'Shrek 2' AND t.production_year between 2000 and 2005 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND n.id = pi.person_id AND ci.person_id = pi.person_id AND it3.id = pi.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/29c.sql b/benchmarks/queries/imdb/29c.sql new file mode 100644 index 000000000000..cb951781827c --- /dev/null +++ b/benchmarks/queries/imdb/29c.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char, MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_animation FROM aka_name AS an, complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, info_type AS it3, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, person_info AS pi, role_type AS rt, title AS t WHERE cct1.kind ='cast' AND cct2.kind ='complete+verified' AND ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND it3.info = 'trivia' AND k.keyword = 'computer-animation' AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year between 2000 and 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND n.id = pi.person_id AND ci.person_id = pi.person_id AND it3.id = pi.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/2a.sql b/benchmarks/queries/imdb/2a.sql new file mode 100644 index 000000000000..f3ef4db75fea --- /dev/null +++ b/benchmarks/queries/imdb/2a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[de]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/2b.sql b/benchmarks/queries/imdb/2b.sql new file mode 100644 index 000000000000..82b2123fbccd --- /dev/null +++ b/benchmarks/queries/imdb/2b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[nl]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/2c.sql b/benchmarks/queries/imdb/2c.sql new file mode 100644 index 000000000000..b5f9b75dd68b --- /dev/null +++ b/benchmarks/queries/imdb/2c.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[sm]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/2d.sql b/benchmarks/queries/imdb/2d.sql new file mode 100644 index 000000000000..4a2791946548 --- /dev/null +++ b/benchmarks/queries/imdb/2d.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/30a.sql b/benchmarks/queries/imdb/30a.sql new file mode 100644 index 000000000000..698872fa8337 --- /dev/null +++ b/benchmarks/queries/imdb/30a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS complete_violent_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind ='complete+verified' AND ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/30b.sql b/benchmarks/queries/imdb/30b.sql new file mode 100644 index 000000000000..5fdb8493496c --- /dev/null +++ b/benchmarks/queries/imdb/30b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS complete_gore_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind ='complete+verified' AND ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.production_year > 2000 and (t.title like '%Freddy%' or t.title like '%Jason%' or t.title like 'Saw%') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/30c.sql b/benchmarks/queries/imdb/30c.sql new file mode 100644 index 000000000000..a18087e39222 --- /dev/null +++ b/benchmarks/queries/imdb/30c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS complete_violent_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind ='complete+verified' AND ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/31a.sql b/benchmarks/queries/imdb/31a.sql new file mode 100644 index 000000000000..7dd855011f2a --- /dev/null +++ b/benchmarks/queries/imdb/31a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS violent_liongate_movie FROM cast_info AS ci, company_name AS cn, info_type AS it1, info_type AS it2, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND cn.name like 'Lionsgate%' AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = mc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/31b.sql b/benchmarks/queries/imdb/31b.sql new file mode 100644 index 000000000000..3be5680f7d00 --- /dev/null +++ b/benchmarks/queries/imdb/31b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS violent_liongate_movie FROM cast_info AS ci, company_name AS cn, info_type AS it1, info_type AS it2, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND cn.name like 'Lionsgate%' AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mc.note like '%(Blu-ray)%' AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.production_year > 2000 and (t.title like '%Freddy%' or t.title like '%Jason%' or t.title like 'Saw%') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = mc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/31c.sql b/benchmarks/queries/imdb/31c.sql new file mode 100644 index 000000000000..156ea2d5eee2 --- /dev/null +++ b/benchmarks/queries/imdb/31c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS violent_liongate_movie FROM cast_info AS ci, company_name AS cn, info_type AS it1, info_type AS it2, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND cn.name like 'Lionsgate%' AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = mc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/32a.sql b/benchmarks/queries/imdb/32a.sql new file mode 100644 index 000000000000..9647fb71065d --- /dev/null +++ b/benchmarks/queries/imdb/32a.sql @@ -0,0 +1 @@ +SELECT MIN(lt.link) AS link_type, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM keyword AS k, link_type AS lt, movie_keyword AS mk, movie_link AS ml, title AS t1, title AS t2 WHERE k.keyword ='10,000-mile-club' AND mk.keyword_id = k.id AND t1.id = mk.movie_id AND ml.movie_id = t1.id AND ml.linked_movie_id = t2.id AND lt.id = ml.link_type_id AND mk.movie_id = t1.id; diff --git a/benchmarks/queries/imdb/32b.sql b/benchmarks/queries/imdb/32b.sql new file mode 100644 index 000000000000..6d096ab43405 --- /dev/null +++ b/benchmarks/queries/imdb/32b.sql @@ -0,0 +1 @@ +SELECT MIN(lt.link) AS link_type, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM keyword AS k, link_type AS lt, movie_keyword AS mk, movie_link AS ml, title AS t1, title AS t2 WHERE k.keyword ='character-name-in-title' AND mk.keyword_id = k.id AND t1.id = mk.movie_id AND ml.movie_id = t1.id AND ml.linked_movie_id = t2.id AND lt.id = ml.link_type_id AND mk.movie_id = t1.id; diff --git a/benchmarks/queries/imdb/33a.sql b/benchmarks/queries/imdb/33a.sql new file mode 100644 index 000000000000..24aac4e20797 --- /dev/null +++ b/benchmarks/queries/imdb/33a.sql @@ -0,0 +1 @@ +SELECT MIN(cn1.name) AS first_company, MIN(cn2.name) AS second_company, MIN(mi_idx1.info) AS first_rating, MIN(mi_idx2.info) AS second_rating, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM company_name AS cn1, company_name AS cn2, info_type AS it1, info_type AS it2, kind_type AS kt1, kind_type AS kt2, link_type AS lt, movie_companies AS mc1, movie_companies AS mc2, movie_info_idx AS mi_idx1, movie_info_idx AS mi_idx2, movie_link AS ml, title AS t1, title AS t2 WHERE cn1.country_code = '[us]' AND it1.info = 'rating' AND it2.info = 'rating' AND kt1.kind in ('tv series') AND kt2.kind in ('tv series') AND lt.link in ('sequel', 'follows', 'followed by') AND mi_idx2.info < '3.0' AND t2.production_year between 2005 and 2008 AND lt.id = ml.link_type_id AND t1.id = ml.movie_id AND t2.id = ml.linked_movie_id AND it1.id = mi_idx1.info_type_id AND t1.id = mi_idx1.movie_id AND kt1.id = t1.kind_id AND cn1.id = mc1.company_id AND t1.id = mc1.movie_id AND ml.movie_id = mi_idx1.movie_id AND ml.movie_id = mc1.movie_id AND mi_idx1.movie_id = mc1.movie_id AND it2.id = mi_idx2.info_type_id AND t2.id = mi_idx2.movie_id AND kt2.id = t2.kind_id AND cn2.id = mc2.company_id AND t2.id = mc2.movie_id AND ml.linked_movie_id = mi_idx2.movie_id AND ml.linked_movie_id = mc2.movie_id AND mi_idx2.movie_id = mc2.movie_id; diff --git a/benchmarks/queries/imdb/33b.sql b/benchmarks/queries/imdb/33b.sql new file mode 100644 index 000000000000..fe6fd75a6948 --- /dev/null +++ b/benchmarks/queries/imdb/33b.sql @@ -0,0 +1 @@ +SELECT MIN(cn1.name) AS first_company, MIN(cn2.name) AS second_company, MIN(mi_idx1.info) AS first_rating, MIN(mi_idx2.info) AS second_rating, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM company_name AS cn1, company_name AS cn2, info_type AS it1, info_type AS it2, kind_type AS kt1, kind_type AS kt2, link_type AS lt, movie_companies AS mc1, movie_companies AS mc2, movie_info_idx AS mi_idx1, movie_info_idx AS mi_idx2, movie_link AS ml, title AS t1, title AS t2 WHERE cn1.country_code = '[nl]' AND it1.info = 'rating' AND it2.info = 'rating' AND kt1.kind in ('tv series') AND kt2.kind in ('tv series') AND lt.link LIKE '%follow%' AND mi_idx2.info < '3.0' AND t2.production_year = 2007 AND lt.id = ml.link_type_id AND t1.id = ml.movie_id AND t2.id = ml.linked_movie_id AND it1.id = mi_idx1.info_type_id AND t1.id = mi_idx1.movie_id AND kt1.id = t1.kind_id AND cn1.id = mc1.company_id AND t1.id = mc1.movie_id AND ml.movie_id = mi_idx1.movie_id AND ml.movie_id = mc1.movie_id AND mi_idx1.movie_id = mc1.movie_id AND it2.id = mi_idx2.info_type_id AND t2.id = mi_idx2.movie_id AND kt2.id = t2.kind_id AND cn2.id = mc2.company_id AND t2.id = mc2.movie_id AND ml.linked_movie_id = mi_idx2.movie_id AND ml.linked_movie_id = mc2.movie_id AND mi_idx2.movie_id = mc2.movie_id; diff --git a/benchmarks/queries/imdb/33c.sql b/benchmarks/queries/imdb/33c.sql new file mode 100644 index 000000000000..c9f0907d3f90 --- /dev/null +++ b/benchmarks/queries/imdb/33c.sql @@ -0,0 +1 @@ +SELECT MIN(cn1.name) AS first_company, MIN(cn2.name) AS second_company, MIN(mi_idx1.info) AS first_rating, MIN(mi_idx2.info) AS second_rating, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM company_name AS cn1, company_name AS cn2, info_type AS it1, info_type AS it2, kind_type AS kt1, kind_type AS kt2, link_type AS lt, movie_companies AS mc1, movie_companies AS mc2, movie_info_idx AS mi_idx1, movie_info_idx AS mi_idx2, movie_link AS ml, title AS t1, title AS t2 WHERE cn1.country_code != '[us]' AND it1.info = 'rating' AND it2.info = 'rating' AND kt1.kind in ('tv series', 'episode') AND kt2.kind in ('tv series', 'episode') AND lt.link in ('sequel', 'follows', 'followed by') AND mi_idx2.info < '3.5' AND t2.production_year between 2000 and 2010 AND lt.id = ml.link_type_id AND t1.id = ml.movie_id AND t2.id = ml.linked_movie_id AND it1.id = mi_idx1.info_type_id AND t1.id = mi_idx1.movie_id AND kt1.id = t1.kind_id AND cn1.id = mc1.company_id AND t1.id = mc1.movie_id AND ml.movie_id = mi_idx1.movie_id AND ml.movie_id = mc1.movie_id AND mi_idx1.movie_id = mc1.movie_id AND it2.id = mi_idx2.info_type_id AND t2.id = mi_idx2.movie_id AND kt2.id = t2.kind_id AND cn2.id = mc2.company_id AND t2.id = mc2.movie_id AND ml.linked_movie_id = mi_idx2.movie_id AND ml.linked_movie_id = mc2.movie_id AND mi_idx2.movie_id = mc2.movie_id; diff --git a/benchmarks/queries/imdb/3a.sql b/benchmarks/queries/imdb/3a.sql new file mode 100644 index 000000000000..231c957be207 --- /dev/null +++ b/benchmarks/queries/imdb/3a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German') AND t.production_year > 2005 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/3b.sql b/benchmarks/queries/imdb/3b.sql new file mode 100644 index 000000000000..fd21efc81014 --- /dev/null +++ b/benchmarks/queries/imdb/3b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Bulgaria') AND t.production_year > 2010 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/3c.sql b/benchmarks/queries/imdb/3c.sql new file mode 100644 index 000000000000..5f34232a2e61 --- /dev/null +++ b/benchmarks/queries/imdb/3c.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND t.production_year > 1990 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/4a.sql b/benchmarks/queries/imdb/4a.sql new file mode 100644 index 000000000000..636afab02c8a --- /dev/null +++ b/benchmarks/queries/imdb/4a.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS movie_title FROM info_type AS it, keyword AS k, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it.info ='rating' AND k.keyword like '%sequel%' AND mi_idx.info > '5.0' AND t.production_year > 2005 AND t.id = mi_idx.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/4b.sql b/benchmarks/queries/imdb/4b.sql new file mode 100644 index 000000000000..ebd3e8992060 --- /dev/null +++ b/benchmarks/queries/imdb/4b.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS movie_title FROM info_type AS it, keyword AS k, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it.info ='rating' AND k.keyword like '%sequel%' AND mi_idx.info > '9.0' AND t.production_year > 2010 AND t.id = mi_idx.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/4c.sql b/benchmarks/queries/imdb/4c.sql new file mode 100644 index 000000000000..309281200f98 --- /dev/null +++ b/benchmarks/queries/imdb/4c.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS movie_title FROM info_type AS it, keyword AS k, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it.info ='rating' AND k.keyword like '%sequel%' AND mi_idx.info > '2.0' AND t.production_year > 1990 AND t.id = mi_idx.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/5a.sql b/benchmarks/queries/imdb/5a.sql new file mode 100644 index 000000000000..04aae9881f7e --- /dev/null +++ b/benchmarks/queries/imdb/5a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS typical_european_movie FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info AS mi, title AS t WHERE ct.kind = 'production companies' AND mc.note like '%(theatrical)%' and mc.note like '%(France)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German') AND t.production_year > 2005 AND t.id = mi.movie_id AND t.id = mc.movie_id AND mc.movie_id = mi.movie_id AND ct.id = mc.company_type_id AND it.id = mi.info_type_id; diff --git a/benchmarks/queries/imdb/5b.sql b/benchmarks/queries/imdb/5b.sql new file mode 100644 index 000000000000..f03a519d61b3 --- /dev/null +++ b/benchmarks/queries/imdb/5b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS american_vhs_movie FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info AS mi, title AS t WHERE ct.kind = 'production companies' AND mc.note like '%(VHS)%' and mc.note like '%(USA)%' and mc.note like '%(1994)%' AND mi.info IN ('USA', 'America') AND t.production_year > 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND mc.movie_id = mi.movie_id AND ct.id = mc.company_type_id AND it.id = mi.info_type_id; diff --git a/benchmarks/queries/imdb/5c.sql b/benchmarks/queries/imdb/5c.sql new file mode 100644 index 000000000000..2705e7e2c7a0 --- /dev/null +++ b/benchmarks/queries/imdb/5c.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS american_movie FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info AS mi, title AS t WHERE ct.kind = 'production companies' AND mc.note not like '%(TV)%' and mc.note like '%(USA)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND t.production_year > 1990 AND t.id = mi.movie_id AND t.id = mc.movie_id AND mc.movie_id = mi.movie_id AND ct.id = mc.company_type_id AND it.id = mi.info_type_id; diff --git a/benchmarks/queries/imdb/6a.sql b/benchmarks/queries/imdb/6a.sql new file mode 100644 index 000000000000..34b3a6da5fd2 --- /dev/null +++ b/benchmarks/queries/imdb/6a.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2010 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6b.sql b/benchmarks/queries/imdb/6b.sql new file mode 100644 index 000000000000..1233c41e66b0 --- /dev/null +++ b/benchmarks/queries/imdb/6b.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS hero_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2014 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6c.sql b/benchmarks/queries/imdb/6c.sql new file mode 100644 index 000000000000..d1f97746e15e --- /dev/null +++ b/benchmarks/queries/imdb/6c.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2014 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6d.sql b/benchmarks/queries/imdb/6d.sql new file mode 100644 index 000000000000..07729510a454 --- /dev/null +++ b/benchmarks/queries/imdb/6d.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS hero_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6e.sql b/benchmarks/queries/imdb/6e.sql new file mode 100644 index 000000000000..2e77873fd81d --- /dev/null +++ b/benchmarks/queries/imdb/6e.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6f.sql b/benchmarks/queries/imdb/6f.sql new file mode 100644 index 000000000000..603901129107 --- /dev/null +++ b/benchmarks/queries/imdb/6f.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS hero_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/7a.sql b/benchmarks/queries/imdb/7a.sql new file mode 100644 index 000000000000..c6b26ce36f11 --- /dev/null +++ b/benchmarks/queries/imdb/7a.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS of_person, MIN(t.title) AS biography_movie FROM aka_name AS an, cast_info AS ci, info_type AS it, link_type AS lt, movie_link AS ml, name AS n, person_info AS pi, title AS t WHERE an.name LIKE '%a%' AND it.info ='mini biography' AND lt.link ='features' AND n.name_pcode_cf BETWEEN 'A' AND 'F' AND (n.gender='m' OR (n.gender = 'f' AND n.name LIKE 'B%')) AND pi.note ='Volker Boehm' AND t.production_year BETWEEN 1980 AND 1995 AND n.id = an.person_id AND n.id = pi.person_id AND ci.person_id = n.id AND t.id = ci.movie_id AND ml.linked_movie_id = t.id AND lt.id = ml.link_type_id AND it.id = pi.info_type_id AND pi.person_id = an.person_id AND pi.person_id = ci.person_id AND an.person_id = ci.person_id AND ci.movie_id = ml.linked_movie_id; diff --git a/benchmarks/queries/imdb/7b.sql b/benchmarks/queries/imdb/7b.sql new file mode 100644 index 000000000000..4e4f6e7615cb --- /dev/null +++ b/benchmarks/queries/imdb/7b.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS of_person, MIN(t.title) AS biography_movie FROM aka_name AS an, cast_info AS ci, info_type AS it, link_type AS lt, movie_link AS ml, name AS n, person_info AS pi, title AS t WHERE an.name LIKE '%a%' AND it.info ='mini biography' AND lt.link ='features' AND n.name_pcode_cf LIKE 'D%' AND n.gender='m' AND pi.note ='Volker Boehm' AND t.production_year BETWEEN 1980 AND 1984 AND n.id = an.person_id AND n.id = pi.person_id AND ci.person_id = n.id AND t.id = ci.movie_id AND ml.linked_movie_id = t.id AND lt.id = ml.link_type_id AND it.id = pi.info_type_id AND pi.person_id = an.person_id AND pi.person_id = ci.person_id AND an.person_id = ci.person_id AND ci.movie_id = ml.linked_movie_id; diff --git a/benchmarks/queries/imdb/7c.sql b/benchmarks/queries/imdb/7c.sql new file mode 100644 index 000000000000..a399342fae02 --- /dev/null +++ b/benchmarks/queries/imdb/7c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS cast_member_name, MIN(pi.info) AS cast_member_info FROM aka_name AS an, cast_info AS ci, info_type AS it, link_type AS lt, movie_link AS ml, name AS n, person_info AS pi, title AS t WHERE an.name is not NULL and (an.name LIKE '%a%' or an.name LIKE 'A%') AND it.info ='mini biography' AND lt.link in ('references', 'referenced in', 'features', 'featured in') AND n.name_pcode_cf BETWEEN 'A' AND 'F' AND (n.gender='m' OR (n.gender = 'f' AND n.name LIKE 'A%')) AND pi.note is not NULL AND t.production_year BETWEEN 1980 AND 2010 AND n.id = an.person_id AND n.id = pi.person_id AND ci.person_id = n.id AND t.id = ci.movie_id AND ml.linked_movie_id = t.id AND lt.id = ml.link_type_id AND it.id = pi.info_type_id AND pi.person_id = an.person_id AND pi.person_id = ci.person_id AND an.person_id = ci.person_id AND ci.movie_id = ml.linked_movie_id; diff --git a/benchmarks/queries/imdb/8a.sql b/benchmarks/queries/imdb/8a.sql new file mode 100644 index 000000000000..66ed05880d5f --- /dev/null +++ b/benchmarks/queries/imdb/8a.sql @@ -0,0 +1 @@ +SELECT MIN(an1.name) AS actress_pseudonym, MIN(t.title) AS japanese_movie_dubbed FROM aka_name AS an1, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n1, role_type AS rt, title AS t WHERE ci.note ='(voice: English version)' AND cn.country_code ='[jp]' AND mc.note like '%(Japan)%' and mc.note not like '%(USA)%' AND n1.name like '%Yo%' and n1.name not like '%Yu%' AND rt.role ='actress' AND an1.person_id = n1.id AND n1.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND an1.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/8b.sql b/benchmarks/queries/imdb/8b.sql new file mode 100644 index 000000000000..044b5f8e8649 --- /dev/null +++ b/benchmarks/queries/imdb/8b.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS acress_pseudonym, MIN(t.title) AS japanese_anime_movie FROM aka_name AS an, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note ='(voice: English version)' AND cn.country_code ='[jp]' AND mc.note like '%(Japan)%' and mc.note not like '%(USA)%' and (mc.note like '%(2006)%' or mc.note like '%(2007)%') AND n.name like '%Yo%' and n.name not like '%Yu%' AND rt.role ='actress' AND t.production_year between 2006 and 2007 and (t.title like 'One Piece%' or t.title like 'Dragon Ball Z%') AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/8c.sql b/benchmarks/queries/imdb/8c.sql new file mode 100644 index 000000000000..d02b74c02c5e --- /dev/null +++ b/benchmarks/queries/imdb/8c.sql @@ -0,0 +1 @@ +SELECT MIN(a1.name) AS writer_pseudo_name, MIN(t.title) AS movie_title FROM aka_name AS a1, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n1, role_type AS rt, title AS t WHERE cn.country_code ='[us]' AND rt.role ='writer' AND a1.person_id = n1.id AND n1.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND a1.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/8d.sql b/benchmarks/queries/imdb/8d.sql new file mode 100644 index 000000000000..0834c0ff5cb7 --- /dev/null +++ b/benchmarks/queries/imdb/8d.sql @@ -0,0 +1 @@ +SELECT MIN(an1.name) AS costume_designer_pseudo, MIN(t.title) AS movie_with_costumes FROM aka_name AS an1, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n1, role_type AS rt, title AS t WHERE cn.country_code ='[us]' AND rt.role ='costume designer' AND an1.person_id = n1.id AND n1.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND an1.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/9a.sql b/benchmarks/queries/imdb/9a.sql new file mode 100644 index 000000000000..593b16213b06 --- /dev/null +++ b/benchmarks/queries/imdb/9a.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS character_name, MIN(t.title) AS movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND mc.note is not NULL and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND n.gender ='f' and n.name like '%Ang%' AND rt.role ='actress' AND t.production_year between 2005 and 2015 AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/queries/imdb/9b.sql b/benchmarks/queries/imdb/9b.sql new file mode 100644 index 000000000000..a4933fd6856e --- /dev/null +++ b/benchmarks/queries/imdb/9b.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS voiced_character, MIN(n.name) AS voicing_actress, MIN(t.title) AS american_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note = '(voice)' AND cn.country_code ='[us]' AND mc.note like '%(200%)%' and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND n.gender ='f' and n.name like '%Angel%' AND rt.role ='actress' AND t.production_year between 2007 and 2010 AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/queries/imdb/9c.sql b/benchmarks/queries/imdb/9c.sql new file mode 100644 index 000000000000..0be511810cf6 --- /dev/null +++ b/benchmarks/queries/imdb/9c.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS voiced_character_name, MIN(n.name) AS voicing_actress, MIN(t.title) AS american_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/queries/imdb/9d.sql b/benchmarks/queries/imdb/9d.sql new file mode 100644 index 000000000000..51262ca5ebae --- /dev/null +++ b/benchmarks/queries/imdb/9d.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress, MIN(t.title) AS american_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND n.gender ='f' AND rt.role ='actress' AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs index 9ce6848a063a..f7b84116e793 100644 --- a/benchmarks/src/bin/dfbench.rs +++ b/benchmarks/src/bin/dfbench.rs @@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; -use datafusion_benchmarks::{clickbench, parquet_filter, sort, tpch}; +use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch}; #[derive(Debug, StructOpt)] #[structopt(about = "benchmark command")] @@ -43,6 +43,7 @@ enum Options { Clickbench(clickbench::RunOpt), ParquetFilter(parquet_filter::RunOpt), Sort(sort::RunOpt), + Imdb(imdb::RunOpt), } // Main benchmark runner entrypoint @@ -56,5 +57,6 @@ pub async fn main() -> Result<()> { Options::Clickbench(opt) => opt.run().await, Options::ParquetFilter(opt) => opt.run().await, Options::Sort(opt) => opt.run().await, + Options::Imdb(opt) => opt.run().await, } } diff --git a/benchmarks/src/bin/imdb.rs b/benchmarks/src/bin/imdb.rs index 40efb84b0501..13421f8a89a9 100644 --- a/benchmarks/src/bin/imdb.rs +++ b/benchmarks/src/bin/imdb.rs @@ -34,9 +34,17 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; +#[derive(Debug, StructOpt)] +#[structopt(about = "benchmark command")] +enum BenchmarkSubCommandOpt { + #[structopt(name = "datafusion")] + DataFusionBenchmark(imdb::RunOpt), +} + #[derive(Debug, StructOpt)] #[structopt(name = "IMDB", about = "IMDB Dataset Processing.")] enum ImdbOpt { + Benchmark(BenchmarkSubCommandOpt), Convert(imdb::ConvertOpt), } @@ -44,6 +52,9 @@ enum ImdbOpt { pub async fn main() -> Result<()> { env_logger::init(); match ImdbOpt::from_args() { + ImdbOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => { + opt.run().await + } ImdbOpt::Convert(opt) => opt.run().await, } } diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs index c95f7f8bf564..4e470d711da5 100644 --- a/benchmarks/src/imdb/convert.rs +++ b/benchmarks/src/imdb/convert.rs @@ -51,11 +51,12 @@ impl ConvertOpt { pub async fn run(self) -> Result<()> { let input_path = self.input_path.to_str().unwrap(); let output_path = self.output_path.to_str().unwrap(); + let config = SessionConfig::new().with_batch_size(self.batch_size); + let ctx = SessionContext::new_with_config(config); for table in IMDB_TABLES { let start = Instant::now(); let schema = get_imdb_table_schema(table); - let input_path = format!("{input_path}/{table}.csv"); let output_path = format!("{output_path}/{table}.parquet"); let options = CsvReadOptions::new() @@ -65,9 +66,6 @@ impl ConvertOpt { .escape(b'\\') .file_extension(".csv"); - let config = SessionConfig::new().with_batch_size(self.batch_size); - let ctx = SessionContext::new_with_config(config); - let mut csv = ctx.read_csv(&input_path, options).await?; // Select all apart from the padding column diff --git a/benchmarks/src/imdb/mod.rs b/benchmarks/src/imdb/mod.rs index 8e2977c0384e..6a45242e6ff4 100644 --- a/benchmarks/src/imdb/mod.rs +++ b/benchmarks/src/imdb/mod.rs @@ -17,10 +17,18 @@ //! Benchmark derived from IMDB dataset. -use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::{ + arrow::datatypes::{DataType, Field, Schema}, + common::plan_err, + error::Result, +}; mod convert; pub use convert::ConvertOpt; +use std::fs; +mod run; +pub use run::RunOpt; + // we have 21 tables in the IMDB dataset pub const IMDB_TABLES: &[&str] = &[ "aka_name", @@ -51,7 +59,7 @@ pub const IMDB_TABLES: &[&str] = &[ pub fn get_imdb_table_schema(table: &str) -> Schema { match table { "aka_name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("person_id", DataType::Int32, false), Field::new("name", DataType::Utf8, true), Field::new("imdb_index", DataType::Utf8, true), @@ -61,7 +69,7 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "aka_title" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("title", DataType::Utf8, true), Field::new("imdb_index", DataType::Utf8, true), @@ -75,7 +83,7 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "cast_info" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("person_id", DataType::Int32, false), Field::new("movie_id", DataType::Int32, false), Field::new("person_role_id", DataType::Int32, true), @@ -84,7 +92,7 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("role_id", DataType::Int32, false), ]), "char_name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("name", DataType::Utf8, false), Field::new("imdb_index", DataType::Utf8, true), Field::new("imdb_id", DataType::Int32, true), @@ -93,11 +101,11 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "comp_cast_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("kind", DataType::Utf8, false), ]), "company_name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("name", DataType::Utf8, false), Field::new("country_code", DataType::Utf8, true), Field::new("imdb_id", DataType::Int32, true), @@ -106,59 +114,59 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "company_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("kind", DataType::Utf8, true), ]), "complete_cast" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, true), Field::new("subject_id", DataType::Int32, false), Field::new("status_id", DataType::Int32, false), ]), "info_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("info", DataType::Utf8, false), ]), "keyword" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("keyword", DataType::Utf8, false), Field::new("phonetic_code", DataType::Utf8, true), ]), "kind_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("kind", DataType::Utf8, true), ]), "link_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("link", DataType::Utf8, false), ]), "movie_companies" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("company_id", DataType::Int32, false), Field::new("company_type_id", DataType::Int32, false), Field::new("note", DataType::Utf8, true), ]), "movie_info_idx" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("info_type_id", DataType::Int32, false), Field::new("info", DataType::Utf8, false), Field::new("note", DataType::Utf8, true), ]), "movie_keyword" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("keyword_id", DataType::Int32, false), ]), "movie_link" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("linked_movie_id", DataType::Int32, false), Field::new("link_type_id", DataType::Int32, false), ]), "name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("name", DataType::Utf8, false), Field::new("imdb_index", DataType::Utf8, true), Field::new("imdb_id", DataType::Int32, true), @@ -169,11 +177,11 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "role_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("role", DataType::Utf8, false), ]), "title" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("title", DataType::Utf8, false), Field::new("imdb_index", DataType::Utf8, true), Field::new("kind_id", DataType::Int32, false), @@ -187,14 +195,14 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "movie_info" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("info_type_id", DataType::Int32, false), Field::new("info", DataType::Utf8, false), Field::new("note", DataType::Utf8, true), ]), "person_info" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("person_id", DataType::Int32, false), Field::new("info_type_id", DataType::Int32, false), Field::new("info", DataType::Utf8, false), @@ -203,3 +211,26 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { _ => unimplemented!("Schema for table {} is not implemented", table), } } + +/// Get the SQL statements from the specified query file +pub fn get_query_sql(query: &str) -> Result> { + let possibilities = vec![ + format!("queries/imdb/{query}.sql"), + format!("benchmarks/queries/imdb/{query}.sql"), + ]; + let mut errors = vec![]; + for filename in possibilities { + match fs::read_to_string(&filename) { + Ok(contents) => { + return Ok(contents + .split(';') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .collect()); + } + Err(e) => errors.push(format!("{filename}: {e}")), + }; + } + plan_err!("invalid query. Could not find query: {:?}", errors) +} diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs new file mode 100644 index 000000000000..697c79dc894a --- /dev/null +++ b/benchmarks/src/imdb/run.rs @@ -0,0 +1,827 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::path::PathBuf; +use std::sync::Arc; + +use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES}; +use crate::{BenchmarkRun, CommonOpt}; + +use arrow::record_batch::RecordBatch; +use arrow::util::pretty::{self, pretty_format_batches}; +use datafusion::datasource::file_format::csv::CsvFormat; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::file_format::FileFormat; +use datafusion::datasource::listing::{ + ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, +}; +use datafusion::datasource::{MemTable, TableProvider}; +use datafusion::error::Result; +use datafusion::physical_plan::display::DisplayableExecutionPlan; +use datafusion::physical_plan::{collect, displayable}; +use datafusion::prelude::*; +use datafusion_common::instant::Instant; +use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; + +use log::info; +use structopt::StructOpt; + +// hack to avoid `default_value is meaningless for bool` errors +type BoolDefaultTrue = bool; + +/// Run the imdb benchmark (a.k.a. JOB). +/// +/// This benchmarks is derived from the [Join Order Benchmark / JOB] proposed in paper [How Good Are Query Optimizers, Really?][1]. +/// The data and answers are downloaded from +/// [2] and [3]. +/// +/// [1]: https://www.vldb.org/pvldb/vol9/p204-leis.pdf +/// [2]: http://homepages.cwi.nl/~boncz/job/imdb.tgz +/// [3]: https://db.in.tum.de/~leis/qo/job.tgz + +#[derive(Debug, StructOpt, Clone)] +#[structopt(verbatim_doc_comment)] +pub struct RunOpt { + /// Query number. If not specified, runs all queries + #[structopt(short, long)] + query: Option, + + /// Common options + #[structopt(flatten)] + common: CommonOpt, + + /// Path to data files + #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + path: PathBuf, + + /// File format: `csv` or `parquet` + #[structopt(short = "f", long = "format", default_value = "csv")] + file_format: String, + + /// Load the data into a MemTable before executing the query + #[structopt(short = "m", long = "mem-table")] + mem_table: bool, + + /// Path to machine readable output file + #[structopt(parse(from_os_str), short = "o", long = "output")] + output_path: Option, + + /// Whether to disable collection of statistics (and cost based optimizations) or not. + #[structopt(short = "S", long = "disable-statistics")] + disable_statistics: bool, + + /// If true then hash join used, if false then sort merge join + /// True by default. + #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] + prefer_hash_join: BoolDefaultTrue, +} + +const IMDB_QUERY_START_ID: usize = 1; +const IMDB_QUERY_END_ID: usize = 113; + +fn map_query_id_to_str(query_id: usize) -> &'static str { + match query_id { + // 1 + 1 => "1a", + 2 => "1b", + 3 => "1c", + 4 => "1d", + + // 2 + 5 => "2a", + 6 => "2b", + 7 => "2c", + 8 => "2d", + + // 3 + 9 => "3a", + 10 => "3b", + 11 => "3c", + + // 4 + 12 => "4a", + 13 => "4b", + 14 => "4c", + + // 5 + 15 => "5a", + 16 => "5b", + 17 => "5c", + + // 6 + 18 => "6a", + 19 => "6b", + 20 => "6c", + 21 => "6d", + 22 => "6e", + 23 => "6f", + + // 7 + 24 => "7a", + 25 => "7b", + 26 => "7c", + + // 8 + 27 => "8a", + 28 => "8b", + 29 => "8c", + 30 => "8d", + + // 9 + 31 => "9a", + 32 => "9b", + 33 => "9c", + 34 => "9d", + + // 10 + 35 => "10a", + 36 => "10b", + 37 => "10c", + + // 11 + 38 => "11a", + 39 => "11b", + 40 => "11c", + 41 => "11d", + + // 12 + 42 => "12a", + 43 => "12b", + 44 => "12c", + + // 13 + 45 => "13a", + 46 => "13b", + 47 => "13c", + 48 => "13d", + + // 14 + 49 => "14a", + 50 => "14b", + 51 => "14c", + + // 15 + 52 => "15a", + 53 => "15b", + 54 => "15c", + 55 => "15d", + + // 16 + 56 => "16a", + 57 => "16b", + 58 => "16c", + 59 => "16d", + + // 17 + 60 => "17a", + 61 => "17b", + 62 => "17c", + 63 => "17d", + 64 => "17e", + 65 => "17f", + + // 18 + 66 => "18a", + 67 => "18b", + 68 => "18c", + + // 19 + 69 => "19a", + 70 => "19b", + 71 => "19c", + 72 => "19d", + + // 20 + 73 => "20a", + 74 => "20b", + 75 => "20c", + + // 21 + 76 => "21a", + 77 => "21b", + 78 => "21c", + + // 22 + 79 => "22a", + 80 => "22b", + 81 => "22c", + 82 => "22d", + + // 23 + 83 => "23a", + 84 => "23b", + 85 => "23c", + + // 24 + 86 => "24a", + 87 => "24b", + + // 25 + 88 => "25a", + 89 => "25b", + 90 => "25c", + + // 26 + 91 => "26a", + 92 => "26b", + 93 => "26c", + + // 27 + 94 => "27a", + 95 => "27b", + 96 => "27c", + + // 28 + 97 => "28a", + 98 => "28b", + 99 => "28c", + + // 29 + 100 => "29a", + 101 => "29b", + 102 => "29c", + + // 30 + 103 => "30a", + 104 => "30b", + 105 => "30c", + + // 31 + 106 => "31a", + 107 => "31b", + 108 => "31c", + + // 32 + 109 => "32a", + 110 => "32b", + + // 33 + 111 => "33a", + 112 => "33b", + 113 => "33c", + + // Fallback for unknown query_id + _ => "unknown", + } +} + +impl RunOpt { + pub async fn run(self) -> Result<()> { + println!("Running benchmarks with the following options: {self:?}"); + let query_range = match self.query { + Some(query_id) => query_id..=query_id, + None => IMDB_QUERY_START_ID..=IMDB_QUERY_END_ID, + }; + + let mut benchmark_run = BenchmarkRun::new(); + for query_id in query_range { + benchmark_run.start_new_case(&format!("Query {query_id}")); + let query_run = self.benchmark_query(query_id).await?; + for iter in query_run { + benchmark_run.write_iter(iter.elapsed, iter.row_count); + } + } + benchmark_run.maybe_write_json(self.output_path.as_ref())?; + Ok(()) + } + + async fn benchmark_query(&self, query_id: usize) -> Result> { + let mut config = self + .common + .config() + .with_collect_statistics(!self.disable_statistics); + config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join; + config + .options_mut() + .execution + .parquet + .schema_force_view_types = self.common.force_view_types; + let ctx = SessionContext::new_with_config(config); + + // register tables + self.register_tables(&ctx).await?; + + let mut millis = vec![]; + // run benchmark + let mut query_results = vec![]; + for i in 0..self.iterations() { + let start = Instant::now(); + + let query_id_str = map_query_id_to_str(query_id); + let sql = &get_query_sql(query_id_str)?; + + let mut result = vec![]; + + for query in sql { + result = self.execute_query(&ctx, query).await?; + } + + let elapsed = start.elapsed(); //.as_secs_f64() * 1000.0; + let ms = elapsed.as_secs_f64() * 1000.0; + millis.push(ms); + info!("output:\n\n{}\n\n", pretty_format_batches(&result)?); + let row_count = result.iter().map(|b| b.num_rows()).sum(); + println!( + "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows" + ); + query_results.push(QueryResult { elapsed, row_count }); + } + + let avg = millis.iter().sum::() / millis.len() as f64; + println!("Query {query_id} avg time: {avg:.2} ms"); + + Ok(query_results) + } + + async fn register_tables(&self, ctx: &SessionContext) -> Result<()> { + for table in IMDB_TABLES { + let table_provider = { self.get_table(ctx, table).await? }; + + if self.mem_table { + println!("Loading table '{table}' into memory"); + let start = Instant::now(); + let memtable = + MemTable::load(table_provider, Some(self.partitions()), &ctx.state()) + .await?; + println!( + "Loaded table '{}' into memory in {} ms", + table, + start.elapsed().as_millis() + ); + ctx.register_table(*table, Arc::new(memtable))?; + } else { + ctx.register_table(*table, table_provider)?; + } + } + Ok(()) + } + + async fn execute_query( + &self, + ctx: &SessionContext, + sql: &str, + ) -> Result> { + let debug = self.common.debug; + let plan = ctx.sql(sql).await?; + let (state, plan) = plan.into_parts(); + + if debug { + println!("=== Logical plan ===\n{plan}\n"); + } + + let plan = state.optimize(&plan)?; + if debug { + println!("=== Optimized logical plan ===\n{plan}\n"); + } + let physical_plan = state.create_physical_plan(&plan).await?; + if debug { + println!( + "=== Physical plan ===\n{}\n", + displayable(physical_plan.as_ref()).indent(true) + ); + } + let result = collect(physical_plan.clone(), state.task_ctx()).await?; + if debug { + println!( + "=== Physical plan with metrics ===\n{}\n", + DisplayableExecutionPlan::with_metrics(physical_plan.as_ref()) + .indent(true) + ); + if !result.is_empty() { + // do not call print_batches if there are no batches as the result is confusing + // and makes it look like there is a batch with no columns + pretty::print_batches(&result)?; + } + } + Ok(result) + } + + async fn get_table( + &self, + ctx: &SessionContext, + table: &str, + ) -> Result> { + let path = self.path.to_str().unwrap(); + let table_format = self.file_format.as_str(); + + // Obtain a snapshot of the SessionState + let state = ctx.state(); + let (format, path, extension): (Arc, String, &'static str) = + match table_format { + // dbgen creates .tbl ('|' delimited) files without header + "tbl" => { + let path = format!("{path}/{table}.tbl"); + + let format = CsvFormat::default() + .with_delimiter(b'|') + .with_has_header(false); + + (Arc::new(format), path, ".tbl") + } + "csv" => { + let path = format!("{path}/{table}.csv"); + let format = CsvFormat::default() + .with_delimiter(b',') + .with_escape(Some(b'\\')) + .with_has_header(false); + + (Arc::new(format), path, DEFAULT_CSV_EXTENSION) + } + "parquet" => { + let path = format!("{path}/{table}.parquet"); + let format = ParquetFormat::default() + .with_options(ctx.state().table_options().parquet.clone()); + (Arc::new(format), path, DEFAULT_PARQUET_EXTENSION) + } + other => { + unimplemented!("Invalid file format '{}'", other); + } + }; + + let options = ListingOptions::new(format) + .with_file_extension(extension) + .with_collect_stat(state.config().collect_statistics()); + + let table_path = ListingTableUrl::parse(path)?; + let config = ListingTableConfig::new(table_path).with_listing_options(options); + let config = match table_format { + "parquet" => config.with_schema(Arc::new(get_imdb_table_schema(table))), + "csv" => config.with_schema(Arc::new(get_imdb_table_schema(table))), + _ => unreachable!(), + }; + + Ok(Arc::new(ListingTable::try_new(config)?)) + } + + fn iterations(&self) -> usize { + self.common.iterations + } + + fn partitions(&self) -> usize { + self.common.partitions.unwrap_or(num_cpus::get()) + } +} + +struct QueryResult { + elapsed: std::time::Duration, + row_count: usize, +} + +#[cfg(test)] +// Only run with "ci" mode when we have the data +#[cfg(feature = "ci")] +mod tests { + use std::path::Path; + + use super::*; + + use datafusion::common::exec_err; + use datafusion::error::Result; + use datafusion_proto::bytes::{ + logical_plan_from_bytes, logical_plan_to_bytes, physical_plan_from_bytes, + physical_plan_to_bytes, + }; + + fn get_imdb_data_path() -> Result { + let path = + std::env::var("IMDB_DATA").unwrap_or_else(|_| "benchmarks/data".to_string()); + if !Path::new(&path).exists() { + return exec_err!( + "Benchmark data not found (set IMDB_DATA env var to override): {}", + path + ); + } + Ok(path) + } + + async fn round_trip_logical_plan(query: usize) -> Result<()> { + let ctx = SessionContext::default(); + let path = get_imdb_data_path()?; + let common = CommonOpt { + iterations: 1, + partitions: Some(2), + batch_size: 8192, + debug: false, + force_view_types: false, + }; + let opt = RunOpt { + query: Some(query), + common, + path: PathBuf::from(path.to_string()), + file_format: "parquet".to_string(), + mem_table: false, + output_path: None, + disable_statistics: false, + prefer_hash_join: true, + }; + opt.register_tables(&ctx).await?; + let queries = get_query_sql(map_query_id_to_str(query))?; + for query in queries { + let plan = ctx.sql(&query).await?; + let plan = plan.into_optimized_plan()?; + let bytes = logical_plan_to_bytes(&plan)?; + let plan2 = logical_plan_from_bytes(&bytes, &ctx)?; + let plan_formatted = format!("{}", plan.display_indent()); + let plan2_formatted = format!("{}", plan2.display_indent()); + assert_eq!(plan_formatted, plan2_formatted); + } + Ok(()) + } + + async fn round_trip_physical_plan(query: usize) -> Result<()> { + let ctx = SessionContext::default(); + let path = get_imdb_data_path()?; + let common = CommonOpt { + iterations: 1, + partitions: Some(2), + batch_size: 8192, + debug: false, + force_view_types: false, + }; + let opt = RunOpt { + query: Some(query), + common, + path: PathBuf::from(path.to_string()), + file_format: "parquet".to_string(), + mem_table: false, + output_path: None, + disable_statistics: false, + prefer_hash_join: true, + }; + opt.register_tables(&ctx).await?; + let queries = get_query_sql(map_query_id_to_str(query))?; + for query in queries { + let plan = ctx.sql(&query).await?; + let plan = plan.create_physical_plan().await?; + let bytes = physical_plan_to_bytes(plan.clone())?; + let plan2 = physical_plan_from_bytes(&bytes, &ctx)?; + let plan_formatted = format!("{}", displayable(plan.as_ref()).indent(false)); + let plan2_formatted = + format!("{}", displayable(plan2.as_ref()).indent(false)); + assert_eq!(plan_formatted, plan2_formatted); + } + Ok(()) + } + + macro_rules! test_round_trip_logical { + ($tn:ident, $query:expr) => { + #[tokio::test] + async fn $tn() -> Result<()> { + round_trip_logical_plan($query).await + } + }; + } + + macro_rules! test_round_trip_physical { + ($tn:ident, $query:expr) => { + #[tokio::test] + async fn $tn() -> Result<()> { + round_trip_physical_plan($query).await + } + }; + } + + // logical plan tests + test_round_trip_logical!(round_trip_logical_plan_1a, 1); + test_round_trip_logical!(round_trip_logical_plan_1b, 2); + test_round_trip_logical!(round_trip_logical_plan_1c, 3); + test_round_trip_logical!(round_trip_logical_plan_1d, 4); + test_round_trip_logical!(round_trip_logical_plan_2a, 5); + test_round_trip_logical!(round_trip_logical_plan_2b, 6); + test_round_trip_logical!(round_trip_logical_plan_2c, 7); + test_round_trip_logical!(round_trip_logical_plan_2d, 8); + test_round_trip_logical!(round_trip_logical_plan_3a, 9); + test_round_trip_logical!(round_trip_logical_plan_3b, 10); + test_round_trip_logical!(round_trip_logical_plan_3c, 11); + test_round_trip_logical!(round_trip_logical_plan_4a, 12); + test_round_trip_logical!(round_trip_logical_plan_4b, 13); + test_round_trip_logical!(round_trip_logical_plan_4c, 14); + test_round_trip_logical!(round_trip_logical_plan_5a, 15); + test_round_trip_logical!(round_trip_logical_plan_5b, 16); + test_round_trip_logical!(round_trip_logical_plan_5c, 17); + test_round_trip_logical!(round_trip_logical_plan_6a, 18); + test_round_trip_logical!(round_trip_logical_plan_6b, 19); + test_round_trip_logical!(round_trip_logical_plan_6c, 20); + test_round_trip_logical!(round_trip_logical_plan_6d, 21); + test_round_trip_logical!(round_trip_logical_plan_6e, 22); + test_round_trip_logical!(round_trip_logical_plan_6f, 23); + test_round_trip_logical!(round_trip_logical_plan_7a, 24); + test_round_trip_logical!(round_trip_logical_plan_7b, 25); + test_round_trip_logical!(round_trip_logical_plan_7c, 26); + test_round_trip_logical!(round_trip_logical_plan_8a, 27); + test_round_trip_logical!(round_trip_logical_plan_8b, 28); + test_round_trip_logical!(round_trip_logical_plan_8c, 29); + test_round_trip_logical!(round_trip_logical_plan_8d, 30); + test_round_trip_logical!(round_trip_logical_plan_9a, 31); + test_round_trip_logical!(round_trip_logical_plan_9b, 32); + test_round_trip_logical!(round_trip_logical_plan_9c, 33); + test_round_trip_logical!(round_trip_logical_plan_9d, 34); + test_round_trip_logical!(round_trip_logical_plan_10a, 35); + test_round_trip_logical!(round_trip_logical_plan_10b, 36); + test_round_trip_logical!(round_trip_logical_plan_10c, 37); + test_round_trip_logical!(round_trip_logical_plan_11a, 38); + test_round_trip_logical!(round_trip_logical_plan_11b, 39); + test_round_trip_logical!(round_trip_logical_plan_11c, 40); + test_round_trip_logical!(round_trip_logical_plan_11d, 41); + test_round_trip_logical!(round_trip_logical_plan_12a, 42); + test_round_trip_logical!(round_trip_logical_plan_12b, 43); + test_round_trip_logical!(round_trip_logical_plan_12c, 44); + test_round_trip_logical!(round_trip_logical_plan_13a, 45); + test_round_trip_logical!(round_trip_logical_plan_13b, 46); + test_round_trip_logical!(round_trip_logical_plan_13c, 47); + test_round_trip_logical!(round_trip_logical_plan_13d, 48); + test_round_trip_logical!(round_trip_logical_plan_14a, 49); + test_round_trip_logical!(round_trip_logical_plan_14b, 50); + test_round_trip_logical!(round_trip_logical_plan_14c, 51); + test_round_trip_logical!(round_trip_logical_plan_15a, 52); + test_round_trip_logical!(round_trip_logical_plan_15b, 53); + test_round_trip_logical!(round_trip_logical_plan_15c, 54); + test_round_trip_logical!(round_trip_logical_plan_15d, 55); + test_round_trip_logical!(round_trip_logical_plan_16a, 56); + test_round_trip_logical!(round_trip_logical_plan_16b, 57); + test_round_trip_logical!(round_trip_logical_plan_16c, 58); + test_round_trip_logical!(round_trip_logical_plan_16d, 59); + test_round_trip_logical!(round_trip_logical_plan_17a, 60); + test_round_trip_logical!(round_trip_logical_plan_17b, 61); + test_round_trip_logical!(round_trip_logical_plan_17c, 62); + test_round_trip_logical!(round_trip_logical_plan_17d, 63); + test_round_trip_logical!(round_trip_logical_plan_17e, 64); + test_round_trip_logical!(round_trip_logical_plan_17f, 65); + test_round_trip_logical!(round_trip_logical_plan_18a, 66); + test_round_trip_logical!(round_trip_logical_plan_18b, 67); + test_round_trip_logical!(round_trip_logical_plan_18c, 68); + test_round_trip_logical!(round_trip_logical_plan_19a, 69); + test_round_trip_logical!(round_trip_logical_plan_19b, 70); + test_round_trip_logical!(round_trip_logical_plan_19c, 71); + test_round_trip_logical!(round_trip_logical_plan_19d, 72); + test_round_trip_logical!(round_trip_logical_plan_20a, 73); + test_round_trip_logical!(round_trip_logical_plan_20b, 74); + test_round_trip_logical!(round_trip_logical_plan_20c, 75); + test_round_trip_logical!(round_trip_logical_plan_21a, 76); + test_round_trip_logical!(round_trip_logical_plan_21b, 77); + test_round_trip_logical!(round_trip_logical_plan_21c, 78); + test_round_trip_logical!(round_trip_logical_plan_22a, 79); + test_round_trip_logical!(round_trip_logical_plan_22b, 80); + test_round_trip_logical!(round_trip_logical_plan_22c, 81); + test_round_trip_logical!(round_trip_logical_plan_22d, 82); + test_round_trip_logical!(round_trip_logical_plan_23a, 83); + test_round_trip_logical!(round_trip_logical_plan_23b, 84); + test_round_trip_logical!(round_trip_logical_plan_23c, 85); + test_round_trip_logical!(round_trip_logical_plan_24a, 86); + test_round_trip_logical!(round_trip_logical_plan_24b, 87); + test_round_trip_logical!(round_trip_logical_plan_25a, 88); + test_round_trip_logical!(round_trip_logical_plan_25b, 89); + test_round_trip_logical!(round_trip_logical_plan_25c, 90); + test_round_trip_logical!(round_trip_logical_plan_26a, 91); + test_round_trip_logical!(round_trip_logical_plan_26b, 92); + test_round_trip_logical!(round_trip_logical_plan_26c, 93); + test_round_trip_logical!(round_trip_logical_plan_27a, 94); + test_round_trip_logical!(round_trip_logical_plan_27b, 95); + test_round_trip_logical!(round_trip_logical_plan_27c, 96); + test_round_trip_logical!(round_trip_logical_plan_28a, 97); + test_round_trip_logical!(round_trip_logical_plan_28b, 98); + test_round_trip_logical!(round_trip_logical_plan_28c, 99); + test_round_trip_logical!(round_trip_logical_plan_29a, 100); + test_round_trip_logical!(round_trip_logical_plan_29b, 101); + test_round_trip_logical!(round_trip_logical_plan_29c, 102); + test_round_trip_logical!(round_trip_logical_plan_30a, 103); + test_round_trip_logical!(round_trip_logical_plan_30b, 104); + test_round_trip_logical!(round_trip_logical_plan_30c, 105); + test_round_trip_logical!(round_trip_logical_plan_31a, 106); + test_round_trip_logical!(round_trip_logical_plan_31b, 107); + test_round_trip_logical!(round_trip_logical_plan_31c, 108); + test_round_trip_logical!(round_trip_logical_plan_32a, 109); + test_round_trip_logical!(round_trip_logical_plan_32b, 110); + test_round_trip_logical!(round_trip_logical_plan_33a, 111); + test_round_trip_logical!(round_trip_logical_plan_33b, 112); + test_round_trip_logical!(round_trip_logical_plan_33c, 113); + + // physical plan tests + test_round_trip_physical!(round_trip_physical_plan_1a, 1); + test_round_trip_physical!(round_trip_physical_plan_1b, 2); + test_round_trip_physical!(round_trip_physical_plan_1c, 3); + test_round_trip_physical!(round_trip_physical_plan_1d, 4); + test_round_trip_physical!(round_trip_physical_plan_2a, 5); + test_round_trip_physical!(round_trip_physical_plan_2b, 6); + test_round_trip_physical!(round_trip_physical_plan_2c, 7); + test_round_trip_physical!(round_trip_physical_plan_2d, 8); + test_round_trip_physical!(round_trip_physical_plan_3a, 9); + test_round_trip_physical!(round_trip_physical_plan_3b, 10); + test_round_trip_physical!(round_trip_physical_plan_3c, 11); + test_round_trip_physical!(round_trip_physical_plan_4a, 12); + test_round_trip_physical!(round_trip_physical_plan_4b, 13); + test_round_trip_physical!(round_trip_physical_plan_4c, 14); + test_round_trip_physical!(round_trip_physical_plan_5a, 15); + test_round_trip_physical!(round_trip_physical_plan_5b, 16); + test_round_trip_physical!(round_trip_physical_plan_5c, 17); + test_round_trip_physical!(round_trip_physical_plan_6a, 18); + test_round_trip_physical!(round_trip_physical_plan_6b, 19); + test_round_trip_physical!(round_trip_physical_plan_6c, 20); + test_round_trip_physical!(round_trip_physical_plan_6d, 21); + test_round_trip_physical!(round_trip_physical_plan_6e, 22); + test_round_trip_physical!(round_trip_physical_plan_6f, 23); + test_round_trip_physical!(round_trip_physical_plan_7a, 24); + test_round_trip_physical!(round_trip_physical_plan_7b, 25); + test_round_trip_physical!(round_trip_physical_plan_7c, 26); + test_round_trip_physical!(round_trip_physical_plan_8a, 27); + test_round_trip_physical!(round_trip_physical_plan_8b, 28); + test_round_trip_physical!(round_trip_physical_plan_8c, 29); + test_round_trip_physical!(round_trip_physical_plan_8d, 30); + test_round_trip_physical!(round_trip_physical_plan_9a, 31); + test_round_trip_physical!(round_trip_physical_plan_9b, 32); + test_round_trip_physical!(round_trip_physical_plan_9c, 33); + test_round_trip_physical!(round_trip_physical_plan_9d, 34); + test_round_trip_physical!(round_trip_physical_plan_10a, 35); + test_round_trip_physical!(round_trip_physical_plan_10b, 36); + test_round_trip_physical!(round_trip_physical_plan_10c, 37); + test_round_trip_physical!(round_trip_physical_plan_11a, 38); + test_round_trip_physical!(round_trip_physical_plan_11b, 39); + test_round_trip_physical!(round_trip_physical_plan_11c, 40); + test_round_trip_physical!(round_trip_physical_plan_11d, 41); + test_round_trip_physical!(round_trip_physical_plan_12a, 42); + test_round_trip_physical!(round_trip_physical_plan_12b, 43); + test_round_trip_physical!(round_trip_physical_plan_12c, 44); + test_round_trip_physical!(round_trip_physical_plan_13a, 45); + test_round_trip_physical!(round_trip_physical_plan_13b, 46); + test_round_trip_physical!(round_trip_physical_plan_13c, 47); + test_round_trip_physical!(round_trip_physical_plan_13d, 48); + test_round_trip_physical!(round_trip_physical_plan_14a, 49); + test_round_trip_physical!(round_trip_physical_plan_14b, 50); + test_round_trip_physical!(round_trip_physical_plan_14c, 51); + test_round_trip_physical!(round_trip_physical_plan_15a, 52); + test_round_trip_physical!(round_trip_physical_plan_15b, 53); + test_round_trip_physical!(round_trip_physical_plan_15c, 54); + test_round_trip_physical!(round_trip_physical_plan_15d, 55); + test_round_trip_physical!(round_trip_physical_plan_16a, 56); + test_round_trip_physical!(round_trip_physical_plan_16b, 57); + test_round_trip_physical!(round_trip_physical_plan_16c, 58); + test_round_trip_physical!(round_trip_physical_plan_16d, 59); + test_round_trip_physical!(round_trip_physical_plan_17a, 60); + test_round_trip_physical!(round_trip_physical_plan_17b, 61); + test_round_trip_physical!(round_trip_physical_plan_17c, 62); + test_round_trip_physical!(round_trip_physical_plan_17d, 63); + test_round_trip_physical!(round_trip_physical_plan_17e, 64); + test_round_trip_physical!(round_trip_physical_plan_17f, 65); + test_round_trip_physical!(round_trip_physical_plan_18a, 66); + test_round_trip_physical!(round_trip_physical_plan_18b, 67); + test_round_trip_physical!(round_trip_physical_plan_18c, 68); + test_round_trip_physical!(round_trip_physical_plan_19a, 69); + test_round_trip_physical!(round_trip_physical_plan_19b, 70); + test_round_trip_physical!(round_trip_physical_plan_19c, 71); + test_round_trip_physical!(round_trip_physical_plan_19d, 72); + test_round_trip_physical!(round_trip_physical_plan_20a, 73); + test_round_trip_physical!(round_trip_physical_plan_20b, 74); + test_round_trip_physical!(round_trip_physical_plan_20c, 75); + test_round_trip_physical!(round_trip_physical_plan_21a, 76); + test_round_trip_physical!(round_trip_physical_plan_21b, 77); + test_round_trip_physical!(round_trip_physical_plan_21c, 78); + test_round_trip_physical!(round_trip_physical_plan_22a, 79); + test_round_trip_physical!(round_trip_physical_plan_22b, 80); + test_round_trip_physical!(round_trip_physical_plan_22c, 81); + test_round_trip_physical!(round_trip_physical_plan_22d, 82); + test_round_trip_physical!(round_trip_physical_plan_23a, 83); + test_round_trip_physical!(round_trip_physical_plan_23b, 84); + test_round_trip_physical!(round_trip_physical_plan_23c, 85); + test_round_trip_physical!(round_trip_physical_plan_24a, 86); + test_round_trip_physical!(round_trip_physical_plan_24b, 87); + test_round_trip_physical!(round_trip_physical_plan_25a, 88); + test_round_trip_physical!(round_trip_physical_plan_25b, 89); + test_round_trip_physical!(round_trip_physical_plan_25c, 90); + test_round_trip_physical!(round_trip_physical_plan_26a, 91); + test_round_trip_physical!(round_trip_physical_plan_26b, 92); + test_round_trip_physical!(round_trip_physical_plan_26c, 93); + test_round_trip_physical!(round_trip_physical_plan_27a, 94); + test_round_trip_physical!(round_trip_physical_plan_27b, 95); + test_round_trip_physical!(round_trip_physical_plan_27c, 96); + test_round_trip_physical!(round_trip_physical_plan_28a, 97); + test_round_trip_physical!(round_trip_physical_plan_28b, 98); + test_round_trip_physical!(round_trip_physical_plan_28c, 99); + test_round_trip_physical!(round_trip_physical_plan_29a, 100); + test_round_trip_physical!(round_trip_physical_plan_29b, 101); + test_round_trip_physical!(round_trip_physical_plan_29c, 102); + test_round_trip_physical!(round_trip_physical_plan_30a, 103); + test_round_trip_physical!(round_trip_physical_plan_30b, 104); + test_round_trip_physical!(round_trip_physical_plan_30c, 105); + test_round_trip_physical!(round_trip_physical_plan_31a, 106); + test_round_trip_physical!(round_trip_physical_plan_31b, 107); + test_round_trip_physical!(round_trip_physical_plan_31c, 108); + test_round_trip_physical!(round_trip_physical_plan_32a, 109); + test_round_trip_physical!(round_trip_physical_plan_32b, 110); + test_round_trip_physical!(round_trip_physical_plan_33a, 111); + test_round_trip_physical!(round_trip_physical_plan_33b, 112); + test_round_trip_physical!(round_trip_physical_plan_33c, 113); +} diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index fbe7d5c04b9b..8a6ccacbb380 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -173,9 +173,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba" +checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c" dependencies = [ "arrow-arith", "arrow-array", @@ -194,9 +194,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a" +checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f" dependencies = [ "arrow-array", "arrow-buffer", @@ -209,9 +209,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f" +checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da" dependencies = [ "ahash", "arrow-buffer", @@ -220,15 +220,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown", + "hashbrown 0.14.5", "num", ] [[package]] name = "arrow-buffer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d" +checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80" dependencies = [ "bytes", "half", @@ -237,9 +237,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785" +checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -258,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740" +checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -277,9 +277,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65" +checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634" dependencies = [ "arrow-buffer", "arrow-schema", @@ -289,9 +289,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a" +checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -304,9 +304,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6" +checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -324,9 +324,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b" +checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb" +checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6" dependencies = [ "ahash", "arrow-array", @@ -353,15 +353,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858" +checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794" [[package]] name = "arrow-select" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba" +checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a" dependencies = [ "ahash", "arrow-array", @@ -373,9 +373,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230" +checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305" dependencies = [ "arrow-array", "arrow-buffer", @@ -406,9 +406,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" +checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" dependencies = [ "bzip2", "flate2", @@ -424,9 +424,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.82" +version = "0.1.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", @@ -450,15 +450,15 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.6" +version = "1.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "848d7b9b605720989929279fa644ce8f244d0ce3146fcca5b70e4eb7b3c020fc" +checksum = "7198e6f03240fdceba36656d8be440297b6b82270325908c7381f37d826a74f6" dependencies = [ "aws-credential-types", "aws-runtime", @@ -523,9 +523,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.43.0" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a9d27ed1c12b1140c47daf1bc541606c43fdafd918c4797d520db0043ceef2" +checksum = "e33ae899566f3d395cbf42858e433930682cc9c1889fa89318896082fef45efb" dependencies = [ "aws-credential-types", "aws-runtime", @@ -545,9 +545,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.44.0" +version = "1.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44514a6ca967686cde1e2a1b81df6ef1883d0e3e570da8d8bc5c491dcb6fc29b" +checksum = "f39c09e199ebd96b9f860b0fce4b6625f211e064ad7c8693b72ecf7ef03881e0" dependencies = [ "aws-credential-types", "aws-runtime", @@ -567,9 +567,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.43.0" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd7a4d279762a35b9df97209f6808b95d4fe78547fe2316b4d200a0283960c5a" +checksum = "3d95f93a98130389eb6233b9d615249e543f6c24a68ca1f109af9ca5164a8765" dependencies = [ "aws-credential-types", "aws-runtime", @@ -707,9 +707,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.6" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03701449087215b5369c7ea17fef0dd5d24cb93439ec5af0c7615f58c3f22605" +checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b" dependencies = [ "base64-simd", "bytes", @@ -917,9 +917,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.21" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07b1695e2c7e8fc85310cde85aeaab7e3097f593c91d209d3f9df76c928100f0" +checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" dependencies = [ "jobserver", "libc", @@ -953,9 +953,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" +checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" dependencies = [ "chrono", "chrono-tz-build", @@ -964,20 +964,19 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" dependencies = [ "parse-zoneinfo", - "phf", "phf_codegen", ] [[package]] name = "clap" -version = "4.5.17" +version = "4.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e5a21b8495e732f1b3c364c9949b201ca7bae518c502c80256c96ad79eaf6ac" +checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" dependencies = [ "clap_builder", "clap_derive", @@ -985,9 +984,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.17" +version = "4.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cf2dd12af7a047ad9d6da2b6b249759a22a7abc0f474c1dae1777afa4b21a73" +checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" dependencies = [ "anstream", "anstyle", @@ -997,9 +996,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.13" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -1175,7 +1174,7 @@ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", - "hashbrown", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core", @@ -1216,7 +1215,7 @@ dependencies = [ "futures", "glob", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1293,7 +1292,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown", + "hashbrown 0.14.5", "instant", "libc", "num_cpus", @@ -1322,7 +1321,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "futures", - "hashbrown", + "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -1345,6 +1344,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", + "indexmap", "paste", "serde_json", "sqlparser", @@ -1374,7 +1374,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "hashbrown", + "hashbrown 0.14.5", "hex", "itertools", "log", @@ -1400,9 +1400,9 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "half", + "indexmap", "log", "paste", - "sqlparser", ] [[package]] @@ -1447,6 +1447,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "log", + "paste", ] [[package]] @@ -1466,7 +1467,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1494,7 +1495,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown", + "hashbrown 0.14.5", "hex", "indexmap", "itertools", @@ -1512,7 +1513,7 @@ dependencies = [ "arrow", "datafusion-common", "datafusion-expr-common", - "hashbrown", + "hashbrown 0.14.5", "rand", ] @@ -1551,7 +1552,7 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1722,9 +1723,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.33" +version = "1.0.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" dependencies = [ "crc32fast", "miniz_oxide", @@ -1756,9 +1757,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1771,9 +1772,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1781,15 +1782,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1798,15 +1799,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -1815,15 +1816,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -1833,9 +1834,9 @@ checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1872,9 +1873,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" @@ -1941,6 +1942,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "heck" version = "0.4.1" @@ -2041,9 +2048,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.4" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" [[package]] name = "httpdate" @@ -2127,7 +2134,7 @@ dependencies = [ "http 1.1.0", "hyper 1.4.1", "hyper-util", - "rustls 0.23.13", + "rustls 0.23.14", "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", @@ -2137,9 +2144,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da62f120a8a37763efb0cf8fdf264b884c7b8b9ac8660b900c8661030c00e6ba" +checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" dependencies = [ "bytes", "futures-channel", @@ -2150,7 +2157,6 @@ dependencies = [ "pin-project-lite", "socket2", "tokio", - "tower", "tower-service", "tracing", ] @@ -2190,12 +2196,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.0", ] [[package]] @@ -2218,9 +2224,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is_terminal_polyfill" @@ -2269,9 +2275,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -2282,9 +2288,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -2293,9 +2299,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" dependencies = [ "lexical-util", "static_assertions", @@ -2303,18 +2309,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" dependencies = [ "lexical-util", "lexical-write-integer", @@ -2323,9 +2329,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" dependencies = [ "lexical-util", "static_assertions", @@ -2333,9 +2339,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.158" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "libflate" @@ -2357,7 +2363,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" dependencies = [ "core2", - "hashbrown", + "hashbrown 0.14.5", "rle-decode-fast", ] @@ -2600,9 +2606,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -2628,7 +2634,7 @@ dependencies = [ "rand", "reqwest", "ring", - "rustls-pemfile 2.1.3", + "rustls-pemfile 2.2.0", "serde", "serde_json", "snafu", @@ -2640,9 +2646,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" @@ -2696,9 +2702,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" +checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e" dependencies = [ "ahash", "arrow-array", @@ -2715,7 +2721,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "lz4_flex", "num", "num-bigint", @@ -2799,26 +2805,6 @@ dependencies = [ "siphasher", ] -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.2.14" @@ -2833,9 +2819,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "powerfmt" @@ -2908,9 +2894,9 @@ checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" [[package]] name = "quick-xml" -version = "0.36.1" +version = "0.36.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" +checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" dependencies = [ "memchr", "serde", @@ -2927,7 +2913,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.13", + "rustls 0.23.14", "socket2", "thiserror", "tokio", @@ -2944,7 +2930,7 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls 0.23.13", + "rustls 0.23.14", "slab", "thiserror", "tinyvec", @@ -3015,9 +3001,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.4" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0884ad60e090bf1345b93da0a5de8923c93884cd03f40dfcfddd3b4bee661853" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags 2.6.0", ] @@ -3035,9 +3021,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", @@ -3047,9 +3033,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -3064,9 +3050,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "relative-path" @@ -3076,9 +3062,9 @@ checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" [[package]] name = "reqwest" -version = "0.12.7" +version = "0.12.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" +checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" dependencies = [ "base64 0.22.1", "bytes", @@ -3099,9 +3085,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.13", - "rustls-native-certs 0.7.3", - "rustls-pemfile 2.1.3", + "rustls 0.23.14", + "rustls-native-certs 0.8.0", + "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", "serde_json", @@ -3218,9 +3204,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" dependencies = [ "once_cell", "ring", @@ -3242,19 +3228,6 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe", - "rustls-pemfile 2.1.3", - "rustls-pki-types", - "schannel", - "security-framework", -] - [[package]] name = "rustls-native-certs" version = "0.8.0" @@ -3262,7 +3235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" dependencies = [ "openssl-probe", - "rustls-pemfile 2.1.3", + "rustls-pemfile 2.2.0", "rustls-pki-types", "schannel", "security-framework", @@ -3279,19 +3252,18 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.3" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64 0.22.1", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" +checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" [[package]] name = "rustls-webpki" @@ -3359,9 +3331,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" dependencies = [ "windows-sys 0.59.0", ] @@ -3397,9 +3369,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.11.1" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" +checksum = "ea4a292869320c0272d7bc55a5a6aafaff59b4f63404a003887b679a2e05b4b6" dependencies = [ "core-foundation-sys", "libc", @@ -3510,18 +3482,18 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b835cb902660db3415a672d862905e791e54d306c6e8189168c7f3d9ae1c79d" +checksum = "223891c85e2a29c3fe8fb900c1fae5e69c2e42415e3177752e8718475efa5019" dependencies = [ "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d1e02fca405f6280643174a50c942219f0bbf4dbf7d480f1dd864d6f211ae5" +checksum = "03c3c6b7927ffe7ecaa769ee0e3994da3b8cafc8f444578982c83ecb161af917" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3633,9 +3605,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.77" +version = "2.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" dependencies = [ "proc-macro2", "quote", @@ -3653,9 +3625,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" dependencies = [ "cfg-if", "fastrand", @@ -3672,18 +3644,18 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "thiserror" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.63" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", @@ -3800,7 +3772,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.13", + "rustls 0.23.14", "rustls-pki-types", "tokio", ] @@ -3826,36 +3798,15 @@ checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" [[package]] name = "toml_edit" -version = "0.22.21" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b072cee73c449a636ffd6f32bd8de3a9f7119139aff882f44943ce2986dc5cf" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "toml_datetime", "winnow", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - [[package]] name = "tower-service" version = "0.3.3" @@ -3937,9 +3888,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" @@ -3964,9 +3915,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "untrusted" @@ -4122,9 +4073,9 @@ checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +checksum = "4e072d4e72f700fb3443d8fe94a39315df013eef1104903cdb0a2abd322bbecd" dependencies = [ "futures-util", "js-sys", @@ -4341,9 +4292,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.18" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68a9bda4691f099d435ad181000724da8e5899daa10713c2d432552b9ccd3a6f" +checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" dependencies = [ "memchr", ] diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 0dec14e9178a..69cdf866cf98 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -226,7 +226,12 @@ impl DFSchema { for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { if let Some(qualifier) = qualifier { - qualified_names.insert((qualifier, field.name())); + if !qualified_names.insert((qualifier, field.name())) { + return _schema_err!(SchemaError::DuplicateQualifiedField { + qualifier: Box::new(qualifier.clone()), + name: field.name().to_string(), + }); + } } else if !unqualified_names.insert(field.name()) { return _schema_err!(SchemaError::DuplicateUnqualifiedField { name: field.name().to_string() @@ -1165,7 +1170,10 @@ mod tests { let left = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let right = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let join = left.join(&right); - assert!(join.err().is_none()); + assert_eq!( + join.unwrap_err().strip_backtrace(), + "Schema error: Schema contains duplicate qualified field name t1.c0", + ); Ok(()) } diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs index 88300e3edd0e..b4d3251fd263 100644 --- a/datafusion/common/src/tree_node.rs +++ b/datafusion/common/src/tree_node.rs @@ -681,7 +681,7 @@ impl Transformed { } } - /// Create a `Transformed` with `transformed and [`TreeNodeRecursion::Continue`]. + /// Create a `Transformed` with `transformed` and [`TreeNodeRecursion::Continue`]. pub fn new_transformed(data: T, transformed: bool) -> Self { Self::new(data, transformed, TreeNodeRecursion::Continue) } diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs new file mode 100644 index 000000000000..53cfe94ecab3 --- /dev/null +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::execution::SessionStateDefaults; +use datafusion_expr::{ + aggregate_doc_sections, scalar_doc_sections, window_doc_sections, AggregateUDF, + DocSection, Documentation, ScalarUDF, WindowUDF, +}; +use itertools::Itertools; +use std::env::args; +use std::fmt::Write as _; + +fn main() { + let args: Vec = args().collect(); + + if args.len() != 2 { + panic!( + "Usage: {} type (one of 'aggregate', 'scalar', 'window')", + args[0] + ); + } + + let function_type = args[1].trim().to_lowercase(); + let docs = match function_type.as_str() { + "aggregate" => print_aggregate_docs(), + "scalar" => print_scalar_docs(), + "window" => print_window_docs(), + _ => { + panic!("Unknown function type: {}", function_type) + } + }; + + println!("{docs}"); +} + +fn print_aggregate_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_aggregate_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, aggregate_doc_sections::doc_sections()) +} + +fn print_scalar_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_scalar_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, scalar_doc_sections::doc_sections()) +} + +fn print_window_docs() -> String { + let mut providers: Vec> = vec![]; + + for f in SessionStateDefaults::default_window_functions() { + providers.push(Box::new(f.as_ref().clone())); + } + + print_docs(providers, window_doc_sections::doc_sections()) +} + +fn print_docs( + providers: Vec>, + doc_sections: Vec, +) -> String { + let mut docs = "".to_string(); + + // doc sections only includes sections that have 'include' == true + for doc_section in doc_sections { + // make sure there is a function that is in this doc section + if !&providers.iter().any(|f| { + if let Some(documentation) = f.get_documentation() { + documentation.doc_section == doc_section + } else { + false + } + }) { + continue; + } + + let providers: Vec<&Box> = providers + .iter() + .filter(|&f| { + if let Some(documentation) = f.get_documentation() { + documentation.doc_section == doc_section + } else { + false + } + }) + .collect::>(); + + // write out section header + let _ = writeln!(docs, "## {} ", doc_section.label); + + if let Some(description) = doc_section.description { + let _ = writeln!(docs, "{description}"); + } + + // names is a sorted list of function names and aliases since we display + // both in the documentation + let names = get_names_and_aliases(&providers); + + // write out the list of function names and aliases + names.iter().for_each(|name| { + let _ = writeln!(docs, "- [{name}](#{name})"); + }); + + // write out each function and alias in the order of the sorted name list + for name in names { + let f = providers + .iter() + .find(|f| f.get_name() == name || f.get_aliases().contains(&name)) + .unwrap(); + + let aliases = f.get_aliases(); + let documentation = f.get_documentation(); + + // if this name is an alias we need to display what it's an alias of + if aliases.contains(&name) { + let fname = f.get_name(); + let _ = writeln!(docs, r#"### `{name}`"#); + let _ = writeln!(docs, "_Alias of [{fname}](#{fname})._"); + continue; + } + + // otherwise display the documentation for the function + let Some(documentation) = documentation else { + unreachable!() + }; + + // first, the name, description and syntax example + let _ = write!( + docs, + r#" +### `{}` + +{} + +``` +{} +``` +"#, + name, documentation.description, documentation.syntax_example + ); + + // next, arguments + if let Some(args) = &documentation.arguments { + let _ = writeln!(docs, "#### Arguments\n"); + for (arg_name, arg_desc) in args { + let _ = writeln!(docs, "- **{arg_name}**: {arg_desc}"); + } + } + + // next, sql example if provided + if let Some(example) = &documentation.sql_example { + let _ = writeln!( + docs, + r#" +#### Example + +{} +"#, + example + ); + } + + // next, aliases + if !f.get_aliases().is_empty() { + let _ = writeln!(docs, "#### Aliases"); + + for alias in f.get_aliases() { + let _ = writeln!(docs, "- {}", alias.replace("_", r#"\_"#)); + } + } + + // finally, any related udfs + if let Some(related_udfs) = &documentation.related_udfs { + let _ = writeln!(docs, "\n**Related functions**:"); + + for related in related_udfs { + let _ = writeln!(docs, "- [{related}](#{related})"); + } + } + } + } + + docs +} + +trait DocProvider { + fn get_name(&self) -> String; + fn get_aliases(&self) -> Vec; + fn get_documentation(&self) -> Option<&Documentation>; +} + +impl DocProvider for AggregateUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +impl DocProvider for ScalarUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +impl DocProvider for WindowUDF { + fn get_name(&self) -> String { + self.name().to_string() + } + fn get_aliases(&self) -> Vec { + self.aliases().iter().map(|a| a.to_string()).collect() + } + fn get_documentation(&self) -> Option<&Documentation> { + self.documentation() + } +} + +#[allow(clippy::borrowed_box)] +#[allow(clippy::ptr_arg)] +fn get_names_and_aliases(functions: &Vec<&Box>) -> Vec { + functions + .iter() + .flat_map(|f| { + if f.get_aliases().is_empty() { + vec![f.get_name().to_string()] + } else { + let mut names = vec![f.get_name().to_string()]; + names.extend(f.get_aliases().iter().cloned()); + names + } + }) + .sorted() + .collect_vec() +} diff --git a/datafusion/core/src/catalog_common/mod.rs b/datafusion/core/src/catalog_common/mod.rs index b8414378862e..85207845a005 100644 --- a/datafusion/core/src/catalog_common/mod.rs +++ b/datafusion/core/src/catalog_common/mod.rs @@ -185,9 +185,7 @@ pub fn resolve_table_references( let _ = s.as_ref().visit(visitor); } DFStatement::CreateExternalTable(table) => { - visitor - .relations - .insert(ObjectName(vec![Ident::from(table.name.as_str())])); + visitor.relations.insert(table.name.clone()); } DFStatement::CopyTo(CopyToStatement { source, .. }) => match source { CopyToSource::Relation(table_name) => { diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 70c507511453..67e2a4780d06 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -535,9 +535,26 @@ impl DataFrame { group_expr: Vec, aggr_expr: Vec, ) -> Result { + let is_grouping_set = matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]); + let aggr_expr_len = aggr_expr.len(); let plan = LogicalPlanBuilder::from(self.plan) .aggregate(group_expr, aggr_expr)? .build()?; + let plan = if is_grouping_set { + let grouping_id_pos = plan.schema().fields().len() - 1 - aggr_expr_len; + // For grouping sets we do a project to not expose the internal grouping id + let exprs = plan + .schema() + .columns() + .into_iter() + .enumerate() + .filter(|(idx, _)| *idx != grouping_id_pos) + .map(|(_, column)| Expr::Column(column)) + .collect::>(); + LogicalPlanBuilder::from(plan).project(exprs)?.build()? + } else { + plan + }; Ok(DataFrame { session_state: self.session_state, plan, @@ -3380,52 +3397,6 @@ mod tests { Ok(()) } - // Table 't1' self join - // Supplementary test of issue: https://github.com/apache/datafusion/issues/7790 - #[tokio::test] - async fn with_column_self_join() -> Result<()> { - let df = test_table().await?.select_columns(&["c1"])?; - let ctx = SessionContext::new(); - - ctx.register_table("t1", df.into_view())?; - - let df = ctx - .table("t1") - .await? - .join( - ctx.table("t1").await?, - JoinType::Inner, - &["c1"], - &["c1"], - None, - )? - .sort(vec![ - // make the test deterministic - col("t1.c1").sort(true, true), - ])? - .limit(0, Some(1))?; - - let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+", - "| c1 | c1 |", - "+----+----+", - "| a | a |", - "+----+----+", - ], - &df_results - ); - - let actual_err = df.clone().with_column("new_column", lit(true)).unwrap_err(); - let expected_err = "Error during planning: Projections require unique expression names \ - but the expression \"t1.c1\" at position 0 and \"t1.c1\" at position 1 have the same name. \ - Consider aliasing (\"AS\") one of them."; - assert_eq!(actual_err.strip_backtrace(), expected_err); - - Ok(()) - } - #[tokio::test] async fn with_column_renamed() -> Result<()> { let df = test_table() diff --git a/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs b/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs index 3a5d50bba07f..98b6702bc383 100644 --- a/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs +++ b/datafusion/core/src/datasource/avro_to_arrow/arrow_array_reader.rs @@ -573,7 +573,7 @@ impl<'a, R: Read> AvroArrowArrayReader<'a, R> { // extract list values, with non-lists converted to Value::Null let array_item_count = rows .iter() - .map(|row| match row { + .map(|row| match maybe_resolve_union(row) { Value::Array(values) => values.len(), _ => 1, }) @@ -1643,6 +1643,93 @@ mod test { assert_batches_eq!(expected, &[batch]); } + #[test] + fn test_avro_nullable_struct_array() { + let schema = apache_avro::Schema::parse_str( + r#" + { + "type": "record", + "name": "r1", + "fields": [ + { + "name": "col1", + "type": [ + "null", + { + "type": "array", + "items": { + "type": [ + "null", + { + "type": "record", + "name": "Item", + "fields": [ + { + "name": "id", + "type": "long" + } + ] + } + ] + } + } + ], + "default": null + } + ] + }"#, + ) + .unwrap(); + let jv1 = serde_json::json!({ + "col1": [ + { + "id": 234 + }, + { + "id": 345 + } + ] + }); + let r1 = apache_avro::to_value(jv1) + .unwrap() + .resolve(&schema) + .unwrap(); + let r2 = apache_avro::to_value(serde_json::json!({ "col1": null })) + .unwrap() + .resolve(&schema) + .unwrap(); + + let mut w = apache_avro::Writer::new(&schema, vec![]); + for _i in 0..5 { + w.append(r1.clone()).unwrap(); + } + w.append(r2).unwrap(); + let bytes = w.into_inner().unwrap(); + + let mut reader = ReaderBuilder::new() + .read_schema() + .with_batch_size(20) + .build(std::io::Cursor::new(bytes)) + .unwrap(); + let batch = reader.next().unwrap().unwrap(); + assert_eq!(batch.num_rows(), 6); + assert_eq!(batch.num_columns(), 1); + + let expected = [ + "+------------------------+", + "| col1 |", + "+------------------------+", + "| [{id: 234}, {id: 345}] |", + "| [{id: 234}, {id: 345}] |", + "| [{id: 234}, {id: 345}] |", + "| [{id: 234}, {id: 345}] |", + "| [{id: 234}, {id: 345}] |", + "| |", + "+------------------------+", + ]; + assert_batches_eq!(expected, &[batch]); + } + #[test] fn test_avro_iterator() { let reader = build_reader("alltypes_plain.avro", 5); diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index 3c409af29703..6654d0871c3f 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -69,11 +69,18 @@ impl UrlTableFactory for DynamicListTableFactory { .ok_or_else(|| plan_datafusion_err!("get current SessionStore error"))?; match ListingTableConfig::new(table_url.clone()) - .infer(state) + .infer_options(state) .await { - Ok(cfg) => ListingTable::try_new(cfg) - .map(|table| Some(Arc::new(table) as Arc)), + Ok(cfg) => { + let cfg = cfg + .infer_partitions_from_path(state) + .await? + .infer_schema(state) + .await?; + ListingTable::try_new(cfg) + .map(|table| Some(Arc::new(table) as Arc)) + } Err(_) => Ok(None), } } diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index e821fa806fce..f235c3b628a0 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -771,7 +771,7 @@ mod tests { "c7: Int64", "c8: Int64", "c9: Int64", - "c10: Int64", + "c10: Utf8", "c11: Float64", "c12: Float64", "c13: Utf8" @@ -907,7 +907,7 @@ mod tests { Field::new("c7", DataType::Int64, true), Field::new("c8", DataType::Int64, true), Field::new("c9", DataType::Int64, true), - Field::new("c10", DataType::Int64, true), + Field::new("c10", DataType::Utf8, true), Field::new("c11", DataType::Float64, true), Field::new("c12", DataType::Float64, true), Field::new("c13", DataType::Utf8, true), diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 60f2b2dcefa9..e16986c660ad 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -241,16 +241,14 @@ pub fn transform_schema_to_view(schema: &Schema) -> Schema { .fields .iter() .map(|field| match field.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new( - field.name(), - DataType::Utf8View, - field.is_nullable(), - )), - DataType::Binary | DataType::LargeBinary => Arc::new(Field::new( - field.name(), - DataType::BinaryView, - field.is_nullable(), - )), + DataType::Utf8 | DataType::LargeUtf8 => Arc::new( + Field::new(field.name(), DataType::Utf8View, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + DataType::Binary | DataType::LargeBinary => Arc::new( + Field::new(field.name(), DataType::BinaryView, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), _ => field.clone(), }) .collect(); diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 98ae0ce14bd7..8647b5df90be 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -20,6 +20,7 @@ use std::any::Any; use std::fmt; use std::fmt::Debug; +use std::ops::Range; use std::sync::Arc; use super::write::demux::start_demuxer_task; @@ -47,7 +48,7 @@ use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; use datafusion_common::{ - exec_err, internal_datafusion_err, not_impl_err, DataFusionError, GetExt, + internal_datafusion_err, not_impl_err, DataFusionError, GetExt, DEFAULT_PARQUET_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; @@ -60,7 +61,7 @@ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::metrics::MetricsSet; use async_trait::async_trait; -use bytes::{BufMut, BytesMut}; +use bytes::Bytes; use hashbrown::HashMap; use log::debug; use object_store::buffered::BufWriter; @@ -71,8 +72,7 @@ use parquet::arrow::arrow_writer::{ use parquet::arrow::{ arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, }; -use parquet::file::footer::{decode_footer, decode_metadata}; -use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::file::properties::WriterProperties; use parquet::file::writer::SerializedFileWriter; use parquet::format::FileMetaData; @@ -84,10 +84,13 @@ use crate::datasource::physical_plan::parquet::{ can_expr_be_pushed_down_with_schemas, ParquetExecBuilder, }; use datafusion_physical_expr_common::sort_expr::LexRequirement; -use futures::{StreamExt, TryStreamExt}; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt, TryStreamExt}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; +use parquet::arrow::async_reader::MetadataFetch; +use parquet::errors::ParquetError; /// Initial writing buffer size. Note this is just a size hint for efficiency. It /// will grow beyond the set value if needed. @@ -441,6 +444,33 @@ impl FileFormat for ParquetFormat { } } +/// [`MetadataFetch`] adapter for reading bytes from an [`ObjectStore`] +struct ObjectStoreFetch<'a> { + store: &'a dyn ObjectStore, + meta: &'a ObjectMeta, +} + +impl<'a> ObjectStoreFetch<'a> { + fn new(store: &'a dyn ObjectStore, meta: &'a ObjectMeta) -> Self { + Self { store, meta } + } +} + +impl<'a> MetadataFetch for ObjectStoreFetch<'a> { + fn fetch( + &mut self, + range: Range, + ) -> BoxFuture<'_, Result> { + async { + self.store + .get_range(&self.meta.location, range) + .await + .map_err(ParquetError::from) + } + .boxed() + } +} + /// Fetches parquet metadata from ObjectStore for given object /// /// This component is a subject to **change** in near future and is exposed for low level integrations @@ -452,57 +482,14 @@ pub async fn fetch_parquet_metadata( meta: &ObjectMeta, size_hint: Option, ) -> Result { - if meta.size < 8 { - return exec_err!("file size of {} is less than footer", meta.size); - } - - // If a size hint is provided, read more than the minimum size - // to try and avoid a second fetch. - let footer_start = if let Some(size_hint) = size_hint { - meta.size.saturating_sub(size_hint) - } else { - meta.size - 8 - }; - - let suffix = store - .get_range(&meta.location, footer_start..meta.size) - .await?; - - let suffix_len = suffix.len(); - - let mut footer = [0; 8]; - footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); - - let length = decode_footer(&footer)?; + let file_size = meta.size; + let fetch = ObjectStoreFetch::new(store, meta); - if meta.size < length + 8 { - return exec_err!( - "file size of {} is less than footer + metadata {}", - meta.size, - length + 8 - ); - } - - // Did not fetch the entire file metadata in the initial read, need to make a second request - if length > suffix_len - 8 { - let metadata_start = meta.size - length - 8; - let remaining_metadata = store - .get_range(&meta.location, metadata_start..footer_start) - .await?; - - let mut metadata = BytesMut::with_capacity(length); - - metadata.put(remaining_metadata.as_ref()); - metadata.put(&suffix[..suffix_len - 8]); - - Ok(decode_metadata(metadata.as_ref())?) - } else { - let metadata_start = meta.size - length - 8; - - Ok(decode_metadata( - &suffix[metadata_start - footer_start..suffix_len - 8], - )?) - } + ParquetMetaDataReader::new() + .with_prefetch_hint(size_hint) + .load_and_finish(fetch, file_size) + .await + .map_err(DataFusionError::from) } /// Read and parse the schema of the Parquet file at location `path` diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 3eb8eed9de36..a9c6aec17537 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -33,7 +33,7 @@ use crate::datasource::{ }; use crate::execution::context::SessionState; use datafusion_catalog::TableProvider; -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::{config_err, DataFusionError, Result}; use datafusion_expr::dml::InsertOp; use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; use datafusion_expr::{SortExpr, TableType}; @@ -192,6 +192,38 @@ impl ListingTableConfig { pub async fn infer(self, state: &SessionState) -> Result { self.infer_options(state).await?.infer_schema(state).await } + + /// Infer the partition columns from the path. Requires `self.options` to be set prior to using. + pub async fn infer_partitions_from_path(self, state: &SessionState) -> Result { + match self.options { + Some(options) => { + let Some(url) = self.table_paths.first() else { + return config_err!("No table path found"); + }; + let partitions = options + .infer_partitions(state, url) + .await? + .into_iter() + .map(|col_name| { + ( + col_name, + DataType::Dictionary( + Box::new(DataType::UInt16), + Box::new(DataType::Utf8), + ), + ) + }) + .collect::>(); + let options = options.with_table_partition_cols(partitions); + Ok(Self { + table_paths: self.table_paths, + file_schema: self.file_schema, + options: Some(options), + }) + } + None => config_err!("No `ListingOptions` set for inferring schema"), + } + } } /// Options for creating a [`ListingTable`] @@ -505,7 +537,7 @@ impl ListingOptions { /// Infer the partitioning at the given path on the provided object store. /// For performance reasons, it doesn't read all the files on disk /// and therefore may fail to detect invalid partitioning. - async fn infer_partitions( + pub(crate) async fn infer_partitions( &self, state: &SessionState, table_path: &ListingTableUrl, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 520392c9f075..cf2a157b04b6 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -692,10 +692,6 @@ impl DefaultPhysicalPlanner { physical_input_schema.clone(), )?); - // update group column indices based on partial aggregate plan evaluation - let final_group: Vec> = - initial_aggr.output_group_expr(); - let can_repartition = !groups.is_empty() && session_state.config().target_partitions() > 1 && session_state.config().repartition_aggregations(); @@ -716,13 +712,7 @@ impl DefaultPhysicalPlanner { AggregateMode::Final }; - let final_grouping_set = PhysicalGroupBy::new_single( - final_group - .iter() - .enumerate() - .map(|(i, expr)| (expr.clone(), groups.expr()[i].1.clone())) - .collect(), - ); + let final_grouping_set = initial_aggr.group_expr().as_final(); Arc::new(AggregateExec::try_new( next_partition_mode, @@ -2345,7 +2335,7 @@ mod tests { .expect("hash aggregate"); assert_eq!( "sum(aggregate_test_100.c3)", - final_hash_agg.schema().field(2).name() + final_hash_agg.schema().field(3).name() ); // we need access to the input to the partial aggregate so that other projects can // implement serde @@ -2557,6 +2547,10 @@ mod tests { ) -> Result { unimplemented!("NoOp"); } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } #[derive(Debug)] diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index cbd892672152..81a33361008f 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -37,14 +37,14 @@ mod simplification; fn test_octet_length() { #[rustfmt::skip] evaluate_expr_test( - octet_length(col("list")), + octet_length(col("id")), vec![ "+------+", "| expr |", "+------+", - "| 5 |", - "| 18 |", - "| 6 |", + "| 1 |", + "| 1 |", + "| 1 |", "+------+", ], ); diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index 408cadc35f48..0cd702372f7c 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -29,7 +29,7 @@ mod sp_repartition_fuzz_tests { metrics::{BaselineMetrics, ExecutionPlanMetricsSet}, repartition::RepartitionExec, sorts::sort_preserving_merge::SortPreservingMergeExec, - sorts::streaming_merge::streaming_merge, + sorts::streaming_merge::StreamingMergeBuilder, stream::RecordBatchStreamAdapter, ExecutionPlan, Partitioning, }; @@ -246,15 +246,14 @@ mod sp_repartition_fuzz_tests { MemoryConsumer::new("test".to_string()).register(context.memory_pool()); // Internally SortPreservingMergeExec uses this function for merging. - let res = streaming_merge( - streams, - schema, - &exprs, - BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0), - 1, - None, - mem_reservation, - )?; + let res = StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(schema) + .with_expressions(&exprs) + .with_metrics(BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0)) + .with_batch_size(1) + .with_reservation(mem_reservation) + .build()?; let res = collect(res).await?; // Contains the merged result. let res = concat_batches(&res[0].schema(), &res)?; diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index e51adbc4ddc1..2b45d0ed600b 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -443,6 +443,10 @@ impl UserDefinedLogicalNodeCore for TopKPlanNode { expr: replace_sort_expression(self.expr.clone(), exprs.swap_remove(0)), }) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } /// Physical planner for TopK nodes diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index d1553b3315e7..320e1303a21b 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -125,6 +125,11 @@ pub enum TypeSignature { /// Fixed number of arguments of numeric types. /// See to know which type is considered numeric Numeric(usize), + /// Fixed number of arguments of all the same string types. + /// The precedence of type from high to low is Utf8View, LargeUtf8 and Utf8. + /// Null is considerd as Utf8 by default + /// Dictionary with string value type is also handled. + String(usize), } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] @@ -190,8 +195,11 @@ impl TypeSignature { .collect::>() .join(", ")] } + TypeSignature::String(num) => { + vec![format!("String({num})")] + } TypeSignature::Numeric(num) => { - vec![format!("Numeric({})", num)] + vec![format!("Numeric({num})")] } TypeSignature::Exact(types) | TypeSignature::Coercible(types) => { vec![Self::join_types(types, ", ")] @@ -280,6 +288,14 @@ impl Signature { } } + /// A specified number of numeric arguments + pub fn string(arg_count: usize, volatility: Volatility) -> Self { + Self { + type_signature: TypeSignature::String(arg_count), + volatility, + } + } + /// An arbitrary number of arguments of any type. pub fn variadic_any(volatility: Volatility) -> Self { Self { diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index e66a9ae1ea98..6d66b8b4df44 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -471,10 +471,16 @@ fn type_union_resolution_coercion( let new_value_type = type_union_resolution_coercion(value_type, other_type); new_value_type.map(|t| DataType::Dictionary(index_type.clone(), Box::new(t))) } + (DataType::List(lhs), DataType::List(rhs)) => { + let new_item_type = + type_union_resolution_coercion(lhs.data_type(), rhs.data_type()); + new_item_type.map(|t| DataType::List(Arc::new(Field::new("item", t, true)))) + } _ => { // numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) + .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) } @@ -507,22 +513,6 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - if lhs_type == rhs_type { - // same type => equality is possible - return Some(lhs_type.clone()); - } - binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) - .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| binary_coercion(lhs_type, rhs_type)) -} - /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation /// where one is numeric and one is `Utf8`/`LargeUtf8`. fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { @@ -969,7 +959,7 @@ fn string_concat_internal_coercion( /// based on the observation that StringArray to StringViewArray is cheap but not vice versa. /// /// Between Utf8 and LargeUtf8, we coerce to LargeUtf8. -fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { // If Utf8View is in any side, we coerce to Utf8View. diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 55387fea22ee..d7dc1afe4d50 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -48,6 +48,7 @@ datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } +indexmap = { workspace = true } paste = "^1.0" serde_json = { workspace = true } sqlparser = { workspace = true } diff --git a/datafusion/expr/src/lib.rs b/datafusion/expr/src/lib.rs index ff9c37559dc4..014b271453ed 100644 --- a/datafusion/expr/src/lib.rs +++ b/datafusion/expr/src/lib.rs @@ -34,6 +34,7 @@ mod partition_evaluator; mod table_source; mod udaf; mod udf; +mod udf_docs; mod udwf; pub mod conditional_expressions; @@ -91,9 +92,12 @@ pub use logical_plan::*; pub use partition_evaluator::PartitionEvaluator; pub use sqlparser; pub use table_source::{TableProviderFilterPushDown, TableSource, TableType}; -pub use udaf::{AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs}; -pub use udf::{ScalarUDF, ScalarUDFImpl}; -pub use udwf::{ReversedUDWF, WindowUDF, WindowUDFImpl}; +pub use udaf::{ + aggregate_doc_sections, AggregateUDF, AggregateUDFImpl, ReversedUDAF, StatisticsArgs, +}; +pub use udf::{scalar_doc_sections, ScalarUDF, ScalarUDFImpl}; +pub use udf_docs::{DocSection, Documentation, DocumentationBuilder}; +pub use udwf::{window_doc_sections, ReversedUDWF, WindowUDF, WindowUDFImpl}; pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits}; #[cfg(test)] diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index cc8ddf8ec8e8..da2a96327ce5 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -35,7 +35,6 @@ use crate::logical_plan::{ Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values, Window, }; -use crate::type_coercion::binary::values_coercion; use crate::utils::{ can_hash, columnize_expr, compare_sort_expr, expr_to_columns, find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, @@ -53,6 +52,7 @@ use datafusion_common::{ plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions, }; +use datafusion_expr_common::type_coercion::binary::type_union_resolution; use super::dml::InsertOp; use super::plan::{ColumnUnnestList, ColumnUnnestType}; @@ -209,7 +209,8 @@ impl LogicalPlanBuilder { } if let Some(prev_type) = common_type { // get common type of each column values. - let Some(new_type) = values_coercion(&data_type, &prev_type) else { + let data_types = vec![prev_type.clone(), data_type.clone()]; + let Some(new_type) = type_union_resolution(&data_types) else { return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}"); }; common_type = Some(new_type); diff --git a/datafusion/expr/src/logical_plan/extension.rs b/datafusion/expr/src/logical_plan/extension.rs index d49c85fb6fd6..19d4cb3db9ce 100644 --- a/datafusion/expr/src/logical_plan/extension.rs +++ b/datafusion/expr/src/logical_plan/extension.rs @@ -195,6 +195,16 @@ pub trait UserDefinedLogicalNode: fmt::Debug + Send + Sync { /// directly because it must remain object safe. fn dyn_eq(&self, other: &dyn UserDefinedLogicalNode) -> bool; fn dyn_ord(&self, other: &dyn UserDefinedLogicalNode) -> Option; + + /// Returns `true` if a limit can be safely pushed down through this + /// `UserDefinedLogicalNode` node. + /// + /// If this method returns `true`, and the query plan contains a limit at + /// the output of this node, DataFusion will push the limit to the input + /// of this node. + fn supports_limit_pushdown(&self) -> bool { + false + } } impl Hash for dyn UserDefinedLogicalNode { @@ -295,6 +305,16 @@ pub trait UserDefinedLogicalNodeCore: ) -> Option>> { None } + + /// Returns `true` if a limit can be safely pushed down through this + /// `UserDefinedLogicalNode` node. + /// + /// If this method returns `true`, and the query plan contains a limit at + /// the output of this node, DataFusion will push the limit to the input + /// of this node. + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } /// Automatically derive UserDefinedLogicalNode to `UserDefinedLogicalNode` @@ -361,6 +381,10 @@ impl UserDefinedLogicalNode for T { .downcast_ref::() .and_then(|other| self.partial_cmp(other)) } + + fn supports_limit_pushdown(&self) -> bool { + self.supports_limit_pushdown() + } } fn get_all_columns_from_schema(schema: &DFSchema) -> HashSet { diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 443d23804adb..0292274e57ee 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -21,7 +21,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::fmt::{self, Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use super::dml::CopyTo; use super::DdlStatement; @@ -51,6 +51,7 @@ use datafusion_common::{ DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, FunctionalDependencies, ParamValues, Result, TableReference, UnnestOptions, }; +use indexmap::IndexSet; // backwards compatibility use crate::display::PgJsonVisitor; @@ -2964,6 +2965,15 @@ impl Aggregate { .into_iter() .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into())) .collect::>(); + qualified_fields.push(( + None, + Field::new( + Self::INTERNAL_GROUPING_ID, + Self::grouping_id_type(qualified_fields.len()), + false, + ) + .into(), + )); } qualified_fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); @@ -3015,9 +3025,19 @@ impl Aggregate { }) } + fn is_grouping_set(&self) -> bool { + matches!(self.group_expr.as_slice(), [Expr::GroupingSet(_)]) + } + /// Get the output expressions. fn output_expressions(&self) -> Result> { + static INTERNAL_ID_EXPR: OnceLock = OnceLock::new(); let mut exprs = grouping_set_to_exprlist(self.group_expr.as_slice())?; + if self.is_grouping_set() { + exprs.push(INTERNAL_ID_EXPR.get_or_init(|| { + Expr::Column(Column::from_name(Self::INTERNAL_GROUPING_ID)) + })); + } exprs.extend(self.aggr_expr.iter()); debug_assert!(exprs.len() == self.schema.fields().len()); Ok(exprs) @@ -3029,6 +3049,41 @@ impl Aggregate { pub fn group_expr_len(&self) -> Result { grouping_set_expr_count(&self.group_expr) } + + /// Returns the data type of the grouping id. + /// The grouping ID value is a bitmask where each set bit + /// indicates that the corresponding grouping expression is + /// null + pub fn grouping_id_type(group_exprs: usize) -> DataType { + if group_exprs <= 8 { + DataType::UInt8 + } else if group_exprs <= 16 { + DataType::UInt16 + } else if group_exprs <= 32 { + DataType::UInt32 + } else { + DataType::UInt64 + } + } + + /// Internal column used when the aggregation is a grouping set. + /// + /// This column contains a bitmask where each bit represents a grouping + /// expression. The least significant bit corresponds to the rightmost + /// grouping expression. A bit value of 0 indicates that the corresponding + /// column is included in the grouping set, while a value of 1 means it is excluded. + /// + /// For example, for the grouping expressions CUBE(a, b), the grouping ID + /// column will have the following values: + /// 0b00: Both `a` and `b` are included + /// 0b01: `b` is excluded + /// 0b10: `a` is excluded + /// 0b11: Both `a` and `b` are excluded + /// + /// This internal column is necessary because excluded columns are replaced + /// with `NULL` values. To handle these cases correctly, we must distinguish + /// between an actual `NULL` value in a column and a column being excluded from the set. + pub const INTERNAL_GROUPING_ID: &'static str = "__grouping_id"; } // Manual implementation needed because of `schema` field. Comparison excludes this field. @@ -3071,6 +3126,8 @@ fn calc_func_dependencies_for_aggregate( let group_by_expr_names = group_expr .iter() .map(|item| item.schema_name().to_string()) + .collect::>() + .into_iter() .collect::>(); let aggregate_func_dependencies = aggregate_functional_dependencies( input.schema(), diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index d30d202df050..143e00fa409e 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -26,8 +26,9 @@ use datafusion_common::{ utils::{coerced_fixed_size_list_to_list, list_ndims}, Result, }; -use datafusion_expr_common::signature::{ - ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD, +use datafusion_expr_common::{ + signature::{ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD}, + type_coercion::binary::string_coercion, }; use std::sync::Arc; @@ -167,6 +168,21 @@ pub fn data_types( try_coerce_types(valid_types, current_types, &signature.type_signature) } +fn is_well_supported_signature(type_signature: &TypeSignature) -> bool { + if let TypeSignature::OneOf(signatures) = type_signature { + return signatures.iter().all(is_well_supported_signature); + } + + matches!( + type_signature, + TypeSignature::UserDefined + | TypeSignature::Numeric(_) + | TypeSignature::String(_) + | TypeSignature::Coercible(_) + | TypeSignature::Any(_) + ) +} + fn try_coerce_types( valid_types: Vec>, current_types: &[DataType], @@ -175,14 +191,7 @@ fn try_coerce_types( let mut valid_types = valid_types; // Well-supported signature that returns exact valid types. - if !valid_types.is_empty() - && matches!( - type_signature, - TypeSignature::UserDefined - | TypeSignature::Numeric(_) - | TypeSignature::Coercible(_) - ) - { + if !valid_types.is_empty() && is_well_supported_signature(type_signature) { // exact valid types assert_eq!(valid_types.len(), 1); let valid_types = valid_types.swap_remove(0); @@ -374,6 +383,67 @@ fn get_valid_types( .iter() .map(|valid_type| current_types.iter().map(|_| valid_type.clone()).collect()) .collect(), + TypeSignature::String(number) => { + if *number < 1 { + return plan_err!( + "The signature expected at least one argument but received {}", + current_types.len() + ); + } + if *number != current_types.len() { + return plan_err!( + "The signature expected {} arguments but received {}", + number, + current_types.len() + ); + } + + fn coercion_rule( + lhs_type: &DataType, + rhs_type: &DataType, + ) -> Result { + match (lhs_type, rhs_type) { + (DataType::Null, DataType::Null) => Ok(DataType::Utf8), + (DataType::Null, data_type) | (data_type, DataType::Null) => { + coercion_rule(data_type, &DataType::Utf8) + } + (DataType::Dictionary(_, lhs), DataType::Dictionary(_, rhs)) => { + coercion_rule(lhs, rhs) + } + (DataType::Dictionary(_, v), other) + | (other, DataType::Dictionary(_, v)) => coercion_rule(v, other), + _ => { + if let Some(coerced_type) = string_coercion(lhs_type, rhs_type) { + Ok(coerced_type) + } else { + plan_err!( + "{} and {} are not coercible to a common string type", + lhs_type, + rhs_type + ) + } + } + } + } + + // Length checked above, safe to unwrap + let mut coerced_type = current_types.first().unwrap().to_owned(); + for t in current_types.iter().skip(1) { + coerced_type = coercion_rule(&coerced_type, t)?; + } + + fn base_type_or_default_type(data_type: &DataType) -> DataType { + if data_type.is_null() { + DataType::Utf8 + } else if let DataType::Dictionary(_, v) = data_type { + base_type_or_default_type(v) + } else { + data_type.to_owned() + } + } + + vec![vec![base_type_or_default_type(&coerced_type); *number]] + } TypeSignature::Numeric(number) => { if *number < 1 { return plan_err!( @@ -602,89 +672,48 @@ fn coerced_from<'a>( Some(type_into.clone()) } // coerced into type_into - (Int8, _) if matches!(type_from, Null | Int8) => Some(type_into.clone()), - (Int16, _) if matches!(type_from, Null | Int8 | Int16 | UInt8) => { - Some(type_into.clone()) - } - (Int32, _) - if matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 | UInt16) => - { - Some(type_into.clone()) - } - (Int64, _) - if matches!( - type_from, - Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 - ) => - { - Some(type_into.clone()) - } - (UInt8, _) if matches!(type_from, Null | UInt8) => Some(type_into.clone()), - (UInt16, _) if matches!(type_from, Null | UInt8 | UInt16) => { - Some(type_into.clone()) - } - (UInt32, _) if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => { - Some(type_into.clone()) - } - (UInt64, _) if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64) => { - Some(type_into.clone()) - } - (Float32, _) - if matches!( - type_from, - Null | Int8 - | Int16 - | Int32 - | Int64 - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Float32 - ) => - { - Some(type_into.clone()) - } - (Float64, _) - if matches!( - type_from, - Null | Int8 - | Int16 - | Int32 - | Int64 - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Float32 - | Float64 - | Decimal128(_, _) - ) => - { - Some(type_into.clone()) - } - (Timestamp(TimeUnit::Nanosecond, None), _) - if matches!( - type_from, - Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8 - ) => - { - Some(type_into.clone()) - } - (Interval(_), _) if matches!(type_from, Utf8 | LargeUtf8) => { + (Int8, Null | Int8) => Some(type_into.clone()), + (Int16, Null | Int8 | Int16 | UInt8) => Some(type_into.clone()), + (Int32, Null | Int8 | Int16 | Int32 | UInt8 | UInt16) => Some(type_into.clone()), + (Int64, Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32) => { Some(type_into.clone()) } + (UInt8, Null | UInt8) => Some(type_into.clone()), + (UInt16, Null | UInt8 | UInt16) => Some(type_into.clone()), + (UInt32, Null | UInt8 | UInt16 | UInt32) => Some(type_into.clone()), + (UInt64, Null | UInt8 | UInt16 | UInt32 | UInt64) => Some(type_into.clone()), + ( + Float32, + Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64 + | Float32, + ) => Some(type_into.clone()), + ( + Float64, + Null + | Int8 + | Int16 + | Int32 + | Int64 + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Float32 + | Float64 + | Decimal128(_, _), + ) => Some(type_into.clone()), + ( + Timestamp(TimeUnit::Nanosecond, None), + Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8, + ) => Some(type_into.clone()), + (Interval(_), Utf8 | LargeUtf8) => Some(type_into.clone()), // We can go into a Utf8View from a Utf8 or LargeUtf8 - (Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8 | Null) => { - Some(type_into.clone()) - } + (Utf8View, Utf8 | LargeUtf8 | Null) => Some(type_into.clone()), // Any type can be coerced into strings (Utf8 | LargeUtf8, _) => Some(type_into.clone()), (Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()), - (List(_), _) if matches!(type_from, FixedSizeList(_, _)) => { - Some(type_into.clone()) - } + (List(_), FixedSizeList(_, _)) => Some(type_into.clone()), // Only accept list and largelist with the same number of dimensions unless the type is Null. // List or LargeList with different dimensions should be handled in TypeSignature or other places before this @@ -695,18 +724,16 @@ fn coerced_from<'a>( Some(type_into.clone()) } // should be able to coerce wildcard fixed size list to non wildcard fixed size list - (FixedSizeList(f_into, FIXED_SIZE_LIST_WILDCARD), _) => match type_from { - FixedSizeList(f_from, size_from) => { - match coerced_from(f_into.data_type(), f_from.data_type()) { - Some(data_type) if &data_type != f_into.data_type() => { - let new_field = - Arc::new(f_into.as_ref().clone().with_data_type(data_type)); - Some(FixedSizeList(new_field, *size_from)) - } - Some(_) => Some(FixedSizeList(Arc::clone(f_into), *size_from)), - _ => None, - } + ( + FixedSizeList(f_into, FIXED_SIZE_LIST_WILDCARD), + FixedSizeList(f_from, size_from), + ) => match coerced_from(f_into.data_type(), f_from.data_type()) { + Some(data_type) if &data_type != f_into.data_type() => { + let new_field = + Arc::new(f_into.as_ref().clone().with_data_type(data_type)); + Some(FixedSizeList(new_field, *size_from)) } + Some(_) => Some(FixedSizeList(Arc::clone(f_into), *size_from)), _ => None, }, (Timestamp(unit, Some(tz)), _) if tz.as_ref() == TIMEZONE_WILDCARD => { @@ -721,12 +748,7 @@ fn coerced_from<'a>( _ => None, } } - (Timestamp(_, Some(_)), _) - if matches!( - type_from, - Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8 - ) => - { + (Timestamp(_, Some(_)), Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8) => { Some(type_into.clone()) } _ => None, diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 780ea36910a4..6e48054bcf3d 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -36,8 +36,8 @@ use crate::function::{ use crate::groups_accumulator::GroupsAccumulator; use crate::utils::format_state_name; use crate::utils::AggregateOrderSensitivity; -use crate::Signature; use crate::{Accumulator, Expr}; +use crate::{Documentation, Signature}; /// Logical representation of a user-defined [aggregate function] (UDAF). /// @@ -275,6 +275,14 @@ impl AggregateUDF { pub fn default_value(&self, data_type: &DataType) -> Result { self.inner.default_value(data_type) } + + /// Returns the documentation for this Aggregate UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } } impl From for AggregateUDF @@ -299,25 +307,42 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr}; +/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility, Expr, Documentation}; /// # use datafusion_expr::{AggregateUDFImpl, AggregateUDF, Accumulator, function::{AccumulatorArgs, StateFieldsArgs}}; +/// # use datafusion_expr::window_doc_sections::DOC_SECTION_AGGREGATE; /// # use arrow::datatypes::Schema; /// # use arrow::datatypes::Field; +/// /// #[derive(Debug, Clone)] /// struct GeoMeanUdf { -/// signature: Signature +/// signature: Signature, /// } /// /// impl GeoMeanUdf { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Float64], Volatility::Immutable), /// } /// } /// } /// +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_AGGREGATE) +/// .with_description("calculates a geometric mean") +/// .with_syntax_example("geo_mean(2.0)") +/// .with_argument("arg1", "The Float64 number for the geometric mean") +/// .build() +/// .unwrap() +/// }) +/// } +/// /// /// Implement the AggregateUDFImpl trait for GeoMeanUdf /// impl AggregateUDFImpl for GeoMeanUdf { /// fn as_any(&self) -> &dyn Any { self } @@ -325,7 +350,7 @@ where /// fn signature(&self) -> &Signature { &self.signature } /// fn return_type(&self, args: &[DataType]) -> Result { /// if !matches!(args.get(0), Some(&DataType::Float64)) { -/// return plan_err!("add_one only accepts Float64 arguments"); +/// return plan_err!("geo_mean only accepts Float64 arguments"); /// } /// Ok(DataType::Float64) /// } @@ -337,6 +362,9 @@ where /// Field::new("ordering", DataType::UInt32, true) /// ]) /// } +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) +/// } /// } /// /// // Create a new AggregateUDF from the implementation @@ -603,6 +631,14 @@ pub trait AggregateUDFImpl: Debug + Send + Sync { fn default_value(&self, data_type: &DataType) -> Result { ScalarValue::try_from(data_type) } + + /// Returns the documentation for this Aggregate UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None + } } impl PartialEq for dyn AggregateUDFImpl { @@ -749,6 +785,41 @@ impl AggregateUDFImpl for AliasedAggregateUDFImpl { fn is_descending(&self) -> Option { self.inner.is_descending() } + + fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } +} + +// Aggregate UDF doc sections for use in public documentation +pub mod aggregate_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_GENERAL, + DOC_SECTION_STATISTICAL, + DOC_SECTION_APPROXIMATE, + ] + } + + pub const DOC_SECTION_GENERAL: DocSection = DocSection { + include: true, + label: "General Functions", + description: None, + }; + + pub const DOC_SECTION_STATISTICAL: DocSection = DocSection { + include: true, + label: "Statistical Functions", + description: None, + }; + + pub const DOC_SECTION_APPROXIMATE: DocSection = DocSection { + include: true, + label: "Approximate Functions", + description: None, + }; } #[cfg(test)] diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 938e1181d85d..3759fb18f56d 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -20,7 +20,9 @@ use crate::expr::schema_name_from_exprs_comma_seperated_without_space; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; -use crate::{ColumnarValue, Expr, ScalarFunctionImplementation, Signature}; +use crate::{ + ColumnarValue, Documentation, Expr, ScalarFunctionImplementation, Signature, +}; use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result}; use datafusion_expr_common::interval_arithmetic::Interval; @@ -274,6 +276,14 @@ impl ScalarUDF { pub fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } } impl From for ScalarUDF @@ -298,22 +308,39 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::DataType; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, ColumnarValue, Signature, Volatility}; +/// # use datafusion_expr::{col, ColumnarValue, Documentation, Signature, Volatility}; /// # use datafusion_expr::{ScalarUDFImpl, ScalarUDF}; +/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; +/// /// #[derive(Debug)] /// struct AddOne { -/// signature: Signature +/// signature: Signature, /// } /// /// impl AddOne { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), /// } /// } /// } +/// +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_MATH) +/// .with_description("Add one to an int32") +/// .with_syntax_example("add_one(2)") +/// .with_argument("arg1", "The int32 number to add one to") +/// .build() +/// .unwrap() +/// }) +/// } /// /// /// Implement the ScalarUDFImpl trait for AddOne /// impl ScalarUDFImpl for AddOne { @@ -328,6 +355,9 @@ where /// } /// // The actual implementation would add one to the argument /// fn invoke(&self, args: &[ColumnarValue]) -> Result { unimplemented!() } +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) +/// } /// } /// /// // Create a new ScalarUDF from the implementation @@ -596,6 +626,14 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { self.signature().hash(hasher); hasher.finish() } + + /// Returns the documentation for this Scalar UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None + } } /// ScalarUDF that adds an alias to the underlying function. It is better to @@ -709,4 +747,100 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl { self.aliases.hash(hasher); hasher.finish() } + + fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } +} + +// Scalar UDF doc sections for use in public documentation +pub mod scalar_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_MATH, + DOC_SECTION_CONDITIONAL, + DOC_SECTION_STRING, + DOC_SECTION_BINARY_STRING, + DOC_SECTION_REGEX, + DOC_SECTION_DATETIME, + DOC_SECTION_ARRAY, + DOC_SECTION_STRUCT, + DOC_SECTION_MAP, + DOC_SECTION_HASHING, + DOC_SECTION_OTHER, + ] + } + + pub const DOC_SECTION_MATH: DocSection = DocSection { + include: true, + label: "Math Functions", + description: None, + }; + + pub const DOC_SECTION_CONDITIONAL: DocSection = DocSection { + include: true, + label: "Conditional Functions", + description: None, + }; + + pub const DOC_SECTION_STRING: DocSection = DocSection { + include: true, + label: "String Functions", + description: None, + }; + + pub const DOC_SECTION_BINARY_STRING: DocSection = DocSection { + include: true, + label: "Binary String Functions", + description: None, + }; + + pub const DOC_SECTION_REGEX: DocSection = DocSection { + include: true, + label: "Regular Expression Functions", + description: Some( + r#"Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported:"#, + ), + }; + + pub const DOC_SECTION_DATETIME: DocSection = DocSection { + include: true, + label: "Time and Date Functions", + description: None, + }; + + pub const DOC_SECTION_ARRAY: DocSection = DocSection { + include: true, + label: "Array Functions", + description: None, + }; + + pub const DOC_SECTION_STRUCT: DocSection = DocSection { + include: true, + label: "Struct Functions", + description: None, + }; + + pub const DOC_SECTION_MAP: DocSection = DocSection { + include: true, + label: "Map Functions", + description: None, + }; + + pub const DOC_SECTION_HASHING: DocSection = DocSection { + include: true, + label: "Hashing Functions", + description: None, + }; + + pub const DOC_SECTION_OTHER: DocSection = DocSection { + include: true, + label: "Other Functions", + description: None, + }; } diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs new file mode 100644 index 000000000000..e8245588d945 --- /dev/null +++ b/datafusion/expr/src/udf_docs.rs @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion_common::exec_err; +use datafusion_common::Result; + +/// Documentation for use by [`ScalarUDFImpl`](crate::ScalarUDFImpl), +/// [`AggregateUDFImpl`](crate::AggregateUDFImpl) and [`WindowUDFImpl`](crate::WindowUDFImpl) functions +/// that will be used to generate public documentation. +/// +/// The name of the udf will be pulled from the [`ScalarUDFImpl::name`](crate::ScalarUDFImpl::name), +/// [`AggregateUDFImpl::name`](crate::AggregateUDFImpl::name) or [`WindowUDFImpl::name`](crate::WindowUDFImpl::name) +/// function as appropriate. +/// +/// All strings in the documentation are required to be +/// in [markdown format](https://www.markdownguide.org/basic-syntax/). +/// +/// Currently, documentation only supports a single language +/// thus all text should be in English. +#[derive(Debug, Clone)] +pub struct Documentation { + /// the section in the documentation where the UDF will be documented + pub doc_section: DocSection, + /// the description for the UDF + pub description: String, + /// a brief example of the syntax. For example "ascii(str)" + pub syntax_example: String, + /// a sql example for the UDF, usually in the form of a sql prompt + /// query and output. It is strongly recommended to provide an + /// example for anything but the most basic UDF's + pub sql_example: Option, + /// arguments for the UDF which will be displayed in array order. + /// Left member of a pair is the argument name, right is a + /// description for the argument + pub arguments: Option>, + /// related functions if any. Values should match the related + /// udf's name exactly. Related udf's must be of the same + /// UDF type (scalar, aggregate or window) for proper linking to + /// occur + pub related_udfs: Option>, +} + +impl Documentation { + /// Returns a new [`DocumentationBuilder`] with no options set. + pub fn builder() -> DocumentationBuilder { + DocumentationBuilder::new() + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct DocSection { + /// true to include this doc section in the public + /// documentation, false otherwise + pub include: bool, + /// a display label for the doc section. For example: "Math Expressions" + pub label: &'static str, + /// an optional description for the doc section + pub description: Option<&'static str>, +} + +/// A builder to be used for building [`Documentation`]'s. +/// +/// Example: +/// +/// ```rust +/// # use datafusion_expr::Documentation; +/// # use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; +/// # use datafusion_common::Result; +/// # +/// # fn main() -> Result<()> { +/// let documentation = Documentation::builder() +/// .with_doc_section(DOC_SECTION_MATH) +/// .with_description("Add one to an int32") +/// .with_syntax_example("add_one(2)") +/// .with_argument("arg_1", "The int32 number to add one to") +/// .build()?; +/// Ok(()) +/// # } +pub struct DocumentationBuilder { + pub doc_section: Option, + pub description: Option, + pub syntax_example: Option, + pub sql_example: Option, + pub arguments: Option>, + pub related_udfs: Option>, +} + +impl DocumentationBuilder { + pub fn new() -> Self { + Self { + doc_section: None, + description: None, + syntax_example: None, + sql_example: None, + arguments: None, + related_udfs: None, + } + } + + pub fn with_doc_section(mut self, doc_section: DocSection) -> Self { + self.doc_section = Some(doc_section); + self + } + + pub fn with_description(mut self, description: impl Into) -> Self { + self.description = Some(description.into()); + self + } + + pub fn with_syntax_example(mut self, syntax_example: impl Into) -> Self { + self.syntax_example = Some(syntax_example.into()); + self + } + + pub fn with_sql_example(mut self, sql_example: impl Into) -> Self { + self.sql_example = Some(sql_example.into()); + self + } + + /// Adds documentation for a specific argument to the documentation. + /// + /// Arguments are displayed in the order they are added. + pub fn with_argument( + mut self, + arg_name: impl Into, + arg_description: impl Into, + ) -> Self { + let mut args = self.arguments.unwrap_or_default(); + args.push((arg_name.into(), arg_description.into())); + self.arguments = Some(args); + self + } + + /// Add a standard "expression" argument to the documentation + /// + /// This is similar to [`Self::with_argument`] except that a standard + /// description is appended to the end: `"Can be a constant, column, or + /// function, and any combination of arithmetic operators."` + /// + /// The argument is rendered like + /// + /// ```text + /// : + /// expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + /// ``` + pub fn with_standard_argument( + self, + arg_name: impl Into, + expression_type: impl AsRef, + ) -> Self { + let expression_type = expression_type.as_ref(); + self.with_argument(arg_name, format!("{expression_type} expression to operate on. Can be a constant, column, or function, and any combination of operators.")) + } + + pub fn with_related_udf(mut self, related_udf: impl Into) -> Self { + let mut related = self.related_udfs.unwrap_or_default(); + related.push(related_udf.into()); + self.related_udfs = Some(related); + self + } + + pub fn build(self) -> Result { + let Self { + doc_section, + description, + syntax_example, + sql_example, + arguments, + related_udfs, + } = self; + + if doc_section.is_none() { + return exec_err!("Documentation must have a doc section"); + } + if description.is_none() { + return exec_err!("Documentation must have a description"); + } + if syntax_example.is_none() { + return exec_err!("Documentation must have a syntax_example"); + } + + Ok(Documentation { + doc_section: doc_section.unwrap(), + description: description.unwrap(), + syntax_example: syntax_example.unwrap(), + sql_example, + arguments, + related_udfs, + }) + } +} + +impl Default for DocumentationBuilder { + fn default() -> Self { + Self::new() + } +} diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 678a0b62cd9a..69f357d48f8c 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -33,7 +33,8 @@ use datafusion_functions_window_common::field::WindowUDFFieldArgs; use crate::expr::WindowFunction; use crate::{ - function::WindowFunctionSimplification, Expr, PartitionEvaluator, Signature, + function::WindowFunctionSimplification, Documentation, Expr, PartitionEvaluator, + Signature, }; /// Logical representation of a user-defined window function (UDWF) @@ -180,6 +181,14 @@ impl WindowUDF { pub fn reverse_expr(&self) -> ReversedUDWF { self.inner.reverse_expr() } + + /// Returns the documentation for this Window UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + pub fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } } impl From for WindowUDF @@ -204,30 +213,47 @@ where /// # Basic Example /// ``` /// # use std::any::Any; +/// # use std::sync::OnceLock; /// # use arrow::datatypes::{DataType, Field}; /// # use datafusion_common::{DataFusionError, plan_err, Result}; -/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt}; +/// # use datafusion_expr::{col, Signature, Volatility, PartitionEvaluator, WindowFrame, ExprFunctionExt, Documentation}; /// # use datafusion_expr::{WindowUDFImpl, WindowUDF}; -/// use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # use datafusion_expr::window_doc_sections::DOC_SECTION_ANALYTICAL; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// /// #[derive(Debug, Clone)] /// struct SmoothIt { -/// signature: Signature +/// signature: Signature, /// } /// /// impl SmoothIt { /// fn new() -> Self { /// Self { -/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable) +/// signature: Signature::uniform(1, vec![DataType::Int32], Volatility::Immutable), /// } /// } /// } /// -/// /// Implement the WindowUDFImpl trait for AddOne +/// static DOCUMENTATION: OnceLock = OnceLock::new(); +/// +/// fn get_doc() -> &'static Documentation { +/// DOCUMENTATION.get_or_init(|| { +/// Documentation::builder() +/// .with_doc_section(DOC_SECTION_ANALYTICAL) +/// .with_description("smooths the windows") +/// .with_syntax_example("smooth_it(2)") +/// .with_argument("arg1", "The int32 number to smooth by") +/// .build() +/// .unwrap() +/// }) +/// } +/// +/// /// Implement the WindowUDFImpl trait for SmoothIt /// impl WindowUDFImpl for SmoothIt { /// fn as_any(&self) -> &dyn Any { self } /// fn name(&self) -> &str { "smooth_it" } /// fn signature(&self) -> &Signature { &self.signature } -/// // The actual implementation would add one to the argument +/// // The actual implementation would smooth the window /// fn partition_evaluator(&self) -> Result> { unimplemented!() } /// fn field(&self, field_args: WindowUDFFieldArgs) -> Result { /// if let Some(DataType::Int32) = field_args.get_input_type(0) { @@ -236,6 +262,9 @@ where /// plan_err!("smooth_it only accepts Int32 arguments") /// } /// } +/// fn documentation(&self) -> Option<&Documentation> { +/// Some(get_doc()) +/// } /// } /// /// // Create a new WindowUDF from the implementation @@ -326,6 +355,10 @@ pub trait WindowUDFImpl: Debug + Send + Sync { } /// The [`Field`] of the final result of evaluating this window function. + /// + /// Call `field_args.name()` to get the fully qualified name for defining + /// the [`Field`]. For a complete example see the implementation in the + /// [Basic Example](WindowUDFImpl#basic-example) section. fn field(&self, field_args: WindowUDFFieldArgs) -> Result; /// Allows the window UDF to define a custom result ordering. @@ -365,6 +398,14 @@ pub trait WindowUDFImpl: Debug + Send + Sync { fn reverse_expr(&self) -> ReversedUDWF { ReversedUDWF::NotSupported } + + /// Returns the documentation for this Window UDF. + /// + /// Documentation can be accessed programmatically as well as + /// generating publicly facing documentation. + fn documentation(&self) -> Option<&Documentation> { + None + } } pub enum ReversedUDWF { @@ -465,6 +506,41 @@ impl WindowUDFImpl for AliasedWindowUDFImpl { fn coerce_types(&self, arg_types: &[DataType]) -> Result> { self.inner.coerce_types(arg_types) } + + fn documentation(&self) -> Option<&Documentation> { + self.inner.documentation() + } +} + +// Window UDF doc sections for use in public documentation +pub mod window_doc_sections { + use crate::DocSection; + + pub fn doc_sections() -> Vec { + vec![ + DOC_SECTION_AGGREGATE, + DOC_SECTION_RANKING, + DOC_SECTION_ANALYTICAL, + ] + } + + pub const DOC_SECTION_AGGREGATE: DocSection = DocSection { + include: true, + label: "Aggregate Functions", + description: Some("All aggregate functions can be used as window functions."), + }; + + pub const DOC_SECTION_RANKING: DocSection = DocSection { + include: true, + label: "Ranking Functions", + description: None, + }; + + pub const DOC_SECTION_ANALYTICAL: DocSection = DocSection { + include: true, + label: "Analytical Functions", + description: None, + }; } #[cfg(test)] diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 1d8eb9445eda..02b36d0feab9 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -19,6 +19,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; +use std::ops::Deref; use std::sync::Arc; use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction}; @@ -38,6 +39,7 @@ use datafusion_common::{ DataFusionError, Result, TableReference, }; +use indexmap::IndexSet; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem}; pub use datafusion_functions_aggregate_common::order::AggregateOrderSensitivity; @@ -65,9 +67,10 @@ pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result { "Invalid group by expressions, GroupingSet must be the only expression" ); } - Ok(grouping_set.distinct_expr().len()) + // Groupings sets have an additional interal column for the grouping id + Ok(grouping_set.distinct_expr().len() + 1) } else { - Ok(group_expr.len()) + grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len()) } } @@ -260,7 +263,11 @@ pub fn grouping_set_to_exprlist(group_expr: &[Expr]) -> Result> { } Ok(grouping_set.distinct_expr()) } else { - Ok(group_expr.iter().collect()) + Ok(group_expr + .iter() + .collect::>() + .into_iter() + .collect()) } } @@ -758,6 +765,15 @@ pub fn find_base_plan(input: &LogicalPlan) -> &LogicalPlan { match input { LogicalPlan::Window(window) => find_base_plan(&window.input), LogicalPlan::Aggregate(agg) => find_base_plan(&agg.input), + // [SqlToRel::try_process_unnest] will convert Expr(Unnest(Expr)) to Projection/Unnest/Projection + // We should expand the wildcard expression based on the input plan of the inner Projection. + LogicalPlan::Unnest(unnest) => { + if let LogicalPlan::Projection(projection) = unnest.input.deref() { + find_base_plan(&projection.input) + } else { + input + } + } LogicalPlan::Filter(filter) => { if filter.having { // If a filter is used for a having clause, its input plan is an aggregation. diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index d78f68a2604e..37e4c7f4a5ad 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -48,9 +48,9 @@ datafusion-functions-aggregate-common = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } half = { workspace = true } +indexmap = { workspace = true } log = { workspace = true } paste = "1.0.14" -sqlparser = { workspace = true } [dev-dependencies] arrow = { workspace = true, features = ["test_utils"] } diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index aa65062e3330..c5382c168f17 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -35,11 +35,14 @@ use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::INTEGERS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ - Accumulator, AggregateUDFImpl, GroupsAccumulator, ReversedUDAF, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, ReversedUDAF, + Signature, Volatility, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use std::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; +use std::sync::OnceLock; /// This macro helps create group accumulators based on bitwise operations typically used internally /// and might not be necessary for users to call directly. @@ -110,8 +113,9 @@ macro_rules! downcast_bitwise_accumulator { /// `EXPR_FN` identifier used to name the generated expression function. /// `AGGREGATE_UDF_FN` is an identifier used to name the underlying UDAF function. /// `OPR_TYPE` is an expression that evaluates to the type of bitwise operation to be performed. +/// `DOCUMENTATION` documentation for the UDAF macro_rules! make_bitwise_udaf_expr_and_func { - ($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $OPR_TYPE:expr) => { + ($EXPR_FN:ident, $AGGREGATE_UDF_FN:ident, $OPR_TYPE:expr, $DOCUMENTATION:expr) => { make_udaf_expr!( $EXPR_FN, expr_x, @@ -125,14 +129,73 @@ macro_rules! make_bitwise_udaf_expr_and_func { create_func!( $EXPR_FN, $AGGREGATE_UDF_FN, - BitwiseOperation::new($OPR_TYPE, stringify!($EXPR_FN)) + BitwiseOperation::new($OPR_TYPE, stringify!($EXPR_FN), $DOCUMENTATION) ); }; } -make_bitwise_udaf_expr_and_func!(bit_and, bit_and_udaf, BitwiseOperationType::And); -make_bitwise_udaf_expr_and_func!(bit_or, bit_or_udaf, BitwiseOperationType::Or); -make_bitwise_udaf_expr_and_func!(bit_xor, bit_xor_udaf, BitwiseOperationType::Xor); +static BIT_AND_DOC: OnceLock = OnceLock::new(); + +fn get_bit_and_doc() -> &'static Documentation { + BIT_AND_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise AND of all non-null input values.") + .with_syntax_example("bit_and(expression)") + .with_standard_argument("expression", "Integer") + .build() + .unwrap() + }) +} + +static BIT_OR_DOC: OnceLock = OnceLock::new(); + +fn get_bit_or_doc() -> &'static Documentation { + BIT_OR_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description("Computes the bitwise OR of all non-null input values.") + .with_syntax_example("bit_or(expression)") + .with_standard_argument("expression", "Integer") + .build() + .unwrap() + }) +} + +static BIT_XOR_DOC: OnceLock = OnceLock::new(); + +fn get_bit_xor_doc() -> &'static Documentation { + BIT_XOR_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Computes the bitwise exclusive OR of all non-null input values.", + ) + .with_syntax_example("bit_xor(expression)") + .with_standard_argument("expression", "Integer") + .build() + .unwrap() + }) +} + +make_bitwise_udaf_expr_and_func!( + bit_and, + bit_and_udaf, + BitwiseOperationType::And, + get_bit_and_doc() +); +make_bitwise_udaf_expr_and_func!( + bit_or, + bit_or_udaf, + BitwiseOperationType::Or, + get_bit_or_doc() +); +make_bitwise_udaf_expr_and_func!( + bit_xor, + bit_xor_udaf, + BitwiseOperationType::Xor, + get_bit_xor_doc() +); /// The different types of bitwise operations that can be performed. #[derive(Debug, Clone, Eq, PartialEq)] @@ -155,14 +218,20 @@ struct BitwiseOperation { /// `operation` indicates the type of bitwise operation to be performed. operation: BitwiseOperationType, func_name: &'static str, + documentation: &'static Documentation, } impl BitwiseOperation { - pub fn new(operator: BitwiseOperationType, func_name: &'static str) -> Self { + pub fn new( + operator: BitwiseOperationType, + func_name: &'static str, + documentation: &'static Documentation, + ) -> Self { Self { operation: operator, signature: Signature::uniform(1, INTEGERS.to_vec(), Volatility::Immutable), func_name, + documentation, } } } @@ -239,6 +308,10 @@ impl AggregateUDFImpl for BitwiseOperation { fn reverse_expr(&self) -> ReversedUDAF { ReversedUDAF::Identical } + + fn documentation(&self) -> Option<&Documentation> { + Some(self.documentation) + } } struct BitAndAccumulator { diff --git a/datafusion/functions-aggregate/src/macros.rs b/datafusion/functions-aggregate/src/macros.rs index 573b9fd5bdb2..ffb5183278e6 100644 --- a/datafusion/functions-aggregate/src/macros.rs +++ b/datafusion/functions-aggregate/src/macros.rs @@ -15,23 +15,6 @@ // specific language governing permissions and limitations // under the License. -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - macro_rules! make_udaf_expr { ($EXPR_FN:ident, $($arg:ident)*, $DOC:expr, $AGGREGATE_UDF_FN:ident) => { // "fluent expr_fn" style function diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 1ce1abe09ea8..e0b029f0909d 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -1240,26 +1240,24 @@ impl Accumulator for SlidingMinAccumulator { } } -// -// Moving min and moving max -// The implementation is taken from https://github.com/spebern/moving_min_max/blob/master/src/lib.rs. - -// Keep track of the minimum or maximum value in a sliding window. -// -// `moving min max` provides one data structure for keeping track of the -// minimum value and one for keeping track of the maximum value in a sliding -// window. -// -// Each element is stored with the current min/max. One stack to push and another one for pop. If pop stack is empty, -// push to this stack all elements popped from first stack while updating their current min/max. Now pop from -// the second stack (MovingMin/Max struct works as a queue). To find the minimum element of the queue, -// look at the smallest/largest two elements of the individual stacks, then take the minimum of those two values. -// -// The complexity of the operations are -// - O(1) for getting the minimum/maximum -// - O(1) for push -// - amortized O(1) for pop - +/// Keep track of the minimum value in a sliding window. +/// +/// The implementation is taken from +/// +/// `moving min max` provides one data structure for keeping track of the +/// minimum value and one for keeping track of the maximum value in a sliding +/// window. +/// +/// Each element is stored with the current min/max. One stack to push and another one for pop. If pop stack is empty, +/// push to this stack all elements popped from first stack while updating their current min/max. Now pop from +/// the second stack (MovingMin/Max struct works as a queue). To find the minimum element of the queue, +/// look at the smallest/largest two elements of the individual stacks, then take the minimum of those two values. +/// +/// The complexity of the operations are +/// - O(1) for getting the minimum/maximum +/// - O(1) for push +/// - amortized O(1) for pop +/// /// ``` /// # use datafusion_functions_aggregate::min_max::MovingMin; /// let mut moving_min = MovingMin::::new(); @@ -1375,6 +1373,11 @@ impl MovingMin { self.len() == 0 } } + +/// Keep track of the maximum value in a sliding window. +/// +/// See [`MovingMin`] for more details. +/// /// ``` /// # use datafusion_functions_aggregate::min_max::MovingMax; /// let mut moving_max = MovingMax::::new(); diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs index 3648ec0d1312..49a30344c212 100644 --- a/datafusion/functions-aggregate/src/variance.rs +++ b/datafusion/functions-aggregate/src/variance.rs @@ -18,22 +18,24 @@ //! [`VarianceSample`]: variance sample aggregations. //! [`VariancePopulation`]: variance population aggregations. -use std::{fmt::Debug, sync::Arc}; - use arrow::{ array::{Array, ArrayRef, BooleanArray, Float64Array, UInt64Array}, buffer::NullBuffer, compute::kernels::cast, datatypes::{DataType, Field}, }; +use std::sync::OnceLock; +use std::{fmt::Debug, sync::Arc}; use datafusion_common::{ downcast_value, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, utils::format_state_name, - Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature, + Volatility, }; use datafusion_functions_aggregate_common::{ aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType, @@ -135,6 +137,26 @@ impl AggregateUDFImpl for VarianceSample { ) -> Result> { Ok(Box::new(VarianceGroupsAccumulator::new(StatsType::Sample))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_variance_sample_doc()) + } +} + +static VARIANCE_SAMPLE_DOC: OnceLock = OnceLock::new(); + +fn get_variance_sample_doc() -> &'static Documentation { + VARIANCE_SAMPLE_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the statistical sample variance of a set of numbers.", + ) + .with_syntax_example("var(expression)") + .with_standard_argument("expression", "Numeric") + .build() + .unwrap() + }) } pub struct VariancePopulation { @@ -222,6 +244,25 @@ impl AggregateUDFImpl for VariancePopulation { StatsType::Population, ))) } + fn documentation(&self) -> Option<&Documentation> { + Some(get_variance_population_doc()) + } +} + +static VARIANCE_POPULATION_DOC: OnceLock = OnceLock::new(); + +fn get_variance_population_doc() -> &'static Documentation { + VARIANCE_POPULATION_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the statistical population variance of a set of numbers.", + ) + .with_syntax_example("var_pop(expression)") + .with_standard_argument("expression", "Numeric") + .build() + .unwrap() + }) } /// An accumulator to compute variance diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs index 79858041d3ca..51fc71e6b09d 100644 --- a/datafusion/functions-nested/src/make_array.rs +++ b/datafusion/functions-nested/src/make_array.rs @@ -17,6 +17,7 @@ //! [`ScalarUDFImpl`] definitions for `make_array` function. +use std::vec; use std::{any::Any, sync::Arc}; use arrow::array::{ArrayData, Capacities, MutableArrayData}; @@ -26,9 +27,8 @@ use arrow_array::{ use arrow_buffer::OffsetBuffer; use arrow_schema::DataType::{LargeList, List, Null}; use arrow_schema::{DataType, Field}; -use datafusion_common::internal_err; use datafusion_common::{plan_err, utils::array_into_list_array_nullable, Result}; -use datafusion_expr::type_coercion::binary::comparison_coercion; +use datafusion_expr::binary::type_union_resolution; use datafusion_expr::TypeSignature; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; @@ -82,19 +82,12 @@ impl ScalarUDFImpl for MakeArray { match arg_types.len() { 0 => Ok(empty_array_type()), _ => { - let mut expr_type = DataType::Null; - for arg_type in arg_types { - if !arg_type.equals_datatype(&DataType::Null) { - expr_type = arg_type.clone(); - break; - } - } - - if expr_type.is_null() { - expr_type = DataType::Int64; - } - - Ok(List(Arc::new(Field::new("item", expr_type, true)))) + // At this point, all the type in array should be coerced to the same one + Ok(List(Arc::new(Field::new( + "item", + arg_types[0].to_owned(), + true, + )))) } } } @@ -112,22 +105,21 @@ impl ScalarUDFImpl for MakeArray { } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - let new_type = arg_types.iter().skip(1).try_fold( - arg_types.first().unwrap().clone(), - |acc, x| { - // The coerced types found by `comparison_coercion` are not guaranteed to be - // coercible for the arguments. `comparison_coercion` returns more loose - // types that can be coerced to both `acc` and `x` for comparison purpose. - // See `maybe_data_types` for the actual coercion. - let coerced_type = comparison_coercion(&acc, x); - if let Some(coerced_type) = coerced_type { - Ok(coerced_type) - } else { - internal_err!("Coercion from {acc:?} to {x:?} failed.") - } - }, - )?; - Ok(vec![new_type; arg_types.len()]) + if let Some(new_type) = type_union_resolution(arg_types) { + if let DataType::FixedSizeList(field, _) = new_type { + Ok(vec![DataType::List(field); arg_types.len()]) + } else if new_type.is_null() { + Ok(vec![DataType::Int64; arg_types.len()]) + } else { + Ok(vec![new_type; arg_types.len()]) + } + } else { + plan_err!( + "Fail to find the valid type between {:?} for {}", + arg_types, + self.name() + ) + } } } diff --git a/datafusion/functions-window/Cargo.toml b/datafusion/functions-window/Cargo.toml index 8dcec6bc964b..952e5720c77c 100644 --- a/datafusion/functions-window/Cargo.toml +++ b/datafusion/functions-window/Cargo.toml @@ -43,6 +43,7 @@ datafusion-expr = { workspace = true } datafusion-functions-window-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } log = { workspace = true } +paste = "1.0.15" [dev-dependencies] arrow = { workspace = true } diff --git a/datafusion/functions-window/src/lib.rs b/datafusion/functions-window/src/lib.rs index 790a500f1f3f..6e98bb091446 100644 --- a/datafusion/functions-window/src/lib.rs +++ b/datafusion/functions-window/src/lib.rs @@ -29,6 +29,8 @@ use log::debug; use datafusion_expr::registry::FunctionRegistry; use datafusion_expr::WindowUDF; +#[macro_use] +pub mod macros; pub mod row_number; /// Fluent-style API for creating `Expr`s diff --git a/datafusion/functions-window/src/macros.rs b/datafusion/functions-window/src/macros.rs new file mode 100644 index 000000000000..843d8ecb38cc --- /dev/null +++ b/datafusion/functions-window/src/macros.rs @@ -0,0 +1,674 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Convenience macros for defining a user-defined window function +//! and associated expression API (fluent style). +//! +//! See [`define_udwf_and_expr!`] for usage examples. +//! +//! [`define_udwf_and_expr!`]: crate::define_udwf_and_expr! + +/// Lazily initializes a user-defined window function exactly once +/// when called concurrently. Repeated calls return a reference to the +/// same instance. +/// +/// # Parameters +/// +/// * `$UDWF`: The struct which defines the [`Signature`](datafusion_expr::Signature) +/// of the user-defined window function. +/// * `$OUT_FN_NAME`: The basename to generate a unique function name like +/// `$OUT_FN_NAME_udwf`. +/// * `$DOC`: Doc comments for UDWF. +/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it +/// automatically resolves to `$UDWF::default()`. +/// +/// # Example +/// +/// ``` +/// # use std::any::Any; +/// # use datafusion_common::arrow::datatypes::{DataType, Field}; +/// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +/// # +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # use datafusion_functions_window::get_or_init_udwf; +/// # +/// /// Defines the `simple_udwf()` user-defined window function. +/// get_or_init_udwf!( +/// SimpleUDWF, +/// simple, +/// "Simple user-defined window function doc comment." +/// ); +/// # +/// # assert_eq!(simple_udwf().name(), "simple_user_defined_window_function"); +/// # +/// # #[derive(Debug)] +/// # struct SimpleUDWF { +/// # signature: Signature, +/// # } +/// # +/// # impl Default for SimpleUDWF { +/// # fn default() -> Self { +/// # Self { +/// # signature: Signature::any(0, Volatility::Immutable), +/// # } +/// # } +/// # } +/// # +/// # impl WindowUDFImpl for SimpleUDWF { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "simple_user_defined_window_function" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new(field_args.name(), DataType::Int64, false)) +/// # } +/// # } +/// # +/// ``` +#[macro_export] +macro_rules! get_or_init_udwf { + ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => { + get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $UDWF::default); + }; + + ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr, $CTOR:path) => { + paste::paste! { + #[doc = concat!(" Singleton instance of [`", stringify!($OUT_FN_NAME), "`], ensures the user-defined")] + #[doc = concat!(" window function is only created once.")] + #[allow(non_upper_case_globals)] + static []: std::sync::OnceLock> = + std::sync::OnceLock::new(); + + #[doc = concat!(" Returns a [`WindowUDF`](datafusion_expr::WindowUDF) for [`", stringify!($OUT_FN_NAME), "`].")] + #[doc = ""] + #[doc = concat!(" ", $DOC)] + pub fn [<$OUT_FN_NAME _udwf>]() -> std::sync::Arc { + [] + .get_or_init(|| { + std::sync::Arc::new(datafusion_expr::WindowUDF::from($CTOR())) + }) + .clone() + } + } + }; +} + +/// Create a [`WindowFunction`] expression that exposes a fluent API +/// which you can use to build more complex expressions. +/// +/// [`WindowFunction`]: datafusion_expr::Expr::WindowFunction +/// +/// # Parameters +/// +/// * `$UDWF`: The struct which defines the [`Signature`] of the +/// user-defined window function. +/// * `$OUT_FN_NAME`: The basename to generate a unique function name like +/// `$OUT_FN_NAME_udwf`. +/// * `$DOC`: Doc comments for UDWF. +/// * (optional) `[$($PARAM:ident),+]`: An array of 1 or more parameters +/// for the generated function. The type of parameters is [`Expr`]. +/// When omitted this creates a function with zero parameters. +/// +/// [`Signature`]: datafusion_expr::Signature +/// [`Expr`]: datafusion_expr::Expr +/// +/// # Example +/// +/// 1. With Zero Parameters +/// ``` +/// # use std::any::Any; +/// # use datafusion_common::arrow::datatypes::{DataType, Field}; +/// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +/// # use datafusion_functions_window::{create_udwf_expr, get_or_init_udwf}; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # get_or_init_udwf!( +/// # RowNumber, +/// # row_number, +/// # "Returns a unique row number for each row in window partition beginning at 1." +/// # ); +/// /// Creates `row_number()` API which has zero parameters: +/// /// +/// /// ``` +/// /// /// Returns a unique row number for each row in window partition +/// /// /// beginning at 1. +/// /// pub fn row_number() -> datafusion_expr::Expr { +/// /// row_number_udwf().call(vec![]) +/// /// } +/// /// ``` +/// create_udwf_expr!( +/// RowNumber, +/// row_number, +/// "Returns a unique row number for each row in window partition beginning at 1." +/// ); +/// # +/// # assert_eq!( +/// # row_number().name_for_alias().unwrap(), +/// # "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" +/// # ); +/// # +/// # #[derive(Debug)] +/// # struct RowNumber { +/// # signature: Signature, +/// # } +/// # impl Default for RowNumber { +/// # fn default() -> Self { +/// # Self { +/// # signature: Signature::any(0, Volatility::Immutable), +/// # } +/// # } +/// # } +/// # impl WindowUDFImpl for RowNumber { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "row_number" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new(field_args.name(), DataType::UInt64, false)) +/// # } +/// # } +/// ``` +/// +/// 2. With Multiple Parameters +/// ``` +/// # use std::any::Any; +/// # +/// # use datafusion_expr::{ +/// # PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +/// # }; +/// # +/// # use datafusion_functions_window::{create_udwf_expr, get_or_init_udwf}; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # +/// # use datafusion_common::arrow::datatypes::Field; +/// # use datafusion_common::ScalarValue; +/// # use datafusion_expr::{col, lit}; +/// # +/// # get_or_init_udwf!(Lead, lead, "user-defined window function"); +/// # +/// /// Creates `lead(expr, offset, default)` with 3 parameters: +/// /// +/// /// ``` +/// /// /// Returns a value evaluated at the row that is offset rows +/// /// /// after the current row within the partition. +/// /// pub fn lead( +/// /// expr: datafusion_expr::Expr, +/// /// offset: datafusion_expr::Expr, +/// /// default: datafusion_expr::Expr, +/// /// ) -> datafusion_expr::Expr { +/// /// lead_udwf().call(vec![expr, offset, default]) +/// /// } +/// /// ``` +/// create_udwf_expr!( +/// Lead, +/// lead, +/// [expr, offset, default], +/// "Returns a value evaluated at the row that is offset rows after the current row within the partition." +/// ); +/// # +/// # assert_eq!( +/// # lead(col("a"), lit(1i64), lit(ScalarValue::Null)) +/// # .name_for_alias() +/// # .unwrap(), +/// # "lead(a,Int64(1),NULL) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" +/// # ); +/// # +/// # #[derive(Debug)] +/// # struct Lead { +/// # signature: Signature, +/// # } +/// # +/// # impl Default for Lead { +/// # fn default() -> Self { +/// # Self { +/// # signature: Signature::one_of( +/// # vec![ +/// # TypeSignature::Any(1), +/// # TypeSignature::Any(2), +/// # TypeSignature::Any(3), +/// # ], +/// # Volatility::Immutable, +/// # ), +/// # } +/// # } +/// # } +/// # +/// # impl WindowUDFImpl for Lead { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "lead" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new( +/// # field_args.name(), +/// # field_args.get_input_type(0).unwrap(), +/// # false, +/// # )) +/// # } +/// # } +/// ``` +#[macro_export] +macro_rules! create_udwf_expr { + // zero arguments + ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => { + paste::paste! { + #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"] + #[doc = concat!(" [`", stringify!($UDWF), "`] user-defined window function.")] + #[doc = ""] + #[doc = concat!(" ", $DOC)] + pub fn $OUT_FN_NAME() -> datafusion_expr::Expr { + [<$OUT_FN_NAME _udwf>]().call(vec![]) + } + } + }; + + // 1 or more arguments + ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => { + paste::paste! { + #[doc = " Create a [`WindowFunction`](datafusion_expr::Expr::WindowFunction) expression for"] + #[doc = concat!(" [`", stringify!($UDWF), "`] user-defined window function.")] + #[doc = ""] + #[doc = concat!(" ", $DOC)] + pub fn $OUT_FN_NAME( + $($PARAM: datafusion_expr::Expr),+ + ) -> datafusion_expr::Expr { + [<$OUT_FN_NAME _udwf>]() + .call(vec![$($PARAM),+]) + } + } + }; +} + +/// Defines a user-defined window function. +/// +/// Combines [`get_or_init_udwf!`] and [`create_udwf_expr!`] into a +/// single macro for convenience. +/// +/// # Arguments +/// +/// * `$UDWF`: The struct which defines the [`Signature`] of the +/// user-defined window function. +/// * `$OUT_FN_NAME`: The basename to generate a unique function name like +/// `$OUT_FN_NAME_udwf`. +/// * (optional) `[$($PARAM:ident),+]`: An array of 1 or more parameters +/// for the generated function. The type of parameters is [`Expr`]. +/// When omitted this creates a function with zero parameters. +/// * `$DOC`: Doc comments for UDWF. +/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it +/// automatically resolves to `$UDWF::default()`. +/// +/// [`Signature`]: datafusion_expr::Signature +/// [`Expr`]: datafusion_expr::Expr +/// +/// # Usage +/// +/// ## Expression API With Zero parameters +/// 1. Uses default constructor for UDWF. +/// +/// ``` +/// # use std::any::Any; +/// # use datafusion_common::arrow::datatypes::{DataType, Field}; +/// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +/// # +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # use datafusion_functions_window::{define_udwf_and_expr, get_or_init_udwf, create_udwf_expr}; +/// # +/// /// 1. Defines the `simple_udwf()` user-defined window function. +/// /// +/// /// 2. Defines the expression API: +/// /// ``` +/// /// pub fn simple() -> datafusion_expr::Expr { +/// /// simple_udwf().call(vec![]) +/// /// } +/// /// ``` +/// define_udwf_and_expr!( +/// SimpleUDWF, +/// simple, +/// "a simple user-defined window function" +/// ); +/// # +/// # assert_eq!(simple_udwf().name(), "simple_user_defined_window_function"); +/// # +/// # #[derive(Debug)] +/// # struct SimpleUDWF { +/// # signature: Signature, +/// # } +/// # +/// # impl Default for SimpleUDWF { +/// # fn default() -> Self { +/// # Self { +/// # signature: Signature::any(0, Volatility::Immutable), +/// # } +/// # } +/// # } +/// # +/// # impl WindowUDFImpl for SimpleUDWF { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "simple_user_defined_window_function" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new(field_args.name(), DataType::Int64, false)) +/// # } +/// # } +/// # +/// ``` +/// +/// 2. Uses a custom constructor for UDWF. +/// +/// ``` +/// # use std::any::Any; +/// # use datafusion_common::arrow::datatypes::{DataType, Field}; +/// # use datafusion_expr::{PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +/// # use datafusion_functions_window::{create_udwf_expr, define_udwf_and_expr, get_or_init_udwf}; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # +/// /// 1. Defines the `row_number_udwf()` user-defined window function. +/// /// +/// /// 2. Defines the expression API: +/// /// ``` +/// /// pub fn row_number() -> datafusion_expr::Expr { +/// /// row_number_udwf().call(vec![]) +/// /// } +/// /// ``` +/// define_udwf_and_expr!( +/// RowNumber, +/// row_number, +/// "Returns a unique row number for each row in window partition beginning at 1.", +/// RowNumber::new // <-- custom constructor +/// ); +/// # +/// # assert_eq!( +/// # row_number().name_for_alias().unwrap(), +/// # "row_number() ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" +/// # ); +/// # +/// # #[derive(Debug)] +/// # struct RowNumber { +/// # signature: Signature, +/// # } +/// # impl RowNumber { +/// # fn new() -> Self { +/// # Self { +/// # signature: Signature::any(0, Volatility::Immutable), +/// # } +/// # } +/// # } +/// # impl WindowUDFImpl for RowNumber { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "row_number" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new(field_args.name(), DataType::UInt64, false)) +/// # } +/// # } +/// ``` +/// +/// ## Expression API With Multiple Parameters +/// 3. Uses default constructor for UDWF +/// +/// ``` +/// # use std::any::Any; +/// # +/// # use datafusion_expr::{ +/// # PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +/// # }; +/// # +/// # use datafusion_functions_window::{create_udwf_expr, define_udwf_and_expr, get_or_init_udwf}; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # +/// # use datafusion_common::arrow::datatypes::Field; +/// # use datafusion_common::ScalarValue; +/// # use datafusion_expr::{col, lit}; +/// # +/// /// 1. Defines the `lead_udwf()` user-defined window function. +/// /// +/// /// 2. Defines the expression API: +/// /// ``` +/// /// pub fn lead( +/// /// expr: datafusion_expr::Expr, +/// /// offset: datafusion_expr::Expr, +/// /// default: datafusion_expr::Expr, +/// /// ) -> datafusion_expr::Expr { +/// /// lead_udwf().call(vec![expr, offset, default]) +/// /// } +/// /// ``` +/// define_udwf_and_expr!( +/// Lead, +/// lead, +/// [expr, offset, default], // <- 3 parameters +/// "user-defined window function" +/// ); +/// # +/// # assert_eq!( +/// # lead(col("a"), lit(1i64), lit(ScalarValue::Null)) +/// # .name_for_alias() +/// # .unwrap(), +/// # "lead(a,Int64(1),NULL) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" +/// # ); +/// # +/// # #[derive(Debug)] +/// # struct Lead { +/// # signature: Signature, +/// # } +/// # +/// # impl Default for Lead { +/// # fn default() -> Self { +/// # Self { +/// # signature: Signature::one_of( +/// # vec![ +/// # TypeSignature::Any(1), +/// # TypeSignature::Any(2), +/// # TypeSignature::Any(3), +/// # ], +/// # Volatility::Immutable, +/// # ), +/// # } +/// # } +/// # } +/// # +/// # impl WindowUDFImpl for Lead { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "lead" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new( +/// # field_args.name(), +/// # field_args.get_input_type(0).unwrap(), +/// # false, +/// # )) +/// # } +/// # } +/// ``` +/// 4. Uses custom constructor for UDWF +/// +/// ``` +/// # use std::any::Any; +/// # +/// # use datafusion_expr::{ +/// # PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +/// # }; +/// # +/// # use datafusion_functions_window::{create_udwf_expr, define_udwf_and_expr, get_or_init_udwf}; +/// # use datafusion_functions_window_common::field::WindowUDFFieldArgs; +/// # +/// # use datafusion_common::arrow::datatypes::Field; +/// # use datafusion_common::ScalarValue; +/// # use datafusion_expr::{col, lit}; +/// # +/// /// 1. Defines the `lead_udwf()` user-defined window function. +/// /// +/// /// 2. Defines the expression API: +/// /// ``` +/// /// pub fn lead( +/// /// expr: datafusion_expr::Expr, +/// /// offset: datafusion_expr::Expr, +/// /// default: datafusion_expr::Expr, +/// /// ) -> datafusion_expr::Expr { +/// /// lead_udwf().call(vec![expr, offset, default]) +/// /// } +/// /// ``` +/// define_udwf_and_expr!( +/// Lead, +/// lead, +/// [expr, offset, default], // <- 3 parameters +/// "user-defined window function", +/// Lead::new // <- Custom constructor +/// ); +/// # +/// # assert_eq!( +/// # lead(col("a"), lit(1i64), lit(ScalarValue::Null)) +/// # .name_for_alias() +/// # .unwrap(), +/// # "lead(a,Int64(1),NULL) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING" +/// # ); +/// # +/// # #[derive(Debug)] +/// # struct Lead { +/// # signature: Signature, +/// # } +/// # +/// # impl Lead { +/// # fn new() -> Self { +/// # Self { +/// # signature: Signature::one_of( +/// # vec![ +/// # TypeSignature::Any(1), +/// # TypeSignature::Any(2), +/// # TypeSignature::Any(3), +/// # ], +/// # Volatility::Immutable, +/// # ), +/// # } +/// # } +/// # } +/// # +/// # impl WindowUDFImpl for Lead { +/// # fn as_any(&self) -> &dyn Any { +/// # self +/// # } +/// # fn name(&self) -> &str { +/// # "lead" +/// # } +/// # fn signature(&self) -> &Signature { +/// # &self.signature +/// # } +/// # fn partition_evaluator( +/// # &self, +/// # ) -> datafusion_common::Result> { +/// # unimplemented!() +/// # } +/// # fn field(&self, field_args: WindowUDFFieldArgs) -> datafusion_common::Result { +/// # Ok(Field::new( +/// # field_args.name(), +/// # field_args.get_input_type(0).unwrap(), +/// # false, +/// # )) +/// # } +/// # } +/// ``` +#[macro_export] +macro_rules! define_udwf_and_expr { + // Defines UDWF with default constructor + // Defines expression API with zero parameters + ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr) => { + get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC); + create_udwf_expr!($UDWF, $OUT_FN_NAME, $DOC); + }; + + // Defines UDWF by passing a custom constructor + // Defines expression API with zero parameters + ($UDWF:ident, $OUT_FN_NAME:ident, $DOC:expr, $CTOR:path) => { + get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $CTOR); + create_udwf_expr!($UDWF, $OUT_FN_NAME, $DOC); + }; + + // Defines UDWF with default constructor + // Defines expression API with multiple parameters + ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr) => { + get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC); + create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $DOC); + }; + + // Defines UDWF by passing a custom constructor + // Defines expression API with multiple parameters + ($UDWF:ident, $OUT_FN_NAME:ident, [$($PARAM:ident),+], $DOC:expr, $CTOR:path) => { + get_or_init_udwf!($UDWF, $OUT_FN_NAME, $DOC, $CTOR); + create_udwf_expr!($UDWF, $OUT_FN_NAME, [$($PARAM),+], $DOC); + }; +} diff --git a/datafusion/functions-window/src/row_number.rs b/datafusion/functions-window/src/row_number.rs index 7f348bf9d2a0..c903f6778ae8 100644 --- a/datafusion/functions-window/src/row_number.rs +++ b/datafusion/functions-window/src/row_number.rs @@ -17,41 +17,28 @@ //! Defines physical expression for `row_number` that can evaluated at runtime during query execution -use std::any::Any; -use std::fmt::Debug; -use std::ops::Range; - use datafusion_common::arrow::array::ArrayRef; use datafusion_common::arrow::array::UInt64Array; use datafusion_common::arrow::compute::SortOptions; use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::Field; use datafusion_common::{Result, ScalarValue}; -use datafusion_expr::expr::WindowFunction; -use datafusion_expr::{Expr, PartitionEvaluator, Signature, Volatility, WindowUDFImpl}; +use datafusion_expr::window_doc_sections::DOC_SECTION_RANKING; +use datafusion_expr::{ + Documentation, PartitionEvaluator, Signature, Volatility, WindowUDFImpl, +}; use datafusion_functions_window_common::field; use field::WindowUDFFieldArgs; +use std::any::Any; +use std::fmt::Debug; +use std::ops::Range; +use std::sync::OnceLock; -/// Create a [`WindowFunction`](Expr::WindowFunction) expression for -/// `row_number` user-defined window function. -pub fn row_number() -> Expr { - Expr::WindowFunction(WindowFunction::new(row_number_udwf(), vec![])) -} - -/// Singleton instance of `row_number`, ensures the UDWF is only created once. -#[allow(non_upper_case_globals)] -static STATIC_RowNumber: std::sync::OnceLock> = - std::sync::OnceLock::new(); - -/// Returns a [`WindowUDF`](datafusion_expr::WindowUDF) for `row_number` -/// user-defined window function. -pub fn row_number_udwf() -> std::sync::Arc { - STATIC_RowNumber - .get_or_init(|| { - std::sync::Arc::new(datafusion_expr::WindowUDF::from(RowNumber::default())) - }) - .clone() -} +define_udwf_and_expr!( + RowNumber, + row_number, + "Returns a unique row number for each row in window partition beginning at 1." +); /// row_number expression #[derive(Debug)] @@ -74,6 +61,21 @@ impl Default for RowNumber { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_row_number_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_RANKING) + .with_description( + "Number of the current row within its partition, counting from 1.", + ) + .with_syntax_example("row_number()") + .build() + .unwrap() + }) +} + impl WindowUDFImpl for RowNumber { fn as_any(&self) -> &dyn Any { self @@ -101,6 +103,10 @@ impl WindowUDFImpl for RowNumber { nulls_first: false, }) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_row_number_doc()) + } } /// State for the `row_number` built-in window function. diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index ff1b926a9b82..a3d114221d3f 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -102,6 +102,11 @@ harness = false name = "to_timestamp" required-features = ["datetime_expressions"] +[[bench]] +harness = false +name = "encoding" +required-features = ["encoding_expressions"] + [[bench]] harness = false name = "regx" diff --git a/datafusion/functions/benches/encoding.rs b/datafusion/functions/benches/encoding.rs new file mode 100644 index 000000000000..d49235aac938 --- /dev/null +++ b/datafusion/functions/benches/encoding.rs @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::encoding; +use std::sync::Arc; + +fn criterion_benchmark(c: &mut Criterion) { + let decode = encoding::decode(); + for size in [1024, 4096, 8192] { + let str_array = Arc::new(create_string_array_with_len::(size, 0.2, 32)); + c.bench_function(&format!("base64_decode/{size}"), |b| { + let method = ColumnarValue::Scalar("base64".into()); + let encoded = encoding::encode() + .invoke(&[ColumnarValue::Array(str_array.clone()), method.clone()]) + .unwrap(); + + let args = vec![encoded, method]; + b.iter(|| black_box(decode.invoke(&args).unwrap())) + }); + + c.bench_function(&format!("hex_decode/{size}"), |b| { + let method = ColumnarValue::Scalar("hex".into()); + let encoded = encoding::encode() + .invoke(&[ColumnarValue::Array(str_array.clone()), method.clone()]) + .unwrap(); + + let args = vec![encoded, method]; + b.iter(|| black_box(decode.invoke(&args).unwrap())) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/core/coalesce.rs b/datafusion/functions/src/core/coalesce.rs index 4eccb5180e73..8155c04da626 100644 --- a/datafusion/functions/src/core/coalesce.rs +++ b/datafusion/functions/src/core/coalesce.rs @@ -15,17 +15,18 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::array::{new_null_array, BooleanArray}; use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; use arrow::datatypes::DataType; use datafusion_common::{exec_err, ExprSchema, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_CONDITIONAL; use datafusion_expr::type_coercion::binary::type_union_resolution; -use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; +use datafusion_expr::{ColumnarValue, Documentation, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use itertools::Itertools; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct CoalesceFunc { @@ -46,6 +47,23 @@ impl CoalesceFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_coalesce_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_CONDITIONAL) + .with_description("Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values.") + .with_syntax_example("coalesce(expression1[, ..., expression_n])") + .with_argument( + "expression1, expression_n", + "Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary." + ) + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for CoalesceFunc { fn as_any(&self) -> &dyn Any { self @@ -140,6 +158,10 @@ impl ScalarUDFImpl for CoalesceFunc { .unwrap_or(arg_types.first().unwrap().clone()); Ok(vec![new_type; arg_types.len()]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_coalesce_doc()) + } } #[cfg(test)] diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index 2795c4a25004..d603e5bcf295 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -19,13 +19,18 @@ use super::basic::{sha224, utf8_or_binary_to_binary_type}; use arrow::datatypes::DataType; use datafusion_common::Result; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_HASHING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct SHA224Func { signature: Signature, } + impl Default for SHA224Func { fn default() -> Self { Self::new() @@ -44,6 +49,21 @@ impl SHA224Func { } } } + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_sha224_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_HASHING) + .with_description("Computes the SHA-224 hash of a binary string.") + .with_syntax_example("sha224(expression)") + .with_standard_argument("expression", "String") + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for SHA224Func { fn as_any(&self) -> &dyn Any { self @@ -60,7 +80,12 @@ impl ScalarUDFImpl for SHA224Func { fn return_type(&self, arg_types: &[DataType]) -> Result { utf8_or_binary_to_binary_type(&arg_types[0], self.name()) } + fn invoke(&self, args: &[ColumnarValue]) -> Result { sha224(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_sha224_doc()) + } } diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index ae2ef58899f2..fc3f062e12e0 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -15,17 +15,19 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - +use crate::datetime::common::*; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Date32; use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; - -use crate::datetime::common::*; use datafusion_common::error::DataFusionError; use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct ToDateFunc { @@ -77,6 +79,50 @@ impl ToDateFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_to_date_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_DATETIME) + .with_description(r#"Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. +"#) + .with_syntax_example("to_date('2017-05-31', '%Y-%m-%d')") + .with_sql_example(r#"```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) +"#) + .with_standard_argument("expression", "String") + .with_argument( + "format_n", + "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned.", + ) + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for ToDateFunc { fn as_any(&self) -> &dyn Any { self @@ -117,6 +163,10 @@ impl ScalarUDFImpl for ToDateFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_to_date_doc()) + } } #[cfg(test)] diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 19dfca107a42..a791d77697ab 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -18,9 +18,12 @@ //! Encoding expressions use arrow::{ - array::{Array, ArrayRef, BinaryArray, OffsetSizeTrait, StringArray}, - datatypes::DataType, + array::{ + Array, ArrayRef, BinaryArray, GenericByteArray, OffsetSizeTrait, StringArray, + }, + datatypes::{ByteArrayType, DataType}, }; +use arrow_buffer::{Buffer, OffsetBufferBuilder}; use base64::{engine::general_purpose, Engine as _}; use datafusion_common::{ cast::{as_generic_binary_array, as_generic_string_array}, @@ -28,10 +31,11 @@ use datafusion_common::{ }; use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{DataFusionError, Result}; -use datafusion_expr::ColumnarValue; -use std::sync::Arc; +use datafusion_expr::{ColumnarValue, Documentation}; +use std::sync::{Arc, OnceLock}; use std::{fmt, str::FromStr}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_BINARY_STRING; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; @@ -54,6 +58,22 @@ impl EncodeFunc { } } +static ENCODE_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_encode_doc() -> &'static Documentation { + ENCODE_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_BINARY_STRING) + .with_description("Encode binary data into a textual representation.") + .with_syntax_example("encode(expression, format)") + .with_argument("expression", "Expression containing string or binary data") + .with_argument("format", "Supported formats are: `base64`, `hex`") + .with_related_udf("decode") + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for EncodeFunc { fn as_any(&self) -> &dyn Any { self @@ -100,6 +120,10 @@ impl ScalarUDFImpl for EncodeFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_encode_doc()) + } } #[derive(Debug)] @@ -121,6 +145,22 @@ impl DecodeFunc { } } +static DECODE_DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_decode_doc() -> &'static Documentation { + DECODE_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_BINARY_STRING) + .with_description("Decode binary data from textual representation in string.") + .with_syntax_example("decode(expression, format)") + .with_argument("expression", "Expression containing encoded string data") + .with_argument("format", "Same arguments as [encode](#encode)") + .with_related_udf("encode") + .build() + .unwrap() + }) +} + impl ScalarUDFImpl for DecodeFunc { fn as_any(&self) -> &dyn Any { self @@ -167,6 +207,10 @@ impl ScalarUDFImpl for DecodeFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_decode_doc()) + } } #[derive(Debug, Copy, Clone)] @@ -245,16 +289,22 @@ fn base64_encode(input: &[u8]) -> String { general_purpose::STANDARD_NO_PAD.encode(input) } -fn hex_decode(input: &[u8]) -> Result> { - hex::decode(input).map_err(|e| { +fn hex_decode(input: &[u8], buf: &mut [u8]) -> Result { + // only write input / 2 bytes to buf + let out_len = input.len() / 2; + let buf = &mut buf[..out_len]; + hex::decode_to_slice(input, buf).map_err(|e| { DataFusionError::Internal(format!("Failed to decode from hex: {}", e)) - }) + })?; + Ok(out_len) } -fn base64_decode(input: &[u8]) -> Result> { - general_purpose::STANDARD_NO_PAD.decode(input).map_err(|e| { - DataFusionError::Internal(format!("Failed to decode from base64: {}", e)) - }) +fn base64_decode(input: &[u8], buf: &mut [u8]) -> Result { + general_purpose::STANDARD_NO_PAD + .decode_slice(input, buf) + .map_err(|e| { + DataFusionError::Internal(format!("Failed to decode from base64: {}", e)) + }) } macro_rules! encode_to_array { @@ -267,14 +317,35 @@ macro_rules! encode_to_array { }}; } -macro_rules! decode_to_array { - ($METHOD: ident, $INPUT:expr) => {{ - let binary_array: BinaryArray = $INPUT - .iter() - .map(|x| x.map(|x| $METHOD(x.as_ref())).transpose()) - .collect::>()?; - Arc::new(binary_array) - }}; +fn decode_to_array( + method: F, + input: &GenericByteArray, + conservative_upper_bound_size: usize, +) -> Result +where + F: Fn(&[u8], &mut [u8]) -> Result, +{ + let mut values = vec![0; conservative_upper_bound_size]; + let mut offsets = OffsetBufferBuilder::new(input.len()); + let mut total_bytes_decoded = 0; + for v in input { + if let Some(v) = v { + let cursor = &mut values[total_bytes_decoded..]; + let decoded = method(v.as_ref(), cursor)?; + total_bytes_decoded += decoded; + offsets.push_length(decoded); + } else { + offsets.push_length(0); + } + } + // We reserved an upper bound size for the values buffer, but we only use the actual size + values.truncate(total_bytes_decoded); + let binary_array = BinaryArray::try_new( + offsets.finish(), + Buffer::from_vec(values), + input.nulls().cloned(), + )?; + Ok(Arc::new(binary_array)) } impl Encoding { @@ -381,10 +452,7 @@ impl Encoding { T: OffsetSizeTrait, { let input_value = as_generic_binary_array::(value)?; - let array: ArrayRef = match self { - Self::Base64 => decode_to_array!(base64_decode, input_value), - Self::Hex => decode_to_array!(hex_decode, input_value), - }; + let array = self.decode_byte_array(input_value)?; Ok(ColumnarValue::Array(array)) } @@ -393,12 +461,29 @@ impl Encoding { T: OffsetSizeTrait, { let input_value = as_generic_string_array::(value)?; - let array: ArrayRef = match self { - Self::Base64 => decode_to_array!(base64_decode, input_value), - Self::Hex => decode_to_array!(hex_decode, input_value), - }; + let array = self.decode_byte_array(input_value)?; Ok(ColumnarValue::Array(array)) } + + fn decode_byte_array( + &self, + input_value: &GenericByteArray, + ) -> Result { + match self { + Self::Base64 => { + let upper_bound = + base64::decoded_len_estimate(input_value.values().len()); + decode_to_array(base64_decode, input_value, upper_bound) + } + Self::Hex => { + // Calculate the upper bound for decoded byte size + // For hex encoding, each pair of hex characters (2 bytes) represents 1 byte when decoded + // So the upper bound is half the length of the input values. + let upper_bound = input_value.values().len() / 2; + decode_to_array(hex_decode, input_value, upper_bound) + } + } + } } impl fmt::Display for Encoding { diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index bb680f3c67de..81be5552666d 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -92,9 +92,6 @@ pub mod macros; pub mod string; make_stub_package!(string, "string_expressions"); -#[cfg(feature = "string_expressions")] -mod regexp_common; - /// Core datafusion expressions /// Enabled via feature flag `core_expressions` #[cfg(feature = "core_expressions")] diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index e47818bc86a4..e850673ef8af 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -284,7 +284,7 @@ macro_rules! make_math_binary_udf { use arrow::datatypes::DataType; use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; - use datafusion_expr::TypeSignature::*; + use datafusion_expr::TypeSignature; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -298,8 +298,8 @@ macro_rules! make_math_binary_udf { Self { signature: Signature::one_of( vec![ - Exact(vec![Float32, Float32]), - Exact(vec![Float64, Float64]), + TypeSignature::Exact(vec![Float32, Float32]), + TypeSignature::Exact(vec![Float64, Float64]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index b6f01f5e4659..1ffda8759d7b 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -18,7 +18,7 @@ //! Math function: `log()`. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use super::power::PowerFunc; @@ -29,9 +29,12 @@ use datafusion_common::{ ScalarValue, }; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; -use datafusion_expr::{lit, ColumnarValue, Expr, ScalarUDF, TypeSignature::*}; +use datafusion_expr::{ + lit, ColumnarValue, Documentation, Expr, ScalarUDF, TypeSignature::*, +}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -45,6 +48,22 @@ impl Default for LogFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_log_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MATH) + .with_description("Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.") + .with_syntax_example(r#"log(base, numeric_expression) +log(numeric_expression)"#) + .with_standard_argument("base", "Base numeric") + .with_standard_argument("numeric_expression", "Numeric") + .build() + .unwrap() + }) +} + impl LogFunc { pub fn new() -> Self { use DataType::*; @@ -168,6 +187,10 @@ impl ScalarUDFImpl for LogFunc { Ok(ColumnarValue::Array(arr)) } + fn documentation(&self) -> Option<&Documentation> { + Some(get_log_doc()) + } + /// Simplify the `log` function by the relevant rules: /// 1. Log(a, 1) ===> 0 /// 2. Log(a, Power(a, b)) ===> b diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs index 2bd704a7de2e..b02839b40bd9 100644 --- a/datafusion/functions/src/math/nans.rs +++ b/datafusion/functions/src/math/nans.rs @@ -19,10 +19,9 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, DataFusionError, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, TypeSignature}; use arrow::array::{ArrayRef, BooleanArray, Float32Array, Float64Array}; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -43,7 +42,10 @@ impl IsNanFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Float32]), Exact(vec![Float64])], + vec![ + TypeSignature::Exact(vec![Float32]), + TypeSignature::Exact(vec![Float64]), + ], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/math/power.rs b/datafusion/functions/src/math/power.rs index 5b790fb56ddf..831f983d5916 100644 --- a/datafusion/functions/src/math/power.rs +++ b/datafusion/functions/src/math/power.rs @@ -25,10 +25,9 @@ use datafusion_common::{ }; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; -use datafusion_expr::{ColumnarValue, Expr, ScalarUDF}; +use datafusion_expr::{ColumnarValue, Expr, ScalarUDF, TypeSignature}; use arrow::array::{ArrayRef, Float64Array, Int64Array}; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -52,7 +51,10 @@ impl PowerFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Int64, Int64]), Exact(vec![Float64, Float64])], + vec![ + TypeSignature::Exact(vec![Int64, Int64]), + TypeSignature::Exact(vec![Float64, Float64]), + ], Volatility::Immutable, ), aliases: vec![String::from("pow")], diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 4cff633a1772..61ef35bb6e5a 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -16,7 +16,7 @@ // under the License. //! Regx expressions -use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::compute::kernels::regexp; use arrow::datatypes::DataType; use datafusion_common::exec_err; @@ -25,32 +25,71 @@ use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, }; -use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct RegexpLikeFunc { signature: Signature, } + impl Default for RegexpLikeFunc { fn default() -> Self { Self::new() } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_regexp_like_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_REGEX) + .with_description("Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.") + .with_syntax_example("regexp_like(str, regexp[, flags])") + .with_sql_example(r#"```sql +select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); ++--------------------------------------------------------+ +| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | ++--------------------------------------------------------+ +| true | ++--------------------------------------------------------+ +SELECT regexp_like('aBc', '(b|d)', 'i'); ++--------------------------------------------------+ +| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) | ++--------------------------------------------------+ +| true | ++--------------------------------------------------+ +``` +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) +"#) + .with_standard_argument("str", "String") + .with_standard_argument("regexp","Regular") + .with_argument("flags", + r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + - **i**: case-insensitive: letters match both upper and lower case + - **m**: multi-line mode: ^ and $ match begin/end of line + - **s**: allow . to match \n + - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used + - **U**: swap the meaning of x* and x*?"#) + .build() + .unwrap() + }) +} + impl RegexpLikeFunc { pub fn new() -> Self { use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), @@ -105,6 +144,10 @@ impl ScalarUDFImpl for RegexpLikeFunc { result.map(ColumnarValue::Array) } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_regexp_like_doc()) + } } fn regexp_like_func(args: &[ArrayRef]) -> Result { match args[0].data_type() { @@ -162,7 +205,8 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result { 2 => { let values = as_generic_string_array::(&args[0])?; let regex = as_generic_string_array::(&args[1])?; - let array = regexp::regexp_is_match_utf8(values, regex, None) + let flags: Option<&GenericStringArray> = None; + let array = regexp::regexp_is_match(values, regex, flags) .map_err(|e| arrow_datafusion_err!(e))?; Ok(Arc::new(array) as ArrayRef) @@ -176,7 +220,7 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result { return plan_err!("regexp_like() does not support the \"global\" option"); } - let array = regexp::regexp_is_match_utf8(values, regex, Some(flags)) + let array = regexp::regexp_is_match(values, regex, Some(flags)) .map_err(|e| arrow_datafusion_err!(e))?; Ok(Arc::new(array) as ArrayRef) diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index a2c314e3761b..714e2c6a5339 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -26,8 +26,7 @@ use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, }; -use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, TypeSignature}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -53,10 +52,10 @@ impl RegexpMatchFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`. // If that fails, it proceeds to `(LargeUtf8, Utf8)`. // TODO: Native support Utf8View for regexp_match. - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), @@ -131,7 +130,7 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { let flags = as_generic_string_array::(&args[2])?; if flags.iter().any(|s| s == Some("g")) { - return plan_err!("regexp_match() does not support the \"global\" option") + return plan_err!("regexp_match() does not support the \"global\" option"); } regexp::regexp_match(values, regex, Some(flags)) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 5c885de202d0..27800af347ea 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -33,7 +33,7 @@ use datafusion_common::{ }; use datafusion_expr::function::Hint; use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::TypeSignature; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use regex::Regex; use std::any::Any; @@ -56,10 +56,10 @@ impl RegexpReplaceFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![Utf8View, Utf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8, Utf8]), - Exact(vec![Utf8View, Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8View, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8View, Utf8, Utf8, Utf8]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/regexp_common.rs b/datafusion/functions/src/regexp_common.rs deleted file mode 100644 index 748c1a294f97..000000000000 --- a/datafusion/functions/src/regexp_common.rs +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Common utilities for implementing regex functions - -use crate::string::common::StringArrayType; - -use arrow::array::{Array, ArrayDataBuilder, BooleanArray}; -use arrow::datatypes::DataType; -use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; -use datafusion_common::DataFusionError; -use regex::Regex; - -use std::collections::HashMap; - -#[cfg(doc)] -use arrow::array::{LargeStringArray, StringArray, StringViewArray}; -/// Perform SQL `array ~ regex_array` operation on -/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. -/// -/// If `regex_array` element has an empty value, the corresponding result value is always true. -/// -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, -/// which allow special search modes, such as case-insensitive and multi-line mode. -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) -/// for more information. -/// -/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. -/// -/// Can remove when is implemented upstream -/// -/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 -pub fn regexp_is_match_utf8<'a, S1, S2, S3>( - array: &'a S1, - regex_array: &'a S2, - flags_array: Option<&'a S3>, -) -> datafusion_common::Result -where - &'a S1: StringArrayType<'a>, - &'a S2: StringArrayType<'a>, - &'a S3: StringArrayType<'a>, -{ - if array.len() != regex_array.len() { - return Err(DataFusionError::Execution( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); - - let mut patterns: HashMap = HashMap::new(); - let mut result = BooleanBufferBuilder::new(array.len()); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{flag}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - (Some(_), Some(pattern)) if pattern == *"" => { - result.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - DataFusionError::Execution(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) - } - }; - result.append(re.is_match(value)); - } - _ => result.append(false), - } - Ok(()) - }) - .collect::, DataFusionError>>()?; - - let data = unsafe { - ArrayDataBuilder::new(DataType::Boolean) - .len(array.len()) - .buffers(vec![result.into()]) - .nulls(nulls) - .build_unchecked() - }; - - Ok(BooleanArray::from(data)) -} diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 526803eb36e4..610366b6e6d9 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -20,10 +20,11 @@ use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array}; use arrow::datatypes::DataType; use arrow::error::ArrowError; use datafusion_common::{internal_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct AsciiFunc { @@ -38,13 +39,8 @@ impl Default for AsciiFunc { impl AsciiFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } @@ -71,6 +67,43 @@ impl ScalarUDFImpl for AsciiFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(ascii, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_ascii_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ascii_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Returns the Unicode character code of the first character in a string.", + ) + .with_syntax_example("ascii(str)") + .with_sql_example( + r#"```sql +> select ascii('abc'); ++--------------------+ +| ascii(Utf8("abc")) | ++--------------------+ +| 97 | ++--------------------+ +> select ascii('🚀'); ++-------------------+ +| ascii(Utf8("🚀")) | ++-------------------+ +| 128640 | ++-------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("chr") + .build() + .unwrap() + }) } fn calculate_ascii<'a, V>(array: V) -> Result diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs index c37851443c37..93d4fa25ae81 100644 --- a/datafusion/functions/src/string/bit_length.rs +++ b/datafusion/functions/src/string/bit_length.rs @@ -15,17 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::compute::kernels::length::bit_length; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::utils::utf8_to_int_type; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::utf8_to_int_type; - #[derive(Debug)] pub struct BitLengthFunc { signature: Signature, @@ -39,13 +39,8 @@ impl Default for BitLengthFunc { impl BitLengthFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } @@ -88,4 +83,34 @@ impl ScalarUDFImpl for BitLengthFunc { }, } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_bit_length_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_bit_length_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the bit length of a string.") + .with_syntax_example("bit_length(str)") + .with_sql_example( + r#"```sql +> select bit_length('datafusion'); ++--------------------------------+ +| bit_length(Utf8("datafusion")) | ++--------------------------------+ +| 80 | ++--------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("length") + .with_related_udf("octet_length") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 0e992ff27fd3..82b7599f0735 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. +use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; -use std::any::Any; - use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; - -use crate::string::common::*; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; +use std::any::Any; +use std::sync::OnceLock; /// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed. /// btrim('xyxtrimyyx', 'xyz') = 'trim' @@ -49,18 +49,9 @@ impl Default for BTrimFunc { impl BTrimFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8View]), - Exact(vec![Utf8]), - ], + vec![TypeSignature::String(2), TypeSignature::String(1)], Volatility::Immutable, ), aliases: vec![String::from("trim")], @@ -109,6 +100,35 @@ impl ScalarUDFImpl for BTrimFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_btrim_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_btrim_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.") + .with_syntax_example("btrim(str[, trim_str])") + .with_sql_example(r#"```sql +> select btrim('__datafusion____', '_'); ++-------------------------------------------+ +| btrim(Utf8("__datafusion____"),Utf8("_")) | ++-------------------------------------------+ +| datafusion | ++-------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("trim_str", "String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._") + .with_related_udf("ltrim") + .with_related_udf("rtrim") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index 4da7dc01594d..ae0900af37d3 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayRef; use arrow::array::StringArray; @@ -24,13 +24,13 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use arrow::datatypes::DataType::Utf8; +use crate::utils::make_scalar_function; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::make_scalar_function; - /// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character. /// chr(65) = 'A' pub fn chr(args: &[ArrayRef]) -> Result { @@ -99,4 +99,35 @@ impl ScalarUDFImpl for ChrFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(chr, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_chr_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_chr_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Returns the character with the specified ASCII or Unicode code value.", + ) + .with_syntax_example("chr(expression)") + .with_sql_example( + r#"```sql +> select chr(128640); ++--------------------+ +| chr(Int64(128640)) | ++--------------------+ +| 🚀 | ++--------------------+ +```"#, + ) + .with_standard_argument("expression", "String") + .with_related_udf("ascii") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 8a9fae7e9f79..e854ff375503 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -18,18 +18,18 @@ use arrow::array::{as_largestring_array, Array}; use arrow::datatypes::DataType; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use crate::string::common::*; +use crate::string::concat; use datafusion_common::cast::{as_string_array, as_string_view_array}; use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; -use datafusion_expr::{lit, ColumnarValue, Expr, Volatility}; +use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; -use crate::string::concat; - #[derive(Debug)] pub struct ConcatFunc { signature: Signature, @@ -248,6 +248,36 @@ impl ScalarUDFImpl for ConcatFunc { ) -> Result { simplify_concat(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_concat_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_concat_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Concatenates multiple strings together.") + .with_syntax_example("concat(str[, ..., str_n])") + .with_sql_example( + r#"```sql +> select concat('data', 'f', 'us', 'ion'); ++-------------------------------------------------------+ +| concat(Utf8("data"),Utf8("f"),Utf8("us"),Utf8("ion")) | ++-------------------------------------------------------+ +| datafusion | ++-------------------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("str_n", "Subsequent string expressions to concatenate.") + .with_related_udf("concat_ws") + .build() + .unwrap() + }) } pub fn simplify_concat(args: Vec) -> Result { diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index 6503a54e65c1..761dddd1047b 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -17,7 +17,7 @@ use arrow::array::{as_largestring_array, Array, StringArray}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::datatypes::DataType; @@ -27,8 +27,9 @@ use crate::string::concat_ws; use datafusion_common::cast::{as_string_array, as_string_view_array}; use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; -use datafusion_expr::{lit, ColumnarValue, Expr, Volatility}; +use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] @@ -279,6 +280,45 @@ impl ScalarUDFImpl for ConcatWsFunc { _ => Ok(ExprSimplifyResult::Original(args)), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_concat_ws_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_concat_ws_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Concatenates multiple strings together with a specified separator.", + ) + .with_syntax_example("concat_ws(separator, str[, ..., str_n])") + .with_sql_example( + r#"```sql +> select concat_ws('_', 'data', 'fusion'); ++--------------------------------------------------+ +| concat_ws(Utf8("_"),Utf8("data"),Utf8("fusion")) | ++--------------------------------------------------+ +| data_fusion | ++--------------------------------------------------+ +```"#, + ) + .with_argument( + "separator", + "Separator to insert between concatenated strings.", + ) + .with_standard_argument("str", "String") + .with_standard_argument( + "str_n", + "Subsequent string expressions to concatenate.", + ) + .with_related_udf("concat") + .build() + .unwrap() + }) } fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result { diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 4133552160d4..0f75731aa1c3 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -15,21 +15,20 @@ // specific language governing permissions and limitations // under the License. -use crate::regexp_common::regexp_is_match_utf8; use crate::utils::make_scalar_function; - use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray}; +use arrow::compute::regexp_is_match; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View}; use datafusion_common::exec_err; use datafusion_common::DataFusionError; use datafusion_common::Result; -use datafusion_expr::ScalarUDFImpl; -use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, Signature, Volatility}; - +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct ContainsFunc { @@ -44,22 +43,8 @@ impl Default for ContainsFunc { impl ContainsFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8View, Utf8]), - Exact(vec![Utf8View, LargeUtf8]), - Exact(vec![Utf8, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8, LargeUtf8]), - Exact(vec![LargeUtf8, Utf8View]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } @@ -84,6 +69,37 @@ impl ScalarUDFImpl for ContainsFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(contains, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_contains_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_contains_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Return true if search_str is found within string (case-sensitive).", + ) + .with_syntax_example("contains(str, search_str)") + .with_sql_example( + r#"```sql +> select contains('the quick brown fox', 'row'); ++---------------------------------------------------+ +| contains(Utf8("the quick brown fox"),Utf8("row")) | ++---------------------------------------------------+ +| true | ++---------------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("search_str", "The string to search for in str.") + .build() + .unwrap() + }) } /// use regexp_is_match_utf8_scalar to do the calculation for contains @@ -92,41 +108,8 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8View) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< - StringViewArray, + let res = regexp_is_match::< StringViewArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (Utf8View, Utf8) => { - let mod_str = args[0].as_string_view(); - let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< - StringViewArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (Utf8View, LargeUtf8) => { - let mod_str = args[0].as_string_view(); - let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< - StringViewArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (Utf8, Utf8View) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< - GenericStringArray, StringViewArray, GenericStringArray, >(mod_str, match_str, None)?; @@ -136,7 +119,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -144,43 +127,10 @@ pub fn contains(args: &[ArrayRef]) -> Result { Ok(Arc::new(res) as ArrayRef) } - (Utf8, LargeUtf8) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< - GenericStringArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (LargeUtf8, Utf8View) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< - GenericStringArray, - StringViewArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (LargeUtf8, Utf8) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< - GenericStringArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } (LargeUtf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -193,89 +143,3 @@ pub fn contains(args: &[ArrayRef]) -> Result { } } } - -#[cfg(test)] -mod tests { - use crate::string::contains::ContainsFunc; - use crate::utils::test::test_function; - use arrow::array::Array; - use arrow::{array::BooleanArray, datatypes::DataType::Boolean}; - use datafusion_common::Result; - use datafusion_common::ScalarValue; - use datafusion_expr::ColumnarValue; - use datafusion_expr::ScalarUDFImpl; - #[test] - fn test_functions() -> Result<()> { - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::from(ScalarValue::from("alphabet")), - ColumnarValue::from(ScalarValue::from("alph")), - ], - Ok(Some(true)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::from(ScalarValue::from("alphabet")), - ColumnarValue::from(ScalarValue::from("dddddd")), - ], - Ok(Some(false)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::from(ScalarValue::from("alphabet")), - ColumnarValue::from(ScalarValue::from("pha")), - ], - Ok(Some(true)), - bool, - Boolean, - BooleanArray - ); - - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::from(ScalarValue::Utf8View(Some(String::from("Apache")))), - ColumnarValue::from(ScalarValue::Utf8View(Some(String::from("pac")))), - ], - Ok(Some(true)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::from(ScalarValue::Utf8View(Some(String::from("Apache")))), - ColumnarValue::from(ScalarValue::Utf8(Some(String::from("ap")))), - ], - Ok(Some(false)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::from(ScalarValue::Utf8View(Some(String::from("Apache")))), - ColumnarValue::from(ScalarValue::LargeUtf8(Some(String::from( - "DataFusion" - )))), - ], - Ok(Some(false)), - bool, - Boolean, - BooleanArray - ); - - Ok(()) - } -} diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs index 82fde772c282..42d673cf7b39 100644 --- a/datafusion/functions/src/string/ends_with.rs +++ b/datafusion/functions/src/string/ends_with.rs @@ -16,18 +16,17 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::make_scalar_function; - #[derive(Debug)] pub struct EndsWithFunc { signature: Signature, @@ -42,17 +41,7 @@ impl Default for EndsWithFunc { impl EndsWithFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![DataType::Utf8View, DataType::Utf8View]), - Exact(vec![DataType::Utf8, DataType::Utf8]), - Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } @@ -84,6 +73,41 @@ impl ScalarUDFImpl for EndsWithFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_ends_with_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ends_with_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Tests if a string ends with a substring.") + .with_syntax_example("ends_with(str, substr)") + .with_sql_example( + r#"```sql +> select ends_with('datafusion', 'soin'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("soin")) | ++--------------------------------------------+ +| false | ++--------------------------------------------+ +> select ends_with('datafusion', 'sion'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("sion")) | ++--------------------------------------------+ +| true | ++--------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring to test for.") + .build() + .unwrap() + }) } /// Returns true if string ends with suffix. diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/string/initcap.rs index 081fa94af8d9..cfac6f423f4a 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/string/initcap.rs @@ -16,18 +16,18 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct InitcapFunc { signature: Signature, @@ -41,13 +41,8 @@ impl Default for InitcapFunc { impl InitcapFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } @@ -79,6 +74,34 @@ impl ScalarUDFImpl for InitcapFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_initcap_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_initcap_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters.") + .with_syntax_example("initcap(str)") + .with_sql_example(r#"```sql +> select initcap('apache datafusion'); ++------------------------------------+ +| initcap(Utf8("apache datafusion")) | ++------------------------------------+ +| Apache Datafusion | ++------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_related_udf("lower") + .with_related_udf("upper") + .build() + .unwrap() + }) } /// Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 430c402a50c5..558e71239f84 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; use arrow::datatypes::DataType; @@ -25,8 +25,8 @@ use crate::utils::{make_scalar_function, utf8_to_int_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -43,14 +43,7 @@ impl Default for LevenshteinFunc { impl LevenshteinFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - Exact(vec![DataType::Utf8View, DataType::Utf8View]), - Exact(vec![DataType::Utf8, DataType::Utf8]), - Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } @@ -83,6 +76,33 @@ impl ScalarUDFImpl for LevenshteinFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_levenshtein_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_levenshtein_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the [`Levenshtein distance`](https://en.wikipedia.org/wiki/Levenshtein_distance) between the two given strings.") + .with_syntax_example("levenshtein(str1, str2)") + .with_sql_example(r#"```sql +> select levenshtein('kitten', 'sitting'); ++---------------------------------------------+ +| levenshtein(Utf8("kitten"),Utf8("sitting")) | ++---------------------------------------------+ +| 3 | ++---------------------------------------------+ +```"#) + .with_argument("str1", "String expression to compute Levenshtein distance with str2.") + .with_argument("str2", "String expression to compute Levenshtein distance with str1.") + .build() + .unwrap() + }) } ///Returns the Levenshtein distance between the two given strings. diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index ca324e69c0d2..f82b11ca9051 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -15,16 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::datatypes::DataType; - -use datafusion_common::Result; -use datafusion_expr::ColumnarValue; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::sync::OnceLock; use crate::string::common::to_lower; use crate::utils::utf8_to_str_type; +use datafusion_common::Result; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] pub struct LowerFunc { @@ -39,13 +39,8 @@ impl Default for LowerFunc { impl LowerFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } @@ -70,8 +65,37 @@ impl ScalarUDFImpl for LowerFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { to_lower(args, "lower") } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_lower_doc()) + } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_lower_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Converts a string to lower-case.") + .with_syntax_example("lower(str)") + .with_sql_example( + r#"```sql +> select lower('Ångström'); ++-------------------------+ +| lower(Utf8("Ångström")) | ++-------------------------+ +| ångström | ++-------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("initcap") + .with_related_udf("upper") + .build() + .unwrap() + }) +} #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 0ddb5a205bac..b64dcda7218e 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -15,20 +15,19 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' fn ltrim(args: &[ArrayRef]) -> Result { @@ -49,18 +48,9 @@ impl Default for LtrimFunc { impl LtrimFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8View]), - Exact(vec![Utf8]), - ], + vec![TypeSignature::String(2), TypeSignature::String(1)], Volatility::Immutable, ), } @@ -104,6 +94,41 @@ impl ScalarUDFImpl for LtrimFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_ltrim_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ltrim_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.") + .with_syntax_example("ltrim(str[, trim_str])") + .with_sql_example(r#"```sql +> select ltrim(' datafusion '); ++-------------------------------+ +| ltrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select ltrim('___datafusion___', '_'); ++-------------------------------------------+ +| ltrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| datafusion___ | ++-------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("trim_str", "String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._") + .with_related_udf("btrim") + .with_related_udf("rtrim") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 93e62961bca8..26ce70a416d0 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -15,17 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::compute::kernels::length::length; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::utils::utf8_to_int_type; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::utf8_to_int_type; - #[derive(Debug)] pub struct OctetLengthFunc { signature: Signature, @@ -39,13 +39,8 @@ impl Default for OctetLengthFunc { impl OctetLengthFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } @@ -91,6 +86,36 @@ impl ScalarUDFImpl for OctetLengthFunc { }, } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_octet_length_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_octet_length_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the length of a string in bytes.") + .with_syntax_example("octet_length(str)") + .with_sql_example( + r#"```sql +> select octet_length('Ångström'); ++--------------------------------+ +| octet_length(Utf8("Ångström")) | ++--------------------------------+ +| 10 | ++--------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("bit_length") + .with_related_udf("length") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs index e285bd85b197..3b31bc360851 100644 --- a/datafusion/functions/src/string/overlay.rs +++ b/datafusion/functions/src/string/overlay.rs @@ -16,21 +16,20 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::{exec_err, Result}; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct OverlayFunc { signature: Signature, @@ -48,12 +47,12 @@ impl OverlayFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Utf8View, Int64, Int64]), - Exact(vec![Utf8, Utf8, Int64, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]), - Exact(vec![Utf8View, Utf8View, Int64]), - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Int64, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8, Int64, Int64]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]), ], Volatility::Immutable, ), @@ -87,6 +86,35 @@ impl ScalarUDFImpl for OverlayFunc { other => exec_err!("Unsupported data type {other:?} for function overlay"), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_overlay_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_overlay_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the string which is replaced by another string from the specified position and specified count length.") + .with_syntax_example("overlay(str PLACING substr FROM pos [FOR count])") + .with_sql_example(r#"```sql +> select overlay('Txxxxas' placing 'hom' from 2 for 4); ++--------------------------------------------------------+ +| overlay(Utf8("Txxxxas"),Utf8("hom"),Int64(2),Int64(4)) | ++--------------------------------------------------------+ +| Thomas | ++--------------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring to replace in str.") + .with_argument("pos", "The start position to start the replace in str.") + .with_argument("count", "The count of characters to be replaced from start position of str. If not specified, will use substr length instead.") + .build() + .unwrap() + }) } macro_rules! process_overlay { diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 0788e0428d7d..50ca641c12d7 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -16,24 +16,22 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use crate::string::common::StringArrayType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Int64, LargeUtf8, Utf8, Utf8View}; - use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::StringArrayType; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct RepeatFunc { signature: Signature, @@ -53,9 +51,9 @@ impl RepeatFunc { // Planner attempts coercion to the target type starting with the most preferred candidate. // For example, given input `(Utf8View, Int64)`, it first tries coercing to `(Utf8View, Int64)`. // If that fails, it proceeds to `(Utf8, Int64)`. - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Int64]), ], Volatility::Immutable, ), @@ -83,6 +81,37 @@ impl ScalarUDFImpl for RepeatFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(repeat, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_repeat_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_repeat_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Returns a string with an input string repeated a specified number.", + ) + .with_syntax_example("repeat(str, n)") + .with_sql_example( + r#"```sql +> select repeat('data', 3); ++-------------------------------+ +| repeat(Utf8("data"),Int64(3)) | ++-------------------------------+ +| datadatadata | ++-------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("n", "Number of times to repeat the input string.") + .build() + .unwrap() + }) } /// Repeats string the specified number of times. diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 7d3109ab63b9..e2ece6cb384f 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -16,19 +16,18 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct ReplaceFunc { signature: Signature, @@ -42,16 +41,8 @@ impl Default for ReplaceFunc { impl ReplaceFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(3, Volatility::Immutable), } } } @@ -83,6 +74,34 @@ impl ScalarUDFImpl for ReplaceFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_replace_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_replace_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Replaces all occurrences of a specified substring in a string with a new substring.") + .with_syntax_example("replace(str, substr, replacement)") + .with_sql_example(r#"```sql +> select replace('ABabbaBA', 'ab', 'cd'); ++-------------------------------------------------+ +| replace(Utf8("ABabbaBA"),Utf8("ab"),Utf8("cd")) | ++-------------------------------------------------+ +| ABcdbaBA | ++-------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_standard_argument("substr", "Substring expression to replace in the input string. Substring expression") + .with_standard_argument("replacement", "Replacement substring") + .build() + .unwrap() + }) } fn replace_view(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index a1aa5568babb..1a27502a2082 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -16,19 +16,18 @@ // under the License. use arrow::array::{ArrayRef, OffsetSizeTrait}; -use std::any::Any; - use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' fn rtrim(args: &[ArrayRef]) -> Result { @@ -49,18 +48,9 @@ impl Default for RtrimFunc { impl RtrimFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8View]), - Exact(vec![Utf8]), - ], + vec![TypeSignature::String(2), TypeSignature::String(1)], Volatility::Immutable, ), } @@ -104,6 +94,41 @@ impl ScalarUDFImpl for RtrimFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_rtrim_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rtrim_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string.") + .with_syntax_example("rtrim(str[, trim_str])") + .with_sql_example(r#"```sql +> select rtrim(' datafusion '); ++-------------------------------+ +| rtrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select rtrim('___datafusion___', '_'); ++-------------------------------------------+ +| rtrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| ___datafusion | ++-------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("trim_str", "String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._") + .with_related_udf("btrim") + .with_related_udf("ltrim") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 438e2e611359..5fedc5b17237 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::utils::utf8_to_str_type; use arrow::array::{ ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, StringViewArray, }; @@ -23,13 +24,11 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; use datafusion_common::ScalarValue; use datafusion_common::{exec_err, DataFusionError, Result}; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; use std::any::Any; -use std::sync::Arc; - -use crate::utils::utf8_to_str_type; +use std::sync::{Arc, OnceLock}; use super::common::StringArrayType; @@ -50,15 +49,15 @@ impl SplitPartFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Utf8View, Int64]), - Exact(vec![Utf8View, Utf8, Int64]), - Exact(vec![Utf8View, LargeUtf8, Int64]), - Exact(vec![Utf8, Utf8View, Int64]), - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, Utf8View, Int64]), - Exact(vec![LargeUtf8, Utf8, Int64]), - Exact(vec![Utf8, LargeUtf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8, Int64]), + TypeSignature::Exact(vec![Utf8View, LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Utf8View, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Utf8, Int64]), + TypeSignature::Exact(vec![Utf8, LargeUtf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]), ], Volatility::Immutable, ), @@ -178,6 +177,34 @@ impl ScalarUDFImpl for SplitPartFunc { result.map(ColumnarValue::Array) } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_split_part_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_split_part_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Splits a string based on a specified delimiter and returns the substring in the specified position.") + .with_syntax_example("split_part(str, delimiter, pos)") + .with_sql_example(r#"```sql +> select split_part('1.2.3.4.5', '.', 3); ++--------------------------------------------------+ +| split_part(Utf8("1.2.3.4.5"),Utf8("."),Int64(3)) | ++--------------------------------------------------+ +| 3 | ++--------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("delimiter", "String or character to split on.") + .with_argument("pos", "Position of the part to return.") + .build() + .unwrap() + }) } /// impl diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index e0c0fbdb3df7..a7917036eb28 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -16,18 +16,17 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; -use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use crate::utils::make_scalar_function; - /// Returns true if string starts with prefix. /// starts_with('alphabet', 'alph') = 't' pub fn starts_with(args: &[ArrayRef]) -> Result { @@ -49,17 +48,7 @@ impl Default for StartsWithFunc { impl StartsWithFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![DataType::Utf8View, DataType::Utf8View]), - Exact(vec![DataType::Utf8, DataType::Utf8]), - Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } @@ -89,6 +78,35 @@ impl ScalarUDFImpl for StartsWithFunc { _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?, } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_starts_with_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_starts_with_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Tests if a string starts with a substring.") + .with_syntax_example("starts_with(str, substr)") + .with_sql_example( + r#"```sql +> select starts_with('datafusion','data'); ++----------------------------------------------+ +| starts_with(Utf8("datafusion"),Utf8("data")) | ++----------------------------------------------+ +| true | ++----------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring to test for.") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index 79aa9254f9b1..72cd4fbffa33 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -16,21 +16,21 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::{ ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type, }; +use crate::utils::make_scalar_function; use datafusion_common::cast::as_primitive_array; use datafusion_common::Result; use datafusion_common::{exec_err, plan_err}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use crate::utils::make_scalar_function; - /// Converts the number to its equivalent hexadecimal representation. /// to_hex(2147483647) = '7fffffff' pub fn to_hex(args: &[ArrayRef]) -> Result @@ -110,6 +110,34 @@ impl ScalarUDFImpl for ToHexFunc { other => exec_err!("Unsupported data type {other:?} for function to_hex"), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_to_hex_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_to_hex_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Converts an integer to a hexadecimal string.") + .with_syntax_example("to_hex(int)") + .with_sql_example( + r#"```sql +> select to_hex(12345689); ++-------------------------+ +| to_hex(Int64(12345689)) | ++-------------------------+ +| bc6159 | ++-------------------------+ +```"#, + ) + .with_standard_argument("int", "Integer") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index 593e33ab6bb4..bfcb2a86994d 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -19,9 +19,11 @@ use crate::string::common::to_upper; use crate::utils::utf8_to_str_type; use arrow::datatypes::DataType; use datafusion_common::Result; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct UpperFunc { @@ -36,13 +38,8 @@ impl Default for UpperFunc { impl UpperFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } @@ -67,6 +64,36 @@ impl ScalarUDFImpl for UpperFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { to_upper(args, "upper") } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_upper_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_upper_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Converts a string to upper-case.") + .with_syntax_example("upper(str)") + .with_sql_example( + r#"```sql +> select upper('dataFusion'); ++---------------------------+ +| upper(Utf8("dataFusion")) | ++---------------------------+ +| DATAFUSION | ++---------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("initcap") + .with_related_udf("lower") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs index 3ddc320fcec1..0fbdce16ccd1 100644 --- a/datafusion/functions/src/string/uuid.rs +++ b/datafusion/functions/src/string/uuid.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::GenericStringArray; use arrow::datatypes::DataType; @@ -24,7 +24,8 @@ use arrow::datatypes::DataType::Utf8; use uuid::Uuid; use datafusion_common::{not_impl_err, Result}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] @@ -74,4 +75,29 @@ impl ScalarUDFImpl for UuidFunc { let array = GenericStringArray::::from_iter_values(values); Ok(ColumnarValue::Array(Arc::new(array))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_uuid_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_uuid_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_(random)) string value which is unique per row.") + .with_syntax_example("uuid()") + .with_sql_example(r#"```sql +> select uuid(); ++--------------------------------------+ +| uuid() | ++--------------------------------------+ +| 6ec17ef8-1934-41cc-8d59-d0c8f9eea1f0 | ++--------------------------------------+ +```"#) + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index 768e88c019a6..2e108f127a73 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -22,9 +22,12 @@ use arrow::array::{ }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use datafusion_common::Result; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct CharacterLengthFunc { @@ -76,6 +79,36 @@ impl ScalarUDFImpl for CharacterLengthFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_character_length_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_character_length_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the number of characters in a string.") + .with_syntax_example("character_length(str)") + .with_sql_example( + r#"```sql +> select character_length('Ångström'); ++------------------------------------+ +| character_length(Utf8("Ångström")) | ++------------------------------------+ +| 8 | ++------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("bit_length") + .with_related_udf("octet_length") + .build() + .unwrap() + }) } /// Returns number of characters in the string. diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs index 41a2b9d9e72d..cad860e41088 100644 --- a/datafusion/functions/src/unicode/find_in_set.rs +++ b/datafusion/functions/src/unicode/find_in_set.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, @@ -24,11 +24,13 @@ use arrow::array::{ }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; +use crate::utils::{make_scalar_function, utf8_to_int_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::utils::{make_scalar_function, utf8_to_int_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct FindInSetFunc { @@ -77,6 +79,33 @@ impl ScalarUDFImpl for FindInSetFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(find_in_set, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_find_in_set_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_find_in_set_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings.") + .with_syntax_example("find_in_set(str, strlist)") + .with_sql_example(r#"```sql +> select find_in_set('b', 'a,b,c,d'); ++----------------------------------------+ +| find_in_set(Utf8("b"),Utf8("a,b,c,d")) | ++----------------------------------------+ +| 2 | ++----------------------------------------+ +```"#) + .with_argument("str", "String expression to find in strlist.") + .with_argument("strlist", "A string list is a string composed of substrings separated by , characters.") + .build() + .unwrap() + }) } ///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index f1f84c98ef5e..f8a507d51526 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::cmp::Ordering; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, @@ -25,15 +25,17 @@ use arrow::array::{ }; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; use datafusion_common::Result; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct LeftFunc { @@ -91,6 +93,34 @@ impl ScalarUDFImpl for LeftFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_left_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_left_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns a specified number of characters from the left side of a string.") + .with_syntax_example("left(str, n)") + .with_sql_example(r#"```sql +> select left('datafusion', 4); ++-----------------------------------+ +| left(Utf8("datafusion"),Int64(4)) | ++-----------------------------------+ +| data | ++-----------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("n", "Number of characters to return.") + .with_related_udf("right") + .build() + .unwrap() + }) } /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 5dfe4e0d82aa..f1c3646fd168 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::Write; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, @@ -27,13 +27,15 @@ use arrow::datatypes::DataType; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; +use crate::string::common::StringArrayType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::string::common::StringArrayType; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct LPadFunc { @@ -95,6 +97,35 @@ impl ScalarUDFImpl for LPadFunc { other => exec_err!("Unsupported data type {other:?} for function lpad"), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_lpad_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_lpad_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Pads the left side of a string with another string to a specified string length.") + .with_syntax_example("lpad(str, n[, padding_str])") + .with_sql_example(r#"```sql +> select lpad('Dolly', 10, 'hello'); ++---------------------------------------------+ +| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) | ++---------------------------------------------+ +| helloDolly | ++---------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("n", "String length to pad to.") + .with_argument("padding_str", "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") + .with_related_udf("rpad") + .build() + .unwrap() + }) } /// Extends the string to length 'length' by prepending the characters fill (a space by default). diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index b7af2d31e711..0190705966fb 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -16,19 +16,21 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, }; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use DataType::{LargeUtf8, Utf8, Utf8View}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct ReverseFunc { signature: Signature, @@ -79,6 +81,34 @@ impl ScalarUDFImpl for ReverseFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_reverse_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_reverse_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Reverses the character order of a string.") + .with_syntax_example("reverse(str)") + .with_sql_example( + r#"```sql +> select reverse('datafusion'); ++-----------------------------+ +| reverse(Utf8("datafusion")) | ++-----------------------------+ +| noisufatad | ++-----------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .build() + .unwrap() + }) } /// Reverses the order of the characters in the string. diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 7fadb058c19b..1ec08cb87eee 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::cmp::{max, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, @@ -31,8 +31,11 @@ use datafusion_common::cast::{ }; use datafusion_common::exec_err; use datafusion_common::Result; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct RightFunc { @@ -90,6 +93,34 @@ impl ScalarUDFImpl for RightFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_right_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_right_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns a specified number of characters from the right side of a string.") + .with_syntax_example("right(str, n)") + .with_sql_example(r#"```sql +> select right('datafusion', 6); ++------------------------------------+ +| right(Utf8("datafusion"),Int64(6)) | ++------------------------------------+ +| fusion | ++------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("n", "Number of characters to return") + .with_related_udf("left") + .build() + .unwrap() + }) } /// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 4c8c2c2ca5f5..9d1d6c989eab 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -25,11 +25,14 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; use datafusion_common::DataFusionError; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; use std::fmt::Write; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; @@ -113,6 +116,39 @@ impl ScalarUDFImpl for RPadFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_rpad_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rpad_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Pads the right side of a string with another string to a specified string length.") + .with_syntax_example("rpad(str, n[, padding_str])") + .with_sql_example(r#"```sql +> select rpad('datafusion', 20, '_-'); ++-----------------------------------------------+ +| rpad(Utf8("datafusion"),Int64(20),Utf8("_-")) | ++-----------------------------------------------+ +| datafusion_-_-_-_-_- | ++-----------------------------------------------+ +```"#) + .with_standard_argument( + "str", + "String", + ) + .with_argument("n", "String length to pad to.") + .with_argument("padding_str", + "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") + .with_related_udf("lpad") + .build() + .unwrap() + }) } pub fn rpad( diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs index e5ffd20bbcd8..5c90d1923996 100644 --- a/datafusion/functions/src/unicode/strpos.rs +++ b/datafusion/functions/src/unicode/strpos.rs @@ -16,15 +16,17 @@ // under the License. use std::any::Any; -use std::sync::Arc; - -use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray}; -use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; +use std::sync::{Arc, OnceLock}; use crate::string::common::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_int_type}; -use datafusion_common::{exec_err, plan_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray}; +use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; +use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct StrposFunc { @@ -41,7 +43,7 @@ impl Default for StrposFunc { impl StrposFunc { pub fn new() -> Self { Self { - signature: Signature::user_defined(Volatility::Immutable), + signature: Signature::string(2, Volatility::Immutable), aliases: vec![String::from("instr"), String::from("position")], } } @@ -72,26 +74,34 @@ impl ScalarUDFImpl for StrposFunc { &self.aliases } - fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - match arg_types { - [first, second ] => { - match (first, second) { - (DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8, DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8) => Ok(arg_types.to_vec()), - (DataType::Null, DataType::Null) => Ok(vec![DataType::Utf8, DataType::Utf8]), - (DataType::Null, _) => Ok(vec![DataType::Utf8, second.to_owned()]), - (_, DataType::Null) => Ok(vec![first.to_owned(), DataType::Utf8]), - (DataType::Dictionary(_, value_type), DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8) => match **value_type { - DataType::LargeUtf8 | DataType::Utf8View | DataType::Utf8 | DataType::Null | DataType::Binary => Ok(vec![*value_type.clone(), second.to_owned()]), - _ => plan_err!("The STRPOS/INSTR/POSITION function can only accept strings, but got {:?}.", **value_type), - }, - _ => plan_err!("The STRPOS/INSTR/POSITION function can only accept strings, but got {:?}.", arg_types) - } - }, - _ => plan_err!("The STRPOS/INSTR/POSITION function can only accept strings, but got {:?}", arg_types) - } + fn documentation(&self) -> Option<&Documentation> { + Some(get_strpos_doc()) } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_strpos_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0.") + .with_syntax_example("strpos(str, substr)") + .with_sql_example(r#"```sql +> select strpos('datafusion', 'fus'); ++----------------------------------------+ +| strpos(Utf8("datafusion"),Utf8("fus")) | ++----------------------------------------+ +| 5 | ++----------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring expression to search for.") + .build() + .unwrap() + }) +} + fn strpos(args: &[ArrayRef]) -> Result { match (args[0].data_type(), args[1].data_type()) { (DataType::Utf8, DataType::Utf8) => { diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 565454d838b6..d4e03edebb09 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use crate::string::common::{make_and_append_view, StringArrayType}; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -28,7 +28,10 @@ use arrow::datatypes::DataType; use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, plan_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct SubstrFunc { @@ -138,6 +141,34 @@ impl ScalarUDFImpl for SubstrFunc { ]) } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_substr_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_substr_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Extracts a substring of a specified number of characters from a specific starting position in a string.") + .with_syntax_example("substr(str, start_pos[, length])") + .with_sql_example(r#"```sql +> select substr('datafusion', 5, 3); ++----------------------------------------------+ +| substr(Utf8("datafusion"),Int64(5),Int64(3)) | ++----------------------------------------------+ +| fus | ++----------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("start_pos", "Character position to start the substring at. The first character in the string has a position of 1.") + .with_argument("length", "Number of characters to extract. If not specified, returns the rest of the string after the start position.") + .build() + .unwrap() + }) } /// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index 9ca3d018d884..c628367cf355 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, @@ -24,11 +24,13 @@ use arrow::array::{ }; use arrow::datatypes::{DataType, Int32Type, Int64Type}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct SubstrIndexFunc { @@ -83,6 +85,42 @@ impl ScalarUDFImpl for SubstrIndexFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_substr_index_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_substr_index_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description(r#"Returns the substring from str before count occurrences of the delimiter delim. +If count is positive, everything to the left of the final delimiter (counting from the left) is returned. +If count is negative, everything to the right of the final delimiter (counting from the right) is returned."#) + .with_syntax_example("substr_index(str, delim, count)") + .with_sql_example(r#"```sql +> select substr_index('www.apache.org', '.', 1); ++---------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) | ++---------------------------------------------------------+ +| www | ++---------------------------------------------------------+ +> select substr_index('www.apache.org', '.', -1); ++----------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) | ++----------------------------------------------------------+ +| org | ++----------------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("delim", "The string to find in str to split str.") + .with_argument("count", "The number of times to search for the delimiter. Can be either a positive or negative number.") + .build() + .unwrap() + }) } /// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index d49559d452c8..ac5a6a2117b8 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, @@ -27,8 +27,11 @@ use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct TranslateFunc { @@ -76,6 +79,34 @@ impl ScalarUDFImpl for TranslateFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(invoke_translate, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_translate_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_translate_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Translates characters in a string to specified translation characters.") + .with_syntax_example("translate(str, chars, translation)") + .with_sql_example(r#"```sql +> select translate('twice', 'wic', 'her'); ++--------------------------------------------------+ +| translate(Utf8("twice"),Utf8("wic"),Utf8("her")) | ++--------------------------------------------------+ +| there | ++--------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("chars", "Characters to translate.") + .with_argument("translation", "Translation characters. Translation characters replace only characters at the same position in the **chars** string.") + .build() + .unwrap() + }) } fn invoke_translate(args: &[ArrayRef]) -> Result { diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs index 86520b3587cd..b3b24724552a 100644 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs @@ -48,13 +48,7 @@ impl AnalyzerRule for CountWildcardRule { } fn is_wildcard(expr: &Expr) -> bool { - matches!( - expr, - Expr::Wildcard { - qualifier: None, - .. - } - ) + matches!(expr, Expr::Wildcard { .. }) } fn is_count_star_aggregate(aggregate_function: &AggregateFunction) -> bool { diff --git a/datafusion/optimizer/src/analyzer/subquery.rs b/datafusion/optimizer/src/analyzer/subquery.rs index c771f31a58b2..aabc549de583 100644 --- a/datafusion/optimizer/src/analyzer/subquery.rs +++ b/datafusion/optimizer/src/analyzer/subquery.rs @@ -385,6 +385,10 @@ mod test { empty_schema: Arc::clone(&self.empty_schema), }) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } #[test] diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 12440f1128f8..1d9aea25274f 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -456,7 +456,6 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { self.schema, &func, )?; - let new_expr = coerce_arguments_for_fun(new_expr, self.schema, &func)?; Ok(Transformed::yes(Expr::ScalarFunction( ScalarFunction::new_udf(func, new_expr), ))) @@ -756,30 +755,6 @@ fn coerce_arguments_for_signature_with_aggregate_udf( .collect() } -fn coerce_arguments_for_fun( - expressions: Vec, - schema: &DFSchema, - fun: &Arc, -) -> Result> { - // Cast Fixedsizelist to List for array functions - if fun.name() == "make_array" { - expressions - .into_iter() - .map(|expr| { - let data_type = expr.get_type(schema).unwrap(); - if let DataType::FixedSizeList(field, _) = data_type { - let to_type = DataType::List(Arc::clone(&field)); - expr.cast_to(&to_type, schema) - } else { - Ok(expr) - } - }) - .collect() - } else { - Ok(expressions) - } -} - fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { // Given expressions like: // diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 5ab427a31699..b5d581f3919f 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -895,6 +895,10 @@ mod tests { // Since schema is same. Output columns requires their corresponding version in the input columns. Some(vec![output_columns.to_vec()]) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } #[derive(Debug, Hash, PartialEq, Eq)] @@ -991,6 +995,10 @@ mod tests { } Some(vec![left_reqs, right_reqs]) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } #[test] diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index f5304c511b35..cdca86505dfa 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -1499,6 +1499,10 @@ mod tests { schema: Arc::clone(&self.schema), }) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } #[test] diff --git a/datafusion/optimizer/src/push_down_limit.rs b/datafusion/optimizer/src/push_down_limit.rs index 158c7592df51..8b5e483001b3 100644 --- a/datafusion/optimizer/src/push_down_limit.rs +++ b/datafusion/optimizer/src/push_down_limit.rs @@ -153,6 +153,29 @@ impl OptimizerRule for PushDownLimit { subquery_alias.input = Arc::new(new_limit); Ok(Transformed::yes(LogicalPlan::SubqueryAlias(subquery_alias))) } + LogicalPlan::Extension(extension_plan) + if extension_plan.node.supports_limit_pushdown() => + { + let new_children = extension_plan + .node + .inputs() + .into_iter() + .map(|child| { + LogicalPlan::Limit(Limit { + skip: 0, + fetch: Some(fetch + skip), + input: Arc::new(child.clone()), + }) + }) + .collect::>(); + + // Create a new extension node with updated inputs + let child_plan = LogicalPlan::Extension(extension_plan); + let new_extension = + child_plan.with_new_exprs(child_plan.expressions(), new_children)?; + + transformed_limit(skip, fetch, new_extension) + } input => original_limit(skip, fetch, input), } } @@ -258,17 +281,241 @@ fn push_down_join(mut join: Join, limit: usize) -> Transformed { #[cfg(test)] mod test { + use std::cmp::Ordering; + use std::fmt::{Debug, Formatter}; use std::vec; use super::*; use crate::test::*; - use datafusion_expr::{col, exists, logical_plan::builder::LogicalPlanBuilder}; + + use datafusion_common::DFSchemaRef; + use datafusion_expr::{ + col, exists, logical_plan::builder::LogicalPlanBuilder, Expr, Extension, + UserDefinedLogicalNodeCore, + }; use datafusion_functions_aggregate::expr_fn::max; fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> { assert_optimized_plan_eq(Arc::new(PushDownLimit::new()), plan, expected) } + #[derive(Debug, PartialEq, Eq, Hash)] + pub struct NoopPlan { + input: Vec, + schema: DFSchemaRef, + } + + // Manual implementation needed because of `schema` field. Comparison excludes this field. + impl PartialOrd for NoopPlan { + fn partial_cmp(&self, other: &Self) -> Option { + self.input.partial_cmp(&other.input) + } + } + + impl UserDefinedLogicalNodeCore for NoopPlan { + fn name(&self) -> &str { + "NoopPlan" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + self.input.iter().collect() + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + self.input + .iter() + .flat_map(|child| child.expressions()) + .collect() + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "NoopPlan") + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + inputs: Vec, + ) -> Result { + Ok(Self { + input: inputs, + schema: Arc::clone(&self.schema), + }) + } + + fn supports_limit_pushdown(&self) -> bool { + true // Allow limit push-down + } + } + + #[derive(Debug, PartialEq, Eq, Hash)] + struct NoLimitNoopPlan { + input: Vec, + schema: DFSchemaRef, + } + + // Manual implementation needed because of `schema` field. Comparison excludes this field. + impl PartialOrd for NoLimitNoopPlan { + fn partial_cmp(&self, other: &Self) -> Option { + self.input.partial_cmp(&other.input) + } + } + + impl UserDefinedLogicalNodeCore for NoLimitNoopPlan { + fn name(&self) -> &str { + "NoLimitNoopPlan" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + self.input.iter().collect() + } + + fn schema(&self) -> &DFSchemaRef { + &self.schema + } + + fn expressions(&self) -> Vec { + self.input + .iter() + .flat_map(|child| child.expressions()) + .collect() + } + + fn fmt_for_explain(&self, f: &mut Formatter) -> std::fmt::Result { + write!(f, "NoLimitNoopPlan") + } + + fn with_exprs_and_inputs( + &self, + _exprs: Vec, + inputs: Vec, + ) -> Result { + Ok(Self { + input: inputs, + schema: Arc::clone(&self.schema), + }) + } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } + } + #[test] + fn limit_pushdown_basic() -> Result<()> { + let table_scan = test_table_scan()?; + let noop_plan = LogicalPlan::Extension(Extension { + node: Arc::new(NoopPlan { + input: vec![table_scan.clone()], + schema: Arc::clone(table_scan.schema()), + }), + }); + + let plan = LogicalPlanBuilder::from(noop_plan) + .limit(0, Some(1000))? + .build()?; + + let expected = "Limit: skip=0, fetch=1000\ + \n NoopPlan\ + \n Limit: skip=0, fetch=1000\ + \n TableScan: test, fetch=1000"; + + assert_optimized_plan_equal(plan, expected) + } + + #[test] + fn limit_pushdown_with_skip() -> Result<()> { + let table_scan = test_table_scan()?; + let noop_plan = LogicalPlan::Extension(Extension { + node: Arc::new(NoopPlan { + input: vec![table_scan.clone()], + schema: Arc::clone(table_scan.schema()), + }), + }); + + let plan = LogicalPlanBuilder::from(noop_plan) + .limit(10, Some(1000))? + .build()?; + + let expected = "Limit: skip=10, fetch=1000\ + \n NoopPlan\ + \n Limit: skip=0, fetch=1010\ + \n TableScan: test, fetch=1010"; + + assert_optimized_plan_equal(plan, expected) + } + + #[test] + fn limit_pushdown_multiple_limits() -> Result<()> { + let table_scan = test_table_scan()?; + let noop_plan = LogicalPlan::Extension(Extension { + node: Arc::new(NoopPlan { + input: vec![table_scan.clone()], + schema: Arc::clone(table_scan.schema()), + }), + }); + + let plan = LogicalPlanBuilder::from(noop_plan) + .limit(10, Some(1000))? + .limit(20, Some(500))? + .build()?; + + let expected = "Limit: skip=30, fetch=500\ + \n NoopPlan\ + \n Limit: skip=0, fetch=530\ + \n TableScan: test, fetch=530"; + + assert_optimized_plan_equal(plan, expected) + } + + #[test] + fn limit_pushdown_multiple_inputs() -> Result<()> { + let table_scan = test_table_scan()?; + let noop_plan = LogicalPlan::Extension(Extension { + node: Arc::new(NoopPlan { + input: vec![table_scan.clone(), table_scan.clone()], + schema: Arc::clone(table_scan.schema()), + }), + }); + + let plan = LogicalPlanBuilder::from(noop_plan) + .limit(0, Some(1000))? + .build()?; + + let expected = "Limit: skip=0, fetch=1000\ + \n NoopPlan\ + \n Limit: skip=0, fetch=1000\ + \n TableScan: test, fetch=1000\ + \n Limit: skip=0, fetch=1000\ + \n TableScan: test, fetch=1000"; + + assert_optimized_plan_equal(plan, expected) + } + + #[test] + fn limit_pushdown_disallowed_noop_plan() -> Result<()> { + let table_scan = test_table_scan()?; + let no_limit_noop_plan = LogicalPlan::Extension(Extension { + node: Arc::new(NoLimitNoopPlan { + input: vec![table_scan.clone()], + schema: Arc::clone(table_scan.schema()), + }), + }); + + let plan = LogicalPlanBuilder::from(no_limit_noop_plan) + .limit(0, Some(1000))? + .build()?; + + let expected = "Limit: skip=0, fetch=1000\ + \n NoLimitNoopPlan\ + \n TableScan: test"; + + assert_optimized_plan_equal(plan, expected) + } + #[test] fn limit_pushdown_projection_table_provider() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 580aa3c582de..67d6bf8977a4 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -838,22 +838,18 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: Or, right, }) if expr_contains(&right, &left, Or) => Transformed::yes(*right), - // A OR (A AND B) --> A (if B not null) + // A OR (A AND B) --> A Expr::BinaryExpr(BinaryExpr { left, op: Or, right, - }) if !info.nullable(&right)? && is_op_with(And, &right, &left) => { - Transformed::yes(*left) - } - // (A AND B) OR A --> A (if B not null) + }) if is_op_with(And, &right, &left) => Transformed::yes(*left), + // (A AND B) OR A --> A Expr::BinaryExpr(BinaryExpr { left, op: Or, right, - }) if !info.nullable(&left)? && is_op_with(And, &left, &right) => { - Transformed::yes(*right) - } + }) if is_op_with(And, &left, &right) => Transformed::yes(*right), // // Rules for AND @@ -911,22 +907,18 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: And, right, }) if expr_contains(&right, &left, And) => Transformed::yes(*right), - // A AND (A OR B) --> A (if B not null) + // A AND (A OR B) --> A Expr::BinaryExpr(BinaryExpr { left, op: And, right, - }) if !info.nullable(&right)? && is_op_with(Or, &right, &left) => { - Transformed::yes(*left) - } - // (A OR B) AND A --> A (if B not null) + }) if is_op_with(Or, &right, &left) => Transformed::yes(*left), + // (A OR B) AND A --> A Expr::BinaryExpr(BinaryExpr { left, op: And, right, - }) if !info.nullable(&left)? && is_op_with(Or, &left, &right) => { - Transformed::yes(*right) - } + }) if is_op_with(Or, &left, &right) => Transformed::yes(*right), // // Rules for Multiply @@ -2609,15 +2601,11 @@ mod tests { // (c2 > 5) OR ((c1 < 6) AND (c2 > 5)) let expr = or(l.clone(), r.clone()); - // no rewrites if c1 can be null - let expected = expr.clone(); + let expected = l.clone(); assert_eq!(simplify(expr), expected); // ((c1 < 6) AND (c2 > 5)) OR (c2 > 5) - let expr = or(l, r); - - // no rewrites if c1 can be null - let expected = expr.clone(); + let expr = or(r, l); assert_eq!(simplify(expr), expected); } @@ -2648,13 +2636,11 @@ mod tests { // (c2 > 5) AND ((c1 < 6) OR (c2 > 5)) --> c2 > 5 let expr = and(l.clone(), r.clone()); - // no rewrites if c1 can be null - let expected = expr.clone(); + let expected = l.clone(); assert_eq!(simplify(expr), expected); // ((c1 < 6) OR (c2 > 5)) AND (c2 > 5) --> c2 > 5 - let expr = and(l, r); - let expected = expr.clone(); + let expr = and(r, l); assert_eq!(simplify(expr), expected); } @@ -3223,7 +3209,7 @@ mod tests { )], Some(Box::new(col("c2").eq(lit(true)))), )))), - col("c2").or(col("c2").not().and(col("c2"))) // #1716 + col("c2") ); // CASE WHEN ISNULL(c2) THEN true ELSE c2 diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 1c22c2a4375a..74251e5caad2 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -355,7 +355,7 @@ mod tests { .build()?; // Should not be optimized - let expected = "Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, count(DISTINCT test.c):Int64]\ + let expected = "Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(plan, expected) @@ -373,7 +373,7 @@ mod tests { .build()?; // Should not be optimized - let expected = "Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, count(DISTINCT test.c):Int64]\ + let expected = "Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(plan, expected) @@ -392,7 +392,7 @@ mod tests { .build()?; // Should not be optimized - let expected = "Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, count(DISTINCT test.c):Int64]\ + let expected = "Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(plan, expected) diff --git a/datafusion/optimizer/src/test/user_defined.rs b/datafusion/optimizer/src/test/user_defined.rs index 814cd0c0cd0a..a39f90b5da5d 100644 --- a/datafusion/optimizer/src/test/user_defined.rs +++ b/datafusion/optimizer/src/test/user_defined.rs @@ -76,4 +76,8 @@ impl UserDefinedLogicalNodeCore for TestUserDefinedPlanNode { input: inputs.swap_remove(0), }) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index 470bd947c7fb..236167985790 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -345,7 +345,7 @@ fn select_wildcard_with_repeated_column() { let sql = "SELECT *, col_int32 FROM test"; let err = test_sql(sql).expect_err("query should have failed"); assert_eq!( - "expand_wildcard_rule\ncaused by\nError during planning: Projections require unique expression names but the expression \"test.col_int32\" at position 0 and \"test.col_int32\" at position 7 have the same name. Consider aliasing (\"AS\") one of them.", + "Schema error: Schema contains duplicate qualified field name test.col_int32", err.strip_backtrace() ); } @@ -396,7 +396,7 @@ fn test_sql(sql: &str) -> Result { .with_udaf(count_udaf()) .with_udaf(avg_udaf()); let sql_to_rel = SqlToRel::new(&context_provider); - let plan = sql_to_rel.sql_statement_to_plan(statement.clone()).unwrap(); + let plan = sql_to_rel.sql_statement_to_plan(statement.clone())?; let config = OptimizerContext::new().with_skip_failing_rules(false); let analyzer = Analyzer::new(); diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs index 704cb291335f..6c4bf156ce56 100644 --- a/datafusion/physical-expr-common/src/sort_expr.rs +++ b/datafusion/physical-expr-common/src/sort_expr.rs @@ -120,6 +120,13 @@ impl PhysicalSortExpr { } } +/// Access the PhysicalSortExpr as a PhysicalExpr +impl AsRef for PhysicalSortExpr { + fn as_ref(&self) -> &(dyn PhysicalExpr + 'static) { + self.expr.as_ref() + } +} + impl PartialEq for PhysicalSortExpr { fn eq(&self, other: &PhysicalSortExpr) -> bool { self.options == other.options && self.expr.eq(&other.expr) diff --git a/datafusion/physical-expr/src/equivalence/class.rs b/datafusion/physical-expr/src/equivalence/class.rs index 00708b4540aa..c1851ddb22b5 100644 --- a/datafusion/physical-expr/src/equivalence/class.rs +++ b/datafusion/physical-expr/src/equivalence/class.rs @@ -30,7 +30,6 @@ use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::JoinType; use datafusion_physical_expr_common::physical_expr::format_physical_expr_list; -#[derive(Debug, Clone)] /// A structure representing a expression known to be constant in a physical execution plan. /// /// The `ConstExpr` struct encapsulates an expression that is constant during the execution @@ -41,9 +40,10 @@ use datafusion_physical_expr_common::physical_expr::format_physical_expr_list; /// /// - `expr`: Constant expression for a node in the physical plan. /// -/// - `across_partitions`: A boolean flag indicating whether the constant expression is -/// valid across partitions. If set to `true`, the constant expression has same value for all partitions. -/// If set to `false`, the constant expression may have different values for different partitions. +/// - `across_partitions`: A boolean flag indicating whether the constant +/// expression is the same across partitions. If set to `true`, the constant +/// expression has same value for all partitions. If set to `false`, the +/// constant expression may have different values for different partitions. /// /// # Example /// @@ -56,11 +56,22 @@ use datafusion_physical_expr_common::physical_expr::format_physical_expr_list; /// // create a constant expression from a physical expression /// let const_expr = ConstExpr::from(col); /// ``` +#[derive(Debug, Clone)] pub struct ConstExpr { + /// The expression that is known to be constant (e.g. a `Column`) expr: Arc, + /// Does the constant have the same value across all partitions? See + /// struct docs for more details across_partitions: bool, } +impl PartialEq for ConstExpr { + fn eq(&self, other: &Self) -> bool { + self.across_partitions == other.across_partitions + && self.expr.eq(other.expr.as_any()) + } +} + impl ConstExpr { /// Create a new constant expression from a physical expression. /// @@ -74,11 +85,17 @@ impl ConstExpr { } } + /// Set the `across_partitions` flag + /// + /// See struct docs for more details pub fn with_across_partitions(mut self, across_partitions: bool) -> Self { self.across_partitions = across_partitions; self } + /// Is the expression the same across all partitions? + /// + /// See struct docs for more details pub fn across_partitions(&self) -> bool { self.across_partitions } @@ -101,6 +118,31 @@ impl ConstExpr { across_partitions: self.across_partitions, }) } + + /// Returns true if this constant expression is equal to the given expression + pub fn eq_expr(&self, other: impl AsRef) -> bool { + self.expr.eq(other.as_ref().as_any()) + } + + /// Returns a [`Display`]able list of `ConstExpr`. + pub fn format_list(input: &[ConstExpr]) -> impl Display + '_ { + struct DisplayableList<'a>(&'a [ConstExpr]); + impl<'a> Display for DisplayableList<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut first = true; + for const_expr in self.0 { + if first { + first = false; + } else { + write!(f, ",")?; + } + write!(f, "{}", const_expr)?; + } + Ok(()) + } + } + DisplayableList(input) + } } /// Display implementation for `ConstExpr` diff --git a/datafusion/physical-expr/src/equivalence/ordering.rs b/datafusion/physical-expr/src/equivalence/ordering.rs index 65423033d5e0..bb3e9218bc41 100644 --- a/datafusion/physical-expr/src/equivalence/ordering.rs +++ b/datafusion/physical-expr/src/equivalence/ordering.rs @@ -18,6 +18,7 @@ use std::fmt::Display; use std::hash::Hash; use std::sync::Arc; +use std::vec::IntoIter; use crate::equivalence::add_offset_to_expr; use crate::{LexOrdering, PhysicalExpr, PhysicalSortExpr}; @@ -36,7 +37,7 @@ use arrow_schema::SortOptions; /// /// Here, both `vec![a ASC, b ASC]` and `vec![c DESC, d ASC]` describe the table /// ordering. In this case, we say that these orderings are equivalent. -#[derive(Debug, Clone, Eq, PartialEq, Hash)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)] pub struct OrderingEquivalenceClass { pub orderings: Vec, } @@ -44,7 +45,7 @@ pub struct OrderingEquivalenceClass { impl OrderingEquivalenceClass { /// Creates new empty ordering equivalence class. pub fn empty() -> Self { - Self { orderings: vec![] } + Default::default() } /// Clears (empties) this ordering equivalence class. @@ -197,6 +198,15 @@ impl OrderingEquivalenceClass { } } +impl IntoIterator for OrderingEquivalenceClass { + type Item = LexOrdering; + type IntoIter = IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.orderings.into_iter() + } +} + /// This function constructs a duplicate-free `LexOrdering` by filtering out /// duplicate entries that have same physical expression inside. For example, /// `vec![a ASC, a DESC]` collapses to `vec![a ASC]`. @@ -229,10 +239,10 @@ impl Display for OrderingEquivalenceClass { write!(f, "[")?; let mut iter = self.orderings.iter(); if let Some(ordering) = iter.next() { - write!(f, "{}", PhysicalSortExpr::format_list(ordering))?; + write!(f, "[{}]", PhysicalSortExpr::format_list(ordering))?; } for ordering in iter { - write!(f, "{}", PhysicalSortExpr::format_list(ordering))?; + write!(f, ", [{}]", PhysicalSortExpr::format_list(ordering))?; } write!(f, "]")?; Ok(()) diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index dc59a1eb835b..6a1268ef8cdb 100644 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -17,6 +17,8 @@ use std::fmt::Display; use std::hash::{Hash, Hasher}; +use std::iter::Peekable; +use std::slice::Iter; use std::sync::Arc; use super::ordering::collapse_lex_ordering; @@ -34,7 +36,7 @@ use crate::{ use arrow_schema::{SchemaRef, SortOptions}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::{plan_err, JoinSide, JoinType, Result}; +use datafusion_common::{internal_err, plan_err, JoinSide, JoinType, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_physical_expr_common::utils::ExprPropertiesNode; @@ -118,7 +120,7 @@ use itertools::Itertools; /// PhysicalSortExpr::new_default(col_c).desc(), /// ]); /// -/// assert_eq!(eq_properties.to_string(), "order: [a@0 ASC,c@2 DESC], const: [b@1]") +/// assert_eq!(eq_properties.to_string(), "order: [[a@0 ASC,c@2 DESC]], const: [b@1]") /// ``` #[derive(Debug, Clone)] pub struct EquivalenceProperties { @@ -279,6 +281,12 @@ impl EquivalenceProperties { self.with_constants(constants) } + /// Remove the specified constant + pub fn remove_constant(mut self, c: &ConstExpr) -> Self { + self.constants.retain(|existing| existing != c); + self + } + /// Track/register physical expressions with constant values. pub fn with_constants( mut self, @@ -1120,15 +1128,7 @@ impl Display for EquivalenceProperties { write!(f, ", eq: {}", self.eq_group)?; } if !self.constants.is_empty() { - write!(f, ", const: [")?; - let mut iter = self.constants.iter(); - if let Some(c) = iter.next() { - write!(f, "{}", c)?; - } - for c in iter { - write!(f, ", {}", c)?; - } - write!(f, "]")?; + write!(f, ", const: [{}]", ConstExpr::format_list(&self.constants))?; } Ok(()) } @@ -1295,21 +1295,30 @@ fn construct_prefix_orderings( relevant_sort_expr: &PhysicalSortExpr, dependency_map: &DependencyMap, ) -> Vec { + let mut dep_enumerator = DependencyEnumerator::new(); dependency_map[relevant_sort_expr] .dependencies .iter() - .flat_map(|dep| construct_orderings(dep, dependency_map)) + .flat_map(|dep| dep_enumerator.construct_orderings(dep, dependency_map)) .collect() } -/// Given a set of relevant dependencies (`relevant_deps`) and a map of dependencies -/// (`dependency_map`), this function generates all possible prefix orderings -/// based on the given dependencies. +/// Generates all possible orderings where dependencies are satisfied for the +/// current projection expression. +/// +/// # Examaple +/// If `dependences` is `a + b ASC` and the dependency map holds dependencies +/// * `a ASC` --> `[c ASC]` +/// * `b ASC` --> `[d DESC]`, +/// +/// This function generates these two sort orders +/// * `[c ASC, d DESC, a + b ASC]` +/// * `[d DESC, c ASC, a + b ASC]` /// /// # Parameters /// -/// * `dependencies` - A reference to the dependencies. -/// * `dependency_map` - A reference to the map of dependencies for expressions. +/// * `dependencies` - Set of relevant expressions. +/// * `dependency_map` - Map of dependencies for expressions that may appear in `dependencies` /// /// # Returns /// @@ -1335,11 +1344,6 @@ fn generate_dependency_orderings( return vec![vec![]]; } - // Generate all possible orderings where dependencies are satisfied for the - // current projection expression. For example, if expression is `a + b ASC`, - // and the dependency for `a ASC` is `[c ASC]`, the dependency for `b ASC` - // is `[d DESC]`, then we generate `[c ASC, d DESC, a + b ASC]` and - // `[d DESC, c ASC, a + b ASC]`. relevant_prefixes .into_iter() .multi_cartesian_product() @@ -1421,7 +1425,7 @@ struct DependencyNode { } impl DependencyNode { - // Insert dependency to the state (if exists). + /// Insert dependency to the state (if exists). fn insert_dependency(&mut self, dependency: Option<&PhysicalSortExpr>) { if let Some(dep) = dependency { self.dependencies.insert(dep.clone()); @@ -1437,38 +1441,71 @@ impl DependencyNode { type DependencyMap = IndexMap; type Dependencies = IndexSet; -/// This function recursively analyzes the dependencies of the given sort -/// expression within the given dependency map to construct lexicographical -/// orderings that include the sort expression and its dependencies. -/// -/// # Parameters -/// -/// - `referred_sort_expr`: A reference to the sort expression (`PhysicalSortExpr`) -/// for which lexicographical orderings satisfying its dependencies are to be -/// constructed. -/// - `dependency_map`: A reference to the `DependencyMap` that contains -/// dependencies for different `PhysicalSortExpr`s. -/// -/// # Returns -/// -/// A vector of lexicographical orderings (`Vec`) based on the given -/// sort expression and its dependencies. -fn construct_orderings( - referred_sort_expr: &PhysicalSortExpr, - dependency_map: &DependencyMap, -) -> Vec { - // We are sure that `referred_sort_expr` is inside `dependency_map`. - let node = &dependency_map[referred_sort_expr]; - // Since we work on intermediate nodes, we are sure `val.target_sort_expr` - // exists. - let target_sort_expr = node.target_sort_expr.clone().unwrap(); - if node.dependencies.is_empty() { - vec![vec![target_sort_expr]] - } else { +/// Contains a mapping of all dependencies we have processed for each sort expr +struct DependencyEnumerator<'a> { + /// Maps `expr` --> `[exprs]` that have previously been processed + seen: IndexMap<&'a PhysicalSortExpr, IndexSet<&'a PhysicalSortExpr>>, +} + +impl<'a> DependencyEnumerator<'a> { + fn new() -> Self { + Self { + seen: IndexMap::new(), + } + } + + /// Insert a new dependency, + /// + /// returns false if the dependency was already in the map + /// returns true if the dependency was newly inserted + fn insert( + &mut self, + target: &'a PhysicalSortExpr, + dep: &'a PhysicalSortExpr, + ) -> bool { + self.seen.entry(target).or_default().insert(dep) + } + + /// This function recursively analyzes the dependencies of the given sort + /// expression within the given dependency map to construct lexicographical + /// orderings that include the sort expression and its dependencies. + /// + /// # Parameters + /// + /// - `referred_sort_expr`: A reference to the sort expression (`PhysicalSortExpr`) + /// for which lexicographical orderings satisfying its dependencies are to be + /// constructed. + /// - `dependency_map`: A reference to the `DependencyMap` that contains + /// dependencies for different `PhysicalSortExpr`s. + /// + /// # Returns + /// + /// A vector of lexicographical orderings (`Vec`) based on the given + /// sort expression and its dependencies. + fn construct_orderings( + &mut self, + referred_sort_expr: &'a PhysicalSortExpr, + dependency_map: &'a DependencyMap, + ) -> Vec { + // We are sure that `referred_sort_expr` is inside `dependency_map`. + let node = &dependency_map[referred_sort_expr]; + // Since we work on intermediate nodes, we are sure `val.target_sort_expr` + // exists. + let target_sort_expr = node.target_sort_expr.as_ref().unwrap(); + // An empty dependency means the referred_sort_expr represents a global ordering. + // Return its projected version, which is the target_expression. + if node.dependencies.is_empty() { + return vec![vec![target_sort_expr.clone()]]; + }; + node.dependencies .iter() .flat_map(|dep| { - let mut orderings = construct_orderings(dep, dependency_map); + let mut orderings = if self.insert(target_sort_expr, dep) { + self.construct_orderings(dep, dependency_map) + } else { + vec![] + }; for ordering in orderings.iter_mut() { ordering.push(target_sort_expr.clone()) } @@ -1611,58 +1648,62 @@ impl Hash for ExprWrapper { /// Calculates the union (in the sense of `UnionExec`) `EquivalenceProperties` /// of `lhs` and `rhs` according to the schema of `lhs`. +/// +/// Rules: The UnionExec does not interleave its inputs: instead it passes each +/// input partition from the children as its own output. +/// +/// Since the output equivalence properties are properties that are true for +/// *all* output partitions, that is the same as being true for all *input* +/// partitions fn calculate_union_binary( - lhs: EquivalenceProperties, + mut lhs: EquivalenceProperties, mut rhs: EquivalenceProperties, ) -> Result { - // TODO: In some cases, we should be able to preserve some equivalence - // classes. Add support for such cases. - // Harmonize the schema of the rhs with the schema of the lhs (which is the accumulator schema): if !rhs.schema.eq(&lhs.schema) { rhs = rhs.with_new_schema(Arc::clone(&lhs.schema))?; } - // First, calculate valid constants for the union. A quantity is constant - // after the union if it is constant in both sides. - let constants = lhs + // First, calculate valid constants for the union. An expression is constant + // at the output of the union if it is constant in both sides. + let constants: Vec<_> = lhs .constants() .iter() .filter(|const_expr| const_exprs_contains(rhs.constants(), const_expr.expr())) .map(|const_expr| { - // TODO: When both sides' constants are valid across partitions, - // the union's constant should also be valid if values are - // the same. However, we do not have the capability to - // check this yet. + // TODO: When both sides have a constant column, and the actual + // constant value is the same, then the output properties could + // reflect the constant is valid across all partitions. However we + // don't track the actual value that the ConstExpr takes on, so we + // can't determine that yet ConstExpr::new(Arc::clone(const_expr.expr())).with_across_partitions(false) }) .collect(); + // remove any constants that are shared in both outputs (avoid double counting them) + for c in &constants { + lhs = lhs.remove_constant(c); + rhs = rhs.remove_constant(c); + } + // Next, calculate valid orderings for the union by searching for prefixes // in both sides. - let mut orderings = vec![]; - for mut ordering in lhs.normalized_oeq_class().orderings { - // Progressively shorten the ordering to search for a satisfied prefix: - while !rhs.ordering_satisfy(&ordering) { - ordering.pop(); - } - // There is a non-trivial satisfied prefix, add it as a valid ordering: - if !ordering.is_empty() { - orderings.push(ordering); - } - } - for mut ordering in rhs.normalized_oeq_class().orderings { - // Progressively shorten the ordering to search for a satisfied prefix: - while !lhs.ordering_satisfy(&ordering) { - ordering.pop(); - } - // There is a non-trivial satisfied prefix, add it as a valid ordering: - if !ordering.is_empty() { - orderings.push(ordering); - } - } - let mut eq_properties = EquivalenceProperties::new(lhs.schema); - eq_properties.constants = constants; + let mut orderings = UnionEquivalentOrderingBuilder::new(); + orderings.add_satisfied_orderings( + lhs.normalized_oeq_class().orderings, + lhs.constants(), + &rhs, + ); + orderings.add_satisfied_orderings( + rhs.normalized_oeq_class().orderings, + rhs.constants(), + &lhs, + ); + let orderings = orderings.build(); + + let mut eq_properties = + EquivalenceProperties::new(lhs.schema).with_constants(constants); + eq_properties.add_new_orderings(orderings); Ok(eq_properties) } @@ -1677,14 +1718,222 @@ pub fn calculate_union( ) -> Result { // TODO: In some cases, we should be able to preserve some equivalence // classes. Add support for such cases. - let mut init = eqps[0].clone(); + let mut iter = eqps.into_iter(); + let Some(mut acc) = iter.next() else { + return internal_err!( + "Cannot calculate EquivalenceProperties for a union with no inputs" + ); + }; + // Harmonize the schema of the init with the schema of the union: - if !init.schema.eq(&schema) { - init = init.with_new_schema(schema)?; + if !acc.schema.eq(&schema) { + acc = acc.with_new_schema(schema)?; } - eqps.into_iter() - .skip(1) - .try_fold(init, calculate_union_binary) + // Fold in the rest of the EquivalenceProperties: + for props in iter { + acc = calculate_union_binary(acc, props)?; + } + Ok(acc) +} + +#[derive(Debug)] +enum AddedOrdering { + /// The ordering was added to the in progress result + Yes, + /// The ordering was not added + No(LexOrdering), +} + +/// Builds valid output orderings of a `UnionExec` +#[derive(Debug)] +struct UnionEquivalentOrderingBuilder { + orderings: Vec, +} + +impl UnionEquivalentOrderingBuilder { + fn new() -> Self { + Self { orderings: vec![] } + } + + /// Add all orderings from `orderings` that satisfy `properties`, + /// potentially augmented with`constants`. + /// + /// Note: any column that is known to be constant can be inserted into the + /// ordering without changing its meaning + /// + /// For example: + /// * `orderings` contains `[a ASC, c ASC]` and `constants` contains `b` + /// * `properties` has required ordering `[a ASC, b ASC]` + /// + /// Then this will add `[a ASC, b ASC]` to the `orderings` list (as `a` was + /// in the sort order and `b` was a constant). + fn add_satisfied_orderings( + &mut self, + orderings: impl IntoIterator, + constants: &[ConstExpr], + properties: &EquivalenceProperties, + ) { + for mut ordering in orderings.into_iter() { + // Progressively shorten the ordering to search for a satisfied prefix: + loop { + match self.try_add_ordering(ordering, constants, properties) { + AddedOrdering::Yes => break, + AddedOrdering::No(o) => { + ordering = o; + ordering.pop(); + } + } + } + } + } + + /// Adds `ordering`, potentially augmented with constants, if it satisfies + /// the target `properties` properties. + /// + /// Returns + /// + /// * [`AddedOrdering::Yes`] if the ordering was added (either directly or + /// augmented), or was empty. + /// + /// * [`AddedOrdering::No`] if the ordering was not added + fn try_add_ordering( + &mut self, + ordering: LexOrdering, + constants: &[ConstExpr], + properties: &EquivalenceProperties, + ) -> AddedOrdering { + if ordering.is_empty() { + AddedOrdering::Yes + } else if constants.is_empty() && properties.ordering_satisfy(&ordering) { + // If the ordering satisfies the target properties, no need to + // augment it with constants. + self.orderings.push(ordering); + AddedOrdering::Yes + } else { + // Did not satisfy target properties, try and augment with constants + // to match the properties + if self.try_find_augmented_ordering(&ordering, constants, properties) { + AddedOrdering::Yes + } else { + AddedOrdering::No(ordering) + } + } + } + + /// Attempts to add `constants` to `ordering` to satisfy the properties. + /// + /// returns true if any orderings were added, false otherwise + fn try_find_augmented_ordering( + &mut self, + ordering: &LexOrdering, + constants: &[ConstExpr], + properties: &EquivalenceProperties, + ) -> bool { + // can't augment if there is nothing to augment with + if constants.is_empty() { + return false; + } + let start_num_orderings = self.orderings.len(); + + // for each equivalent ordering in properties, try and augment + // `ordering` it with the constants to match + for existing_ordering in &properties.oeq_class.orderings { + if let Some(augmented_ordering) = self.augment_ordering( + ordering, + constants, + existing_ordering, + &properties.constants, + ) { + if !augmented_ordering.is_empty() { + assert!(properties.ordering_satisfy(&augmented_ordering)); + self.orderings.push(augmented_ordering); + } + } + } + + self.orderings.len() > start_num_orderings + } + + /// Attempts to augment the ordering with constants to match the + /// `existing_ordering` + /// + /// Returns Some(ordering) if an augmented ordering was found, None otherwise + fn augment_ordering( + &mut self, + ordering: &LexOrdering, + constants: &[ConstExpr], + existing_ordering: &LexOrdering, + existing_constants: &[ConstExpr], + ) -> Option { + let mut augmented_ordering = vec![]; + let mut sort_expr_iter = ordering.iter().peekable(); + let mut existing_sort_expr_iter = existing_ordering.iter().peekable(); + + // walk in parallel down the two orderings, trying to match them up + while sort_expr_iter.peek().is_some() || existing_sort_expr_iter.peek().is_some() + { + // If the next expressions are equal, add the next match + // otherwise try and match with a constant + if let Some(expr) = + advance_if_match(&mut sort_expr_iter, &mut existing_sort_expr_iter) + { + augmented_ordering.push(expr); + } else if let Some(expr) = + advance_if_matches_constant(&mut sort_expr_iter, existing_constants) + { + augmented_ordering.push(expr); + } else if let Some(expr) = + advance_if_matches_constant(&mut existing_sort_expr_iter, constants) + { + augmented_ordering.push(expr); + } else { + // no match, can't continue the ordering, return what we have + break; + } + } + + Some(augmented_ordering) + } + + fn build(self) -> Vec { + self.orderings + } +} + +/// Advances two iterators in parallel +/// +/// If the next expressions are equal, the iterators are advanced and returns +/// the matched expression . +/// +/// Otherwise, the iterators are left unchanged and return `None` +fn advance_if_match( + iter1: &mut Peekable>, + iter2: &mut Peekable>, +) -> Option { + if matches!((iter1.peek(), iter2.peek()), (Some(expr1), Some(expr2)) if expr1.eq(expr2)) + { + iter1.next().unwrap(); + iter2.next().cloned() + } else { + None + } +} + +/// Advances the iterator with a constant +/// +/// If the next expression matches one of the constants, advances the iterator +/// returning the matched expression +/// +/// Otherwise, the iterator is left unchanged and returns `None` +fn advance_if_matches_constant( + iter: &mut Peekable>, + constants: &[ConstExpr], +) -> Option { + let expr = iter.peek()?; + let const_expr = constants.iter().find(|c| c.eq_expr(expr))?; + let found_expr = PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options); + iter.next(); + Some(found_expr) } #[cfg(test)] @@ -1755,6 +2004,51 @@ mod tests { Ok(()) } + #[test] + fn project_equivalence_properties_test_multi() -> Result<()> { + // test multiple input orderings with equivalence properties + let input_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + Field::new("c", DataType::Int64, true), + Field::new("d", DataType::Int64, true), + ])); + + let mut input_properties = EquivalenceProperties::new(Arc::clone(&input_schema)); + // add equivalent ordering [a, b, c, d] + input_properties.add_new_ordering(vec![ + parse_sort_expr("a", &input_schema), + parse_sort_expr("b", &input_schema), + parse_sort_expr("c", &input_schema), + parse_sort_expr("d", &input_schema), + ]); + + // add equivalent ordering [a, c, b, d] + input_properties.add_new_ordering(vec![ + parse_sort_expr("a", &input_schema), + parse_sort_expr("c", &input_schema), + parse_sort_expr("b", &input_schema), // NB b and c are swapped + parse_sort_expr("d", &input_schema), + ]); + + // simply project all the columns in order + let proj_exprs = vec![ + (col("a", &input_schema)?, "a".to_string()), + (col("b", &input_schema)?, "b".to_string()), + (col("c", &input_schema)?, "c".to_string()), + (col("d", &input_schema)?, "d".to_string()), + ]; + let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?; + let out_properties = input_properties.project(&projection_mapping, input_schema); + + assert_eq!( + out_properties.to_string(), + "order: [[a@0 ASC,c@2 ASC,b@1 ASC,d@3 ASC], [a@0 ASC,b@1 ASC,c@2 ASC,d@3 ASC]]" + ); + + Ok(()) + } + #[test] fn test_join_equivalence_properties() -> Result<()> { let schema = create_test_schema()?; @@ -2708,379 +3002,503 @@ mod tests { )) } - #[tokio::test] - async fn test_union_equivalence_properties_multi_children() -> Result<()> { - let schema = create_test_schema()?; + #[test] + fn test_union_equivalence_properties_multi_children_1() { + let schema = create_test_schema().unwrap(); let schema2 = append_fields(&schema, "1"); let schema3 = append_fields(&schema, "2"); - let test_cases = vec![ - // --------- TEST CASE 1 ---------- - ( - vec![ - // Children 1 - ( - // Orderings - vec![vec!["a", "b", "c"]], - Arc::clone(&schema), - ), - // Children 2 - ( - // Orderings - vec![vec!["a1", "b1", "c1"]], - Arc::clone(&schema2), - ), - // Children 3 - ( - // Orderings - vec![vec!["a2", "b2"]], - Arc::clone(&schema3), - ), - ], - // Expected - vec![vec!["a", "b"]], - ), - // --------- TEST CASE 2 ---------- - ( - vec![ - // Children 1 - ( - // Orderings - vec![vec!["a", "b", "c"]], - Arc::clone(&schema), - ), - // Children 2 - ( - // Orderings - vec![vec!["a1", "b1", "c1"]], - Arc::clone(&schema2), - ), - // Children 3 - ( - // Orderings - vec![vec!["a2", "b2", "c2"]], - Arc::clone(&schema3), - ), - ], - // Expected - vec![vec!["a", "b", "c"]], - ), - // --------- TEST CASE 3 ---------- - ( - vec![ - // Children 1 - ( - // Orderings - vec![vec!["a", "b"]], - Arc::clone(&schema), - ), - // Children 2 - ( - // Orderings - vec![vec!["a1", "b1", "c1"]], - Arc::clone(&schema2), - ), - // Children 3 - ( - // Orderings - vec![vec!["a2", "b2", "c2"]], - Arc::clone(&schema3), - ), - ], - // Expected + UnionEquivalenceTest::new(&schema) + // Children 1 + .with_child_sort(vec![vec!["a", "b", "c"]], &schema) + // Children 2 + .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2) + // Children 3 + .with_child_sort(vec![vec!["a2", "b2"]], &schema3) + .with_expected_sort(vec![vec!["a", "b"]]) + .run() + } + + #[test] + fn test_union_equivalence_properties_multi_children_2() { + let schema = create_test_schema().unwrap(); + let schema2 = append_fields(&schema, "1"); + let schema3 = append_fields(&schema, "2"); + UnionEquivalenceTest::new(&schema) + // Children 1 + .with_child_sort(vec![vec!["a", "b", "c"]], &schema) + // Children 2 + .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2) + // Children 3 + .with_child_sort(vec![vec!["a2", "b2", "c2"]], &schema3) + .with_expected_sort(vec![vec!["a", "b", "c"]]) + .run() + } + + #[test] + fn test_union_equivalence_properties_multi_children_3() { + let schema = create_test_schema().unwrap(); + let schema2 = append_fields(&schema, "1"); + let schema3 = append_fields(&schema, "2"); + UnionEquivalenceTest::new(&schema) + // Children 1 + .with_child_sort(vec![vec!["a", "b"]], &schema) + // Children 2 + .with_child_sort(vec![vec!["a1", "b1", "c1"]], &schema2) + // Children 3 + .with_child_sort(vec![vec!["a2", "b2", "c2"]], &schema3) + .with_expected_sort(vec![vec!["a", "b"]]) + .run() + } + + #[test] + fn test_union_equivalence_properties_multi_children_4() { + let schema = create_test_schema().unwrap(); + let schema2 = append_fields(&schema, "1"); + let schema3 = append_fields(&schema, "2"); + UnionEquivalenceTest::new(&schema) + // Children 1 + .with_child_sort(vec![vec!["a", "b"]], &schema) + // Children 2 + .with_child_sort(vec![vec!["a1", "b1"]], &schema2) + // Children 3 + .with_child_sort(vec![vec!["b2", "c2"]], &schema3) + .with_expected_sort(vec![]) + .run() + } + + #[test] + fn test_union_equivalence_properties_multi_children_5() { + let schema = create_test_schema().unwrap(); + let schema2 = append_fields(&schema, "1"); + UnionEquivalenceTest::new(&schema) + // Children 1 + .with_child_sort(vec![vec!["a", "b"], vec!["c"]], &schema) + // Children 2 + .with_child_sort(vec![vec!["a1", "b1"], vec!["c1"]], &schema2) + .with_expected_sort(vec![vec!["a", "b"], vec!["c"]]) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_common_constants() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child: [a ASC], const [b, c] + vec![vec!["a"]], + vec!["b", "c"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child: [b ASC], const [a, c] + vec![vec!["b"]], + vec!["a", "c"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union expected orderings: [[a ASC], [b ASC]], const [c] + vec![vec!["a"], vec!["b"]], + vec!["c"], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_prefix() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child: [a ASC], const [] + vec![vec!["a"]], + vec![], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child: [a ASC, b ASC], const [] vec![vec!["a", "b"]], - ), - // --------- TEST CASE 4 ---------- - ( - vec![ - // Children 1 - ( - // Orderings - vec![vec!["a", "b"]], - Arc::clone(&schema), - ), - // Children 2 - ( - // Orderings - vec![vec!["a1", "b1"]], - Arc::clone(&schema2), - ), - // Children 3 - ( - // Orderings - vec![vec!["b2", "c2"]], - Arc::clone(&schema3), - ), - ], - // Expected vec![], - ), - // --------- TEST CASE 5 ---------- - ( - vec![ - // Children 1 - ( - // Orderings - vec![vec!["a", "b"], vec!["c"]], - Arc::clone(&schema), - ), - // Children 2 - ( - // Orderings - vec![vec!["a1", "b1"], vec!["c1"]], - Arc::clone(&schema2), - ), - ], - // Expected - vec![vec!["a", "b"], vec!["c"]], - ), - ]; - for (children, expected) in test_cases { - let children_eqs = children - .iter() - .map(|(orderings, schema)| { - let orderings = orderings - .iter() - .map(|ordering| { - ordering - .iter() - .map(|name| PhysicalSortExpr { - expr: col(name, schema).unwrap(), - options: SortOptions::default(), - }) - .collect::>() - }) - .collect::>(); - EquivalenceProperties::new_with_orderings( - Arc::clone(schema), - &orderings, - ) - }) - .collect::>(); - let actual = calculate_union(children_eqs, Arc::clone(&schema))?; + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [a ASC], const [] + vec![vec!["a"]], + vec![], + ) + .run() + } - let expected_ordering = expected - .into_iter() - .map(|ordering| { - ordering - .into_iter() - .map(|name| PhysicalSortExpr { - expr: col(name, &schema).unwrap(), - options: SortOptions::default(), - }) - .collect::>() - }) - .collect::>(); - let expected = EquivalenceProperties::new_with_orderings( - Arc::clone(&schema), - &expected_ordering, - ); - assert_eq_properties_same( - &actual, - &expected, - format!("expected: {:?}, actual: {:?}", expected, actual), - ); - } - Ok(()) + #[test] + fn test_union_equivalence_properties_constants_asc_desc_mismatch() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child: [a ASC], const [] + vec![vec!["a"]], + vec![], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [a DESC], const [] + vec![vec!["a DESC"]], + vec![], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union doesn't have any ordering or constant + vec![], + vec![], + ) + .run() } - #[tokio::test] - async fn test_union_equivalence_properties_binary() -> Result<()> { - let schema = create_test_schema()?; + #[test] + fn test_union_equivalence_properties_constants_different_schemas() { + let schema = create_test_schema().unwrap(); let schema2 = append_fields(&schema, "1"); - let col_a = &col("a", &schema)?; - let col_b = &col("b", &schema)?; - let col_c = &col("c", &schema)?; - let col_a1 = &col("a1", &schema2)?; - let col_b1 = &col("b1", &schema2)?; - let options = SortOptions::default(); - let options_desc = !SortOptions::default(); - let test_cases = [ - //-----------TEST CASE 1----------// - ( - ( - // First child orderings - vec![ - // [a ASC] - (vec![(col_a, options)]), - ], - // First child constants - vec![col_b, col_c], - Arc::clone(&schema), - ), - ( - // Second child orderings - vec![ - // [b ASC] - (vec![(col_b, options)]), - ], - // Second child constants - vec![col_a, col_c], - Arc::clone(&schema), - ), - ( - // Union expected orderings - vec![ - // [a ASC] - vec![(col_a, options)], - // [b ASC] - vec![(col_b, options)], - ], - // Union - vec![col_c], - ), - ), - //-----------TEST CASE 2----------// - // Meet ordering between [a ASC], [a ASC, b ASC] should be [a ASC] - ( - ( - // First child orderings - vec![ - // [a ASC] - vec![(col_a, options)], - ], - // No constant - vec![], - Arc::clone(&schema), - ), - ( - // Second child orderings - vec![ - // [a ASC, b ASC] - vec![(col_a, options), (col_b, options)], - ], - // No constant - vec![], - Arc::clone(&schema), - ), - ( - // Union orderings - vec![ - // [a ASC] - vec![(col_a, options)], - ], - // No constant - vec![], - ), - ), - //-----------TEST CASE 3----------// - // Meet ordering between [a ASC], [a DESC] should be [] - ( - ( - // First child orderings - vec![ - // [a ASC] - vec![(col_a, options)], - ], - // No constant - vec![], - Arc::clone(&schema), - ), - ( - // Second child orderings - vec![ - // [a DESC] - vec![(col_a, options_desc)], - ], - // No constant - vec![], - Arc::clone(&schema), - ), - ( - // Union doesn't have any ordering - vec![], - // No constant - vec![], - ), - ), - //-----------TEST CASE 4----------// - // Meet ordering between [a ASC], [a1 ASC, b1 ASC] should be [a ASC] - // Where a, and a1 ath the same index for their corresponding schemas. - ( - ( - // First child orderings - vec![ - // [a ASC] - vec![(col_a, options)], - ], - // No constant - vec![], - Arc::clone(&schema), - ), - ( - // Second child orderings - vec![ - // [a1 ASC, b1 ASC] - vec![(col_a1, options), (col_b1, options)], - ], - // No constant - vec![], - Arc::clone(&schema2), - ), - ( - // Union orderings - vec![ - // [a ASC] - vec![(col_a, options)], - ], - // No constant - vec![], - ), - ), - ]; + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [a ASC], const [] + vec![vec!["a"]], + vec![], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [a1 ASC, b1 ASC], const [] + vec![vec!["a1", "b1"]], + vec![], + &schema2, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [a ASC] + // + // Note that a, and a1 are at the same index for their + // corresponding schemas. + vec![vec!["a"]], + vec![], + ) + .run() + } - for ( - test_idx, - ( - (first_child_orderings, first_child_constants, first_schema), - (second_child_orderings, second_child_constants, second_schema), - (union_orderings, union_constants), - ), - ) in test_cases.iter().enumerate() - { - let first_orderings = first_child_orderings - .iter() - .map(|ordering| convert_to_sort_exprs(ordering)) - .collect::>(); - let first_constants = first_child_constants - .iter() - .map(|expr| ConstExpr::new(Arc::clone(expr))) - .collect::>(); - let mut lhs = EquivalenceProperties::new(Arc::clone(first_schema)); - lhs = lhs.with_constants(first_constants); - lhs.add_new_orderings(first_orderings); + #[test] + fn test_union_equivalence_properties_constants_fill_gaps() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [a ASC, c ASC], const [b] + vec![vec!["a", "c"]], + vec!["b"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [b ASC, c ASC], const [a] + vec![vec!["b", "c"]], + vec!["a"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [ + // [a ASC, b ASC, c ASC], + // [b ASC, a ASC, c ASC] + // ], const [] + vec![vec!["a", "b", "c"], vec!["b", "a", "c"]], + vec![], + ) + .run() + } - let second_orderings = second_child_orderings - .iter() - .map(|ordering| convert_to_sort_exprs(ordering)) - .collect::>(); - let second_constants = second_child_constants + #[test] + fn test_union_equivalence_properties_constants_no_fill_gaps() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [a ASC, c ASC], const [d] // some other constant + vec![vec!["a", "c"]], + vec!["d"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [b ASC, c ASC], const [a] + vec![vec!["b", "c"]], + vec!["a"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [[a]] (only a is constant) + vec![vec!["a"]], + vec![], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_fill_some_gaps() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [c ASC], const [a, b] // some other constant + vec![vec!["c"]], + vec!["a", "b"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [a DESC, b], const [] + vec![vec!["a DESC", "b"]], + vec![], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [[a, b]] (can fill in the a/b with constants) + vec![vec!["a DESC", "b"]], + vec![], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_fill_gaps_non_symmetric() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [a ASC, c ASC], const [b] + vec![vec!["a", "c"]], + vec!["b"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [b ASC, c ASC], const [a] + vec![vec!["b DESC", "c"]], + vec!["a"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [ + // [a ASC, b ASC, c ASC], + // [b ASC, a ASC, c ASC] + // ], const [] + vec![vec!["a", "b DESC", "c"], vec!["b DESC", "a", "c"]], + vec![], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_gap_fill_symmetric() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child: [a ASC, b ASC, d ASC], const [c] + vec![vec!["a", "b", "d"]], + vec!["c"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child: [a ASC, c ASC, d ASC], const [b] + vec![vec!["a", "c", "d"]], + vec!["b"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: + // [a, b, c, d] + // [a, c, b, d] + vec![vec!["a", "c", "b", "d"], vec!["a", "b", "c", "d"]], + vec![], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_gap_fill_and_common() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child: [a DESC, d ASC], const [b, c] + vec![vec!["a DESC", "d"]], + vec!["b", "c"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child: [a DESC, c ASC, d ASC], const [b] + vec![vec!["a DESC", "c", "d"]], + vec!["b"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: + // [a DESC, c, d] [b] + vec![vec!["a DESC", "c", "d"]], + vec!["b"], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_middle_desc() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // NB `b DESC` in the first child + // + // First child: [a ASC, b DESC, d ASC], const [c] + vec![vec!["a", "b DESC", "d"]], + vec!["c"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child: [a ASC, c ASC, d ASC], const [b] + vec![vec!["a", "c", "d"]], + vec!["b"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: + // [a, b, d] (c constant) + // [a, c, d] (b constant) + vec![vec!["a", "c", "b DESC", "d"], vec!["a", "b DESC", "c", "d"]], + vec![], + ) + .run() + } + + // TODO tests with multiple constants + + #[derive(Debug)] + struct UnionEquivalenceTest { + /// The schema of the output of the Union + output_schema: SchemaRef, + /// The equivalence properties of each child to the union + child_properties: Vec, + /// The expected output properties of the union. Must be set before + /// running `build` + expected_properties: Option, + } + + impl UnionEquivalenceTest { + fn new(output_schema: &SchemaRef) -> Self { + Self { + output_schema: Arc::clone(output_schema), + child_properties: vec![], + expected_properties: None, + } + } + + /// Add a union input with the specified orderings + /// + /// See [`Self::make_props`] for the format of the strings in `orderings` + fn with_child_sort( + mut self, + orderings: Vec>, + schema: &SchemaRef, + ) -> Self { + let properties = self.make_props(orderings, vec![], schema); + self.child_properties.push(properties); + self + } + + /// Add a union input with the specified orderings and constant + /// equivalences + /// + /// See [`Self::make_props`] for the format of the strings in + /// `orderings` and `constants` + fn with_child_sort_and_const_exprs( + mut self, + orderings: Vec>, + constants: Vec<&str>, + schema: &SchemaRef, + ) -> Self { + let properties = self.make_props(orderings, constants, schema); + self.child_properties.push(properties); + self + } + + /// Set the expected output sort order for the union of the children + /// + /// See [`Self::make_props`] for the format of the strings in `orderings` + fn with_expected_sort(mut self, orderings: Vec>) -> Self { + let properties = self.make_props(orderings, vec![], &self.output_schema); + self.expected_properties = Some(properties); + self + } + + /// Set the expected output sort order and constant expressions for the + /// union of the children + /// + /// See [`Self::make_props`] for the format of the strings in + /// `orderings` and `constants`. + fn with_expected_sort_and_const_exprs( + mut self, + orderings: Vec>, + constants: Vec<&str>, + ) -> Self { + let properties = self.make_props(orderings, constants, &self.output_schema); + self.expected_properties = Some(properties); + self + } + + /// compute the union's output equivalence properties from the child + /// properties, and compare them to the expected properties + fn run(self) { + let Self { + output_schema, + child_properties, + expected_properties, + } = self; + + let expected_properties = + expected_properties.expect("expected_properties not set"); + + // try all permutations of the children + // as the code treats lhs and rhs differently + for child_properties in child_properties .iter() - .map(|expr| ConstExpr::new(Arc::clone(expr))) - .collect::>(); - let mut rhs = EquivalenceProperties::new(Arc::clone(second_schema)); - rhs = rhs.with_constants(second_constants); - rhs.add_new_orderings(second_orderings); + .cloned() + .permutations(child_properties.len()) + { + println!("--- permutation ---"); + for c in &child_properties { + println!("{c}"); + } + let actual_properties = + calculate_union(child_properties, Arc::clone(&output_schema)) + .expect("failed to calculate union equivalence properties"); + assert_eq_properties_same( + &actual_properties, + &expected_properties, + format!( + "expected: {expected_properties:?}\nactual: {actual_properties:?}" + ), + ); + } + } - let union_expected_orderings = union_orderings + /// Make equivalence properties for the specified columns named in orderings and constants + /// + /// orderings: strings formatted like `"a"` or `"a DESC"`. See [`parse_sort_expr`] + /// constants: strings formatted like `"a"`. + fn make_props( + &self, + orderings: Vec>, + constants: Vec<&str>, + schema: &SchemaRef, + ) -> EquivalenceProperties { + let orderings = orderings .iter() - .map(|ordering| convert_to_sort_exprs(ordering)) + .map(|ordering| { + ordering + .iter() + .map(|name| parse_sort_expr(name, schema)) + .collect::>() + }) .collect::>(); - let union_constants = union_constants + + let constants = constants .iter() - .map(|expr| ConstExpr::new(Arc::clone(expr))) + .map(|col_name| ConstExpr::new(col(col_name, schema).unwrap())) .collect::>(); - let mut union_expected_eq = EquivalenceProperties::new(Arc::clone(&schema)); - union_expected_eq = union_expected_eq.with_constants(union_constants); - union_expected_eq.add_new_orderings(union_expected_orderings); - let actual_union_eq = calculate_union_binary(lhs, rhs)?; - let err_msg = format!( - "Error in test id: {:?}, test case: {:?}", - test_idx, test_cases[test_idx] - ); - assert_eq_properties_same(&actual_union_eq, &union_expected_eq, err_msg); + EquivalenceProperties::new_with_orderings(Arc::clone(schema), &orderings) + .with_constants(constants) } - Ok(()) } fn assert_eq_properties_same( @@ -3091,21 +3509,63 @@ mod tests { // Check whether constants are same let lhs_constants = lhs.constants(); let rhs_constants = rhs.constants(); - assert_eq!(lhs_constants.len(), rhs_constants.len(), "{}", err_msg); for rhs_constant in rhs_constants { assert!( const_exprs_contains(lhs_constants, rhs_constant.expr()), - "{}", - err_msg + "{err_msg}\nlhs: {lhs}\nrhs: {rhs}" ); } + assert_eq!( + lhs_constants.len(), + rhs_constants.len(), + "{err_msg}\nlhs: {lhs}\nrhs: {rhs}" + ); // Check whether orderings are same. let lhs_orderings = lhs.oeq_class(); let rhs_orderings = &rhs.oeq_class.orderings; - assert_eq!(lhs_orderings.len(), rhs_orderings.len(), "{}", err_msg); for rhs_ordering in rhs_orderings { - assert!(lhs_orderings.contains(rhs_ordering), "{}", err_msg); + assert!( + lhs_orderings.contains(rhs_ordering), + "{err_msg}\nlhs: {lhs}\nrhs: {rhs}" + ); + } + assert_eq!( + lhs_orderings.len(), + rhs_orderings.len(), + "{err_msg}\nlhs: {lhs}\nrhs: {rhs}" + ); + } + + /// Converts a string to a physical sort expression + /// + /// # Example + /// * `"a"` -> (`"a"`, `SortOptions::default()`) + /// * `"a ASC"` -> (`"a"`, `SortOptions { descending: false, nulls_first: false }`) + fn parse_sort_expr(name: &str, schema: &SchemaRef) -> PhysicalSortExpr { + let mut parts = name.split_whitespace(); + let name = parts.next().expect("empty sort expression"); + let mut sort_expr = PhysicalSortExpr::new( + col(name, schema).expect("invalid column name"), + SortOptions::default(), + ); + + if let Some(options) = parts.next() { + sort_expr = match options { + "ASC" => sort_expr.asc(), + "DESC" => sort_expr.desc(), + _ => panic!( + "unknown sort options. Expected 'ASC' or 'DESC', got {}", + options + ), + } } + + assert!( + parts.next().is_none(), + "unexpected tokens in column name. Expected 'name' / 'name ASC' / 'name DESC' but got '{name}'" + ); + + sort_expr } } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 8c4a0fd3c989..dbae695abb97 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -27,9 +27,7 @@ use crate::PhysicalExpr; use arrow::array::*; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; use arrow::compute::kernels::cmp::*; -use arrow::compute::kernels::comparison::{ - regexp_is_match_utf8, regexp_is_match_utf8_scalar, -}; +use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar}; use arrow::compute::kernels::concat_elements::concat_elements_utf8; use arrow::compute::{cast, ilike, like, nilike, nlike}; use arrow::datatypes::*; @@ -179,7 +177,7 @@ macro_rules! compute_utf8_flag_op { } else { None }; - let mut array = paste::expr! {[<$OP _utf8>]}(&ll, &rr, flag.as_ref())?; + let mut array = $OP(ll, rr, flag.as_ref())?; if $NOT { array = not(&array).unwrap(); } @@ -216,7 +214,7 @@ macro_rules! compute_utf8_flag_op_scalar { if let ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT { let flag = $FLAG.then_some("i"); let mut array = - paste::expr! {[<$OP _utf8_scalar>]}(&ll, &string_value, flag)?; + paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; if $NOT { array = not(&array).unwrap(); } diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr/src/expressions/is_not_null.rs index 50c3cbab9baf..c16db7e8d456 100644 --- a/datafusion/physical-expr/src/expressions/is_not_null.rs +++ b/datafusion/physical-expr/src/expressions/is_not_null.rs @@ -73,7 +73,7 @@ impl PhysicalExpr for IsNotNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => { - let is_not_null = super::is_null::compute_is_not_null(array)?; + let is_not_null = arrow::compute::is_not_null(&array)?; Ok(ColumnarValue::Array(Arc::new(is_not_null))) } ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::from( diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr/src/expressions/is_null.rs index cdc5f101002e..65dc04fdfab0 100644 --- a/datafusion/physical-expr/src/expressions/is_null.rs +++ b/datafusion/physical-expr/src/expressions/is_null.rs @@ -20,14 +20,10 @@ use std::hash::{Hash, Hasher}; use std::{any::Any, sync::Arc}; -use arrow::compute; use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; -use arrow_array::{Array, ArrayRef, BooleanArray, Int8Array, UnionArray}; -use arrow_buffer::{BooleanBuffer, ScalarBuffer}; -use arrow_ord::cmp; use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; @@ -77,9 +73,9 @@ impl PhysicalExpr for IsNullExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let arg = self.arg.evaluate(batch)?; match arg { - ColumnarValue::Array(array) => { - Ok(ColumnarValue::Array(Arc::new(compute_is_null(array)?))) - } + ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new( + arrow::compute::is_null(&array)?, + ))), ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::from( ScalarValue::Boolean(Some(scalar.value().is_null())), )), @@ -103,65 +99,6 @@ impl PhysicalExpr for IsNullExpr { } } -/// workaround , -/// this can be replaced with a direct call to `arrow::compute::is_null` once it's fixed. -pub(crate) fn compute_is_null(array: ArrayRef) -> Result { - if let Some(union_array) = array.as_any().downcast_ref::() { - if let Some(offsets) = union_array.offsets() { - dense_union_is_null(union_array, offsets) - } else { - sparse_union_is_null(union_array) - } - } else { - compute::is_null(array.as_ref()).map_err(Into::into) - } -} - -/// workaround , -/// this can be replaced with a direct call to `arrow::compute::is_not_null` once it's fixed. -pub(crate) fn compute_is_not_null(array: ArrayRef) -> Result { - if array.as_any().is::() { - compute::not(&compute_is_null(array)?).map_err(Into::into) - } else { - compute::is_not_null(array.as_ref()).map_err(Into::into) - } -} - -fn dense_union_is_null( - union_array: &UnionArray, - offsets: &ScalarBuffer, -) -> Result { - let child_arrays = (0..union_array.type_names().len()) - .map(|type_id| { - compute::is_null(&union_array.child(type_id as i8)).map_err(Into::into) - }) - .collect::>>()?; - - let buffer: BooleanBuffer = offsets - .iter() - .zip(union_array.type_ids()) - .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset as usize)) - .collect(); - - Ok(BooleanArray::new(buffer, None)) -} - -fn sparse_union_is_null(union_array: &UnionArray) -> Result { - let type_ids = Int8Array::new(union_array.type_ids().clone(), None); - - let mut union_is_null = - BooleanArray::new(BooleanBuffer::new_unset(union_array.len()), None); - for type_id in 0..union_array.type_names().len() { - let type_id = type_id as i8; - let union_is_child = cmp::eq(&type_ids, &Int8Array::new_scalar(type_id))?; - let child = union_array.child(type_id); - let child_array_is_null = compute::is_null(&child)?; - let child_is_null = compute::and(&union_is_child, &child_array_is_null)?; - union_is_null = compute::or(&union_is_null, &child_is_null)?; - } - Ok(union_is_null) -} - impl PartialEq for IsNullExpr { fn eq(&self, other: &dyn Any) -> bool { down_cast_any_ref(other) @@ -184,7 +121,7 @@ mod tests { array::{BooleanArray, StringArray}, datatypes::*, }; - use arrow_array::{Float64Array, Int32Array}; + use arrow_array::{Array, Float64Array, Int32Array, UnionArray}; use arrow_buffer::ScalarBuffer; use datafusion_common::cast::as_boolean_array; @@ -243,8 +180,7 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap(); - let array_ref = Arc::new(array) as ArrayRef; - let result = compute_is_null(array_ref).unwrap(); + let result = arrow::compute::is_null(&array).unwrap(); let expected = &BooleanArray::from(vec![false, true, false, false, true, true, false]); @@ -272,8 +208,7 @@ mod tests { UnionArray::try_new(union_fields(), type_ids, Some(offsets), children) .unwrap(); - let array_ref = Arc::new(array) as ArrayRef; - let result = compute_is_null(array_ref).unwrap(); + let result = arrow::compute::is_null(&array).unwrap(); let expected = &BooleanArray::from(vec![false, true, false, true, false, true]); assert_eq!(expected, &result); diff --git a/datafusion/physical-plan/src/aggregates/group_values/column.rs b/datafusion/physical-plan/src/aggregates/group_values/column.rs index 91d87302ce99..28f35b2bded2 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/column.rs @@ -16,8 +16,7 @@ // under the License. use crate::aggregates::group_values::group_column::{ - ByteGroupValueBuilder, GroupColumn, NonNullPrimitiveGroupValueBuilder, - PrimitiveGroupValueBuilder, + ByteGroupValueBuilder, GroupColumn, PrimitiveGroupValueBuilder, }; use crate::aggregates::group_values::GroupValues; use ahash::RandomState; @@ -124,8 +123,7 @@ impl GroupValuesColumn { } } -/// instantiates a [`PrimitiveGroupValueBuilder`] or -/// [`NonNullPrimitiveGroupValueBuilder`] and pushes it into $v +/// instantiates a [`PrimitiveGroupValueBuilder`] and pushes it into $v /// /// Arguments: /// `$v`: the vector to push the new builder into @@ -135,10 +133,10 @@ impl GroupValuesColumn { macro_rules! instantiate_primitive { ($v:expr, $nullable:expr, $t:ty) => { if $nullable { - let b = PrimitiveGroupValueBuilder::<$t>::new(); + let b = PrimitiveGroupValueBuilder::<$t, true>::new(); $v.push(Box::new(b) as _) } else { - let b = NonNullPrimitiveGroupValueBuilder::<$t>::new(); + let b = PrimitiveGroupValueBuilder::<$t, false>::new(); $v.push(Box::new(b) as _) } }; diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 7409f5c214b9..5d00f300e960 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -15,24 +15,21 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::BooleanBufferBuilder; use arrow::array::BufferBuilder; use arrow::array::GenericBinaryArray; use arrow::array::GenericStringArray; use arrow::array::OffsetSizeTrait; use arrow::array::PrimitiveArray; use arrow::array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}; -use arrow::buffer::NullBuffer; use arrow::buffer::OffsetBuffer; use arrow::buffer::ScalarBuffer; -use arrow::datatypes::ArrowNativeType; use arrow::datatypes::ByteArrayType; use arrow::datatypes::DataType; use arrow::datatypes::GenericBinaryType; -use arrow::datatypes::GenericStringType; use datafusion_common::utils::proxy::VecAllocExt; use crate::aggregates::group_values::null_builder::MaybeNullBufferBuilder; +use arrow_array::types::GenericStringType; use datafusion_physical_expr_common::binary_map::{OutputType, INITIAL_BUFFER_CAPACITY}; use std::sync::Arc; use std::vec; @@ -63,75 +60,25 @@ pub trait GroupColumn: Send + Sync { fn take_n(&mut self, n: usize) -> ArrayRef; } -/// An implementation of [`GroupColumn`] for primitive values which are known to have no nulls -#[derive(Debug)] -pub struct NonNullPrimitiveGroupValueBuilder { - group_values: Vec, -} - -impl NonNullPrimitiveGroupValueBuilder -where - T: ArrowPrimitiveType, -{ - pub fn new() -> Self { - Self { - group_values: vec![], - } - } -} - -impl GroupColumn for NonNullPrimitiveGroupValueBuilder { - fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { - // know input has no nulls - self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) - } - - fn append_val(&mut self, array: &ArrayRef, row: usize) { - // input can't possibly have nulls, so don't worry about them - self.group_values.push(array.as_primitive::().value(row)) - } - - fn len(&self) -> usize { - self.group_values.len() - } - - fn size(&self) -> usize { - self.group_values.allocated_size() - } - - fn build(self: Box) -> ArrayRef { - let Self { group_values } = *self; - - let nulls = None; - - Arc::new(PrimitiveArray::::new( - ScalarBuffer::from(group_values), - nulls, - )) - } - - fn take_n(&mut self, n: usize) -> ArrayRef { - let first_n = self.group_values.drain(0..n).collect::>(); - let first_n_nulls = None; - - Arc::new(PrimitiveArray::::new( - ScalarBuffer::from(first_n), - first_n_nulls, - )) - } -} - -/// An implementation of [`GroupColumn`] for primitive values which may have nulls +/// An implementation of [`GroupColumn`] for primitive values +/// +/// Optimized to skip null buffer construction if the input is known to be non nullable +/// +/// # Template parameters +/// +/// `T`: the native Rust type that stores the data +/// `NULLABLE`: if the data can contain any nulls #[derive(Debug)] -pub struct PrimitiveGroupValueBuilder { +pub struct PrimitiveGroupValueBuilder { group_values: Vec, nulls: MaybeNullBufferBuilder, } -impl PrimitiveGroupValueBuilder +impl PrimitiveGroupValueBuilder where T: ArrowPrimitiveType, { + /// Create a new `PrimitiveGroupValueBuilder` pub fn new() -> Self { Self { group_values: vec![], @@ -140,18 +87,34 @@ where } } -impl GroupColumn for PrimitiveGroupValueBuilder { +impl GroupColumn + for PrimitiveGroupValueBuilder +{ fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { - self.nulls.is_null(lhs_row) == array.is_null(rhs_row) - && self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) + // Perf: skip null check (by short circuit) if input is not nullable + if NULLABLE { + let exist_null = self.nulls.is_null(lhs_row); + let input_null = array.is_null(rhs_row); + if let Some(result) = nulls_equal_to(exist_null, input_null) { + return result; + } + // Otherwise, we need to check their values + } + + self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } fn append_val(&mut self, array: &ArrayRef, row: usize) { - if array.is_null(row) { - self.nulls.append(true); - self.group_values.push(T::default_value()); + // Perf: skip null check if input can't have nulls + if NULLABLE { + if array.is_null(row) { + self.nulls.append(true); + self.group_values.push(T::default_value()); + } else { + self.nulls.append(false); + self.group_values.push(array.as_primitive::().value(row)); + } } else { - self.nulls.append(false); self.group_values.push(array.as_primitive::().value(row)); } } @@ -171,6 +134,9 @@ impl GroupColumn for PrimitiveGroupValueBuilder { } = *self; let nulls = nulls.build(); + if !NULLABLE { + assert!(nulls.is_none(), "unexpected nulls in non nullable input"); + } Arc::new(PrimitiveArray::::new( ScalarBuffer::from(group_values), @@ -180,7 +146,8 @@ impl GroupColumn for PrimitiveGroupValueBuilder { fn take_n(&mut self, n: usize) -> ArrayRef { let first_n = self.group_values.drain(0..n).collect::>(); - let first_n_nulls = self.nulls.take_n(n); + + let first_n_nulls = if NULLABLE { self.nulls.take_n(n) } else { None }; Arc::new(PrimitiveArray::::new( ScalarBuffer::from(first_n), @@ -190,6 +157,12 @@ impl GroupColumn for PrimitiveGroupValueBuilder { } /// An implementation of [`GroupColumn`] for binary and utf8 types. +/// +/// Stores a collection of binary or utf8 group values in a single buffer +/// in a way that allows: +/// +/// 1. Efficient comparison of incoming rows to existing rows +/// 2. Efficient construction of the final output array pub struct ByteGroupValueBuilder where O: OffsetSizeTrait, @@ -201,8 +174,8 @@ where /// stored in the range `offsets[i]..offsets[i+1]` in `buffer`. Null values /// are stored as a zero length string. offsets: Vec, - /// Null indexes in offsets, if `i` is in nulls, `offsets[i]` should be equals to `offsets[i+1]` - nulls: Vec, + /// Nulls + nulls: MaybeNullBufferBuilder, } impl ByteGroupValueBuilder @@ -214,7 +187,7 @@ where output_type, buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY), offsets: vec![O::default()], - nulls: vec![], + nulls: MaybeNullBufferBuilder::new(), } } @@ -224,40 +197,38 @@ where { let arr = array.as_bytes::(); if arr.is_null(row) { - self.nulls.push(self.len()); + self.nulls.append(true); // nulls need a zero length in the offset buffer let offset = self.buffer.len(); - self.offsets.push(O::usize_as(offset)); - return; + } else { + self.nulls.append(false); + let value: &[u8] = arr.value(row).as_ref(); + self.buffer.append_slice(value); + self.offsets.push(O::usize_as(self.buffer.len())); } - - let value: &[u8] = arr.value(row).as_ref(); - self.buffer.append_slice(value); - self.offsets.push(O::usize_as(self.buffer.len())); } fn equal_to_inner(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool where B: ByteArrayType, { - // Handle nulls - let is_lhs_null = self.nulls.iter().any(|null_idx| *null_idx == lhs_row); - let arr = array.as_bytes::(); - if is_lhs_null { - return arr.is_null(rhs_row); - } else if arr.is_null(rhs_row) { - return false; + let array = array.as_bytes::(); + let exist_null = self.nulls.is_null(lhs_row); + let input_null = array.is_null(rhs_row); + if let Some(result) = nulls_equal_to(exist_null, input_null) { + return result; } + // Otherwise, we need to check their values + self.value(lhs_row) == (array.value(rhs_row).as_ref() as &[u8]) + } - let arr = array.as_bytes::(); - let rhs_elem: &[u8] = arr.value(rhs_row).as_ref(); - let rhs_elem_len = arr.value_length(rhs_row).as_usize(); - debug_assert_eq!(rhs_elem_len, rhs_elem.len()); - let l = self.offsets[lhs_row].as_usize(); - let r = self.offsets[lhs_row + 1].as_usize(); - let existing_elem = unsafe { self.buffer.as_slice().get_unchecked(l..r) }; - rhs_elem == existing_elem + /// return the current value of the specified row irrespective of null + pub fn value(&self, row: usize) -> &[u8] { + let l = self.offsets[row].as_usize(); + let r = self.offsets[row + 1].as_usize(); + // Safety: the offsets are constructed correctly and never decrease + unsafe { self.buffer.as_slice().get_unchecked(l..r) } } } @@ -325,18 +296,7 @@ where nulls, } = *self; - let null_buffer = if nulls.is_empty() { - None - } else { - // Only make a `NullBuffer` if there was a null value - let num_values = offsets.len() - 1; - let mut bool_builder = BooleanBufferBuilder::new(num_values); - bool_builder.append_n(num_values, true); - nulls.into_iter().for_each(|null_index| { - bool_builder.set_bit(null_index, false); - }); - Some(NullBuffer::from(bool_builder.finish())) - }; + let null_buffer = nulls.build(); // SAFETY: the offsets were constructed correctly in `insert_if_new` -- // monotonically increasing, overflows were checked. @@ -353,9 +313,9 @@ where // SAFETY: // 1. the offsets were constructed safely // - // 2. we asserted the input arrays were all the correct type and - // thus since all the values that went in were valid (e.g. utf8) - // so are all the values that come out + // 2. the input arrays were all the correct type and thus since + // all the values that went in were valid (e.g. utf8) so are all + // the values that come out Arc::new(unsafe { GenericStringArray::new_unchecked(offsets, values, null_buffer) }) @@ -366,27 +326,7 @@ where fn take_n(&mut self, n: usize) -> ArrayRef { debug_assert!(self.len() >= n); - - let null_buffer = if self.nulls.is_empty() { - None - } else { - // Only make a `NullBuffer` if there was a null value - let mut bool_builder = BooleanBufferBuilder::new(n); - bool_builder.append_n(n, true); - - let mut new_nulls = vec![]; - self.nulls.iter().for_each(|null_index| { - if *null_index < n { - bool_builder.set_bit(*null_index, false); - } else { - new_nulls.push(null_index - n); - } - }); - - self.nulls = new_nulls; - Some(NullBuffer::from(bool_builder.finish())) - }; - + let null_buffer = self.nulls.take_n(n); let first_remaining_offset = O::as_usize(self.offsets[n]); // Given offests like [0, 2, 4, 5] and n = 1, we expect to get @@ -436,13 +376,31 @@ where } } +/// Determines if the nullability of the existing and new input array can be used +/// to short-circuit the comparison of the two values. +/// +/// Returns `Some(result)` if the result of the comparison can be determined +/// from the nullness of the two values, and `None` if the comparison must be +/// done on the values themselves. +fn nulls_equal_to(lhs_null: bool, rhs_null: bool) -> Option { + match (lhs_null, rhs_null) { + (true, true) => Some(true), + (false, true) | (true, false) => Some(false), + _ => None, + } +} + #[cfg(test)] mod tests { use std::sync::Arc; - use arrow_array::{ArrayRef, StringArray}; + use arrow::datatypes::Int64Type; + use arrow_array::{ArrayRef, Int64Array, StringArray}; + use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; use datafusion_physical_expr::binary_map::OutputType; + use crate::aggregates::group_values::group_column::PrimitiveGroupValueBuilder; + use super::{ByteGroupValueBuilder, GroupColumn}; #[test] @@ -489,4 +447,136 @@ mod tests { ])) as ArrayRef; assert_eq!(&output, &array); } + + #[test] + fn test_nullable_primitive_equal_to() { + // Will cover such cases: + // - exist null, input not null + // - exist null, input null; values not equal + // - exist null, input null; values equal + // - exist not null, input null + // - exist not null, input not null; values not equal + // - exist not null, input not null; values equal + + // Define PrimitiveGroupValueBuilder + let mut builder = PrimitiveGroupValueBuilder::::new(); + let builder_array = Arc::new(Int64Array::from(vec![ + None, + None, + None, + Some(1), + Some(2), + Some(3), + ])) as ArrayRef; + builder.append_val(&builder_array, 0); + builder.append_val(&builder_array, 1); + builder.append_val(&builder_array, 2); + builder.append_val(&builder_array, 3); + builder.append_val(&builder_array, 4); + builder.append_val(&builder_array, 5); + + // Define input array + let (_nulls, values, _) = + Int64Array::from(vec![Some(1), Some(2), None, None, Some(1), Some(3)]) + .into_parts(); + + // explicitly build a boolean buffer where one of the null values also happens to match + let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(false); // this sets Some(2) to null above + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(true); + let nulls = NullBuffer::new(boolean_buffer_builder.finish()); + let input_array = Arc::new(Int64Array::new(values, Some(nulls))) as ArrayRef; + + // Check + assert!(!builder.equal_to(0, &input_array, 0)); + assert!(builder.equal_to(1, &input_array, 1)); + assert!(builder.equal_to(2, &input_array, 2)); + assert!(!builder.equal_to(3, &input_array, 3)); + assert!(!builder.equal_to(4, &input_array, 4)); + assert!(builder.equal_to(5, &input_array, 5)); + } + + #[test] + fn test_not_nullable_primitive_equal_to() { + // Will cover such cases: + // - values equal + // - values not equal + + // Define PrimitiveGroupValueBuilder + let mut builder = PrimitiveGroupValueBuilder::::new(); + let builder_array = + Arc::new(Int64Array::from(vec![Some(0), Some(1)])) as ArrayRef; + builder.append_val(&builder_array, 0); + builder.append_val(&builder_array, 1); + + // Define input array + let input_array = Arc::new(Int64Array::from(vec![Some(0), Some(2)])) as ArrayRef; + + // Check + assert!(builder.equal_to(0, &input_array, 0)); + assert!(!builder.equal_to(1, &input_array, 1)); + } + + #[test] + fn test_byte_array_equal_to() { + // Will cover such cases: + // - exist null, input not null + // - exist null, input null; values not equal + // - exist null, input null; values equal + // - exist not null, input null + // - exist not null, input not null; values not equal + // - exist not null, input not null; values equal + + // Define PrimitiveGroupValueBuilder + let mut builder = ByteGroupValueBuilder::::new(OutputType::Utf8); + let builder_array = Arc::new(StringArray::from(vec![ + None, + None, + None, + Some("foo"), + Some("bar"), + Some("baz"), + ])) as ArrayRef; + builder.append_val(&builder_array, 0); + builder.append_val(&builder_array, 1); + builder.append_val(&builder_array, 2); + builder.append_val(&builder_array, 3); + builder.append_val(&builder_array, 4); + builder.append_val(&builder_array, 5); + + // Define input array + let (offsets, buffer, _nulls) = StringArray::from(vec![ + Some("foo"), + Some("bar"), + None, + None, + Some("foo"), + Some("baz"), + ]) + .into_parts(); + + // explicitly build a boolean buffer where one of the null values also happens to match + let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(false); // this sets Some("bar") to null above + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(true); + let nulls = NullBuffer::new(boolean_buffer_builder.finish()); + let input_array = + Arc::new(StringArray::new(offsets, buffer, Some(nulls))) as ArrayRef; + + // Check + assert!(!builder.equal_to(0, &input_array, 0)); + assert!(builder.equal_to(1, &input_array, 1)); + assert!(builder.equal_to(2, &input_array, 2)); + assert!(!builder.equal_to(3, &input_array, 3)); + assert!(!builder.equal_to(4, &input_array, 4)); + assert!(builder.equal_to(5, &input_array, 5)); + } } diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 9466ff6dd459..f9dd973c814e 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -36,10 +36,11 @@ use crate::{ use arrow::array::ArrayRef; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; +use arrow_array::{UInt16Array, UInt32Array, UInt64Array, UInt8Array}; use datafusion_common::stats::Precision; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_execution::TaskContext; -use datafusion_expr::Accumulator; +use datafusion_expr::{Accumulator, Aggregate}; use datafusion_physical_expr::{ equivalence::{collapse_lex_req, ProjectionMapping}, expressions::Column, @@ -211,13 +212,99 @@ impl PhysicalGroupBy { .collect() } + /// The number of expressions in the output schema. + fn num_output_exprs(&self) -> usize { + let mut num_exprs = self.expr.len(); + if !self.is_single() { + num_exprs += 1 + } + num_exprs + } + /// Return grouping expressions as they occur in the output schema. pub fn output_exprs(&self) -> Vec> { - self.expr - .iter() - .enumerate() - .map(|(index, (_, name))| Arc::new(Column::new(name, index)) as _) - .collect() + let num_output_exprs = self.num_output_exprs(); + let mut output_exprs = Vec::with_capacity(num_output_exprs); + output_exprs.extend( + self.expr + .iter() + .enumerate() + .take(num_output_exprs) + .map(|(index, (_, name))| Arc::new(Column::new(name, index)) as _), + ); + if !self.is_single() { + output_exprs.push(Arc::new(Column::new( + Aggregate::INTERNAL_GROUPING_ID, + self.expr.len(), + )) as _); + } + output_exprs + } + + /// Returns the number expression as grouping keys. + fn num_group_exprs(&self) -> usize { + if self.is_single() { + self.expr.len() + } else { + self.expr.len() + 1 + } + } + + /// Returns the fields that are used as the grouping keys. + fn group_fields(&self, input_schema: &Schema) -> Result> { + let mut fields = Vec::with_capacity(self.num_group_exprs()); + for ((expr, name), group_expr_nullable) in + self.expr.iter().zip(self.exprs_nullable().into_iter()) + { + fields.push( + Field::new( + name, + expr.data_type(input_schema)?, + group_expr_nullable || expr.nullable(input_schema)?, + ) + .with_metadata( + get_field_metadata(expr, input_schema).unwrap_or_default(), + ), + ); + } + if !self.is_single() { + fields.push(Field::new( + Aggregate::INTERNAL_GROUPING_ID, + Aggregate::grouping_id_type(self.expr.len()), + false, + )); + } + Ok(fields) + } + + /// Returns the output fields of the group by. + /// + /// This might be different from the `group_fields` that might contain internal expressions that + /// should not be part of the output schema. + fn output_fields(&self, input_schema: &Schema) -> Result> { + let mut fields = self.group_fields(input_schema)?; + fields.truncate(self.num_output_exprs()); + Ok(fields) + } + + /// Returns the `PhysicalGroupBy` for a final aggregation if `self` is used for a partial + /// aggregation. + pub fn as_final(&self) -> PhysicalGroupBy { + let expr: Vec<_> = + self.output_exprs() + .into_iter() + .zip( + self.expr.iter().map(|t| t.1.clone()).chain(std::iter::once( + Aggregate::INTERNAL_GROUPING_ID.to_owned(), + )), + ) + .collect(); + let num_exprs = expr.len(); + Self { + expr, + null_expr: vec![], + groups: vec![vec![false; num_exprs]], + } } } @@ -321,13 +408,7 @@ impl AggregateExec { input: Arc, input_schema: SchemaRef, ) -> Result { - let schema = create_schema( - &input.schema(), - &group_by.expr, - &aggr_expr, - group_by.exprs_nullable(), - mode, - )?; + let schema = create_schema(&input.schema(), &group_by, &aggr_expr, mode)?; let schema = Arc::new(schema); AggregateExec::try_new_with_schema( @@ -789,25 +870,12 @@ impl ExecutionPlan for AggregateExec { fn create_schema( input_schema: &Schema, - group_expr: &[(Arc, String)], + group_by: &PhysicalGroupBy, aggr_expr: &[AggregateFunctionExpr], - group_expr_nullable: Vec, mode: AggregateMode, ) -> Result { - let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len()); - for (index, (expr, name)) in group_expr.iter().enumerate() { - fields.push( - Field::new( - name, - expr.data_type(input_schema)?, - // In cases where we have multiple grouping sets, we will use NULL expressions in - // order to align the grouping sets. So the field must be nullable even if the underlying - // schema field is not. - group_expr_nullable[index] || expr.nullable(input_schema)?, - ) - .with_metadata(get_field_metadata(expr, input_schema).unwrap_or_default()), - ) - } + let mut fields = Vec::with_capacity(group_by.num_output_exprs() + aggr_expr.len()); + fields.extend(group_by.output_fields(input_schema)?); match mode { AggregateMode::Partial => { @@ -833,9 +901,8 @@ fn create_schema( )) } -fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef { - let group_fields = schema.fields()[0..group_count].to_vec(); - Arc::new(Schema::new(group_fields)) +fn group_schema(input_schema: &Schema, group_by: &PhysicalGroupBy) -> Result { + Ok(Arc::new(Schema::new(group_by.group_fields(input_schema)?))) } /// Determines the lexical ordering requirement for an aggregate expression. @@ -1142,6 +1209,27 @@ fn evaluate_optional( .collect() } +fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result { + if group.len() > 64 { + return not_impl_err!( + "Grouping sets with more than 64 columns are not supported" + ); + } + let group_id = group.iter().fold(0u64, |acc, &is_null| { + (acc << 1) | if is_null { 1 } else { 0 } + }); + let num_rows = batch.num_rows(); + if group.len() <= 8 { + Ok(Arc::new(UInt8Array::from(vec![group_id as u8; num_rows]))) + } else if group.len() <= 16 { + Ok(Arc::new(UInt16Array::from(vec![group_id as u16; num_rows]))) + } else if group.len() <= 32 { + Ok(Arc::new(UInt32Array::from(vec![group_id as u32; num_rows]))) + } else { + Ok(Arc::new(UInt64Array::from(vec![group_id; num_rows]))) + } +} + /// Evaluate a group by expression against a `RecordBatch` /// /// Arguments: @@ -1174,23 +1262,24 @@ pub(crate) fn evaluate_group_by( }) .collect::>>()?; - Ok(group_by + group_by .groups .iter() .map(|group| { - group - .iter() - .enumerate() - .map(|(idx, is_null)| { - if *is_null { - Arc::clone(&null_exprs[idx]) - } else { - Arc::clone(&exprs[idx]) - } - }) - .collect() + let mut group_values = Vec::with_capacity(group_by.num_group_exprs()); + group_values.extend(group.iter().enumerate().map(|(idx, is_null)| { + if *is_null { + Arc::clone(&null_exprs[idx]) + } else { + Arc::clone(&exprs[idx]) + } + })); + if !group_by.is_single() { + group_values.push(group_id_array(group, batch)?); + } + Ok(group_values) }) - .collect()) + .collect() } #[cfg(test)] @@ -1348,21 +1437,21 @@ mod tests { ) -> Result<()> { let input_schema = input.schema(); - let grouping_set = PhysicalGroupBy { - expr: vec![ + let grouping_set = PhysicalGroupBy::new( + vec![ (col("a", &input_schema)?, "a".to_string()), (col("b", &input_schema)?, "b".to_string()), ], - null_expr: vec![ + vec![ (lit(ScalarValue::UInt32(None)), "a".to_string()), (lit(ScalarValue::Float64(None)), "b".to_string()), ], - groups: vec![ + vec![ vec![false, true], // (a, NULL) vec![true, false], // (NULL, b) vec![false, false], // (a,b) ], - }; + ); let aggregates = vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1i8)]) .schema(Arc::clone(&input_schema)) @@ -1392,63 +1481,56 @@ mod tests { // In spill mode, we test with the limited memory, if the mem usage exceeds, // we trigger the early emit rule, which turns out the partial aggregate result. vec![ - "+---+-----+-----------------+", - "| a | b | COUNT(1)[count] |", - "+---+-----+-----------------+", - "| | 1.0 | 1 |", - "| | 1.0 | 1 |", - "| | 2.0 | 1 |", - "| | 2.0 | 1 |", - "| | 3.0 | 1 |", - "| | 3.0 | 1 |", - "| | 4.0 | 1 |", - "| | 4.0 | 1 |", - "| 2 | | 1 |", - "| 2 | | 1 |", - "| 2 | 1.0 | 1 |", - "| 2 | 1.0 | 1 |", - "| 3 | | 1 |", - "| 3 | | 2 |", - "| 3 | 2.0 | 2 |", - "| 3 | 3.0 | 1 |", - "| 4 | | 1 |", - "| 4 | | 2 |", - "| 4 | 3.0 | 1 |", - "| 4 | 4.0 | 2 |", - "+---+-----+-----------------+", + "+---+-----+---------------+-----------------+", + "| a | b | __grouping_id | COUNT(1)[count] |", + "+---+-----+---------------+-----------------+", + "| | 1.0 | 2 | 1 |", + "| | 1.0 | 2 | 1 |", + "| | 2.0 | 2 | 1 |", + "| | 2.0 | 2 | 1 |", + "| | 3.0 | 2 | 1 |", + "| | 3.0 | 2 | 1 |", + "| | 4.0 | 2 | 1 |", + "| | 4.0 | 2 | 1 |", + "| 2 | | 1 | 1 |", + "| 2 | | 1 | 1 |", + "| 2 | 1.0 | 0 | 1 |", + "| 2 | 1.0 | 0 | 1 |", + "| 3 | | 1 | 1 |", + "| 3 | | 1 | 2 |", + "| 3 | 2.0 | 0 | 2 |", + "| 3 | 3.0 | 0 | 1 |", + "| 4 | | 1 | 1 |", + "| 4 | | 1 | 2 |", + "| 4 | 3.0 | 0 | 1 |", + "| 4 | 4.0 | 0 | 2 |", + "+---+-----+---------------+-----------------+", ] } else { vec![ - "+---+-----+-----------------+", - "| a | b | COUNT(1)[count] |", - "+---+-----+-----------------+", - "| | 1.0 | 2 |", - "| | 2.0 | 2 |", - "| | 3.0 | 2 |", - "| | 4.0 | 2 |", - "| 2 | | 2 |", - "| 2 | 1.0 | 2 |", - "| 3 | | 3 |", - "| 3 | 2.0 | 2 |", - "| 3 | 3.0 | 1 |", - "| 4 | | 3 |", - "| 4 | 3.0 | 1 |", - "| 4 | 4.0 | 2 |", - "+---+-----+-----------------+", + "+---+-----+---------------+-----------------+", + "| a | b | __grouping_id | COUNT(1)[count] |", + "+---+-----+---------------+-----------------+", + "| | 1.0 | 2 | 2 |", + "| | 2.0 | 2 | 2 |", + "| | 3.0 | 2 | 2 |", + "| | 4.0 | 2 | 2 |", + "| 2 | | 1 | 2 |", + "| 2 | 1.0 | 0 | 2 |", + "| 3 | | 1 | 3 |", + "| 3 | 2.0 | 0 | 2 |", + "| 3 | 3.0 | 0 | 1 |", + "| 4 | | 1 | 3 |", + "| 4 | 3.0 | 0 | 1 |", + "| 4 | 4.0 | 0 | 2 |", + "+---+-----+---------------+-----------------+", ] }; assert_batches_sorted_eq!(expected, &result); - let groups = partial_aggregate.group_expr().expr().to_vec(); - let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); - let final_group: Vec<(Arc, String)> = groups - .iter() - .map(|(_expr, name)| Ok((col(name, &input_schema)?, name.clone()))) - .collect::>()?; - - let final_grouping_set = PhysicalGroupBy::new_single(final_group); + let final_grouping_set = grouping_set.as_final(); let task_ctx = if spill { new_spill_ctx(4, 3160) @@ -1468,26 +1550,26 @@ mod tests { let result = common::collect(merged_aggregate.execute(0, Arc::clone(&task_ctx))?).await?; let batch = concat_batches(&result[0].schema(), &result)?; - assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_columns(), 4); assert_eq!(batch.num_rows(), 12); let expected = vec![ - "+---+-----+----------+", - "| a | b | COUNT(1) |", - "+---+-----+----------+", - "| | 1.0 | 2 |", - "| | 2.0 | 2 |", - "| | 3.0 | 2 |", - "| | 4.0 | 2 |", - "| 2 | | 2 |", - "| 2 | 1.0 | 2 |", - "| 3 | | 3 |", - "| 3 | 2.0 | 2 |", - "| 3 | 3.0 | 1 |", - "| 4 | | 3 |", - "| 4 | 3.0 | 1 |", - "| 4 | 4.0 | 2 |", - "+---+-----+----------+", + "+---+-----+---------------+----------+", + "| a | b | __grouping_id | COUNT(1) |", + "+---+-----+---------------+----------+", + "| | 1.0 | 2 | 2 |", + "| | 2.0 | 2 | 2 |", + "| | 3.0 | 2 | 2 |", + "| | 4.0 | 2 | 2 |", + "| 2 | | 1 | 2 |", + "| 2 | 1.0 | 0 | 2 |", + "| 3 | | 1 | 3 |", + "| 3 | 2.0 | 0 | 2 |", + "| 3 | 3.0 | 0 | 1 |", + "| 4 | | 1 | 3 |", + "| 4 | 3.0 | 0 | 1 |", + "| 4 | 4.0 | 0 | 2 |", + "+---+-----+---------------+----------+", ]; assert_batches_sorted_eq!(&expected, &result); @@ -1503,11 +1585,11 @@ mod tests { async fn check_aggregates(input: Arc, spill: bool) -> Result<()> { let input_schema = input.schema(); - let grouping_set = PhysicalGroupBy { - expr: vec![(col("a", &input_schema)?, "a".to_string())], - null_expr: vec![], - groups: vec![vec![false]], - }; + let grouping_set = PhysicalGroupBy::new( + vec![(col("a", &input_schema)?, "a".to_string())], + vec![], + vec![vec![false]], + ); let aggregates: Vec = vec![ @@ -1563,13 +1645,7 @@ mod tests { let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); - let final_group: Vec<(Arc, String)> = grouping_set - .expr - .iter() - .map(|(_expr, name)| Ok((col(name, &input_schema)?, name.clone()))) - .collect::>()?; - - let final_grouping_set = PhysicalGroupBy::new_single(final_group); + let final_grouping_set = grouping_set.as_final(); let merged_aggregate = Arc::new(AggregateExec::try_new( AggregateMode::Final, @@ -1825,11 +1901,11 @@ mod tests { let task_ctx = Arc::new(task_ctx); let groups_none = PhysicalGroupBy::default(); - let groups_some = PhysicalGroupBy { - expr: vec![(col("a", &input_schema)?, "a".to_string())], - null_expr: vec![], - groups: vec![vec![false]], - }; + let groups_some = PhysicalGroupBy::new( + vec![(col("a", &input_schema)?, "a".to_string())], + vec![], + vec![vec![false]], + ); // something that allocates within the aggregator let aggregates_v0: Vec = @@ -2306,7 +2382,7 @@ mod tests { )?); let aggregate_exec = Arc::new(AggregateExec::try_new( - AggregateMode::Partial, + AggregateMode::Single, groups, aggregates.clone(), vec![None], @@ -2318,13 +2394,13 @@ mod tests { collect(aggregate_exec.execute(0, Arc::new(TaskContext::default()))?).await?; let expected = [ - "+-----+-----+-------+----------+", - "| a | b | const | 1[count] |", - "+-----+-----+-------+----------+", - "| | 0.0 | | 32768 |", - "| 0.0 | | | 32768 |", - "| | | 1 | 32768 |", - "+-----+-----+-------+----------+", + "+-----+-----+-------+---------------+-------+", + "| a | b | const | __grouping_id | 1 |", + "+-----+-----+-------+---------------+-------+", + "| | | 1 | 6 | 32768 |", + "| | 0.0 | | 5 | 32768 |", + "| 0.0 | | | 3 | 32768 |", + "+-----+-----+-------+---------------+-------+", ]; assert_batches_sorted_eq!(expected, &output); @@ -2638,30 +2714,30 @@ mod tests { .build()?, ]; - let grouping_set = PhysicalGroupBy { - expr: vec![ + let grouping_set = PhysicalGroupBy::new( + vec![ (col("a", &input_schema)?, "a".to_string()), (col("b", &input_schema)?, "b".to_string()), ], - null_expr: vec![ + vec![ (lit(ScalarValue::Float32(None)), "a".to_string()), (lit(ScalarValue::Float32(None)), "b".to_string()), ], - groups: vec![ + vec![ vec![false, true], // (a, NULL) vec![false, false], // (a,b) ], - }; + ); let aggr_schema = create_schema( &input_schema, - &grouping_set.expr, + &grouping_set, &aggr_expr, - grouping_set.exprs_nullable(), AggregateMode::Final, )?; let expected_schema = Schema::new(vec![ Field::new("a", DataType::Float32, false), Field::new("b", DataType::Float32, true), + Field::new("__grouping_id", DataType::UInt8, false), Field::new("COUNT(a)", DataType::Int64, false), ]); assert_eq!(aggr_schema, expected_schema); diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index a043905765ec..5121e6cc3b35 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -29,7 +29,7 @@ use crate::aggregates::{ }; use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput}; use crate::sorts::sort::sort_batch; -use crate::sorts::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::spill::{read_spill_as_stream, spill_record_batch_by_size}; use crate::stream::RecordBatchStreamAdapter; use crate::{aggregates, metrics, ExecutionPlan, PhysicalExpr}; @@ -38,7 +38,7 @@ use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::*; use arrow::datatypes::SchemaRef; use arrow_schema::SortOptions; -use datafusion_common::{internal_datafusion_err, DataFusionError, Result}; +use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::disk_manager::RefCountedTempFile; use datafusion_execution::memory_pool::proxy::VecAllocExt; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; @@ -449,13 +449,13 @@ impl GroupedHashAggregateStream { let aggregate_arguments = aggregates::aggregate_expressions( &agg.aggr_expr, &agg.mode, - agg_group_by.expr.len(), + agg_group_by.num_group_exprs(), )?; // arguments for aggregating spilled data is the same as the one for final aggregation let merging_aggregate_arguments = aggregates::aggregate_expressions( &agg.aggr_expr, &AggregateMode::Final, - agg_group_by.expr.len(), + agg_group_by.num_group_exprs(), )?; let filter_expressions = match agg.mode { @@ -473,7 +473,7 @@ impl GroupedHashAggregateStream { .map(create_group_accumulator) .collect::>()?; - let group_schema = group_schema(&agg_schema, agg_group_by.expr.len()); + let group_schema = group_schema(&agg.input().schema(), &agg_group_by)?; let spill_expr = group_schema .fields .into_iter() @@ -1001,15 +1001,14 @@ impl GroupedHashAggregateStream { streams.push(stream); } self.spill_state.is_stream_merging = true; - self.input = streaming_merge( - streams, - schema, - &self.spill_state.spill_expr, - self.baseline_metrics.clone(), - self.batch_size, - None, - self.reservation.new_empty(), - )?; + self.input = StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(schema) + .with_expressions(&self.spill_state.spill_expr) + .with_metrics(self.baseline_metrics.clone()) + .with_batch_size(self.batch_size) + .with_reservation(self.reservation.new_empty()) + .build()?; self.input_done = false; self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new()); Ok(()) @@ -1081,13 +1080,14 @@ impl GroupedHashAggregateStream { /// Transforms input batch to intermediate aggregate state, without grouping it fn transform_to_states(&self, batch: RecordBatch) -> Result { - let group_values = evaluate_group_by(&self.group_by, &batch)?; + let mut group_values = evaluate_group_by(&self.group_by, &batch)?; let input_values = evaluate_many(&self.aggregate_arguments, &batch)?; let filter_values = evaluate_optional(&self.filter_expressions, &batch)?; - let mut output = group_values.first().cloned().ok_or_else(|| { - internal_datafusion_err!("group_values expected to have at least one element") - })?; + if group_values.len() != 1 { + return internal_err!("group_values expected to have single element"); + } + let mut output = group_values.swap_remove(0); let iter = self .accumulators diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 11153556f253..a70645f3d6c0 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -69,15 +69,22 @@ impl CrossJoinExec { /// Create a new [CrossJoinExec]. pub fn new(left: Arc, right: Arc) -> Self { // left then right - let all_columns: Fields = { + let (all_columns, metadata) = { let left_schema = left.schema(); let right_schema = right.schema(); let left_fields = left_schema.fields().iter(); let right_fields = right_schema.fields().iter(); - left_fields.chain(right_fields).cloned().collect() + + let mut metadata = left_schema.metadata().clone(); + metadata.extend(right_schema.metadata().clone()); + + ( + left_fields.chain(right_fields).cloned().collect::(), + metadata, + ) }; - let schema = Arc::new(Schema::new(all_columns)); + let schema = Arc::new(Schema::new(all_columns).with_metadata(metadata)); let cache = Self::compute_properties(&left, &right, Arc::clone(&schema)); CrossJoinExec { left, diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 4fd364cca4d0..f0f198319ee3 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -34,7 +34,7 @@ use crate::metrics::BaselineMetrics; use crate::repartition::distributor_channels::{ channels, partition_aware_channels, DistributionReceiver, DistributionSender, }; -use crate::sorts::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::stream::RecordBatchStreamAdapter; use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics}; @@ -637,15 +637,15 @@ impl ExecutionPlan for RepartitionExec { let merge_reservation = MemoryConsumer::new(format!("{}[Merge {partition}]", name)) .register(context.memory_pool()); - streaming_merge( - input_streams, - schema_captured, - &sort_exprs, - BaselineMetrics::new(&metrics, partition), - context.session_config().batch_size(), - fetch, - merge_reservation, - ) + StreamingMergeBuilder::new() + .with_streams(input_streams) + .with_schema(schema_captured) + .with_expressions(&sort_exprs) + .with_metrics(BaselineMetrics::new(&metrics, partition)) + .with_batch_size(context.session_config().batch_size()) + .with_fetch(fetch) + .with_reservation(merge_reservation) + .build() } else { Ok(Box::pin(RepartitionStream { num_input_partitions, diff --git a/datafusion/physical-plan/src/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs index 7c084761fdc3..ab5df37ed327 100644 --- a/datafusion/physical-plan/src/sorts/mod.rs +++ b/datafusion/physical-plan/src/sorts/mod.rs @@ -28,4 +28,3 @@ mod stream; pub mod streaming_merge; pub use index::RowIndex; -pub(crate) use streaming_merge::streaming_merge; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 91816713c6c3..50f6f4a93097 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -30,7 +30,7 @@ use crate::limit::LimitStream; use crate::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, }; -use crate::sorts::streaming_merge::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::spill::{read_spill_as_stream, spill_record_batches}; use crate::stream::RecordBatchStreamAdapter; use crate::topk::TopK; @@ -342,15 +342,15 @@ impl ExternalSorter { streams.push(stream); } - streaming_merge( - streams, - Arc::clone(&self.schema), - &self.expr, - self.metrics.baseline.clone(), - self.batch_size, - self.fetch, - self.reservation.new_empty(), - ) + StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(Arc::clone(&self.schema)) + .with_expressions(&self.expr) + .with_metrics(self.metrics.baseline.clone()) + .with_batch_size(self.batch_size) + .with_fetch(self.fetch) + .with_reservation(self.reservation.new_empty()) + .build() } else { self.in_mem_sort_stream(self.metrics.baseline.clone()) } @@ -534,15 +534,15 @@ impl ExternalSorter { }) .collect::>()?; - streaming_merge( - streams, - Arc::clone(&self.schema), - &self.expr, - metrics, - self.batch_size, - self.fetch, - self.merge_reservation.new_empty(), - ) + StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(Arc::clone(&self.schema)) + .with_expressions(&self.expr) + .with_metrics(metrics) + .with_batch_size(self.batch_size) + .with_fetch(self.fetch) + .with_reservation(self.merge_reservation.new_empty()) + .build() } /// Sorts a single `RecordBatch` into a single stream. diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index b00a11a5355f..3d3f9dcb98ee 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -24,7 +24,7 @@ use crate::common::spawn_buffered; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; -use crate::sorts::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, @@ -273,15 +273,15 @@ impl ExecutionPlan for SortPreservingMergeExec { debug!("Done setting up sender-receiver for SortPreservingMergeExec::execute"); - let result = streaming_merge( - receivers, - schema, - &self.expr, - BaselineMetrics::new(&self.metrics, partition), - context.session_config().batch_size(), - self.fetch, - reservation, - )?; + let result = StreamingMergeBuilder::new() + .with_streams(receivers) + .with_schema(schema) + .with_expressions(&self.expr) + .with_metrics(BaselineMetrics::new(&self.metrics, partition)) + .with_batch_size(context.session_config().batch_size()) + .with_fetch(self.fetch) + .with_reservation(reservation) + .build()?; debug!("Got stream result from SortPreservingMergeStream::new_from_receivers"); @@ -960,16 +960,15 @@ mod tests { MemoryConsumer::new("test").register(&task_ctx.runtime_env().memory_pool); let fetch = None; - let merge_stream = streaming_merge( - streams, - batches.schema(), - sort.as_slice(), - BaselineMetrics::new(&metrics, 0), - task_ctx.session_config().batch_size(), - fetch, - reservation, - ) - .unwrap(); + let merge_stream = StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(batches.schema()) + .with_expressions(sort.as_slice()) + .with_metrics(BaselineMetrics::new(&metrics, 0)) + .with_batch_size(task_ctx.session_config().batch_size()) + .with_fetch(fetch) + .with_reservation(reservation) + .build()?; let mut merged = common::collect(merge_stream).await.unwrap(); diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index 9e6618dd1af5..ad640d8e8470 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -49,49 +49,120 @@ macro_rules! merge_helper { }}; } -/// Perform a streaming merge of [`SendableRecordBatchStream`] based on provided sort expressions -/// while preserving order. -pub fn streaming_merge( +#[derive(Default)] +pub struct StreamingMergeBuilder<'a> { streams: Vec, - schema: SchemaRef, - expressions: &[PhysicalSortExpr], - metrics: BaselineMetrics, - batch_size: usize, + schema: Option, + expressions: &'a [PhysicalSortExpr], + metrics: Option, + batch_size: Option, fetch: Option, - reservation: MemoryReservation, -) -> Result { - // If there are no sort expressions, preserving the order - // doesn't mean anything (and result in infinite loops) - if expressions.is_empty() { - return internal_err!("Sort expressions cannot be empty for streaming merge"); + reservation: Option, +} + +impl<'a> StreamingMergeBuilder<'a> { + pub fn new() -> Self { + Self::default() } - // Special case single column comparisons with optimized cursor implementations - if expressions.len() == 1 { - let sort = expressions[0].clone(); - let data_type = sort.expr.data_type(schema.as_ref())?; - downcast_primitive! { - data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation), - DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - _ => {} - } + + pub fn with_streams(mut self, streams: Vec) -> Self { + self.streams = streams; + self } - let streams = RowCursorStream::try_new( - schema.as_ref(), - expressions, - streams, - reservation.new_empty(), - )?; - - Ok(Box::pin(SortPreservingMergeStream::new( - Box::new(streams), - schema, - metrics, - batch_size, - fetch, - reservation, - ))) + pub fn with_schema(mut self, schema: SchemaRef) -> Self { + self.schema = Some(schema); + self + } + + pub fn with_expressions(mut self, expressions: &'a [PhysicalSortExpr]) -> Self { + self.expressions = expressions; + self + } + + pub fn with_metrics(mut self, metrics: BaselineMetrics) -> Self { + self.metrics = Some(metrics); + self + } + + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = Some(batch_size); + self + } + + pub fn with_fetch(mut self, fetch: Option) -> Self { + self.fetch = fetch; + self + } + + pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self { + self.reservation = Some(reservation); + self + } + + pub fn build(self) -> Result { + let Self { + streams, + schema, + metrics, + batch_size, + reservation, + fetch, + expressions, + } = self; + + // Early return if streams or expressions are empty + let checks = [ + ( + streams.is_empty(), + "Streams cannot be empty for streaming merge", + ), + ( + expressions.is_empty(), + "Sort expressions cannot be empty for streaming merge", + ), + ]; + + if let Some((_, error_message)) = checks.iter().find(|(condition, _)| *condition) + { + return internal_err!("{}", error_message); + } + + // Unwrapping mandatory fields + let schema = schema.expect("Schema cannot be empty for streaming merge"); + let metrics = metrics.expect("Metrics cannot be empty for streaming merge"); + let batch_size = + batch_size.expect("Batch size cannot be empty for streaming merge"); + let reservation = + reservation.expect("Reservation cannot be empty for streaming merge"); + + // Special case single column comparisons with optimized cursor implementations + if expressions.len() == 1 { + let sort = expressions[0].clone(); + let data_type = sort.expr.data_type(schema.as_ref())?; + downcast_primitive! { + data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation), + DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + _ => {} + } + } + + let streams = RowCursorStream::try_new( + schema.as_ref(), + expressions, + streams, + reservation.new_empty(), + )?; + Ok(Box::pin(SortPreservingMergeStream::new( + Box::new(streams), + schema, + metrics, + batch_size, + fetch, + reservation, + ))) + } } diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 78b25686054d..1cf22060b62a 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -474,7 +474,16 @@ fn union_schema(inputs: &[Arc]) -> SchemaRef { .iter() .filter_map(|input| { if input.schema().fields().len() > i { - Some(input.schema().field(i).clone()) + let field = input.schema().field(i).clone(); + let right_hand_metdata = inputs + .get(1) + .map(|right_input| { + right_input.schema().field(i).metadata().clone() + }) + .unwrap_or_default(); + let mut metadata = field.metadata().clone(); + metadata.extend(right_hand_metdata); + Some(field.with_metadata(metadata)) } else { None } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 8a94f905812c..cd789e06dc3b 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1060,6 +1060,10 @@ impl UserDefinedLogicalNodeCore for TopKPlanNode { expr: exprs.swap_remove(0), }) } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } #[derive(Debug)] diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index ddafc4e3a03a..20a772cdd088 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -432,6 +432,18 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { qualifier: None, options: WildcardOptions::default(), }), + FunctionArg::Unnamed(FunctionArgExpr::QualifiedWildcard(object_name)) => { + let qualifier = self.object_name_to_table_reference(object_name)?; + // sanity check on qualifier with schema + let qualified_indices = schema.fields_indices_with_qualified(&qualifier); + if qualified_indices.is_empty() { + return plan_err!("Invalid qualifier {qualifier}"); + } + Ok(Expr::Wildcard { + qualifier: Some(qualifier), + options: WildcardOptions::default(), + }) + } _ => not_impl_err!("Unsupported qualified wildcard argument: {sql:?}"), } } diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 2df8d89c59bc..6d130647a49f 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -181,7 +181,7 @@ pub(crate) type LexOrdering = Vec; #[derive(Debug, Clone, PartialEq, Eq)] pub struct CreateExternalTable { /// Table name - pub name: String, + pub name: ObjectName, /// Optional schema pub columns: Vec, /// File type (Parquet, NDJSON, CSV, etc) @@ -813,7 +813,7 @@ impl<'a> DFParser<'a> { } let create = CreateExternalTable { - name: table_name.to_string(), + name: table_name, columns, file_type: builder.file_type.unwrap(), location: builder.location.unwrap(), @@ -915,8 +915,9 @@ mod tests { // positive case let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv'"; let display = None; + let name = ObjectName(vec![Ident::from("t")]); let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -932,7 +933,7 @@ mod tests { // positive case: leading space let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' "; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -949,7 +950,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' ;"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -966,7 +967,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS (format.delimiter '|')"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -986,7 +987,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1, p2) LOCATION 'foo.csv'"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -1013,7 +1014,7 @@ mod tests { ]; for (sql, compression) in sqls { let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -1033,7 +1034,7 @@ mod tests { // positive case: it is ok for parquet files not to have columns specified let sql = "CREATE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![], file_type: "PARQUET".to_string(), location: "foo.parquet".into(), @@ -1049,7 +1050,7 @@ mod tests { // positive case: it is ok for parquet files to be other than upper case let sql = "CREATE EXTERNAL TABLE t STORED AS parqueT LOCATION 'foo.parquet'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![], file_type: "PARQUET".to_string(), location: "foo.parquet".into(), @@ -1065,7 +1066,7 @@ mod tests { // positive case: it is ok for avro files not to have columns specified let sql = "CREATE EXTERNAL TABLE t STORED AS AVRO LOCATION 'foo.avro'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![], file_type: "AVRO".to_string(), location: "foo.avro".into(), @@ -1082,7 +1083,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE IF NOT EXISTS t STORED AS PARQUET LOCATION 'foo.parquet'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![], file_type: "PARQUET".to_string(), location: "foo.parquet".into(), @@ -1099,7 +1100,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV PARTITIONED BY (p1 int) LOCATION 'foo.csv'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![ make_column_def("c1", DataType::Int(None)), make_column_def("p1", DataType::Int(None)), @@ -1132,7 +1133,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1') LOCATION 'blahblah'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![], file_type: "X".to_string(), location: "blahblah".into(), @@ -1149,7 +1150,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2) LOCATION 'blahblah'"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![], file_type: "X".to_string(), location: "blahblah".into(), @@ -1188,7 +1189,7 @@ mod tests { ]; for (sql, (asc, nulls_first)) in sqls.iter().zip(expected.into_iter()) { let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), location: "foo.csv".into(), @@ -1214,7 +1215,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV WITH ORDER (c1 ASC, c2 DESC NULLS FIRST) LOCATION 'foo.csv'"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![ make_column_def("c1", DataType::Int(display)), make_column_def("c2", DataType::Int(display)), @@ -1253,7 +1254,7 @@ mod tests { let sql = "CREATE EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV WITH ORDER (c1 - c2 ASC) LOCATION 'foo.csv'"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![ make_column_def("c1", DataType::Int(display)), make_column_def("c2", DataType::Int(display)), @@ -1297,7 +1298,7 @@ mod tests { 'TRUNCATE' 'NO', 'format.has_header' 'true')"; let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), + name: name.clone(), columns: vec![ make_column_def("c1", DataType::Int(None)), make_column_def("c2", DataType::Float(None)), diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 5cbe1d7c014a..e8defedddf2c 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -197,9 +197,9 @@ impl PlannerContext { /// extends the FROM schema, returning the existing one, if any pub fn extend_outer_from_schema(&mut self, schema: &DFSchemaRef) -> Result<()> { - self.outer_from_schema = match self.outer_from_schema.as_ref() { - Some(from_schema) => Some(Arc::new(from_schema.join(schema)?)), - None => Some(Arc::clone(schema)), + match self.outer_from_schema.as_mut() { + Some(from_schema) => Arc::make_mut(from_schema).merge(schema), + None => self.outer_from_schema = Some(Arc::clone(schema)), }; Ok(()) } diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 895285c59737..656d72d07ba2 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1239,8 +1239,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let ordered_exprs = self.build_order_by(order_exprs, &df_schema, &mut planner_context)?; - // External tables do not support schemas at the moment, so the name is just a table name - let name = TableReference::bare(name); + let name = self.object_name_to_table_reference(name)?; let constraints = Constraints::new_from_table_constraints(&all_constraints, &df_schema)?; Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable( diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs index d8a4fb254264..609e6f2240e1 100644 --- a/datafusion/sql/src/unparser/dialect.rs +++ b/datafusion/sql/src/unparser/dialect.rs @@ -18,12 +18,17 @@ use std::sync::Arc; use arrow_schema::TimeUnit; +use datafusion_expr::Expr; use regex::Regex; use sqlparser::{ - ast::{self, Ident, ObjectName, TimezoneInfo}, + ast::{self, Function, Ident, ObjectName, TimezoneInfo}, keywords::ALL_KEYWORDS, }; +use datafusion_common::Result; + +use super::{utils::date_part_to_sql, Unparser}; + /// `Dialect` to use for Unparsing /// /// The default dialect tries to avoid quoting identifiers unless necessary (e.g. `a` instead of `"a"`) @@ -108,6 +113,18 @@ pub trait Dialect: Send + Sync { fn supports_column_alias_in_table_alias(&self) -> bool { true } + + /// Allows the dialect to override scalar function unparsing if the dialect has specific rules. + /// Returns None if the default unparsing should be used, or Some(ast::Expr) if there is + /// a custom implementation for the function. + fn scalar_function_to_sql_overrides( + &self, + _unparser: &Unparser, + _func_name: &str, + _args: &[Expr], + ) -> Result> { + Ok(None) + } } /// `IntervalStyle` to use for unparsing @@ -171,6 +188,67 @@ impl Dialect for PostgreSqlDialect { fn float64_ast_dtype(&self) -> sqlparser::ast::DataType { sqlparser::ast::DataType::DoublePrecision } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "round" { + return Ok(Some( + self.round_to_sql_enforce_numeric(unparser, func_name, args)?, + )); + } + + Ok(None) + } +} + +impl PostgreSqlDialect { + fn round_to_sql_enforce_numeric( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result { + let mut args = unparser.function_args_to_sql(args)?; + + // Enforce the first argument to be Numeric + if let Some(ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr))) = + args.first_mut() + { + if let ast::Expr::Cast { data_type, .. } = expr { + // Don't create an additional cast wrapper if we can update the existing one + *data_type = ast::DataType::Numeric(ast::ExactNumberInfo::None); + } else { + // Wrap the expression in a new cast + *expr = ast::Expr::Cast { + kind: ast::CastKind::Cast, + expr: Box::new(expr.clone()), + data_type: ast::DataType::Numeric(ast::ExactNumberInfo::None), + format: None, + }; + } + } + + Ok(ast::Expr::Function(Function { + name: ast::ObjectName(vec![Ident { + value: func_name.to_string(), + quote_style: None, + }]), + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args, + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + parameters: ast::FunctionArguments::None, + })) + } } pub struct MySqlDialect {} @@ -211,6 +289,19 @@ impl Dialect for MySqlDialect { ) -> ast::DataType { ast::DataType::Datetime(None) } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "date_part" { + return date_part_to_sql(unparser, self.date_field_extract_style(), args); + } + + Ok(None) + } } pub struct SqliteDialect {} @@ -231,6 +322,19 @@ impl Dialect for SqliteDialect { fn supports_column_alias_in_table_alias(&self) -> bool { false } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "date_part" { + return date_part_to_sql(unparser, self.date_field_extract_style(), args); + } + + Ok(None) + } } pub struct CustomDialect { @@ -339,6 +443,19 @@ impl Dialect for CustomDialect { fn supports_column_alias_in_table_alias(&self) -> bool { self.supports_column_alias_in_table_alias } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "date_part" { + return date_part_to_sql(unparser, self.date_field_extract_style(), args); + } + + Ok(None) + } } /// `CustomDialectBuilder` to build `CustomDialect` using builder pattern diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index b924268a7657..537ac2274424 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -15,16 +15,15 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::ScalarUDF; use sqlparser::ast::Value::SingleQuotedString; use sqlparser::ast::{ - self, BinaryOperator, Expr as AstExpr, Function, FunctionArg, Ident, Interval, - ObjectName, TimezoneInfo, UnaryOperator, + self, BinaryOperator, Expr as AstExpr, Function, Ident, Interval, ObjectName, + TimezoneInfo, UnaryOperator, }; use std::sync::Arc; use std::vec; -use super::dialect::{DateFieldExtractStyle, IntervalStyle}; +use super::dialect::IntervalStyle; use super::Unparser; use arrow::datatypes::{Decimal128Type, Decimal256Type, DecimalType}; use arrow::util::display::array_value_to_string; @@ -116,47 +115,14 @@ impl Unparser<'_> { Expr::ScalarFunction(ScalarFunction { func, args }) => { let func_name = func.name(); - if let Some(expr) = - self.scalar_function_to_sql_overrides(func_name, func, args) + if let Some(expr) = self + .dialect + .scalar_function_to_sql_overrides(self, func_name, args)? { return Ok(expr); } - let args = args - .iter() - .map(|e| { - if matches!( - e, - Expr::Wildcard { - qualifier: None, - .. - } - ) { - Ok(FunctionArg::Unnamed(ast::FunctionArgExpr::Wildcard)) - } else { - self.expr_to_sql_inner(e).map(|e| { - FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(e)) - }) - } - }) - .collect::>>()?; - - Ok(ast::Expr::Function(Function { - name: ast::ObjectName(vec![Ident { - value: func_name.to_string(), - quote_style: None, - }]), - args: ast::FunctionArguments::List(ast::FunctionArgumentList { - duplicate_treatment: None, - args, - clauses: vec![], - }), - filter: None, - null_treatment: None, - over: None, - within_group: vec![], - parameters: ast::FunctionArguments::None, - })) + self.scalar_function_to_sql(func_name, args) } Expr::Between(Between { expr, @@ -508,6 +474,30 @@ impl Unparser<'_> { } } + pub fn scalar_function_to_sql( + &self, + func_name: &str, + args: &[Expr], + ) -> Result { + let args = self.function_args_to_sql(args)?; + Ok(ast::Expr::Function(Function { + name: ast::ObjectName(vec![Ident { + value: func_name.to_string(), + quote_style: None, + }]), + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args, + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + parameters: ast::FunctionArguments::None, + })) + } + pub fn sort_to_sql(&self, sort: &Sort) -> Result { let Sort { expr, @@ -530,87 +520,6 @@ impl Unparser<'_> { }) } - fn scalar_function_to_sql_overrides( - &self, - func_name: &str, - _func: &Arc, - args: &[Expr], - ) -> Option { - if func_name.to_lowercase() == "date_part" { - match (self.dialect.date_field_extract_style(), args.len()) { - (DateFieldExtractStyle::Extract, 2) => { - let date_expr = self.expr_to_sql(&args[1]).ok()?; - - if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &args[0] { - let field = match field.to_lowercase().as_str() { - "year" => ast::DateTimeField::Year, - "month" => ast::DateTimeField::Month, - "day" => ast::DateTimeField::Day, - "hour" => ast::DateTimeField::Hour, - "minute" => ast::DateTimeField::Minute, - "second" => ast::DateTimeField::Second, - _ => return None, - }; - - return Some(ast::Expr::Extract { - field, - expr: Box::new(date_expr), - syntax: ast::ExtractSyntax::From, - }); - } - } - (DateFieldExtractStyle::Strftime, 2) => { - let column = self.expr_to_sql(&args[1]).ok()?; - - if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &args[0] { - let field = match field.to_lowercase().as_str() { - "year" => "%Y", - "month" => "%m", - "day" => "%d", - "hour" => "%H", - "minute" => "%M", - "second" => "%S", - _ => return None, - }; - - return Some(ast::Expr::Function(ast::Function { - name: ast::ObjectName(vec![ast::Ident { - value: "strftime".to_string(), - quote_style: None, - }]), - args: ast::FunctionArguments::List( - ast::FunctionArgumentList { - duplicate_treatment: None, - args: vec![ - ast::FunctionArg::Unnamed( - ast::FunctionArgExpr::Expr(ast::Expr::Value( - ast::Value::SingleQuotedString( - field.to_string(), - ), - )), - ), - ast::FunctionArg::Unnamed( - ast::FunctionArgExpr::Expr(column), - ), - ], - clauses: vec![], - }, - ), - filter: None, - null_treatment: None, - over: None, - within_group: vec![], - parameters: ast::FunctionArguments::None, - })); - } - } - _ => {} // no overrides for DateFieldExtractStyle::DatePart, because it's already a date_part - } - } - - None - } - fn ast_type_for_date64_in_cast(&self) -> ast::DataType { if self.dialect.use_timestamp_for_date64() { ast::DataType::Timestamp(None, ast::TimezoneInfo::None) @@ -665,7 +574,10 @@ impl Unparser<'_> { } } - fn function_args_to_sql(&self, args: &[Expr]) -> Result> { + pub(crate) fn function_args_to_sql( + &self, + args: &[Expr], + ) -> Result> { args.iter() .map(|e| { if matches!( @@ -1554,7 +1466,10 @@ mod tests { use datafusion_functions_aggregate::expr_fn::sum; use datafusion_functions_window::row_number::row_number_udwf; - use crate::unparser::dialect::{CustomDialect, CustomDialectBuilder}; + use crate::unparser::dialect::{ + CustomDialect, CustomDialectBuilder, DateFieldExtractStyle, Dialect, + PostgreSqlDialect, + }; use super::*; @@ -2428,4 +2343,39 @@ mod tests { assert_eq!(actual, expected); } } + + #[test] + fn test_round_scalar_fn_to_expr() -> Result<()> { + let default_dialect: Arc = Arc::new( + CustomDialectBuilder::new() + .with_identifier_quote_style('"') + .build(), + ); + let postgres_dialect: Arc = Arc::new(PostgreSqlDialect {}); + + for (dialect, identifier) in + [(default_dialect, "DOUBLE"), (postgres_dialect, "NUMERIC")] + { + let unparser = Unparser::new(dialect.as_ref()); + let expr = Expr::ScalarFunction(ScalarFunction { + func: Arc::new(ScalarUDF::from( + datafusion_functions::math::round::RoundFunc::new(), + )), + args: vec![ + Expr::Cast(Cast { + expr: Box::new(col("a")), + data_type: DataType::Float64, + }), + Expr::Literal(ScalarValue::Int64(Some(2))), + ], + }); + let ast = unparser.expr_to_sql(&expr)?; + + let actual = format!("{}", ast); + let expected = format!(r#"round(CAST("a" AS {identifier}), 2)"#); + + assert_eq!(actual, expected); + } + Ok(()) + } } diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index a76e26aa7d98..c4fcbb2d6458 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -38,7 +38,10 @@ use super::{ rewrite_plan_for_sort_on_non_projected_fields, subquery_alias_inner_query_and_columns, TableAliasRewriter, }, - utils::{find_agg_node_within_select, unproject_window_exprs, AggVariant}, + utils::{ + find_agg_node_within_select, find_window_nodes_within_select, + unproject_window_exprs, + }, Unparser, }; @@ -172,13 +175,17 @@ impl Unparser<'_> { p: &Projection, select: &mut SelectBuilder, ) -> Result<()> { - match find_agg_node_within_select(plan, None, true) { - Some(AggVariant::Aggregate(agg)) => { + match ( + find_agg_node_within_select(plan, true), + find_window_nodes_within_select(plan, None, true), + ) { + (Some(agg), window) => { + let window_option = window.as_deref(); let items = p .expr .iter() .map(|proj_expr| { - let unproj = unproject_agg_exprs(proj_expr, agg)?; + let unproj = unproject_agg_exprs(proj_expr, agg, window_option)?; self.select_item_to_sql(&unproj) }) .collect::>>()?; @@ -192,7 +199,7 @@ impl Unparser<'_> { vec![], )); } - Some(AggVariant::Window(window)) => { + (None, Some(window)) => { let items = p .expr .iter() @@ -204,7 +211,7 @@ impl Unparser<'_> { select.projection(items); } - None => { + _ => { let items = p .expr .iter() @@ -287,10 +294,10 @@ impl Unparser<'_> { self.select_to_sql_recursively(p.input.as_ref(), query, select, relation) } LogicalPlan::Filter(filter) => { - if let Some(AggVariant::Aggregate(agg)) = - find_agg_node_within_select(plan, None, select.already_projected()) + if let Some(agg) = + find_agg_node_within_select(plan, select.already_projected()) { - let unprojected = unproject_agg_exprs(&filter.predicate, agg)?; + let unprojected = unproject_agg_exprs(&filter.predicate, agg, None)?; let filter_expr = self.expr_to_sql(&unprojected)?; select.having(Some(filter_expr)); } else { diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index c1b3fe18f7e7..e05df8ba77fc 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -15,61 +15,91 @@ // specific language governing permissions and limitations // under the License. +use std::cmp::Ordering; + use datafusion_common::{ internal_err, tree_node::{Transformed, TreeNode}, - Result, + Column, DataFusionError, Result, ScalarValue, +}; +use datafusion_expr::{ + utils::grouping_set_to_exprlist, Aggregate, Expr, LogicalPlan, Window, }; -use datafusion_expr::{Aggregate, Expr, LogicalPlan, Window}; +use sqlparser::ast; + +use super::{dialect::DateFieldExtractStyle, Unparser}; -/// One of the possible aggregation plans which can be found within a single select query. -pub(crate) enum AggVariant<'a> { - Aggregate(&'a Aggregate), - Window(Vec<&'a Window>), +/// Recursively searches children of [LogicalPlan] to find an Aggregate node if exists +/// prior to encountering a Join, TableScan, or a nested subquery (derived table factor). +/// If an Aggregate or node is not found prior to this or at all before reaching the end +/// of the tree, None is returned. +pub(crate) fn find_agg_node_within_select( + plan: &LogicalPlan, + already_projected: bool, +) -> Option<&Aggregate> { + // Note that none of the nodes that have a corresponding node can have more + // than 1 input node. E.g. Projection / Filter always have 1 input node. + let input = plan.inputs(); + let input = if input.len() > 1 { + return None; + } else { + input.first()? + }; + // Agg nodes explicitly return immediately with a single node + if let LogicalPlan::Aggregate(agg) = input { + Some(agg) + } else if let LogicalPlan::TableScan(_) = input { + None + } else if let LogicalPlan::Projection(_) = input { + if already_projected { + None + } else { + find_agg_node_within_select(input, true) + } + } else { + find_agg_node_within_select(input, already_projected) + } } -/// Recursively searches children of [LogicalPlan] to find an Aggregate or window node if one exists +/// Recursively searches children of [LogicalPlan] to find Window nodes if exist /// prior to encountering a Join, TableScan, or a nested subquery (derived table factor). -/// If an Aggregate or window node is not found prior to this or at all before reaching the end -/// of the tree, None is returned. It is assumed that a Window and Aggregate node cannot both -/// be found in a single select query. -pub(crate) fn find_agg_node_within_select<'a>( +/// If Window node is not found prior to this or at all before reaching the end +/// of the tree, None is returned. +pub(crate) fn find_window_nodes_within_select<'a>( plan: &'a LogicalPlan, - mut prev_windows: Option>, + mut prev_windows: Option>, already_projected: bool, -) -> Option> { - // Note that none of the nodes that have a corresponding agg node can have more +) -> Option> { + // Note that none of the nodes that have a corresponding node can have more // than 1 input node. E.g. Projection / Filter always have 1 input node. let input = plan.inputs(); let input = if input.len() > 1 { - return None; + return prev_windows; } else { input.first()? }; - // Agg nodes explicitly return immediately with a single node // Window nodes accumulate in a vec until encountering a TableScan or 2nd projection match input { - LogicalPlan::Aggregate(agg) => Some(AggVariant::Aggregate(agg)), LogicalPlan::Window(window) => { prev_windows = match &mut prev_windows { - Some(AggVariant::Window(windows)) => { + Some(windows) => { windows.push(window); prev_windows } - _ => Some(AggVariant::Window(vec![window])), + _ => Some(vec![window]), }; - find_agg_node_within_select(input, prev_windows, already_projected) + find_window_nodes_within_select(input, prev_windows, already_projected) } LogicalPlan::Projection(_) => { if already_projected { prev_windows } else { - find_agg_node_within_select(input, prev_windows, true) + find_window_nodes_within_select(input, prev_windows, true) } } LogicalPlan::TableScan(_) => prev_windows, - _ => find_agg_node_within_select(input, prev_windows, already_projected), + _ => find_window_nodes_within_select(input, prev_windows, already_projected), } } @@ -78,22 +108,34 @@ pub(crate) fn find_agg_node_within_select<'a>( /// /// For example, if expr contains the column expr "COUNT(*)" it will be transformed /// into an actual aggregate expression COUNT(*) as identified in the aggregate node. -pub(crate) fn unproject_agg_exprs(expr: &Expr, agg: &Aggregate) -> Result { +pub(crate) fn unproject_agg_exprs( + expr: &Expr, + agg: &Aggregate, + windows: Option<&[&Window]>, +) -> Result { expr.clone() .transform(|sub_expr| { if let Expr::Column(c) = sub_expr { - // find the column in the agg schema - if let Ok(n) = agg.schema.index_of_column(&c) { - let unprojected_expr = agg - .group_expr - .iter() - .chain(agg.aggr_expr.iter()) - .nth(n) - .unwrap(); + if let Some(unprojected_expr) = find_agg_expr(agg, &c)? { Ok(Transformed::yes(unprojected_expr.clone())) + } else if let Some(mut unprojected_expr) = + windows.and_then(|w| find_window_expr(w, &c.name).cloned()) + { + if let Expr::WindowFunction(func) = &mut unprojected_expr { + // Window function can contain an aggregation column, e.g., 'avg(sum(ss_sales_price)) over ...' that needs to be unprojected + func.args.iter_mut().try_for_each(|arg| { + if let Expr::Column(c) = arg { + if let Some(expr) = find_agg_expr(agg, c)? { + *arg = expr.clone(); + } + } + Ok::<(), DataFusionError>(()) + })?; + } + Ok(Transformed::yes(unprojected_expr)) } else { internal_err!( - "Tried to unproject agg expr not found in provided Aggregate!" + "Tried to unproject agg expr for column '{}' that was not found in the provided Aggregate!", &c.name ) } } else { @@ -112,11 +154,7 @@ pub(crate) fn unproject_window_exprs(expr: &Expr, windows: &[&Window]) -> Result expr.clone() .transform(|sub_expr| { if let Expr::Column(c) = sub_expr { - if let Some(unproj) = windows - .iter() - .flat_map(|w| w.window_expr.iter()) - .find(|window_expr| window_expr.schema_name().to_string() == c.name) - { + if let Some(unproj) = find_window_expr(windows, &c.name) { Ok(Transformed::yes(unproj.clone())) } else { Ok(Transformed::no(Expr::Column(c))) @@ -127,3 +165,114 @@ pub(crate) fn unproject_window_exprs(expr: &Expr, windows: &[&Window]) -> Result }) .map(|e| e.data) } + +fn find_agg_expr<'a>(agg: &'a Aggregate, column: &Column) -> Result> { + if let Ok(index) = agg.schema.index_of_column(column) { + if matches!(agg.group_expr.as_slice(), [Expr::GroupingSet(_)]) { + // For grouping set expr, we must operate by expression list from the grouping set + let grouping_expr = grouping_set_to_exprlist(agg.group_expr.as_slice())?; + match index.cmp(&grouping_expr.len()) { + Ordering::Less => Ok(grouping_expr.into_iter().nth(index)), + Ordering::Equal => { + internal_err!( + "Tried to unproject column refereing to internal grouping id" + ) + } + Ordering::Greater => { + Ok(agg.aggr_expr.get(index - grouping_expr.len() - 1)) + } + } + } else { + Ok(agg.group_expr.iter().chain(agg.aggr_expr.iter()).nth(index)) + } + } else { + Ok(None) + } +} + +fn find_window_expr<'a>( + windows: &'a [&'a Window], + column_name: &'a str, +) -> Option<&'a Expr> { + windows + .iter() + .flat_map(|w| w.window_expr.iter()) + .find(|expr| expr.schema_name().to_string() == column_name) +} + +/// Converts a date_part function to SQL, tailoring it to the supported date field extraction style. +pub(crate) fn date_part_to_sql( + unparser: &Unparser, + style: DateFieldExtractStyle, + date_part_args: &[Expr], +) -> Result> { + match (style, date_part_args.len()) { + (DateFieldExtractStyle::Extract, 2) => { + let date_expr = unparser.expr_to_sql(&date_part_args[1])?; + if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &date_part_args[0] { + let field = match field.to_lowercase().as_str() { + "year" => ast::DateTimeField::Year, + "month" => ast::DateTimeField::Month, + "day" => ast::DateTimeField::Day, + "hour" => ast::DateTimeField::Hour, + "minute" => ast::DateTimeField::Minute, + "second" => ast::DateTimeField::Second, + _ => return Ok(None), + }; + + return Ok(Some(ast::Expr::Extract { + field, + expr: Box::new(date_expr), + syntax: ast::ExtractSyntax::From, + })); + } + } + (DateFieldExtractStyle::Strftime, 2) => { + let column = unparser.expr_to_sql(&date_part_args[1])?; + + if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &date_part_args[0] { + let field = match field.to_lowercase().as_str() { + "year" => "%Y", + "month" => "%m", + "day" => "%d", + "hour" => "%H", + "minute" => "%M", + "second" => "%S", + _ => return Ok(None), + }; + + return Ok(Some(ast::Expr::Function(ast::Function { + name: ast::ObjectName(vec![ast::Ident { + value: "strftime".to_string(), + quote_style: None, + }]), + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args: vec![ + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr( + ast::Expr::Value(ast::Value::SingleQuotedString( + field.to_string(), + )), + )), + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(column)), + ], + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + parameters: ast::FunctionArguments::None, + }))); + } + } + (DateFieldExtractStyle::DatePart, _) => { + return Ok(Some( + unparser.scalar_function_to_sql("date_part", date_part_args)?, + )); + } + _ => {} + }; + + Ok(None) +} diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 656e4b851aa8..d8ad964be213 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -619,7 +619,9 @@ pub(crate) fn rewrite_recursive_unnest_bottom_up( } = original_expr.clone().rewrite(&mut rewriter)?; if !transformed { - if matches!(&transformed_expr, Expr::Column(_)) { + if matches!(&transformed_expr, Expr::Column(_)) + || matches!(&transformed_expr, Expr::Wildcard { .. }) + { push_projection_dedupl(inner_projection_exprs, transformed_expr.clone()); Ok(vec![transformed_expr]) } else { diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 49f4720ed137..903d4e28520b 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -149,6 +149,26 @@ fn roundtrip_statement() -> Result<()> { "WITH w1 AS (SELECT 'a' as col), w2 AS (SELECT 'b' as col), w3 as (SELECT 'c' as col) SELECT * FROM w1 UNION ALL SELECT * FROM w2 UNION ALL SELECT * FROM w3", "WITH w1 AS (SELECT 'a' as col), w2 AS (SELECT 'b' as col), w3 as (SELECT 'c' as col), w4 as (SELECT 'd' as col) SELECT * FROM w1 UNION ALL SELECT * FROM w2 UNION ALL SELECT * FROM w3 UNION ALL SELECT * FROM w4", "WITH w1 AS (SELECT 'a' as col), w2 AS (SELECT 'b' as col) SELECT * FROM w1 JOIN w2 ON w1.col = w2.col UNION ALL SELECT * FROM w1 JOIN w2 ON w1.col = w2.col UNION ALL SELECT * FROM w1 JOIN w2 ON w1.col = w2.col", + r#"SELECT id, first_name, + SUM(id) AS total_sum, + SUM(id) OVER (PARTITION BY first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS moving_sum, + MAX(SUM(id)) OVER (PARTITION BY first_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS max_total + FROM person JOIN orders ON person.id = orders.customer_id GROUP BY id, first_name"#, + r#"SELECT id, first_name, + SUM(id) AS total_sum, + SUM(id) OVER (PARTITION BY first_name ROWS BETWEEN 5 PRECEDING AND 2 FOLLOWING) AS moving_sum, + MAX(SUM(id)) OVER (PARTITION BY first_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS max_total + FROM (SELECT id, first_name from person) person JOIN (SELECT customer_id FROM orders) orders ON person.id = orders.customer_id GROUP BY id, first_name"#, + r#"SELECT id, first_name, last_name, customer_id, SUM(id) AS total_sum + FROM person + JOIN orders ON person.id = orders.customer_id + GROUP BY ROLLUP(id, first_name, last_name, customer_id)"#, + r#"SELECT id, first_name, last_name, + SUM(id) AS total_sum, + COUNT(*) AS total_count, + SUM(id) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total + FROM person + GROUP BY GROUPING SETS ((id, first_name, last_name), (first_name, last_name), (last_name))"#, ]; // For each test sql string, we transform as follows: @@ -164,6 +184,7 @@ fn roundtrip_statement() -> Result<()> { let state = MockSessionState::default() .with_aggregate_function(sum_udaf()) .with_aggregate_function(count_udaf()) + .with_aggregate_function(max_udaf()) .with_expr_planner(Arc::new(CoreFunctionPlanner::default())); let context = MockContextProvider { state }; let sql_to_rel = SqlToRel::new(&context); diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 5c9655a55606..44b591fedef8 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -1913,6 +1913,13 @@ fn create_external_table_with_pk() { quick_test(sql, expected); } +#[test] +fn create_external_table_wih_schema() { + let sql = "CREATE EXTERNAL TABLE staging.foo STORED AS CSV LOCATION 'foo.csv'"; + let expected = "CreateExternalTable: Partial { schema: \"staging\", table: \"foo\" }"; + quick_test(sql, expected); +} + #[test] fn create_schema_with_quoted_name() { let sql = "CREATE SCHEMA \"quoted_schema_name\""; diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index d3ee720467b6..9a0db1c41c71 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -314,8 +314,13 @@ pub async fn register_metadata_tables(ctx: &SessionContext) { String::from("metadata_key"), String::from("the name field"), )])); + let l_name = + Field::new("l_name", DataType::Utf8, true).with_metadata(HashMap::from([( + String::from("metadata_key"), + String::from("the l_name field"), + )])); - let schema = Schema::new(vec![id, name]).with_metadata(HashMap::from([( + let schema = Schema::new(vec![id, name, l_name]).with_metadata(HashMap::from([( String::from("metadata_key"), String::from("the entire schema"), )])); @@ -325,6 +330,7 @@ pub async fn register_metadata_tables(ctx: &SessionContext) { vec![ Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _, Arc::new(StringArray::from(vec![None, Some("bar"), Some("baz")])) as _, + Arc::new(StringArray::from(vec![None, Some("l_bar"), Some("l_baz")])) as _, ], ) .unwrap(); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 46327534e7de..250fa85cddef 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1124,6 +1124,14 @@ SELECT COUNT(*) FROM aggregate_test_100 ---- 100 +query I +SELECT COUNT(aggregate_test_100.*) FROM aggregate_test_100 +---- +100 + +query error Error during planning: Invalid qualifier foo +SELECT COUNT(foo.*) FROM aggregate_test_100 + # csv_query_count_literal query I SELECT COUNT(2) FROM aggregate_test_100 @@ -3512,6 +3520,18 @@ SELECT MIN(value), MAX(value) FROM integers_with_nulls ---- 1 5 +# grouping_sets with null values +query II rowsort +SELECT value, min(value) FROM integers_with_nulls GROUP BY CUBE(value) +---- +1 1 +3 3 +4 4 +5 5 +NULL 1 +NULL NULL + + statement ok DROP TABLE integers_with_nulls; @@ -4871,16 +4891,18 @@ query TT EXPLAIN SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; ---- logical_plan -01)Limit: skip=0, fetch=3 -02)--Aggregate: groupBy=[[ROLLUP (aggregate_test_100.c2, aggregate_test_100.c3)]], aggr=[[]] -03)----TableScan: aggregate_test_100 projection=[c2, c3] +01)Projection: aggregate_test_100.c2, aggregate_test_100.c3 +02)--Limit: skip=0, fetch=3 +03)----Aggregate: groupBy=[[ROLLUP (aggregate_test_100.c2, aggregate_test_100.c3)]], aggr=[[]] +04)------TableScan: aggregate_test_100 projection=[c2, c3] physical_plan -01)GlobalLimitExec: skip=0, fetch=3 -02)--AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[3] -03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true +01)ProjectionExec: expr=[c2@0 as c2, c3@1 as c3] +02)--GlobalLimitExec: skip=0, fetch=3 +03)----AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3, __grouping_id@2 as __grouping_id], aggr=[], lim=[3] +04)------CoalescePartitionsExec +05)--------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true query II SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index b7d60b50586d..bcd80cbe9b24 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6595,7 +6595,7 @@ select make_array(1, 2.0, null, 3) query ? select make_array(1.0, '2', null) ---- -[1.0, 2, ] +[1.0, 2.0, ] ### FixedSizeListArray diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 12b097c3d5d1..9ac2ecdce7cc 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -275,3 +275,15 @@ DROP TABLE t; # query should fail with bad column statement error DataFusion error: Error during planning: Column foo is not in schema CREATE EXTERNAL TABLE t STORED AS parquet LOCATION '../../parquet-testing/data/alltypes_plain.parquet' WITH ORDER (foo); + +# Create external table with qualified name should belong to the schema +statement ok +CREATE SCHEMA staging; + +statement ok +CREATE EXTERNAL TABLE staging.foo STORED AS parquet LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; + +# Create external table with qualified name, but no schema should error +statement error DataFusion error: Error during planning: failed to resolve schema: release +CREATE EXTERNAL TABLE release.bar STORED AS parquet LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; + diff --git a/datafusion/sqllogictest/test_files/cse.slt b/datafusion/sqllogictest/test_files/cse.slt index 19b47fa50e41..9f0f654179e9 100644 --- a/datafusion/sqllogictest/test_files/cse.slt +++ b/datafusion/sqllogictest/test_files/cse.slt @@ -179,8 +179,8 @@ physical_plan # Surely only once but also conditionally evaluated expressions query TT EXPLAIN SELECT - (a = 1 OR random() = 0) AND a = 1 AS c1, - (a = 2 AND random() = 0) OR a = 2 AS c2, + (a = 1 OR random() = 0) AND a = 2 AS c1, + (a = 2 AND random() = 0) OR a = 1 AS c2, CASE WHEN a + 3 = 0 THEN a + 3 ELSE 0 END AS c3, CASE WHEN a + 4 = 0 THEN 0 WHEN a + 4 THEN 0 ELSE 0 END AS c4, CASE WHEN a + 5 = 0 THEN 0 WHEN random() = 0 THEN a + 5 ELSE 0 END AS c5, @@ -188,11 +188,11 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: (__common_expr_1 OR random() = Float64(0)) AND __common_expr_1 AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_2 AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Int64(0) WHEN CAST(__common_expr_4 AS Boolean) THEN Int64(0) ELSE Int64(0) END AS c4, CASE WHEN __common_expr_5 = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN __common_expr_5 ELSE Float64(0) END AS c5, CASE WHEN __common_expr_6 = Float64(0) THEN Float64(0) ELSE __common_expr_6 END AS c6 +01)Projection: (__common_expr_1 OR random() = Float64(0)) AND __common_expr_2 AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_1 AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Int64(0) WHEN CAST(__common_expr_4 AS Boolean) THEN Int64(0) ELSE Int64(0) END AS c4, CASE WHEN __common_expr_5 = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN __common_expr_5 ELSE Float64(0) END AS c5, CASE WHEN __common_expr_6 = Float64(0) THEN Float64(0) ELSE __common_expr_6 END AS c6 02)--Projection: t1.a = Float64(1) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a + Float64(3) AS __common_expr_3, t1.a + Float64(4) AS __common_expr_4, t1.a + Float64(5) AS __common_expr_5, t1.a + Float64(6) AS __common_expr_6 03)----TableScan: t1 projection=[a] physical_plan -01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND __common_expr_1@0 as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_2@1 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 WHEN CAST(__common_expr_4@3 AS Boolean) THEN 0 ELSE 0 END as c4, CASE WHEN __common_expr_5@4 = 0 THEN 0 WHEN random() = 0 THEN __common_expr_5@4 ELSE 0 END as c5, CASE WHEN __common_expr_6@5 = 0 THEN 0 ELSE __common_expr_6@5 END as c6] +01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND __common_expr_2@1 as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_1@0 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 WHEN CAST(__common_expr_4@3 AS Boolean) THEN 0 ELSE 0 END as c4, CASE WHEN __common_expr_5@4 = 0 THEN 0 WHEN random() = 0 THEN __common_expr_5@4 ELSE 0 END as c5, CASE WHEN __common_expr_6@5 = 0 THEN 0 ELSE __common_expr_6@5 END as c6] 02)--ProjectionExec: expr=[a@0 = 1 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 + 3 as __common_expr_3, a@0 + 4 as __common_expr_4, a@0 + 5 as __common_expr_5, a@0 + 6 as __common_expr_6] 03)----MemoryExec: partitions=1, partition_sizes=[0] @@ -217,8 +217,8 @@ physical_plan # Only conditionally evaluated expressions query TT EXPLAIN SELECT - (random() = 0 OR a = 1) AND a = 1 AS c1, - (random() = 0 AND a = 2) OR a = 2 AS c2, + (random() = 0 OR a = 1) AND a = 2 AS c1, + (random() = 0 AND a = 2) OR a = 1 AS c2, CASE WHEN random() = 0 THEN a + 3 ELSE a + 3 END AS c3, CASE WHEN random() = 0 THEN 0 WHEN a + 4 = 0 THEN a + 4 ELSE 0 END AS c4, CASE WHEN random() = 0 THEN 0 WHEN a + 5 = 0 THEN 0 ELSE a + 5 END AS c5, @@ -226,8 +226,8 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: (random() = Float64(0) OR t1.a = Float64(1)) AND t1.a = Float64(1) AS c1, random() = Float64(0) AND t1.a = Float64(2) OR t1.a = Float64(2) AS c2, CASE WHEN random() = Float64(0) THEN t1.a + Float64(3) ELSE t1.a + Float64(3) END AS c3, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(4) = Float64(0) THEN t1.a + Float64(4) ELSE Float64(0) END AS c4, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(5) = Float64(0) THEN Float64(0) ELSE t1.a + Float64(5) END AS c5, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN t1.a + Float64(6) ELSE t1.a + Float64(6) END AS c6 +01)Projection: (random() = Float64(0) OR t1.a = Float64(1)) AND t1.a = Float64(2) AS c1, random() = Float64(0) AND t1.a = Float64(2) OR t1.a = Float64(1) AS c2, CASE WHEN random() = Float64(0) THEN t1.a + Float64(3) ELSE t1.a + Float64(3) END AS c3, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(4) = Float64(0) THEN t1.a + Float64(4) ELSE Float64(0) END AS c4, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(5) = Float64(0) THEN Float64(0) ELSE t1.a + Float64(5) END AS c5, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN t1.a + Float64(6) ELSE t1.a + Float64(6) END AS c6 02)--TableScan: t1 projection=[a] physical_plan -01)ProjectionExec: expr=[(random() = 0 OR a@0 = 1) AND a@0 = 1 as c1, random() = 0 AND a@0 = 2 OR a@0 = 2 as c2, CASE WHEN random() = 0 THEN a@0 + 3 ELSE a@0 + 3 END as c3, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 4 = 0 THEN a@0 + 4 ELSE 0 END as c4, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 5 = 0 THEN 0 ELSE a@0 + 5 END as c5, CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a@0 + 6 ELSE a@0 + 6 END as c6] +01)ProjectionExec: expr=[(random() = 0 OR a@0 = 1) AND a@0 = 2 as c1, random() = 0 AND a@0 = 2 OR a@0 = 1 as c2, CASE WHEN random() = 0 THEN a@0 + 3 ELSE a@0 + 3 END as c3, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 4 = 0 THEN a@0 + 4 ELSE 0 END as c4, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 5 = 0 THEN 0 ELSE a@0 + 5 END as c5, CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a@0 + 6 ELSE a@0 + 6 END as c6] 02)--MemoryExec: partitions=1, partition_sizes=[0] diff --git a/datafusion/sqllogictest/test_files/dynamic_file.slt b/datafusion/sqllogictest/test_files/dynamic_file.slt index e177fd3de243..69f9a43ad407 100644 --- a/datafusion/sqllogictest/test_files/dynamic_file.slt +++ b/datafusion/sqllogictest/test_files/dynamic_file.slt @@ -25,9 +25,170 @@ SELECT * FROM '../core/tests/data/partitioned_table_arrow/part=123' ORDER BY f0; 1 foo true 2 bar false -# dynamic file query doesn't support partitioned table -statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/partitioned_table_arrow' not found -SELECT * FROM '../core/tests/data/partitioned_table_arrow' ORDER BY f0; +# Read partitioned file +statement ok +CREATE TABLE src_table_1 ( + int_col INT, + string_col TEXT, + bigint_col BIGINT, + partition_col INT +) AS VALUES +(1, 'aaa', 100, 1), +(2, 'bbb', 200, 1), +(3, 'ccc', 300, 1), +(4, 'ddd', 400, 1); + +statement ok +CREATE TABLE src_table_2 ( + int_col INT, + string_col TEXT, + bigint_col BIGINT, + partition_col INT +) AS VALUES +(5, 'eee', 500, 2), +(6, 'fff', 600, 2), +(7, 'ggg', 700, 2), +(8, 'hhh', 800, 2); + +# Read partitioned csv file + +query I +COPY src_table_1 TO 'test_files/scratch/dynamic_file/csv_partitions' +STORED AS CSV +PARTITIONED BY (partition_col); +---- +4 + +query I +COPY src_table_2 TO 'test_files/scratch/dynamic_file/csv_partitions' +STORED AS CSV +PARTITIONED BY (partition_col); +---- +4 + +query ITIT rowsort +SELECT int_col, string_col, bigint_col, partition_col FROM 'test_files/scratch/dynamic_file/csv_partitions'; +---- +1 aaa 100 1 +2 bbb 200 1 +3 ccc 300 1 +4 ddd 400 1 +5 eee 500 2 +6 fff 600 2 +7 ggg 700 2 +8 hhh 800 2 + +# Read partitioned json file + +query I +COPY src_table_1 TO 'test_files/scratch/dynamic_file/json_partitions' +STORED AS JSON +PARTITIONED BY (partition_col); +---- +4 + +query I +COPY src_table_2 TO 'test_files/scratch/dynamic_file/json_partitions' +STORED AS JSON +PARTITIONED BY (partition_col); +---- +4 + +query ITIT rowsort +SELECT int_col, string_col, bigint_col, partition_col FROM 'test_files/scratch/dynamic_file/json_partitions'; +---- +1 aaa 100 1 +2 bbb 200 1 +3 ccc 300 1 +4 ddd 400 1 +5 eee 500 2 +6 fff 600 2 +7 ggg 700 2 +8 hhh 800 2 + +# Read partitioned arrow file + +query I +COPY src_table_1 TO 'test_files/scratch/dynamic_file/arrow_partitions' +STORED AS ARROW +PARTITIONED BY (partition_col); +---- +4 + +query I +COPY src_table_2 TO 'test_files/scratch/dynamic_file/arrow_partitions' +STORED AS ARROW +PARTITIONED BY (partition_col); +---- +4 + +query ITIT rowsort +SELECT int_col, string_col, bigint_col, partition_col FROM 'test_files/scratch/dynamic_file/arrow_partitions'; +---- +1 aaa 100 1 +2 bbb 200 1 +3 ccc 300 1 +4 ddd 400 1 +5 eee 500 2 +6 fff 600 2 +7 ggg 700 2 +8 hhh 800 2 + +# Read partitioned parquet file + +query I +COPY src_table_1 TO 'test_files/scratch/dynamic_file/parquet_partitions' +STORED AS PARQUET +PARTITIONED BY (partition_col); +---- +4 + +query I +COPY src_table_2 TO 'test_files/scratch/dynamic_file/parquet_partitions' +STORED AS PARQUET +PARTITIONED BY (partition_col); +---- +4 + +query ITIT rowsort +select * from 'test_files/scratch/dynamic_file/parquet_partitions'; +---- +1 aaa 100 1 +2 bbb 200 1 +3 ccc 300 1 +4 ddd 400 1 +5 eee 500 2 +6 fff 600 2 +7 ggg 700 2 +8 hhh 800 2 + +# Read partitioned parquet file with multiple partition columns + +query I +COPY src_table_1 TO 'test_files/scratch/dynamic_file/nested_partition' +STORED AS PARQUET +PARTITIONED BY (partition_col, string_col); +---- +4 + +query I +COPY src_table_2 TO 'test_files/scratch/dynamic_file/nested_partition' +STORED AS PARQUET +PARTITIONED BY (partition_col, string_col); +---- +4 + +query IITT rowsort +select * from 'test_files/scratch/dynamic_file/nested_partition'; +---- +1 100 1 aaa +2 200 1 bbb +3 300 1 ccc +4 400 1 ddd +5 500 2 eee +6 600 2 fff +7 700 2 ggg +8 800 2 hhh # read avro file query IT diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index be7fdac71b57..7abf94932c71 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -128,5 +128,5 @@ from aggregate_test_100 order by c9 -statement error Inconsistent data type across values list at row 1 column 0. Was Int64 but found Utf8 +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'foo' to value of Int64 type create table foo as values (1), ('foo'); diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 7d41c26ba012..5b6017b08a00 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -553,6 +553,16 @@ SELECT strpos(arrow_cast('helloworld', 'Dictionary(Int32, Utf8)'), 'world') ---- 6 +query I +SELECT strpos('helloworld', NULL) +---- +NULL + +query I +SELECT strpos(arrow_cast('helloworld', 'Dictionary(Int32, Utf8)'), NULL) +---- +NULL + statement ok CREATE TABLE products ( product_id INT PRIMARY KEY, diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index f561fa9e9ac8..a80a0891e977 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -5152,8 +5152,6 @@ drop table test_case_expr statement ok drop table t; -# TODO: Current grouping set result is not align with Postgres and DuckDB, we might want to change the result -# See https://github.com/apache/datafusion/issues/12570 # test multi group by for binary type with nulls statement ok create table t(a int, b bytea) as values (1, 0xa), (1, 0xa), (2, null), (null, 0xb), (null, 0xb); @@ -5162,11 +5160,14 @@ query I?I select a, b, count(*) from t group by grouping sets ((a, b), (a), (b)); ---- 1 0a 2 -2 NULL 2 -NULL 0b 4 +2 NULL 1 +NULL 0b 2 1 NULL 2 -NULL NULL 3 +2 NULL 1 +NULL NULL 2 NULL 0a 2 +NULL NULL 1 +NULL 0b 2 statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 8d801b92c393..519fbb887c7e 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -1215,14 +1215,14 @@ statement ok create table t1(v1 int) as values(100); ## Query with Ambiguous column reference -query error DataFusion error: Schema error: Ambiguous reference to unqualified field v1 +query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1 select count(*) from t1 right outer join t1 on t1.v1 > 0; -query error DataFusion error: Schema error: Ambiguous reference to unqualified field v1 +query error DataFusion error: Schema error: Schema contains duplicate qualified field name t1\.v1 select t1.v1 from t1 join t1 using(v1) cross join (select struct('foo' as v1) as t1); statement ok -drop table t1; \ No newline at end of file +drop table t1; diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp deleted file mode 100644 index 00e74a207b33..000000000000 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -########## -## Join Tests -########## - -# turn off repartition_joins -statement ok -set datafusion.optimizer.repartition_joins = false; - -include ./join.slt diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index 45e1b51a09b4..726de75b5141 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -148,18 +148,17 @@ SELECT MAKE_MAP([1,2], ['a', 'b'], [3,4], ['b']); {[1, 2]: [a, b], [3, 4]: [b]} query ? -SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); +SELECT MAKE_MAP('POST', 41, 'HEAD', 53, 'PATCH', 30); ---- -{POST: 41, HEAD: ab, PATCH: 30} +{POST: 41, HEAD: 53, PATCH: 30} + +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'ab' to value of Int64 type +SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); +# Map keys can not be NULL query error SELECT MAKE_MAP('POST', 41, 'HEAD', 33, null, 30); -query ? -SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); ----- -{POST: 41, HEAD: ab, PATCH: 30} - query ? SELECT MAKE_MAP() ---- @@ -517,9 +516,12 @@ query error SELECT MAP {'a': MAP {1:'a', 2:'b', 3:'c'}, 'b': MAP {2:'c', 4:'d'} }[NULL]; query ? -SELECT MAP { 'a': 1, 2: 3 }; +SELECT MAP { 'a': 1, 'b': 3 }; ---- -{a: 1, 2: 3} +{a: 1, b: 3} + +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type +SELECT MAP { 'a': 1, 2: 3 }; # TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key # query ? @@ -610,9 +612,12 @@ select map_extract(column1, 1), map_extract(column1, 5), map_extract(column1, 7) # Tests for map_keys query ? -SELECT map_keys(MAP { 'a': 1, 2: 3 }); +SELECT map_keys(MAP { 'a': 1, 'b': 3 }); ---- -[a, 2] +[a, b] + +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type +SELECT map_keys(MAP { 'a': 1, 2: 3 }); query ? SELECT map_keys(MAP {'a':1, 'b':2, 'c':3 }) FROM t; @@ -657,8 +662,11 @@ SELECT map_keys(column1) from map_array_table_1; # Tests for map_values -query ? +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type SELECT map_values(MAP { 'a': 1, 2: 3 }); + +query ? +SELECT map_values(MAP { 'a': 1, 'b': 3 }); ---- [1, 3] diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index f38281abc5ab..d0853b9e4983 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -25,7 +25,7 @@ ## with metadata in SQL. query IT -select * from table_with_metadata; +select id, name from table_with_metadata; ---- 1 NULL NULL bar @@ -96,5 +96,34 @@ select count(id) cnt from table_with_metadata group by name order by cnt; 1 + +# Regression test: missing schema metadata, when aggregate on cross join +query I +SELECT count("data"."id") +FROM + ( + SELECT "id" FROM "table_with_metadata" + ) as "data", + ( + SELECT "id" FROM "table_with_metadata" + ) as "samples"; +---- +6 + +# Regression test: missing field metadata, from the NULL field on the left side of the union +query ITT +(SELECT id, NULL::string as name, l_name FROM "table_with_metadata") + UNION +(SELECT id, name, NULL::string as l_name FROM "table_with_metadata") +ORDER BY id, name, l_name; +---- +1 NULL NULL +3 baz NULL +3 NULL l_baz +NULL bar NULL +NULL NULL l_bar + + + statement ok drop table table_with_metadata; diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index f53363b6eb38..6cc7ee0403f2 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -958,6 +958,24 @@ drop table foo; statement ok drop table ambiguity_test; +## reproducer for https://github.com/apache/datafusion/issues/12446 +# Ensure union ordering calculations with constants can be optimized + +statement ok +create table t(a0 int, a int, b int, c int) as values (1, 2, 3, 4), (5, 6, 7, 8); + +# expect this query to run successfully, not error +query III +select * from (select c, a, NULL::int as a0 from t order by a, c) t1 +union all +select * from (select c, NULL::int as a, a0 from t order by a0, c) t2 +order by c, a, a0, b +limit 2; +---- +4 2 NULL +4 NULL 1 + + # Casting from numeric to string types breaks the ordering statement ok CREATE EXTERNAL TABLE ordered_table ( @@ -1189,3 +1207,48 @@ physical_plan 02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 03)----SortExec: TopK(fetch=1), expr=[a@0 ASC NULLS LAST], preserve_partitioning=[false] 04)------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], has_header=true + + +# Test: inputs into union with different orderings +query TT +explain select * from (select b, c, a, NULL::int as a0 from ordered_table order by a, c) t1 +union all +select * from (select b, c, NULL::int as a, a0 from ordered_table order by a0, c) t2 +order by d, c, a, a0, b +limit 2; +---- +logical_plan +01)Projection: t1.b, t1.c, t1.a, t1.a0 +02)--Sort: t1.d ASC NULLS LAST, t1.c ASC NULLS LAST, t1.a ASC NULLS LAST, t1.a0 ASC NULLS LAST, t1.b ASC NULLS LAST, fetch=2 +03)----Union +04)------SubqueryAlias: t1 +05)--------Projection: ordered_table.b, ordered_table.c, ordered_table.a, Int32(NULL) AS a0, ordered_table.d +06)----------TableScan: ordered_table projection=[a, b, c, d] +07)------SubqueryAlias: t2 +08)--------Projection: ordered_table.b, ordered_table.c, Int32(NULL) AS a, ordered_table.a0, ordered_table.d +09)----------TableScan: ordered_table projection=[a0, b, c, d] +physical_plan +01)ProjectionExec: expr=[b@0 as b, c@1 as c, a@2 as a, a0@3 as a0] +02)--SortPreservingMergeExec: [d@4 ASC NULLS LAST,c@1 ASC NULLS LAST,a@2 ASC NULLS LAST,a0@3 ASC NULLS LAST,b@0 ASC NULLS LAST], fetch=2 +03)----UnionExec +04)------SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST,c@1 ASC NULLS LAST,a@2 ASC NULLS LAST,b@0 ASC NULLS LAST], preserve_partitioning=[false] +05)--------ProjectionExec: expr=[b@1 as b, c@2 as c, a@0 as a, NULL as a0, d@3 as d] +06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[c@2 ASC NULLS LAST], has_header=true +07)------SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST,c@1 ASC NULLS LAST,a0@3 ASC NULLS LAST,b@0 ASC NULLS LAST], preserve_partitioning=[false] +08)--------ProjectionExec: expr=[b@1 as b, c@2 as c, NULL as a, a0@0 as a0, d@3 as d] +09)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, b, c, d], output_ordering=[c@2 ASC NULLS LAST], has_header=true + +# Test: run the query from above +query IIII +select * from (select b, c, a, NULL::int as a0 from ordered_table order by a, c) t1 +union all +select * from (select b, c, NULL::int as a, a0 from ordered_table order by a0, c) t2 +order by d, c, a, a0, b +limit 2; +---- +0 0 0 NULL +0 0 NULL 1 + + +statement ok +drop table ordered_table; diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 4c86312f9e51..858e42106221 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..172], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:172..338, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..178], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:178..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:174..342, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..180], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:180..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..169], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..173], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:173..347], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:169..338]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..171], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..175], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:175..351], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:171..342]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 3b9c9a16042c..0c2fa41e5bf8 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1906,11 +1906,9 @@ select position('' in '') ---- 1 - -query error POSITION function can only accept strings +query error DataFusion error: Error during planning: Error during planning: Int64 and Int64 are not coercible to a common string select position(1 in 1) - query I select strpos('abc', 'c'); ---- diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 5df5f313af3c..0fef56aeea5c 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -348,8 +348,11 @@ VALUES (1),() statement error DataFusion error: Error during planning: Inconsistent data length across values list: got 2 values in row 1 but expected 1 VALUES (1),(1,2) -statement error DataFusion error: Error during planning: Inconsistent data type across values list at row 1 column 0 +query I VALUES (1),('2') +---- +1 +2 query R VALUES (1),(2.0) @@ -357,8 +360,11 @@ VALUES (1),(2.0) 1 2 -statement error DataFusion error: Error during planning: Inconsistent data type across values list at row 1 column 1 +query II VALUES (1,2), (1,'2') +---- +1 2 +1 2 query IT VALUES (1,'a'),(NULL,'b'),(3,'c') diff --git a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt index ea3c9b8eb6ca..c181f613ee9a 100644 --- a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt +++ b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt @@ -53,36 +53,6 @@ Xiangpeng datafusion数据融合 false true false true Raphael datafusionДатаФусион false false false false NULL NULL NULL NULL NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query BBBB -SELECT - REGEXP_LIKE(ascii_1, 'an'), - REGEXP_LIKE(unicode_1, 'таФ'), - REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; ----- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query ???? -SELECT - REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), - REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/init_data.slt.part b/datafusion/sqllogictest/test_files/string/init_data.slt.part index d99401f10d20..096e3bb3b330 100644 --- a/datafusion/sqllogictest/test_files/string/init_data.slt.part +++ b/datafusion/sqllogictest/test_files/string/init_data.slt.part @@ -30,4 +30,3 @@ statement ok create table test_substr_base ( col1 VARCHAR ) as values ('foo'), ('hello🌏世界'), ('💩'), ('ThisIsAVeryLongASCIIString'), (''), (NULL); - diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt index 169c658e5ac1..af6d104e57ac 100644 --- a/datafusion/sqllogictest/test_files/string/large_string.slt +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -72,23 +72,6 @@ false false false true NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12670 -query IIIIII -SELECT - STRPOS(ascii_1, 'e'), - STRPOS(ascii_1, 'ang'), - STRPOS(ascii_1, NULL), - STRPOS(unicode_1, 'и'), - STRPOS(unicode_1, 'ион'), - STRPOS(unicode_1, NULL) -FROM test_basic_operator; ----- -5 0 NULL 0 0 NULL -7 3 NULL 0 0 NULL -6 0 NULL 18 18 NULL -NULL NULL NULL NULL NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 6b89147c5c4f..f003e01ecda0 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -63,53 +63,6 @@ Xiangpeng datafusion数据融合 false true false true Raphael datafusionДатаФусион false false false false NULL NULL NULL NULL NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query BBBB -SELECT - REGEXP_LIKE(ascii_1, 'an'), - REGEXP_LIKE(unicode_1, 'таФ'), - REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; ----- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query ???? -SELECT - REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), - REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12670 -query IIIIII -SELECT - STRPOS(ascii_1, 'e'), - STRPOS(ascii_1, 'ang'), - STRPOS(ascii_1, NULL), - STRPOS(unicode_1, 'и'), - STRPOS(unicode_1, 'ион'), - STRPOS(unicode_1, NULL) -FROM test_basic_operator; ----- -5 0 NULL 0 0 NULL -7 3 NULL 0 0 NULL -6 0 NULL 18 18 NULL -NULL NULL NULL NULL NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 0af0a6a642b2..6a02296f5e6c 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -856,39 +856,47 @@ NULL NULL # Test REGEXP_LIKE # -------------------------------------- -# TODO: LargeString does not support REGEXP_LIKE. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12664 -#query BBBB -#SELECT -# REGEXP_LIKE(ascii_1, 'an'), -# REGEXP_LIKE(unicode_1, 'таФ'), -# REGEXP_LIKE(ascii_1, NULL), -# REGEXP_LIKE(unicode_1, NULL) -#FROM test_basic_operator; -#---- -#false false NULL NULL -#true false NULL NULL -#false true NULL NULL -#NULL NULL NULL NULL +query BBBBBBBB +SELECT + -- without flags + REGEXP_LIKE(ascii_1, 'an'), + REGEXP_LIKE(unicode_1, 'таФ'), + REGEXP_LIKE(ascii_1, NULL), + REGEXP_LIKE(unicode_1, NULL), + -- with flags + REGEXP_LIKE(ascii_1, 'AN', 'i'), + REGEXP_LIKE(unicode_1, 'ТаФ', 'i'), + REGEXP_LIKE(ascii_1, NULL, 'i'), + REGEXP_LIKE(unicode_1, NULL, 'i') + FROM test_basic_operator; +---- +false false NULL NULL true false NULL NULL +true false NULL NULL true false NULL NULL +false true NULL NULL false true NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL # -------------------------------------- # Test REGEXP_MATCH # -------------------------------------- -# TODO: LargeString does not support REGEXP_MATCH. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12664 -#query ???? -#SELECT -# REGEXP_MATCH(ascii_1, 'an'), -# REGEXP_MATCH(unicode_1, 'таФ'), -# REGEXP_MATCH(ascii_1, NULL), -# REGEXP_MATCH(unicode_1, NULL) -#FROM test_basic_operator; -#---- -#NULL NULL NULL NULL -#[an] NULL NULL NULL -#NULL [таФ] NULL NULL -#NULL NULL NULL NULL +query ???????? +SELECT + -- without flags + REGEXP_MATCH(ascii_1, 'an'), + REGEXP_MATCH(unicode_1, 'ТаФ'), + REGEXP_MATCH(ascii_1, NULL), + REGEXP_MATCH(unicode_1, NULL), + -- with flags + REGEXP_MATCH(ascii_1, 'AN', 'i'), + REGEXP_MATCH(unicode_1, 'таФ', 'i'), + REGEXP_MATCH(ascii_1, NULL, 'i'), + REGEXP_MATCH(unicode_1, NULL, 'i') +FROM test_basic_operator; +---- +NULL NULL NULL NULL [An] NULL NULL NULL +[an] NULL NULL NULL [an] NULL NULL NULL +NULL NULL NULL NULL NULL [таФ] NULL NULL +NULL NULL NULL NULL NULL NULL NULL NULL # -------------------------------------- # Test REPEAT @@ -943,22 +951,20 @@ NULL NULL # Test STRPOS # -------------------------------------- -# TODO: DictionaryString does not support STRPOS. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12670 -#query IIIIII -#SELECT -# STRPOS(ascii_1, 'e'), -# STRPOS(ascii_1, 'ang'), -# STRPOS(ascii_1, NULL), -# STRPOS(unicode_1, 'и'), -# STRPOS(unicode_1, 'ион'), -# STRPOS(unicode_1, NULL) -#FROM test_basic_operator; -#---- -#5 0 NULL 0 0 NULL -#7 3 NULL 0 0 NULL -#6 0 NULL 18 18 NULL -#NULL NULL NULL NULL NULL NULL +query IIIIII +SELECT + STRPOS(ascii_1, 'e'), + STRPOS(ascii_1, 'ang'), + STRPOS(ascii_1, NULL), + STRPOS(unicode_1, 'и'), + STRPOS(unicode_1, 'ион'), + STRPOS(unicode_1, NULL) +FROM test_basic_operator; +---- +5 0 NULL 0 0 NULL +7 3 NULL 0 0 NULL +6 0 NULL 18 18 NULL +NULL NULL NULL NULL NULL NULL # -------------------------------------- # Test SUBSTR_INDEX diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index fb82726e3a9d..e01a40586fe0 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -50,53 +50,6 @@ false false false true NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query BBBB -SELECT - REGEXP_LIKE(ascii_1, 'an'), - REGEXP_LIKE(unicode_1, 'таФ'), - REGEXP_LIKE(ascii_1, NULL), - REGEXP_LIKE(unicode_1, NULL) -FROM test_basic_operator; ----- -false false NULL NULL -true false NULL NULL -false true NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12664 -query ???? -SELECT - REGEXP_MATCH(ascii_1, 'an'), - REGEXP_MATCH(unicode_1, 'таФ'), - REGEXP_MATCH(ascii_1, NULL), - REGEXP_MATCH(unicode_1, NULL) -FROM test_basic_operator; ----- -NULL NULL NULL NULL -[an] NULL NULL NULL -NULL [таФ] NULL NULL -NULL NULL NULL NULL - -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12670 -query IIIIII -SELECT - STRPOS(ascii_1, 'e'), - STRPOS(ascii_1, 'ang'), - STRPOS(ascii_1, NULL), - STRPOS(unicode_1, 'и'), - STRPOS(unicode_1, 'ион'), - STRPOS(unicode_1, NULL) -FROM test_basic_operator; ----- -5 0 NULL 0 0 NULL -7 3 NULL 0 0 NULL -6 0 NULL 18 18 NULL -NULL NULL NULL NULL NULL NULL - # # common test for string-like functions and operators # @@ -139,6 +92,21 @@ FROM test_source; statement ok drop table test_source +######## +## StringView Function test +######## + +query error DataFusion error: Arrow error: Compute error: bit_length not supported for Utf8View +select bit_length(column1_utf8view) from test; + +query T +select btrim(column1_large_utf8) from test; +---- +Andrew +Xiangpeng +Raphael +NULL + ######## ## StringView to Other Types column ######## @@ -346,9 +314,8 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 -02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] +01)Projection: starts_with(CAST(test.column1_utf8 AS Utf8View), test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(CAST(test.column1_utf8 AS LargeUtf8), test.column2_large_utf8) AS c4 +02)--TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] query BBB SELECT @@ -638,7 +605,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: contains(test.column1_utf8view, Utf8("foo")) AS c1, contains(test.column1_utf8view, test.column2_utf8view) AS c2, contains(test.column1_utf8view, test.column2_large_utf8) AS c3, contains(test.column1_utf8, test.column2_utf8view) AS c4, contains(test.column1_utf8, test.column2_utf8) AS c5, contains(test.column1_utf8, test.column2_large_utf8) AS c6, contains(test.column1_large_utf8, test.column1_utf8view) AS c7, contains(test.column1_large_utf8, test.column2_utf8) AS c8, contains(test.column1_large_utf8, test.column2_large_utf8) AS c9 +01)Projection: contains(test.column1_utf8view, Utf8View("foo")) AS c1, contains(test.column1_utf8view, test.column2_utf8view) AS c2, contains(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3, contains(CAST(test.column1_utf8 AS Utf8View), test.column2_utf8view) AS c4, contains(test.column1_utf8, test.column2_utf8) AS c5, contains(CAST(test.column1_utf8 AS LargeUtf8), test.column2_large_utf8) AS c6, contains(CAST(test.column1_large_utf8 AS Utf8View), test.column1_utf8view) AS c7, contains(test.column1_large_utf8, CAST(test.column2_utf8 AS LargeUtf8)) AS c8, contains(test.column1_large_utf8, test.column2_large_utf8) AS c9 02)--TableScan: test projection=[column1_utf8, column2_utf8, column1_large_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] ## Ensure no casts for ENDS_WITH @@ -882,7 +849,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: strpos(test.column1_utf8view, Utf8("f")) AS c, strpos(test.column1_utf8view, test.column2_utf8view) AS c2 +01)Projection: strpos(test.column1_utf8view, Utf8View("f")) AS c, strpos(test.column1_utf8view, test.column2_utf8view) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for SUBSTR diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 63ca74e9714c..9e79805d52fb 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -33,7 +33,7 @@ AS VALUES statement ok CREATE TABLE nested_unnest_table AS VALUES - (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), + (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), (struct('d', 'e', struct('f')), (struct('x', 'y', [30,40, 50])), null) ; @@ -780,3 +780,55 @@ NULL 1 ### TODO: group by unnest struct query error DataFusion error: Error during planning: Projection references non\-aggregate values select unnest(column1) c1 from nested_unnest_table group by c1.c0; + +query II??I?? +select unnest(column5), * from unnest_table; +---- +1 2 [1, 2, 3] [7] 1 [13, 14] {c0: 1, c1: 2} +3 4 [4, 5] [8, 9, 10] 2 [15, 16] {c0: 3, c1: 4} +NULL NULL [6] [11, 12] 3 NULL NULL +7 8 [12] [, 42, ] NULL NULL {c0: 7, c1: 8} +NULL NULL NULL NULL 4 [17, 18] NULL + +query TT???? +select unnest(column1), * from nested_unnest_table +---- +a b {c0: c} {c0: a, c1: b, c2: {c0: c}} {c0: a, c1: b, c2: [10, 20]} [{c0: a, c1: b}] +d e {c0: f} {c0: d, c1: e, c2: {c0: f}} {c0: x, c1: y, c2: [30, 40, 50]} NULL + +query ????? +select unnest(unnest(column3)), * from recursive_unnest_table +---- +[1] [[1, 2]] {c0: [1], c1: a} [[[1], [2]], [[1, 1]]] [{c0: [1], c1: [[1, 2]]}] +[2] [[3], [4]] {c0: [2], c1: b} [[[3, 4], [5]], [[, 6], , [7, 8]]] [{c0: [2], c1: [[3], [4]]}] + +statement ok +CREATE TABLE join_table +AS VALUES + (1, 2, 3), + (2, 3, 4), + (4, 5, 6) +; + +query IIIII +select unnest(u.column5), j.* from unnest_table u join join_table j on u.column3 = j.column1 +---- +1 2 1 2 3 +3 4 2 3 4 +NULL NULL 4 5 6 + +query II?I? +select unnest(column5), * except (column5, column1) from unnest_table; +---- +1 2 [7] 1 [13, 14] +3 4 [8, 9, 10] 2 [15, 16] +NULL NULL [11, 12] 3 NULL +7 8 [, 42, ] NULL NULL +NULL NULL NULL 4 [17, 18] + +query III +select unnest(u.column5), j.* except(column2, column3) from unnest_table u join join_table j on u.column3 = j.column1 +---- +1 2 1 +3 4 2 +NULL NULL 4 diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index e6bfc67eda81..030536f9f830 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -34,6 +34,7 @@ use datafusion::logical_expr::{ ExprSchemable, LogicalPlan, Operator, Projection, SortExpr, Values, }; use substrait::proto::expression::subquery::set_predicate::PredicateOp; +use substrait::proto::expression_reference::ExprType; use url::Url; use crate::extensions::Extensions; @@ -96,7 +97,7 @@ use substrait::proto::{ sort_field::{SortDirection, SortKind::*}, AggregateFunction, Expression, NamedStruct, Plan, Rel, Type, }; -use substrait::proto::{FunctionArgument, SortField}; +use substrait::proto::{ExtendedExpression, FunctionArgument, SortField}; // Substrait PrecisionTimestampTz indicates that the timestamp is relative to UTC, which // is the same as the expectation for any non-empty timezone in DF, so any non-empty timezone @@ -251,6 +252,81 @@ pub async fn from_substrait_plan( } } +/// An ExprContainer is a container for a collection of expressions with a common input schema +/// +/// In addition, each expression is associated with a field, which defines the +/// expression's output. The data type and nullability of the field are calculated from the +/// expression and the input schema. However the names of the field (and its nested fields) are +/// derived from the Substrait message. +pub struct ExprContainer { + /// The input schema for the expressions + pub input_schema: DFSchemaRef, + /// The expressions + /// + /// Each item contains an expression and the field that defines the expected nullability and name of the expr's output + pub exprs: Vec<(Expr, Field)>, +} + +/// Convert Substrait ExtendedExpression to ExprContainer +/// +/// A Substrait ExtendedExpression message contains one or more expressions, +/// with names for the outputs, and an input schema. These pieces are all included +/// in the ExprContainer. +/// +/// This is a top-level message and can be used to send expressions (not plans) +/// between systems. This is often useful for scenarios like pushdown where filter +/// expressions need to be sent to remote systems. +pub async fn from_substrait_extended_expr( + ctx: &SessionContext, + extended_expr: &ExtendedExpression, +) -> Result { + // Register function extension + let extensions = Extensions::try_from(&extended_expr.extensions)?; + if !extensions.type_variations.is_empty() { + return not_impl_err!("Type variation extensions are not supported"); + } + + let input_schema = DFSchemaRef::new(match &extended_expr.base_schema { + Some(base_schema) => from_substrait_named_struct(base_schema, &extensions), + None => { + plan_err!("required property `base_schema` missing from Substrait ExtendedExpression message") + } + }?); + + // Parse expressions + let mut exprs = Vec::with_capacity(extended_expr.referred_expr.len()); + for (expr_idx, substrait_expr) in extended_expr.referred_expr.iter().enumerate() { + let scalar_expr = match &substrait_expr.expr_type { + Some(ExprType::Expression(scalar_expr)) => Ok(scalar_expr), + Some(ExprType::Measure(_)) => { + not_impl_err!("Measure expressions are not yet supported") + } + None => { + plan_err!("required property `expr_type` missing from Substrait ExpressionReference message") + } + }?; + let expr = + from_substrait_rex(ctx, scalar_expr, &input_schema, &extensions).await?; + let (output_type, expected_nullability) = + expr.data_type_and_nullable(&input_schema)?; + let output_field = Field::new("", output_type, expected_nullability); + let mut names_idx = 0; + let output_field = rename_field( + &output_field, + &substrait_expr.output_names, + expr_idx, + &mut names_idx, + /*rename_self=*/ true, + )?; + exprs.push((expr, output_field)); + } + + Ok(ExprContainer { + input_schema, + exprs, + }) +} + /// parse projection pub fn extract_projection( t: LogicalPlan, @@ -334,6 +410,68 @@ fn rename_expressions( .collect() } +fn rename_field( + field: &Field, + dfs_names: &Vec, + unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" + name_idx: &mut usize, // Index into dfs_names + rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name +) -> Result { + let name = if rename_self { + next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? + } else { + field.name().to_string() + }; + match field.data_type() { + DataType::Struct(children) => { + let children = children + .iter() + .enumerate() + .map(|(child_idx, f)| { + rename_field( + f.as_ref(), + dfs_names, + child_idx, + name_idx, + /*rename_self=*/ true, + ) + }) + .collect::>()?; + Ok(field + .to_owned() + .with_name(name) + .with_data_type(DataType::Struct(children))) + } + DataType::List(inner) => { + let renamed_inner = rename_field( + inner.as_ref(), + dfs_names, + 0, + name_idx, + /*rename_self=*/ false, + )?; + Ok(field + .to_owned() + .with_data_type(DataType::List(FieldRef::new(renamed_inner))) + .with_name(name)) + } + DataType::LargeList(inner) => { + let renamed_inner = rename_field( + inner.as_ref(), + dfs_names, + 0, + name_idx, + /*rename_self= */ false, + )?; + Ok(field + .to_owned() + .with_data_type(DataType::LargeList(FieldRef::new(renamed_inner))) + .with_name(name)) + } + _ => Ok(field.to_owned().with_name(name)), + } +} + /// Produce a version of the given schema with names matching the given list of names. /// Substrait doesn't deal with column (incl. nested struct field) names within the schema, /// but it does give us the list of expected names at the end of the plan, so we use this @@ -342,59 +480,20 @@ fn make_renamed_schema( schema: &DFSchemaRef, dfs_names: &Vec, ) -> Result { - fn rename_inner_fields( - dtype: &DataType, - dfs_names: &Vec, - name_idx: &mut usize, - ) -> Result { - match dtype { - DataType::Struct(fields) => { - let fields = fields - .iter() - .map(|f| { - let name = next_struct_field_name(0, dfs_names, name_idx)?; - Ok((**f).to_owned().with_name(name).with_data_type( - rename_inner_fields(f.data_type(), dfs_names, name_idx)?, - )) - }) - .collect::>()?; - Ok(DataType::Struct(fields)) - } - DataType::List(inner) => Ok(DataType::List(FieldRef::new( - (**inner).to_owned().with_data_type(rename_inner_fields( - inner.data_type(), - dfs_names, - name_idx, - )?), - ))), - DataType::LargeList(inner) => Ok(DataType::LargeList(FieldRef::new( - (**inner).to_owned().with_data_type(rename_inner_fields( - inner.data_type(), - dfs_names, - name_idx, - )?), - ))), - _ => Ok(dtype.to_owned()), - } - } - let mut name_idx = 0; let (qualifiers, fields): (_, Vec) = schema .iter() - .map(|(q, f)| { - let name = next_struct_field_name(0, dfs_names, &mut name_idx)?; - Ok(( - q.cloned(), - (**f) - .to_owned() - .with_name(name) - .with_data_type(rename_inner_fields( - f.data_type(), - dfs_names, - &mut name_idx, - )?), - )) + .enumerate() + .map(|(field_idx, (q, f))| { + let renamed_f = rename_field( + f.as_ref(), + dfs_names, + field_idx, + &mut name_idx, + /*rename_self=*/ true, + )?; + Ok((q.cloned(), renamed_f)) }) .collect::>>()? .into_iter() @@ -1681,14 +1780,14 @@ fn from_substrait_struct_type( } fn next_struct_field_name( - i: usize, + column_idx: usize, dfs_names: &[String], name_idx: &mut usize, ) -> Result { if dfs_names.is_empty() { // If names are not given, create dummy names // c0, c1, ... align with e.g. SqlToRel::create_named_struct - Ok(format!("c{i}")) + Ok(format!("c{column_idx}")) } else { let name = dfs_names.get(*name_idx).cloned().ok_or_else(|| { substrait_datafusion_err!("Named schema must contain names for all fields") diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index fada827875b0..1165ce13d236 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. -use itertools::Itertools; use std::sync::Arc; +use substrait::proto::expression_reference::ExprType; use arrow_buffer::ToByteSlice; -use datafusion::arrow::datatypes::IntervalUnit; +use datafusion::arrow::datatypes::{Field, IntervalUnit}; use datafusion::logical_expr::{ CrossJoin, Distinct, Like, Partitioning, WindowFrameUnits, }; @@ -63,7 +63,9 @@ use substrait::proto::expression::window_function::BoundsType; use substrait::proto::read_rel::VirtualTable; use substrait::proto::rel_common::EmitKind; use substrait::proto::rel_common::EmitKind::Emit; -use substrait::proto::{rel_common, CrossRel, ExchangeRel, RelCommon}; +use substrait::proto::{ + rel_common, CrossRel, ExchangeRel, ExpressionReference, ExtendedExpression, RelCommon, +}; use substrait::{ proto::{ aggregate_function::AggregationInvocation, @@ -119,6 +121,56 @@ pub fn to_substrait_plan(plan: &LogicalPlan, ctx: &SessionContext) -> Result Result> { + let mut extensions = Extensions::default(); + + let substrait_exprs = exprs + .iter() + .map(|(expr, field)| { + let substrait_expr = to_substrait_rex( + ctx, + expr, + schema, + /*col_ref_offset=*/ 0, + &mut extensions, + )?; + let mut output_names = Vec::new(); + flatten_names(field, false, &mut output_names)?; + Ok(ExpressionReference { + output_names, + expr_type: Some(ExprType::Expression(substrait_expr)), + }) + }) + .collect::>>()?; + let substrait_schema = to_substrait_named_struct(schema, &mut extensions)?; + + Ok(Box::new(ExtendedExpression { + advanced_extensions: None, + expected_type_urls: vec![], + extension_uris: vec![], + extensions: extensions.into(), + version: Some(version::version_with_producer("datafusion")), + referred_expr: substrait_exprs, + base_schema: Some(substrait_schema), + })) +} + /// Convert DataFusion LogicalPlan to Substrait Rel pub fn to_substrait_rel( plan: &LogicalPlan, @@ -580,50 +632,43 @@ fn create_project_remapping(expr_count: usize, input_field_count: usize) -> Emit Emit(rel_common::Emit { output_mapping }) } +// Substrait wants a list of all field names, including nested fields from structs, +// also from within e.g. lists and maps. However, it does not want the list and map field names +// themselves - only proper structs fields are considered to have useful names. +fn flatten_names(field: &Field, skip_self: bool, names: &mut Vec) -> Result<()> { + if !skip_self { + names.push(field.name().to_string()); + } + match field.data_type() { + DataType::Struct(fields) => { + for field in fields { + flatten_names(field, false, names)?; + } + Ok(()) + } + DataType::List(l) => flatten_names(l, true, names), + DataType::LargeList(l) => flatten_names(l, true, names), + DataType::Map(m, _) => match m.data_type() { + DataType::Struct(key_and_value) if key_and_value.len() == 2 => { + flatten_names(&key_and_value[0], true, names)?; + flatten_names(&key_and_value[1], true, names) + } + _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"), + }, + _ => Ok(()), + }?; + Ok(()) +} + fn to_substrait_named_struct( schema: &DFSchemaRef, extensions: &mut Extensions, ) -> Result { - // Substrait wants a list of all field names, including nested fields from structs, - // also from within e.g. lists and maps. However, it does not want the list and map field names - // themselves - only proper structs fields are considered to have useful names. - fn names_dfs(dtype: &DataType) -> Result> { - match dtype { - DataType::Struct(fields) => { - let mut names = Vec::new(); - for field in fields { - names.push(field.name().to_string()); - names.extend(names_dfs(field.data_type())?); - } - Ok(names) - } - DataType::List(l) => names_dfs(l.data_type()), - DataType::LargeList(l) => names_dfs(l.data_type()), - DataType::Map(m, _) => match m.data_type() { - DataType::Struct(key_and_value) if key_and_value.len() == 2 => { - let key_names = - names_dfs(key_and_value.first().unwrap().data_type())?; - let value_names = - names_dfs(key_and_value.last().unwrap().data_type())?; - Ok([key_names, value_names].concat()) - } - _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"), - }, - _ => Ok(Vec::new()), - } + let mut names = Vec::with_capacity(schema.fields().len()); + for field in schema.fields() { + flatten_names(field, false, &mut names)?; } - let names = schema - .fields() - .iter() - .map(|f| { - let mut names = vec![f.name().to_string()]; - names.extend(names_dfs(f.data_type())?); - Ok(names) - }) - .flatten_ok() - .collect::>()?; - let field_types = r#type::Struct { types: schema .fields() @@ -2178,14 +2223,16 @@ fn substrait_field_ref(index: usize) -> Result { mod test { use super::*; use crate::logical_plan::consumer::{ - from_substrait_literal_without_names, from_substrait_type_without_names, + from_substrait_extended_expr, from_substrait_literal_without_names, + from_substrait_named_struct, from_substrait_type_without_names, }; use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano}; use datafusion::arrow::array::{ GenericListArray, Int64Builder, MapBuilder, StringBuilder, }; - use datafusion::arrow::datatypes::Field; + use datafusion::arrow::datatypes::{Field, Fields, Schema}; use datafusion::common::scalar::ScalarStructBuilder; + use datafusion::common::DFSchema; use std::collections::HashMap; #[test] @@ -2461,4 +2508,101 @@ mod test { Ok(()) } + + #[test] + fn named_struct_names() -> Result<()> { + let mut extensions = Extensions::default(); + let schema = DFSchemaRef::new(DFSchema::try_from(Schema::new(vec![ + Field::new("int", DataType::Int32, true), + Field::new( + "struct", + DataType::Struct(Fields::from(vec![Field::new( + "inner", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + )])), + true, + ), + Field::new("trailer", DataType::Float64, true), + ]))?); + + let named_struct = to_substrait_named_struct(&schema, &mut extensions)?; + + // Struct field names should be flattened DFS style + // List field names should be omitted + assert_eq!( + named_struct.names, + vec!["int", "struct", "inner", "trailer"] + ); + + let roundtrip_schema = from_substrait_named_struct(&named_struct, &extensions)?; + assert_eq!(schema.as_ref(), &roundtrip_schema); + Ok(()) + } + + #[tokio::test] + async fn extended_expressions() -> Result<()> { + let ctx = SessionContext::new(); + + // One expression, empty input schema + let expr = Expr::Literal(ScalarValue::Int32(Some(42))); + let field = Field::new("out", DataType::Int32, false); + let empty_schema = DFSchemaRef::new(DFSchema::empty()); + let substrait = + to_substrait_extended_expr(&[(&expr, &field)], &empty_schema, &ctx)?; + let roundtrip_expr = from_substrait_extended_expr(&ctx, &substrait).await?; + + assert_eq!(roundtrip_expr.input_schema, empty_schema); + assert_eq!(roundtrip_expr.exprs.len(), 1); + + let (rt_expr, rt_field) = roundtrip_expr.exprs.first().unwrap(); + assert_eq!(rt_field, &field); + assert_eq!(rt_expr, &expr); + + // Multiple expressions, with column references + let expr1 = Expr::Column("c0".into()); + let expr2 = Expr::Column("c1".into()); + let out1 = Field::new("out1", DataType::Int32, true); + let out2 = Field::new("out2", DataType::Utf8, true); + let input_schema = DFSchemaRef::new(DFSchema::try_from(Schema::new(vec![ + Field::new("c0", DataType::Int32, true), + Field::new("c1", DataType::Utf8, true), + ]))?); + + let substrait = to_substrait_extended_expr( + &[(&expr1, &out1), (&expr2, &out2)], + &input_schema, + &ctx, + )?; + let roundtrip_expr = from_substrait_extended_expr(&ctx, &substrait).await?; + + assert_eq!(roundtrip_expr.input_schema, input_schema); + assert_eq!(roundtrip_expr.exprs.len(), 2); + + let mut exprs = roundtrip_expr.exprs.into_iter(); + + let (rt_expr, rt_field) = exprs.next().unwrap(); + assert_eq!(rt_field, out1); + assert_eq!(rt_expr, expr1); + + let (rt_expr, rt_field) = exprs.next().unwrap(); + assert_eq!(rt_field, out2); + assert_eq!(rt_expr, expr2); + + Ok(()) + } + + #[tokio::test] + async fn invalid_extended_expression() { + let ctx = SessionContext::new(); + + // Not ok if input schema is missing field referenced by expr + let expr = Expr::Column("missing".into()); + let field = Field::new("out", DataType::Int32, false); + let empty_schema = DFSchemaRef::new(DFSchema::empty()); + + let err = to_substrait_extended_expr(&[(&expr, &field)], &empty_schema, &ctx); + + assert!(matches!(err, Err(DataFusionError::SchemaError(_, _)))); + } } diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index f7686bec5435..ce6d1825cd25 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -149,6 +149,10 @@ impl UserDefinedLogicalNode for MockUserDefinedLogicalPlan { fn dyn_ord(&self, _: &dyn UserDefinedLogicalNode) -> Option { unimplemented!() } + + fn supports_limit_pushdown(&self) -> bool { + false // Disallow limit push-down by default + } } impl MockUserDefinedLogicalPlan { @@ -290,8 +294,9 @@ async fn aggregate_grouping_sets() -> Result<()> { async fn aggregate_grouping_rollup() -> Result<()> { assert_expected_plan( "SELECT a, c, e, avg(b) FROM data GROUP BY ROLLUP (a, c, e)", - "Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]\ - \n TableScan: data projection=[a, b, c, e]", + "Projection: data.a, data.c, data.e, avg(data.b)\ + \n Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]\ + \n TableScan: data projection=[a, b, c, e]", true ).await } diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh index 836ba6772eac..585cb77839f9 100755 --- a/dev/update_config_docs.sh +++ b/dev/update_config_docs.sh @@ -24,7 +24,7 @@ SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SOURCE_DIR}/../" && pwd TARGET_FILE="docs/source/user-guide/configs.md" -PRINT_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs" +PRINT_CONFIG_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_config_docs" echo "Inserting header" cat <<'EOF' > "$TARGET_FILE" @@ -67,8 +67,8 @@ Environment variables are read during `SessionConfig` initialisation so they mus EOF -echo "Running CLI and inserting docs table" -$PRINT_DOCS_COMMAND >> "$TARGET_FILE" +echo "Running CLI and inserting config docs table" +$PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE" echo "Running prettier" npx prettier@2.3.2 --write "$TARGET_FILE" diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh new file mode 100755 index 000000000000..a4236eefc8c8 --- /dev/null +++ b/dev/update_function_docs.sh @@ -0,0 +1,284 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SOURCE_DIR}/../" && pwd + + +TARGET_FILE="docs/source/user-guide/sql/aggregate_functions_new.md" +PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- aggregate" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Aggregate Functions (NEW) + +This page is a WIP and will replace the Aggregate Functions page once completed. + +Aggregate functions operate on a set of values to compute a single result. +EOF + +echo "Running CLI and inserting aggregate function docs table" +$PRINT_AGGREGATE_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + +TARGET_FILE="docs/source/user-guide/sql/scalar_functions_new.md" +PRINT_SCALAR_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- scalar" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + +# Scalar Functions (NEW) + +This page is a WIP and will replace the Scalar Functions page once completed. +EOF + +echo "Running CLI and inserting scalar function docs table" +$PRINT_SCALAR_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + +TARGET_FILE="docs/source/user-guide/sql/window_functions_new.md" +PRINT_WINDOW_FUNCTION_DOCS_COMMAND="cargo run --manifest-path datafusion/core/Cargo.toml --bin print_functions_docs -- window" + +echo "Inserting header" +cat <<'EOF' > "$TARGET_FILE" + + + + + +# Window Functions (NEW) + +This page is a WIP and will replace the Window Functions page once completed. + +A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result + +Here is an example that shows how to compare each employee's salary with the average salary in his or her department: + +```sql +SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; + ++-----------+-------+--------+-------------------+ +| depname | empno | salary | avg | ++-----------+-------+--------+-------------------+ +| personnel | 2 | 3900 | 3700.0 | +| personnel | 5 | 3500 | 3700.0 | +| develop | 8 | 6000 | 5020.0 | +| develop | 10 | 5200 | 5020.0 | +| develop | 11 | 5200 | 5020.0 | +| develop | 9 | 4500 | 5020.0 | +| develop | 7 | 4200 | 5020.0 | +| sales | 1 | 5000 | 4866.666666666667 | +| sales | 4 | 4800 | 4866.666666666667 | +| sales | 3 | 4800 | 4866.666666666667 | ++-----------+-------+--------+-------------------+ +``` + +A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. + +You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: + +```sql +SELECT depname, empno, salary, + rank() OVER (PARTITION BY depname ORDER BY salary DESC) +FROM empsalary; + ++-----------+-------+--------+--------+ +| depname | empno | salary | rank | ++-----------+-------+--------+--------+ +| personnel | 2 | 3900 | 1 | +| develop | 8 | 6000 | 1 | +| develop | 10 | 5200 | 2 | +| develop | 11 | 5200 | 2 | +| develop | 9 | 4500 | 4 | +| develop | 7 | 4200 | 5 | +| sales | 1 | 5000 | 1 | +| sales | 4 | 4800 | 2 | +| personnel | 5 | 3500 | 2 | +| sales | 3 | 4800 | 2 | ++-----------+-------+--------+--------+ +``` + +There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: + +```sql +SELECT depname, empno, salary, + avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, + min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min +FROM empsalary +ORDER BY empno ASC; + ++-----------+-------+--------+--------------------+---------+ +| depname | empno | salary | avg | cum_min | ++-----------+-------+--------+--------------------+---------+ +| sales | 1 | 5000 | 5000.0 | 5000 | +| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | +| sales | 3 | 4800 | 4700.0 | 3900 | +| sales | 4 | 4800 | 4866.666666666667 | 3900 | +| personnel | 5 | 3500 | 3700.0 | 3500 | +| develop | 7 | 4200 | 4200.0 | 3500 | +| develop | 8 | 6000 | 5600.0 | 3500 | +| develop | 9 | 4500 | 4500.0 | 3500 | +| develop | 10 | 5200 | 5133.333333333333 | 3500 | +| develop | 11 | 5200 | 5466.666666666667 | 3500 | ++-----------+-------+--------+--------------------+---------+ +``` + +When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: + +```sql +SELECT sum(salary) OVER w, avg(salary) OVER w +FROM empsalary +WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); +``` + +## Syntax + +The syntax for the OVER-clause is + +``` +function([expr]) + OVER( + [PARTITION BY expr[, …]] + [ORDER BY expr [ ASC | DESC ][, …]] + [ frame_clause ] + ) +``` + +where **frame_clause** is one of: + +``` + { RANGE | ROWS | GROUPS } frame_start + { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end +``` + +and **frame_start** and **frame_end** can be one of + +```sql +UNBOUNDED PRECEDING +offset PRECEDING +CURRENT ROW +offset FOLLOWING +UNBOUNDED FOLLOWING +``` + +where **offset** is an non-negative integer. + +RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). + +## Aggregate functions + +All [aggregate functions](aggregate_functions.md) can be used as window functions. + +EOF + +echo "Running CLI and inserting window function docs table" +$PRINT_WINDOW_FUNCTION_DOCS_COMMAND >> "$TARGET_FILE" + +echo "Running prettier" +npx prettier@2.3.2 --write "$TARGET_FILE" + +echo "'$TARGET_FILE' successfully updated!" + diff --git a/docs/source/index.rst b/docs/source/index.rst index 32a5dce323f2..f11670d259bf 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,7 +34,20 @@ Apache DataFusion DataFusion is an extensible query engine written in `Rust `_ that -uses `Apache Arrow `_ as its in-memory format. DataFusion's target users are +uses `Apache Arrow `_ as its in-memory format. + +This documentation is for the `core DataFusion project `_, which contains +libraries that are used to build data-centric system software. DataFusion also offers the following subprojects, which +provide packaged versions of DataFusion intended for end users, and these have separate documentation. + +- `DataFusion Python `_ offers a Python interface for SQL and DataFrame + queries. +- `DataFusion Ray `_ provides a distributed version of DataFusion + that scales out on `Ray `_ clusters. +- `DataFusion Comet `_ is an accelerator for Apache Spark based on + DataFusion. + +DataFusion's target users are developers building fast and feature rich database and analytic systems, customized to particular workloads. See `use cases `_ for examples. diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index c8f0ffbec701..ababb001f5c5 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -69,7 +69,7 @@ value ::: :::{note} -Since `&&` and `||` are existed as logical operators in Rust, but those are not overloadable and not works with expression API. +Since `&&` and `||` are logical operators in Rust and cannot be overloaded these are not available in the expression API. ::: ## Bitwise Expressions @@ -151,7 +151,7 @@ but these operators always return a `bool` which makes them not work with the ex | trunc(x) | truncate toward zero | :::{note} -Unlike to some databases the math functions in Datafusion works the same way as Rust math functions, avoiding failing on corner cases e.g +Unlike to some databases the math functions in Datafusion works the same way as Rust math functions, avoiding failing on corner cases e.g. ```sql select log(-1), log(0), sqrt(-1); diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index edb0e1d0c9f0..fe6a61e74e62 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -240,9 +240,6 @@ last_value(expression [ORDER BY expression]) - [stddev](#stddev) - [stddev_pop](#stddev_pop) - [stddev_samp](#stddev_samp) -- [var](#var) -- [var_pop](#var_pop) -- [var_samp](#var_samp) - [regr_avgx](#regr_avgx) - [regr_avgy](#regr_avgy) - [regr_count](#regr_count) @@ -349,45 +346,6 @@ stddev_samp(expression) #### Arguments -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var` - -Returns the statistical variance of a set of numbers. - -``` -var(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var_pop` - -Returns the statistical population variance of a set of numbers. - -``` -var_pop(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var_samp` - -Returns the statistical sample variance of a set of numbers. - -``` -var_samp(expression) -``` - -#### Arguments - - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md new file mode 100644 index 000000000000..236ef57da502 --- /dev/null +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -0,0 +1,124 @@ + + + + +# Aggregate Functions (NEW) + +This page is a WIP and will replace the Aggregate Functions page once completed. + +Aggregate functions operate on a set of values to compute a single result. + +## General Functions + +- [bit_and](#bit_and) +- [bit_or](#bit_or) +- [bit_xor](#bit_xor) +- [var](#var) +- [var_pop](#var_pop) +- [var_population](#var_population) +- [var_samp](#var_samp) +- [var_sample](#var_sample) + +### `bit_and` + +Computes the bitwise AND of all non-null input values. + +``` +bit_and(expression) +``` + +#### Arguments + +- **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. + +### `bit_or` + +Computes the bitwise OR of all non-null input values. + +``` +bit_or(expression) +``` + +#### Arguments + +- **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. + +### `bit_xor` + +Computes the bitwise exclusive OR of all non-null input values. + +``` +bit_xor(expression) +``` + +#### Arguments + +- **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. + +### `var` + +Returns the statistical sample variance of a set of numbers. + +``` +var(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases + +- var_sample +- var_samp + +### `var_pop` + +Returns the statistical population variance of a set of numbers. + +``` +var_pop(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases + +- var_population + +### `var_population` + +_Alias of [var_pop](#var_pop)._ + +### `var_samp` + +_Alias of [var](#var)._ + +### `var_sample` + +_Alias of [var](#var)._ diff --git a/docs/source/user-guide/sql/data_types.md b/docs/source/user-guide/sql/data_types.md index 0e974550a84d..18c95cdea70e 100644 --- a/docs/source/user-guide/sql/data_types.md +++ b/docs/source/user-guide/sql/data_types.md @@ -97,7 +97,7 @@ select arrow_cast(now(), 'Timestamp(Second, None)'); | `BYTEA` | `Binary` | You can create binary literals using a hex string literal such as -`X'1234` to create a `Binary` value of two bytes, `0x12` and `0x34`. +`X'1234'` to create a `Binary` value of two bytes, `0x12` and `0x34`. ## Unsupported SQL Types diff --git a/docs/source/user-guide/sql/index.rst b/docs/source/user-guide/sql/index.rst index 04d1fc228f81..6eb451c83b96 100644 --- a/docs/source/user-guide/sql/index.rst +++ b/docs/source/user-guide/sql/index.rst @@ -30,7 +30,10 @@ SQL Reference information_schema operators aggregate_functions + aggregate_functions_new window_functions + window_functions_new scalar_functions + scalar_functions_new sql_status write_options diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 469fb705b71f..3e481db90c22 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -644,618 +644,7 @@ _Alias of [nvl](#nvl)._ ## String Functions -- [ascii](#ascii) -- [bit_length](#bit_length) -- [btrim](#btrim) -- [char_length](#char_length) -- [character_length](#character_length) -- [concat](#concat) -- [concat_ws](#concat_ws) -- [chr](#chr) -- [ends_with](#ends_with) -- [initcap](#initcap) -- [instr](#instr) -- [left](#left) -- [length](#length) -- [lower](#lower) -- [lpad](#lpad) -- [ltrim](#ltrim) -- [octet_length](#octet_length) -- [repeat](#repeat) -- [replace](#replace) -- [reverse](#reverse) -- [right](#right) -- [rpad](#rpad) -- [rtrim](#rtrim) -- [split_part](#split_part) -- [starts_with](#starts_with) -- [strpos](#strpos) -- [substr](#substr) -- [to_hex](#to_hex) -- [translate](#translate) -- [trim](#trim) -- [upper](#upper) -- [uuid](#uuid) -- [overlay](#overlay) -- [levenshtein](#levenshtein) -- [substr_index](#substr_index) -- [find_in_set](#find_in_set) -- [position](#position) -- [contains](#contains) - -### `ascii` - -Returns the ASCII value of the first character in a string. - -``` -ascii(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[chr](#chr) - -### `bit_length` - -Returns the bit length of a string. - -``` -bit_length(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[length](#length), -[octet_length](#octet_length) - -### `btrim` - -Trims the specified trim string from the start and end of a string. -If no trim string is provided, all whitespace is removed from the start and end -of the input string. - -``` -btrim(str[, trim_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **trim_str**: String expression to trim from the beginning and end of the input string. - Can be a constant, column, or function, and any combination of arithmetic operators. - _Default is whitespace characters._ - -**Related functions**: -[ltrim](#ltrim), -[rtrim](#rtrim) - -#### Aliases - -- trim - -### `char_length` - -_Alias of [length](#length)._ - -### `character_length` - -_Alias of [length](#length)._ - -### `concat` - -Concatenates multiple strings together. - -``` -concat(str[, ..., str_n]) -``` - -#### Arguments - -- **str**: String expression to concatenate. - Can be a constant, column, or function, and any combination of string operators. -- **str_n**: Subsequent string column or literal string to concatenate. - -**Related functions**: -[concat_ws](#concat_ws) - -### `concat_ws` - -Concatenates multiple strings together with a specified separator. - -``` -concat_ws(separator, str[, ..., str_n]) -``` - -#### Arguments - -- **separator**: Separator to insert between concatenated strings. -- **str**: String expression to concatenate. - Can be a constant, column, or function, and any combination of string operators. -- **str_n**: Subsequent string column or literal string to concatenate. - -**Related functions**: -[concat](#concat) - -### `chr` - -Returns the character with the specified ASCII or Unicode code value. - -``` -chr(expression) -``` - -#### Arguments - -- **expression**: Expression containing the ASCII or Unicode code value to operate on. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. - -**Related functions**: -[ascii](#ascii) - -### `ends_with` - -Tests if a string ends with a substring. - -``` -ends_with(str, substr) -``` - -#### Arguments - -- **str**: String expression to test. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring to test for. - -### `initcap` - -Capitalizes the first character in each word in the input string. -Words are delimited by non-alphanumeric characters. - -``` -initcap(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[lower](#lower), -[upper](#upper) - -### `instr` - -_Alias of [strpos](#strpos)._ - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring expression to search for. - Can be a constant, column, or function, and any combination of string operators. - -### `left` - -Returns a specified number of characters from the left side of a string. - -``` -left(str, n) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: Number of characters to return. - -**Related functions**: -[right](#right) - -### `length` - -Returns the number of characters in a string. - -``` -length(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -#### Aliases - -- char_length -- character_length - -**Related functions**: -[bit_length](#bit_length), -[octet_length](#octet_length) - -### `lower` - -Converts a string to lower-case. - -``` -lower(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[initcap](#initcap), -[upper](#upper) - -### `lpad` - -Pads the left side of a string with another string to a specified string length. - -``` -lpad(str, n[, padding_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: String length to pad to. -- **padding_str**: String expression to pad with. - Can be a constant, column, or function, and any combination of string operators. - _Default is a space._ - -**Related functions**: -[rpad](#rpad) - -### `ltrim` - -Trims the specified trim string from the beginning of a string. -If no trim string is provided, all whitespace is removed from the start -of the input string. - -``` -ltrim(str[, trim_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **trim_str**: String expression to trim from the beginning of the input string. - Can be a constant, column, or function, and any combination of arithmetic operators. - _Default is whitespace characters._ - -**Related functions**: -[btrim](#btrim), -[rtrim](#rtrim) - -### `octet_length` - -Returns the length of a string in bytes. - -``` -octet_length(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[bit_length](#bit_length), -[length](#length) - -### `repeat` - -Returns a string with an input string repeated a specified number. - -``` -repeat(str, n) -``` - -#### Arguments - -- **str**: String expression to repeat. - Can be a constant, column, or function, and any combination of string operators. -- **n**: Number of times to repeat the input string. - -### `replace` - -Replaces all occurrences of a specified substring in a string with a new substring. - -``` -replace(str, substr, replacement) -``` - -#### Arguments - -- **str**: String expression to repeat. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring expression to replace in the input string. - Can be a constant, column, or function, and any combination of string operators. -- **replacement**: Replacement substring expression. - Can be a constant, column, or function, and any combination of string operators. - -### `reverse` - -Reverses the character order of a string. - -``` -reverse(str) -``` - -#### Arguments - -- **str**: String expression to repeat. - Can be a constant, column, or function, and any combination of string operators. - -### `right` - -Returns a specified number of characters from the right side of a string. - -``` -right(str, n) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: Number of characters to return. - -**Related functions**: -[left](#left) - -### `rpad` - -Pads the right side of a string with another string to a specified string length. - -``` -rpad(str, n[, padding_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: String length to pad to. -- **padding_str**: String expression to pad with. - Can be a constant, column, or function, and any combination of string operators. - _Default is a space._ - -**Related functions**: -[lpad](#lpad) - -### `rtrim` - -Trims the specified trim string from the end of a string. -If no trim string is provided, all whitespace is removed from the end -of the input string. - -``` -rtrim(str[, trim_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **trim_str**: String expression to trim from the end of the input string. - Can be a constant, column, or function, and any combination of arithmetic operators. - _Default is whitespace characters._ - -**Related functions**: -[btrim](#btrim), -[ltrim](#ltrim) - -### `split_part` - -Splits a string based on a specified delimiter and returns the substring in the -specified position. - -``` -split_part(str, delimiter, pos) -``` - -#### Arguments - -- **str**: String expression to spit. - Can be a constant, column, or function, and any combination of string operators. -- **delimiter**: String or character to split on. -- **pos**: Position of the part to return. - -### `starts_with` - -Tests if a string starts with a substring. - -``` -starts_with(str, substr) -``` - -#### Arguments - -- **str**: String expression to test. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring to test for. - -### `strpos` - -Returns the starting position of a specified substring in a string. -Positions begin at 1. -If the substring does not exist in the string, the function returns 0. - -``` -strpos(str, substr) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring expression to search for. - Can be a constant, column, or function, and any combination of string operators. - -#### Aliases - -- instr - -### `substr` - -Extracts a substring of a specified number of characters from a specific -starting position in a string. - -``` -substr(str, start_pos[, length]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **start_pos**: Character position to start the substring at. - The first character in the string has a position of 1. -- **length**: Number of characters to extract. - If not specified, returns the rest of the string after the start position. - -#### Aliases - -- substring - -### `substring` - -_Alias of [substr](#substr)._ - -### `translate` - -Translates characters in a string to specified translation characters. - -``` -translate(str, chars, translation) -``` - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **chars**: Characters to translate. -- **translation**: Translation characters. Translation characters replace only - characters at the same position in the **chars** string. - -### `to_hex` - -Converts an integer to a hexadecimal string. - -``` -to_hex(int) -``` - -#### Arguments - -- **int**: Integer expression to convert. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `trim` - -_Alias of [btrim](#btrim)._ - -### `upper` - -Converts a string to upper-case. - -``` -upper(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[initcap](#initcap), -[lower](#lower) - -### `uuid` - -Returns UUID v4 string value which is unique per row. - -``` -uuid() -``` - -### `overlay` - -Returns the string which is replaced by another string from the specified position and specified count length. -For example, `overlay('Txxxxas' placing 'hom' from 2 for 4) → Thomas` - -``` -overlay(str PLACING substr FROM pos [FOR count]) -``` - -#### Arguments - -- **str**: String expression to operate on. -- **substr**: the string to replace part of str. -- **pos**: the start position to replace of str. -- **count**: the count of characters to be replaced from start position of str. If not specified, will use substr length instead. - -### `levenshtein` - -Returns the Levenshtein distance between the two given strings. -For example, `levenshtein('kitten', 'sitting') = 3` - -``` -levenshtein(str1, str2) -``` - -#### Arguments - -- **str1**: String expression to compute Levenshtein distance with str2. -- **str2**: String expression to compute Levenshtein distance with str1. - -### `substr_index` - -Returns the substring from str before count occurrences of the delimiter delim. -If count is positive, everything to the left of the final delimiter (counting from the left) is returned. -If count is negative, everything to the right of the final delimiter (counting from the right) is returned. -For example, `substr_index('www.apache.org', '.', 1) = www`, `substr_index('www.apache.org', '.', -1) = org` - -``` -substr_index(str, delim, count) -``` - -#### Arguments - -- **str**: String expression to operate on. -- **delim**: the string to find in str to split str. -- **count**: The number of times to search for the delimiter. Can be both a positive or negative number. - -### `find_in_set` - -Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings. -For example, `find_in_set('b', 'a,b,c,d') = 2` - -``` -find_in_set(str, strlist) -``` - -#### Arguments - -- **str**: String expression to find in strlist. -- **strlist**: A string list is a string composed of substrings separated by , characters. +See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) ## Binary String Functions @@ -1452,19 +841,6 @@ position(substr in origstr) - **substr**: The pattern string. - **origstr**: The model string. -### `contains` - -Return true if search_string is found within string (case-sensitive). - -``` -contains(string, search_string) -``` - -#### Arguments - -- **string**: The pattern string. -- **search_string**: The model string. - ## Time and Date Functions - [now](#now) diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md new file mode 100644 index 000000000000..2423f9c4757d --- /dev/null +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -0,0 +1,1172 @@ + + + + +# Scalar Functions (NEW) + +This page is a WIP and will replace the Scalar Functions page once completed. + +## Math Functions + +- [log](#log) + +### `log` + +Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number. + +``` +log(base, numeric_expression) +log(numeric_expression) +``` + +#### Arguments + +- **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +## Conditional Functions + +- [coalesce](#coalesce) + +### `coalesce` + +Returns the first of its arguments that is not _null_. Returns _null_ if all arguments are _null_. This function is often used to substitute a default value for _null_ values. + +``` +coalesce(expression1[, ..., expression_n]) +``` + +#### Arguments + +- **expression1, expression_n**: Expression to use if previous expressions are _null_. Can be a constant, column, or function, and any combination of arithmetic operators. Pass as many expression arguments as necessary. + +## String Functions + +- [ascii](#ascii) +- [bit_length](#bit_length) +- [btrim](#btrim) +- [char_length](#char_length) +- [character_length](#character_length) +- [chr](#chr) +- [concat](#concat) +- [concat_ws](#concat_ws) +- [contains](#contains) +- [ends_with](#ends_with) +- [find_in_set](#find_in_set) +- [initcap](#initcap) +- [instr](#instr) +- [left](#left) +- [length](#length) +- [levenshtein](#levenshtein) +- [lower](#lower) +- [lpad](#lpad) +- [ltrim](#ltrim) +- [octet_length](#octet_length) +- [position](#position) +- [repeat](#repeat) +- [replace](#replace) +- [reverse](#reverse) +- [right](#right) +- [rpad](#rpad) +- [rtrim](#rtrim) +- [split_part](#split_part) +- [starts_with](#starts_with) +- [strpos](#strpos) +- [substr](#substr) +- [substr_index](#substr_index) +- [substring](#substring) +- [substring_index](#substring_index) +- [to_hex](#to_hex) +- [translate](#translate) +- [trim](#trim) +- [upper](#upper) +- [uuid](#uuid) + +### `ascii` + +Returns the Unicode character code of the first character in a string. + +``` +ascii(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select ascii('abc'); ++--------------------+ +| ascii(Utf8("abc")) | ++--------------------+ +| 97 | ++--------------------+ +> select ascii('🚀'); ++-------------------+ +| ascii(Utf8("🚀")) | ++-------------------+ +| 128640 | ++-------------------+ +``` + +**Related functions**: + +- [chr](#chr) + +### `bit_length` + +Returns the bit length of a string. + +``` +bit_length(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select bit_length('datafusion'); ++--------------------------------+ +| bit_length(Utf8("datafusion")) | ++--------------------------------+ +| 80 | ++--------------------------------+ +``` + +**Related functions**: + +- [length](#length) +- [octet_length](#octet_length) + +### `btrim` + +Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string. + +``` +btrim(str[, trim_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **trim_str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._ + +#### Example + +```sql +> select btrim('__datafusion____', '_'); ++-------------------------------------------+ +| btrim(Utf8("__datafusion____"),Utf8("_")) | ++-------------------------------------------+ +| datafusion | ++-------------------------------------------+ +``` + +#### Aliases + +- trim + +**Related functions**: + +- [ltrim](#ltrim) +- [rtrim](#rtrim) + +### `char_length` + +_Alias of [character_length](#character_length)._ + +### `character_length` + +Returns the number of characters in a string. + +``` +character_length(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select character_length('Ångström'); ++------------------------------------+ +| character_length(Utf8("Ångström")) | ++------------------------------------+ +| 8 | ++------------------------------------+ +``` + +#### Aliases + +- length +- char_length + +**Related functions**: + +- [bit_length](#bit_length) +- [octet_length](#octet_length) + +### `chr` + +Returns the character with the specified ASCII or Unicode code value. + +``` +chr(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select chr(128640); ++--------------------+ +| chr(Int64(128640)) | ++--------------------+ +| 🚀 | ++--------------------+ +``` + +**Related functions**: + +- [ascii](#ascii) + +### `concat` + +Concatenates multiple strings together. + +``` +concat(str[, ..., str_n]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **str_n**: Subsequent string expressions to concatenate. + +#### Example + +```sql +> select concat('data', 'f', 'us', 'ion'); ++-------------------------------------------------------+ +| concat(Utf8("data"),Utf8("f"),Utf8("us"),Utf8("ion")) | ++-------------------------------------------------------+ +| datafusion | ++-------------------------------------------------------+ +``` + +**Related functions**: + +- [concat_ws](#concat_ws) + +### `concat_ws` + +Concatenates multiple strings together with a specified separator. + +``` +concat_ws(separator, str[, ..., str_n]) +``` + +#### Arguments + +- **separator**: Separator to insert between concatenated strings. +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **str_n**: Subsequent string expressions to concatenate. expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select concat_ws('_', 'data', 'fusion'); ++--------------------------------------------------+ +| concat_ws(Utf8("_"),Utf8("data"),Utf8("fusion")) | ++--------------------------------------------------+ +| data_fusion | ++--------------------------------------------------+ +``` + +**Related functions**: + +- [concat](#concat) + +### `contains` + +Return true if search_str is found within string (case-sensitive). + +``` +contains(str, search_str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **search_str**: The string to search for in str. + +#### Example + +```sql +> select contains('the quick brown fox', 'row'); ++---------------------------------------------------+ +| contains(Utf8("the quick brown fox"),Utf8("row")) | ++---------------------------------------------------+ +| true | ++---------------------------------------------------+ +``` + +### `ends_with` + +Tests if a string ends with a substring. + +``` +ends_with(str, substr) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring to test for. + +#### Example + +```sql +> select ends_with('datafusion', 'soin'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("soin")) | ++--------------------------------------------+ +| false | ++--------------------------------------------+ +> select ends_with('datafusion', 'sion'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("sion")) | ++--------------------------------------------+ +| true | ++--------------------------------------------+ +``` + +### `find_in_set` + +Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings. + +``` +find_in_set(str, strlist) +``` + +#### Arguments + +- **str**: String expression to find in strlist. +- **strlist**: A string list is a string composed of substrings separated by , characters. + +#### Example + +```sql +> select find_in_set('b', 'a,b,c,d'); ++----------------------------------------+ +| find_in_set(Utf8("b"),Utf8("a,b,c,d")) | ++----------------------------------------+ +| 2 | ++----------------------------------------+ +``` + +### `initcap` + +Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters. + +``` +initcap(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select initcap('apache datafusion'); ++------------------------------------+ +| initcap(Utf8("apache datafusion")) | ++------------------------------------+ +| Apache Datafusion | ++------------------------------------+ +``` + +**Related functions**: + +- [lower](#lower) +- [upper](#upper) + +### `instr` + +_Alias of [strpos](#strpos)._ + +### `left` + +Returns a specified number of characters from the left side of a string. + +``` +left(str, n) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: Number of characters to return. + +#### Example + +```sql +> select left('datafusion', 4); ++-----------------------------------+ +| left(Utf8("datafusion"),Int64(4)) | ++-----------------------------------+ +| data | ++-----------------------------------+ +``` + +**Related functions**: + +- [right](#right) + +### `length` + +_Alias of [character_length](#character_length)._ + +### `levenshtein` + +Returns the [`Levenshtein distance`](https://en.wikipedia.org/wiki/Levenshtein_distance) between the two given strings. + +``` +levenshtein(str1, str2) +``` + +#### Arguments + +- **str1**: String expression to compute Levenshtein distance with str2. +- **str2**: String expression to compute Levenshtein distance with str1. + +#### Example + +```sql +> select levenshtein('kitten', 'sitting'); ++---------------------------------------------+ +| levenshtein(Utf8("kitten"),Utf8("sitting")) | ++---------------------------------------------+ +| 3 | ++---------------------------------------------+ +``` + +### `lower` + +Converts a string to lower-case. + +``` +lower(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select lower('Ångström'); ++-------------------------+ +| lower(Utf8("Ångström")) | ++-------------------------+ +| ångström | ++-------------------------+ +``` + +**Related functions**: + +- [initcap](#initcap) +- [upper](#upper) + +### `lpad` + +Pads the left side of a string with another string to a specified string length. + +``` +lpad(str, n[, padding_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: String length to pad to. +- **padding_str**: Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ + +#### Example + +```sql +> select lpad('Dolly', 10, 'hello'); ++---------------------------------------------+ +| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) | ++---------------------------------------------+ +| helloDolly | ++---------------------------------------------+ +``` + +**Related functions**: + +- [rpad](#rpad) + +### `ltrim` + +Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string. + +``` +ltrim(str[, trim_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **trim_str**: String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._ + +#### Example + +```sql +> select ltrim(' datafusion '); ++-------------------------------+ +| ltrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select ltrim('___datafusion___', '_'); ++-------------------------------------------+ +| ltrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| datafusion___ | ++-------------------------------------------+ +``` + +**Related functions**: + +- [btrim](#btrim) +- [rtrim](#rtrim) + +### `octet_length` + +Returns the length of a string in bytes. + +``` +octet_length(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select octet_length('Ångström'); ++--------------------------------+ +| octet_length(Utf8("Ångström")) | ++--------------------------------+ +| 10 | ++--------------------------------+ +``` + +**Related functions**: + +- [bit_length](#bit_length) +- [length](#length) + +### `position` + +_Alias of [strpos](#strpos)._ + +### `repeat` + +Returns a string with an input string repeated a specified number. + +``` +repeat(str, n) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: Number of times to repeat the input string. + +#### Example + +```sql +> select repeat('data', 3); ++-------------------------------+ +| repeat(Utf8("data"),Int64(3)) | ++-------------------------------+ +| datadatadata | ++-------------------------------+ +``` + +### `replace` + +Replaces all occurrences of a specified substring in a string with a new substring. + +``` +replace(str, substr, replacement) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring expression to replace in the input string. Substring expression expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **replacement**: Replacement substring expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select replace('ABabbaBA', 'ab', 'cd'); ++-------------------------------------------------+ +| replace(Utf8("ABabbaBA"),Utf8("ab"),Utf8("cd")) | ++-------------------------------------------------+ +| ABcdbaBA | ++-------------------------------------------------+ +``` + +### `reverse` + +Reverses the character order of a string. + +``` +reverse(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select reverse('datafusion'); ++-----------------------------+ +| reverse(Utf8("datafusion")) | ++-----------------------------+ +| noisufatad | ++-----------------------------+ +``` + +### `right` + +Returns a specified number of characters from the right side of a string. + +``` +right(str, n) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: Number of characters to return + +#### Example + +```sql +> select right('datafusion', 6); ++------------------------------------+ +| right(Utf8("datafusion"),Int64(6)) | ++------------------------------------+ +| fusion | ++------------------------------------+ +``` + +**Related functions**: + +- [left](#left) + +### `rpad` + +Pads the right side of a string with another string to a specified string length. + +``` +rpad(str, n[, padding_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: String length to pad to. +- **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ + +#### Example + +```sql +> select rpad('datafusion', 20, '_-'); ++-----------------------------------------------+ +| rpad(Utf8("datafusion"),Int64(20),Utf8("_-")) | ++-----------------------------------------------+ +| datafusion_-_-_-_-_- | ++-----------------------------------------------+ +``` + +**Related functions**: + +- [lpad](#lpad) + +### `rtrim` + +Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string. + +``` +rtrim(str[, trim_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **trim_str**: String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._ + +#### Example + +```sql +> select rtrim(' datafusion '); ++-------------------------------+ +| rtrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select rtrim('___datafusion___', '_'); ++-------------------------------------------+ +| rtrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| ___datafusion | ++-------------------------------------------+ +``` + +**Related functions**: + +- [btrim](#btrim) +- [ltrim](#ltrim) + +### `split_part` + +Splits a string based on a specified delimiter and returns the substring in the specified position. + +``` +split_part(str, delimiter, pos) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **delimiter**: String or character to split on. +- **pos**: Position of the part to return. + +#### Example + +```sql +> select split_part('1.2.3.4.5', '.', 3); ++--------------------------------------------------+ +| split_part(Utf8("1.2.3.4.5"),Utf8("."),Int64(3)) | ++--------------------------------------------------+ +| 3 | ++--------------------------------------------------+ +``` + +### `starts_with` + +Tests if a string starts with a substring. + +``` +starts_with(str, substr) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring to test for. + +#### Example + +```sql +> select starts_with('datafusion','data'); ++----------------------------------------------+ +| starts_with(Utf8("datafusion"),Utf8("data")) | ++----------------------------------------------+ +| true | ++----------------------------------------------+ +``` + +### `strpos` + +Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0. + +``` +strpos(str, substr) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring expression to search for. + +#### Example + +```sql +> select strpos('datafusion', 'fus'); ++----------------------------------------+ +| strpos(Utf8("datafusion"),Utf8("fus")) | ++----------------------------------------+ +| 5 | ++----------------------------------------+ +``` + +#### Aliases + +- instr +- position + +### `substr` + +Extracts a substring of a specified number of characters from a specific starting position in a string. + +``` +substr(str, start_pos[, length]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **start_pos**: Character position to start the substring at. The first character in the string has a position of 1. +- **length**: Number of characters to extract. If not specified, returns the rest of the string after the start position. + +#### Example + +```sql +> select substr('datafusion', 5, 3); ++----------------------------------------------+ +| substr(Utf8("datafusion"),Int64(5),Int64(3)) | ++----------------------------------------------+ +| fus | ++----------------------------------------------+ +``` + +#### Aliases + +- substring + +### `substr_index` + +Returns the substring from str before count occurrences of the delimiter delim. +If count is positive, everything to the left of the final delimiter (counting from the left) is returned. +If count is negative, everything to the right of the final delimiter (counting from the right) is returned. + +``` +substr_index(str, delim, count) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **delim**: The string to find in str to split str. +- **count**: The number of times to search for the delimiter. Can be either a positive or negative number. + +#### Example + +```sql +> select substr_index('www.apache.org', '.', 1); ++---------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) | ++---------------------------------------------------------+ +| www | ++---------------------------------------------------------+ +> select substr_index('www.apache.org', '.', -1); ++----------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) | ++----------------------------------------------------------+ +| org | ++----------------------------------------------------------+ +``` + +#### Aliases + +- substring_index + +### `substring` + +_Alias of [substr](#substr)._ + +### `substring_index` + +_Alias of [substr_index](#substr_index)._ + +### `to_hex` + +Converts an integer to a hexadecimal string. + +``` +to_hex(int) +``` + +#### Arguments + +- **int**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select to_hex(12345689); ++-------------------------+ +| to_hex(Int64(12345689)) | ++-------------------------+ +| bc6159 | ++-------------------------+ +``` + +### `translate` + +Translates characters in a string to specified translation characters. + +``` +translate(str, chars, translation) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **chars**: Characters to translate. +- **translation**: Translation characters. Translation characters replace only characters at the same position in the **chars** string. + +#### Example + +```sql +> select translate('twice', 'wic', 'her'); ++--------------------------------------------------+ +| translate(Utf8("twice"),Utf8("wic"),Utf8("her")) | ++--------------------------------------------------+ +| there | ++--------------------------------------------------+ +``` + +### `trim` + +_Alias of [btrim](#btrim)._ + +### `upper` + +Converts a string to upper-case. + +``` +upper(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select upper('dataFusion'); ++---------------------------+ +| upper(Utf8("dataFusion")) | ++---------------------------+ +| DATAFUSION | ++---------------------------+ +``` + +**Related functions**: + +- [initcap](#initcap) +- [lower](#lower) + +### `uuid` + +Returns [`UUID v4`]() string value which is unique per row. + +``` +uuid() +``` + +#### Example + +```sql +> select uuid(); ++--------------------------------------+ +| uuid() | ++--------------------------------------+ +| 6ec17ef8-1934-41cc-8d59-d0c8f9eea1f0 | ++--------------------------------------+ +``` + +## Binary String Functions + +- [decode](#decode) +- [encode](#encode) + +### `decode` + +Decode binary data from textual representation in string. + +``` +decode(expression, format) +``` + +#### Arguments + +- **expression**: Expression containing encoded string data +- **format**: Same arguments as [encode](#encode) + +**Related functions**: + +- [encode](#encode) + +### `encode` + +Encode binary data into a textual representation. + +``` +encode(expression, format) +``` + +#### Arguments + +- **expression**: Expression containing string or binary data +- **format**: Supported formats are: `base64`, `hex` + +**Related functions**: + +- [decode](#decode) + +## Regular Expression Functions + +Apache DataFusion uses a [PCRE-like](https://en.wikibooks.org/wiki/Regular_Expressions/Perl-Compatible_Regular_Expressions) +regular expression [syntax](https://docs.rs/regex/latest/regex/#syntax) +(minus support for several features including look-around and backreferences). +The following regular expression functions are supported: + +- [regexp_like](#regexp_like) + +### `regexp_like` + +Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise. + +``` +regexp_like(str, regexp[, flags]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **regexp**: Regular expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: + - **i**: case-insensitive: letters match both upper and lower case + - **m**: multi-line mode: ^ and $ match begin/end of line + - **s**: allow . to match \n + - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used + - **U**: swap the meaning of x* and x*? + +#### Example + +```sql +select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}'); ++--------------------------------------------------------+ +| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) | ++--------------------------------------------------------+ +| true | ++--------------------------------------------------------+ +SELECT regexp_like('aBc', '(b|d)', 'i'); ++--------------------------------------------------+ +| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) | ++--------------------------------------------------+ +| true | ++--------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) + +## Time and Date Functions + +- [to_date](#to_date) + +### `to_date` + +Converts a value to a date (`YYYY-MM-DD`). +Supports strings, integer and double types as input. +Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)s are provided. +Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). +Returns the corresponding date. + +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. + +``` +to_date('2017-05-31', '%Y-%m-%d') +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order + they appear with the first successful one being returned. If none of the formats successfully parse the expression + an error will be returned. + +#### Example + +```sql +> select to_date('2023-01-31'); ++-----------------------------+ +| to_date(Utf8("2023-01-31")) | ++-----------------------------+ +| 2023-01-31 | ++-----------------------------+ +> select to_date('2023/01/31', '%Y-%m-%d', '%Y/%m/%d'); ++---------------------------------------------------------------+ +| to_date(Utf8("2023/01/31"),Utf8("%Y-%m-%d"),Utf8("%Y/%m/%d")) | ++---------------------------------------------------------------+ +| 2023-01-31 | ++---------------------------------------------------------------+ +``` + +Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) + +## Hashing Functions + +- [sha224](#sha224) + +### `sha224` + +Computes the SHA-224 hash of a binary string. + +``` +sha224(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. diff --git a/docs/source/user-guide/sql/window_functions_new.md b/docs/source/user-guide/sql/window_functions_new.md new file mode 100644 index 000000000000..1ab6740a6f87 --- /dev/null +++ b/docs/source/user-guide/sql/window_functions_new.md @@ -0,0 +1,161 @@ + + + + +# Window Functions (NEW) + +This page is a WIP and will replace the Window Functions page once completed. + +A _window function_ performs a calculation across a set of table rows that are somehow related to the current row. This is comparable to the type of calculation that can be done with an aggregate function. However, window functions do not cause rows to become grouped into a single output row like non-window aggregate calls would. Instead, the rows retain their separate identities. Behind the scenes, the window function is able to access more than just the current row of the query result + +Here is an example that shows how to compare each employee's salary with the average salary in his or her department: + +```sql +SELECT depname, empno, salary, avg(salary) OVER (PARTITION BY depname) FROM empsalary; + ++-----------+-------+--------+-------------------+ +| depname | empno | salary | avg | ++-----------+-------+--------+-------------------+ +| personnel | 2 | 3900 | 3700.0 | +| personnel | 5 | 3500 | 3700.0 | +| develop | 8 | 6000 | 5020.0 | +| develop | 10 | 5200 | 5020.0 | +| develop | 11 | 5200 | 5020.0 | +| develop | 9 | 4500 | 5020.0 | +| develop | 7 | 4200 | 5020.0 | +| sales | 1 | 5000 | 4866.666666666667 | +| sales | 4 | 4800 | 4866.666666666667 | +| sales | 3 | 4800 | 4866.666666666667 | ++-----------+-------+--------+-------------------+ +``` + +A window function call always contains an OVER clause directly following the window function's name and argument(s). This is what syntactically distinguishes it from a normal function or non-window aggregate. The OVER clause determines exactly how the rows of the query are split up for processing by the window function. The PARTITION BY clause within OVER divides the rows into groups, or partitions, that share the same values of the PARTITION BY expression(s). For each row, the window function is computed across the rows that fall into the same partition as the current row. The previous example showed how to count the average of a column per partition. + +You can also control the order in which rows are processed by window functions using ORDER BY within OVER. (The window ORDER BY does not even have to match the order in which the rows are output.) Here is an example: + +```sql +SELECT depname, empno, salary, + rank() OVER (PARTITION BY depname ORDER BY salary DESC) +FROM empsalary; + ++-----------+-------+--------+--------+ +| depname | empno | salary | rank | ++-----------+-------+--------+--------+ +| personnel | 2 | 3900 | 1 | +| develop | 8 | 6000 | 1 | +| develop | 10 | 5200 | 2 | +| develop | 11 | 5200 | 2 | +| develop | 9 | 4500 | 4 | +| develop | 7 | 4200 | 5 | +| sales | 1 | 5000 | 1 | +| sales | 4 | 4800 | 2 | +| personnel | 5 | 3500 | 2 | +| sales | 3 | 4800 | 2 | ++-----------+-------+--------+--------+ +``` + +There is another important concept associated with window functions: for each row, there is a set of rows within its partition called its window frame. Some window functions act only on the rows of the window frame, rather than of the whole partition. Here is an example of using window frames in queries: + +```sql +SELECT depname, empno, salary, + avg(salary) OVER(ORDER BY salary ASC ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS avg, + min(salary) OVER(ORDER BY empno ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cum_min +FROM empsalary +ORDER BY empno ASC; + ++-----------+-------+--------+--------------------+---------+ +| depname | empno | salary | avg | cum_min | ++-----------+-------+--------+--------------------+---------+ +| sales | 1 | 5000 | 5000.0 | 5000 | +| personnel | 2 | 3900 | 3866.6666666666665 | 3900 | +| sales | 3 | 4800 | 4700.0 | 3900 | +| sales | 4 | 4800 | 4866.666666666667 | 3900 | +| personnel | 5 | 3500 | 3700.0 | 3500 | +| develop | 7 | 4200 | 4200.0 | 3500 | +| develop | 8 | 6000 | 5600.0 | 3500 | +| develop | 9 | 4500 | 4500.0 | 3500 | +| develop | 10 | 5200 | 5133.333333333333 | 3500 | +| develop | 11 | 5200 | 5466.666666666667 | 3500 | ++-----------+-------+--------+--------------------+---------+ +``` + +When a query involves multiple window functions, it is possible to write out each one with a separate OVER clause, but this is duplicative and error-prone if the same windowing behavior is wanted for several functions. Instead, each windowing behavior can be named in a WINDOW clause and then referenced in OVER. For example: + +```sql +SELECT sum(salary) OVER w, avg(salary) OVER w +FROM empsalary +WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); +``` + +## Syntax + +The syntax for the OVER-clause is + +``` +function([expr]) + OVER( + [PARTITION BY expr[, …]] + [ORDER BY expr [ ASC | DESC ][, …]] + [ frame_clause ] + ) +``` + +where **frame_clause** is one of: + +``` + { RANGE | ROWS | GROUPS } frame_start + { RANGE | ROWS | GROUPS } BETWEEN frame_start AND frame_end +``` + +and **frame_start** and **frame_end** can be one of + +```sql +UNBOUNDED PRECEDING +offset PRECEDING +CURRENT ROW +offset FOLLOWING +UNBOUNDED FOLLOWING +``` + +where **offset** is an non-negative integer. + +RANGE and GROUPS modes require an ORDER BY clause (with RANGE the ORDER BY must specify exactly one column). + +## Aggregate functions + +All [aggregate functions](aggregate_functions.md) can be used as window functions. + +## Ranking Functions + +- [row_number](#row_number) + +### `row_number` + +Number of the current row within its partition, counting from 1. + +``` +row_number() +```