From 4c7b14c6e4da99a7e52fddaf6fed9ae51ca2c66e Mon Sep 17 00:00:00 2001 From: zhenxing jiang Date: Thu, 12 Oct 2023 10:22:27 -0500 Subject: [PATCH 1/3] add bit_and,bit_or,bit_xor,bool_add,bool_or (#496) * add bit_and,bit_or,bit_xor,bool_add,bool_or * Update datafusion/tests/test_aggregation.py Co-authored-by: Liang-Chi Hsieh --------- Co-authored-by: Liang-Chi Hsieh --- datafusion/tests/test_aggregation.py | 38 ++++++++++++++++++++++++++-- src/functions.rs | 10 ++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/datafusion/tests/test_aggregation.py b/datafusion/tests/test_aggregation.py index 2c8c064b1..0a6c90c32 100644 --- a/datafusion/tests/test_aggregation.py +++ b/datafusion/tests/test_aggregation.py @@ -33,8 +33,9 @@ def df(): pa.array([1, 2, 3]), pa.array([4, 4, 6]), pa.array([9, 8, 5]), + pa.array([True, True, False]), ], - names=["a", "b", "c"], + names=["a", "b", "c", "d"], ) return ctx.create_dataframe([[batch]]) @@ -73,7 +74,7 @@ def test_built_in_aggregation(df): ], ) result = agg_df.collect()[0] - values_a, values_b, values_c = df.collect()[0] + values_a, values_b, values_c, values_d = df.collect()[0] assert result.column(0) == pa.array([2], type=pa.uint64()) assert result.column(1) == pa.array([4]) @@ -125,3 +126,36 @@ def test_built_in_aggregation(df): np.testing.assert_array_almost_equal( result.column(21), np.var(values_c, ddof=1) ) + + +def test_bit_add_or_xor(df): + + df = df.aggregate( + [], + [ + f.bit_and(column("a")), + f.bit_or(column("b")), + f.bit_xor(column("c")), + ], + ) + + result = df.collect() + result = result[0] + assert result.column(0) == pa.array([0]) + assert result.column(1) == pa.array([6]) + assert result.column(2) == pa.array([4]) + + +def test_bool_and_or(df): + + df = df.aggregate( + [], + [ + f.bool_and(column("d")), + f.bool_or(column("d")), + ], + ) + result = df.collect() + result = result[0] + assert result.column(0) == pa.array([False]) + assert result.column(1) == pa.array([True]) diff --git a/src/functions.rs b/src/functions.rs index ef26240fe..eed28154e 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -362,6 +362,11 @@ aggregate_function!(stddev_samp, Stddev); aggregate_function!(var, Variance); aggregate_function!(var_pop, VariancePop); aggregate_function!(var_samp, Variance); +aggregate_function!(bit_and, BitAnd); +aggregate_function!(bit_or, BitOr); +aggregate_function!(bit_xor, BitXor); +aggregate_function!(bool_and, BoolAnd); +aggregate_function!(bool_or, BoolOr); pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(abs))?; @@ -489,6 +494,11 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(var_pop))?; m.add_wrapped(wrap_pyfunction!(var_samp))?; m.add_wrapped(wrap_pyfunction!(window))?; + m.add_wrapped(wrap_pyfunction!(bit_and))?; + m.add_wrapped(wrap_pyfunction!(bit_or))?; + m.add_wrapped(wrap_pyfunction!(bit_xor))?; + m.add_wrapped(wrap_pyfunction!(bool_and))?; + m.add_wrapped(wrap_pyfunction!(bool_or))?; //Binary String Functions m.add_wrapped(wrap_pyfunction!(encode))?; From 804d0eb3160b0debe5beed77209ea497b0f026c7 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 12 Oct 2023 10:00:25 -0600 Subject: [PATCH 2/3] Use DataFusion 32 (#515) * use DataFusion 32 * update lock file --- Cargo.lock | 131 +++++++++++++++++++++++++++-------------------------- Cargo.toml | 14 +++--- 2 files changed, 74 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ee4eb3f64..f739947f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -38,9 +38,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.1" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] @@ -342,7 +342,7 @@ dependencies = [ "arrow-select", "num", "regex", - "regex-syntax", + "regex-syntax 0.7.5", ] [[package]] @@ -715,8 +715,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7014432223f4d721cb9786cd88bb89e7464e0ba984d4a7f49db7787f5f268674" dependencies = [ "ahash", "apache-avro", @@ -764,8 +765,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3903ed8f102892f17b48efa437f3542159241d41c564f0d1e78efdc5e663aa" dependencies = [ "ahash", "apache-avro", @@ -784,8 +786,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "780b73b2407050e53f51a9781868593f694102c59e622de9a8aafc0343c4f237" dependencies = [ "arrow", "chrono", @@ -804,8 +807,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24c382676338d8caba6c027ba0da47260f65ffedab38fda78f6d8043f607557c" dependencies = [ "ahash", "arrow", @@ -818,8 +822,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2904a432f795484fd45e29ded4537152adb60f636c05691db34fcd94c92c96" dependencies = [ "arrow", "async-trait", @@ -830,13 +835,14 @@ dependencies = [ "hashbrown 0.14.1", "itertools 0.11.0", "log", - "regex-syntax", + "regex-syntax 0.7.5", ] [[package]] name = "datafusion-physical-expr" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57b4968e9a998dc0476c4db7a82f280e2026b25f464e4aa0c3bb9807ee63ddfd" dependencies = [ "ahash", "arrow", @@ -868,8 +874,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efd0d1fe54e37a47a2d58a1232c22786f2c28ad35805fdcd08f0253a8b0aaa90" dependencies = [ "ahash", "arrow", @@ -898,7 +905,7 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "31.0.0" +version = "32.0.0" dependencies = [ "async-trait", "datafusion", @@ -916,7 +923,7 @@ dependencies = [ "pyo3", "pyo3-build-config", "rand", - "regex-syntax", + "regex-syntax 0.7.5", "syn 2.0.38", "tokio", "url", @@ -925,8 +932,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b568d44c87ead99604d704f942e257c8a236ee1bbf890ee3e034ad659dcb2c21" dependencies = [ "arrow", "arrow-schema", @@ -938,8 +946,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion?rev=e23d34bae60bb2f9c496241e218bab795af3af83#e23d34bae60bb2f9c496241e218bab795af3af83" +version = "32.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2884dff8207774c1ea5f7b008d44b20e9723bd0b2e4b7dd6627390d8b526b50" dependencies = [ "async-recursion", "chrono", @@ -998,25 +1007,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" dependencies = [ - "errno-dragonfly", "libc", "windows-sys", ] -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "fastrand" version = "2.0.1" @@ -1456,9 +1454,9 @@ checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" dependencies = [ "libc", ] @@ -1614,9 +1612,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.8" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "lock_api" @@ -1800,9 +1798,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", "libm", @@ -1865,9 +1863,9 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ordered-float" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" dependencies = [ "num-traits", ] @@ -2040,9 +2038,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.68" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] @@ -2234,32 +2232,32 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.6" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff" +checksum = "d119d7c7ca818f8a53c300863d4f87566aac09943aef5b355bb83969dae75d87" dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.1", ] [[package]] name = "regex-automata" -version = "0.3.9" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" +checksum = "465c6fc0621e4abc4187a2bda0937bfd4f722c2730b29562e19689ea796c9a4b" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.1", ] [[package]] name = "regex-lite" -version = "0.1.0" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f96ede7f386ba6e910092e7ccdc04176cface62abebea07ed6b46d870ed95ca2" +checksum = "9a6ebcd15653947e6140f59a9811a06ed061d18a5c35dfca2e2e4c5525696878" [[package]] name = "regex-syntax" @@ -2267,6 +2265,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "regex-syntax" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56d84fdd47036b038fc80dd333d10b6aab10d5d31f4a366e20014def75328d33" + [[package]] name = "regress" version = "0.6.0" @@ -2357,9 +2361,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.17" +version = "0.38.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7" +checksum = "5a74ee2d7c2581cd139b42447d7d9389b889bdaad3a73f1ebb16f2a3237bb19c" dependencies = [ "bitflags 2.4.0", "errno", @@ -2462,9 +2466,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad977052201c6de01a8ef2aa3378c4bd23217a056337d1d6da40468d267a4fb0" +checksum = "836fa6a3e1e547f9a2c4040802ec865b5d85f4014efe00555d7090a3dcaa1090" [[package]] name = "seq-macro" @@ -2853,9 +2857,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.32.0" +version = "1.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17ed6077ed6cd6c74735e21f37eb16dc3935f96878b1fe961074089cc80893f9" +checksum = "4f38200e3ef7995e5ef13baec2f432a6da0aa9ac495b2c0e8f3b7eec2c92d653" dependencies = [ "backtrace", "bytes", @@ -3390,11 +3394,10 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.9+zstd.1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index b1e432007..8c86b5658 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "datafusion-python" -version = "31.0.0" +version = "32.0.0" homepage = "https://github.com/apache/arrow-datafusion-python" repository = "https://github.com/apache/arrow-datafusion-python" authors = ["Apache Arrow "] @@ -36,12 +36,12 @@ protoc = [ "datafusion-substrait/protoc" ] tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync"] } rand = "0.8" pyo3 = { version = "0.19", features = ["extension-module", "abi3", "abi3-py38"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion", rev = "e23d34bae60bb2f9c496241e218bab795af3af83", features = ["pyarrow", "avro"] } -datafusion-common = { git = "https://github.com/apache/arrow-datafusion", rev = "e23d34bae60bb2f9c496241e218bab795af3af83", features = ["pyarrow"] } -datafusion-expr = { git = "https://github.com/apache/arrow-datafusion", rev = "e23d34bae60bb2f9c496241e218bab795af3af83" } -datafusion-optimizer = { git = "https://github.com/apache/arrow-datafusion", rev = "e23d34bae60bb2f9c496241e218bab795af3af83" } -datafusion-sql = { git = "https://github.com/apache/arrow-datafusion", rev = "e23d34bae60bb2f9c496241e218bab795af3af83" } -datafusion-substrait = { git = "https://github.com/apache/arrow-datafusion", rev = "e23d34bae60bb2f9c496241e218bab795af3af83" } +datafusion = { version = "32.0.0", features = ["pyarrow", "avro"] } +datafusion-common = { version = "32.0.0", features = ["pyarrow"] } +datafusion-expr = { version = "32.0.0" } +datafusion-optimizer = { version = "32.0.0" } +datafusion-sql = { version = "32.0.0" } +datafusion-substrait = { version = "32.0.0" } prost = "0.11" prost-types = "0.11" uuid = { version = "1.3", features = ["v4"] } From a91188c51019a1f9c190a6596970914763415965 Mon Sep 17 00:00:00 2001 From: zhenxing jiang Date: Fri, 13 Oct 2023 17:18:55 -0500 Subject: [PATCH 3/3] add first_value last_value (#498) Co-authored-by: Andy Grove --- datafusion/tests/test_functions.py | 23 +++++++++++++++++++++++ src/functions.rs | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/datafusion/tests/test_functions.py b/datafusion/tests/test_functions.py index f1f64c30a..e504cc498 100644 --- a/datafusion/tests/test_functions.py +++ b/datafusion/tests/test_functions.py @@ -479,6 +479,29 @@ def test_case(df): assert result.column(2) == pa.array(["Hola", "Mundo", None]) +def test_first_last_value(df): + df = df.aggregate( + [], + [ + f.first_value(column("a")), + f.first_value(column("b")), + f.first_value(column("d")), + f.last_value(column("a")), + f.last_value(column("b")), + f.last_value(column("d")), + ], + ) + + result = df.collect() + result = result[0] + assert result.column(0) == pa.array(["Hello"]) + assert result.column(1) == pa.array([4]) + assert result.column(2) == pa.array([datetime(2022, 12, 31)]) + assert result.column(3) == pa.array(["!"]) + assert result.column(4) == pa.array([6]) + assert result.column(5) == pa.array([datetime(2020, 7, 2)]) + + def test_binary_string_functions(df): df = df.select( f.encode(column("a"), literal("base64")), diff --git a/src/functions.rs b/src/functions.rs index eed28154e..2f2f34ee0 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -362,6 +362,8 @@ aggregate_function!(stddev_samp, Stddev); aggregate_function!(var, Variance); aggregate_function!(var_pop, VariancePop); aggregate_function!(var_samp, Variance); +aggregate_function!(first_value, FirstValue); +aggregate_function!(last_value, LastValue); aggregate_function!(bit_and, BitAnd); aggregate_function!(bit_or, BitOr); aggregate_function!(bit_xor, BitXor); @@ -494,6 +496,8 @@ pub(crate) fn init_module(m: &PyModule) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(var_pop))?; m.add_wrapped(wrap_pyfunction!(var_samp))?; m.add_wrapped(wrap_pyfunction!(window))?; + m.add_wrapped(wrap_pyfunction!(first_value))?; + m.add_wrapped(wrap_pyfunction!(last_value))?; m.add_wrapped(wrap_pyfunction!(bit_and))?; m.add_wrapped(wrap_pyfunction!(bit_or))?; m.add_wrapped(wrap_pyfunction!(bit_xor))?;