diff --git a/Cargo.lock b/Cargo.lock index 97bffdb060ea..e332fe6e1a87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "adler32" version = "1.2.0" @@ -163,9 +169,9 @@ checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow-array" @@ -251,7 +257,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -262,7 +268,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -412,9 +418,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.37.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1074e818fbe4f9169242d78448b15be8916a79daa38ea1231f2e2e10d993fcd2" +checksum = "11822090cf501c316c6f75711d77b96fba30658e3867a7762e5e2f5d32d31e81" dependencies = [ "aws-credential-types", "aws-runtime", @@ -434,9 +440,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.38.0" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29755c51e33fa3f678598f64324a169cf4b7d3c4865d2709d4308f53366a92a4" +checksum = "78a2a06ff89176123945d1bbe865603c4d7101bea216a550bb4d2e4e9ba74d74" dependencies = [ "aws-credential-types", "aws-runtime", @@ -456,9 +462,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.37.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e52dc3fd7dfa6c01a69cf3903e00aa467261639138a05b06cd92314d2c8fb07" +checksum = "a20a91795850826a6f456f4a48eff1dfa59a0e69bdbf5b8c50518fd372106574" dependencies = [ "aws-credential-types", "aws-runtime", @@ -591,9 +597,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.6.2" +version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce87155eba55e11768b8c1afa607f3e864ae82f03caf63258b37455b0ad02537" +checksum = "0abbf454960d0db2ad12684a1640120e7557294b0ff8e2f11236290a1b293225" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -635,9 +641,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.2.0" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe321a6b21f5d8eabd0ade9c55d3d0335f3c3157fc2b3e87f05f34b539e4df5" +checksum = "6cee7cadb433c781d3299b916fbf620fea813bf38f49db282fb6858141a05cc8" dependencies = [ "base64-simd", "bytes", @@ -692,7 +698,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -751,9 +757,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" dependencies = [ "arrayref", "arrayvec", @@ -820,22 +826,22 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.16.3" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83" +checksum = "6fd4c6dcc3b0aea2f5c0b4b82c2b15fe39ddbc76041a310848f4706edf76bb31" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ee891b04274a59bd38b412188e24b849617b2e45a0fd8d057deb63e7403761b" +checksum = "0cc8b54b395f2fcfbb3d90c47b01c7f444d94d05bdeb775811dec868ac3bbc26" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -889,12 +895,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.10" +version = "1.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" +checksum = "50d2eb3cd3d1bf4529e31c215ee6f93ec5a3d536d9f578f93d9d33ee19562932" dependencies = [ "jobserver", "libc", + "shlex", ] [[package]] @@ -967,9 +974,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.15" +version = "4.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d8838454fda655dafd3accb2b6e2bea645b9e4078abe84a22ceb947235c5cc" +checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" dependencies = [ "clap_builder", ] @@ -1001,9 +1008,9 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.50" +version = "0.1.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" dependencies = [ "cc", ] @@ -1356,7 +1363,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -1411,9 +1418,9 @@ checksum = "95765f67b4b18863968b4a1bd5bb576f732b29a4a28c7cd84c09fa3e2875f33c" [[package]] name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "ff" @@ -1427,13 +1434,13 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "libz-ng-sys", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -1532,7 +1539,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -1655,9 +1662,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -1729,6 +1736,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hex" version = "0.4.3" @@ -1860,7 +1873,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "httparse", @@ -1898,7 +1911,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -1960,9 +1973,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown", @@ -1989,11 +2002,11 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is-terminal" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi", + "hermit-abi 0.4.0", "libc", "windows-sys 0.52.0", ] @@ -2149,9 +2162,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libflate" @@ -2237,9 +2250,9 @@ dependencies = [ [[package]] name = "libz-ng-sys" -version = "1.1.15" +version = "1.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6409efc61b12687963e602df8ecf70e8ddacf95bc6576bcf16e3ac6328083c5" +checksum = "4436751a01da56f1277f323c80d584ffad94a3d14aecd959dd0dff75aa73a438" dependencies = [ "cmake", "libc", @@ -2247,9 +2260,9 @@ dependencies = [ [[package]] name = "libz-sys" -version = "1.1.19" +version = "1.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdc53a7799a7496ebc9fd29f31f7df80e83c9bda5299768af5f9e59eeea74647" +checksum = "d2d16453e800a8cf6dd2fc3eb4bc99b786a9b90c663b8559a5b1a041bf89e472" dependencies = [ "cc", "libc", @@ -2385,13 +2398,22 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", "wasi", "windows-sys 0.52.0", @@ -2821,7 +2843,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -3106,6 +3128,7 @@ dependencies = [ "fs4", "futures", "glob", + "hashbrown", "home", "itoa", "memchr", @@ -3405,6 +3428,8 @@ dependencies = [ "atomic-waker", "crossbeam-deque", "crossbeam-utils", + "futures", + "memmap2", "parking_lot", "pin-project-lite", "polars-core", @@ -3412,6 +3437,7 @@ dependencies = [ "polars-expr", "polars-io", "polars-mem-engine", + "polars-parquet", "polars-plan", "polars-utils", "rand", @@ -3451,6 +3477,7 @@ dependencies = [ "bytes", "hashbrown", "indexmap", + "libc", "memmap2", "num-traits", "once_cell", @@ -3458,6 +3485,7 @@ dependencies = [ "rand", "raw-cpuid", "rayon", + "serde", "smartstring", "stacker", "sysinfo", @@ -3603,7 +3631,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -3616,7 +3644,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -3696,9 +3724,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -3814,7 +3842,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -3843,7 +3871,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -3889,16 +3917,16 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "base64 0.22.1", "bytes", "futures-channel", "futures-core", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "http-body-util", @@ -3914,7 +3942,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3930,7 +3958,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "windows-registry", ] [[package]] @@ -4045,9 +4073,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -4242,29 +4270,29 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.207" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5665e14a49a4ea1b91029ba7d3bca9f299e1f7cfa194388ccc20f14743e784f2" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.207" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6aea2634c86b0e8ef2cfdc0c340baede54ec27b1e46febd7f80dffb2aa44a00e" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] name = "serde_json" -version = "1.0.124" +version = "1.0.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66ad62847a56b3dba58cc891acd13884b9c61138d330c0d7b6181713d4fce38d" +checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" dependencies = [ "indexmap", "itoa", @@ -4316,6 +4344,12 @@ dependencies = [ "digest", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -4466,15 +4500,15 @@ dependencies = [ [[package]] name = "stacker" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +checksum = "95a5daa25ea337c85ed954c0496e3bdd2c7308cc3b24cf7b50d04876654c579f" dependencies = [ "cc", "cfg-if", "libc", "psm", - "winapi", + "windows-sys 0.36.1", ] [[package]] @@ -4526,7 +4560,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -4539,7 +4573,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -4561,9 +4595,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.74" +version = "2.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" dependencies = [ "proc-macro2", "quote", @@ -4575,12 +4609,15 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "sysinfo" -version = "0.31.2" +version = "0.31.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4115055da5f572fff541dd0c4e61b0262977f453cc9fe04be83aba25a89bdab" +checksum = "2b92e0bdf838cbc1c4c9ba14f9c97a7ec6cdcd1ae66b10e1e42775a25553f45d" dependencies = [ "core-foundation-sys", "libc", @@ -4631,7 +4668,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -4700,9 +4737,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", @@ -4723,7 +4760,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -4812,15 +4849,15 @@ dependencies = [ [[package]] name = "tower-layer" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -4841,7 +4878,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -4886,7 +4923,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -5059,7 +5096,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", "wasm-bindgen-shared", ] @@ -5093,7 +5130,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5185,7 +5222,7 @@ checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" dependencies = [ "windows-implement", "windows-interface", - "windows-result", + "windows-result 0.1.2", "windows-targets 0.52.6", ] @@ -5197,7 +5234,7 @@ checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] @@ -5208,7 +5245,18 @@ checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", +] + +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result 0.2.0", + "windows-strings", + "windows-targets 0.52.6", ] [[package]] @@ -5220,13 +5268,36 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result 0.2.0", + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.36.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" dependencies = [ - "windows-targets 0.48.5", + "windows_aarch64_msvc 0.36.1", + "windows_i686_gnu 0.36.1", + "windows_i686_msvc 0.36.1", + "windows_x86_64_gnu 0.36.1", + "windows_x86_64_msvc 0.36.1", ] [[package]] @@ -5290,6 +5361,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -5302,6 +5379,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -5320,6 +5403,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -5332,6 +5421,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -5356,6 +5451,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -5377,16 +5478,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "x11rb" version = "0.13.1" @@ -5434,7 +5525,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.76", ] [[package]] diff --git a/README.md b/README.md index 5df1673c0dfe..5d7e3c3bf203 100644 --- a/README.md +++ b/README.md @@ -217,7 +217,7 @@ improvements point to the `main` branch of this repo. polars = { git = "https://github.com/pola-rs/polars", rev = "" } ``` -Requires Rust version `>=1.79`. +Requires Rust version `>=1.80`. ## Contributing diff --git a/crates/polars-arrow/src/array/binary/mutable.rs b/crates/polars-arrow/src/array/binary/mutable.rs index 53a8ed32bb6f..65d1ca928b75 100644 --- a/crates/polars-arrow/src/array/binary/mutable.rs +++ b/crates/polars-arrow/src/array/binary/mutable.rs @@ -442,9 +442,8 @@ impl> TryPush> for MutableBinaryArray { Some(value) => { self.values.try_push(value.as_ref())?; - match &mut self.validity { - Some(validity) => validity.push(true), - None => {}, + if let Some(validity) = &mut self.validity { + validity.push(true) } }, None => { diff --git a/crates/polars-arrow/src/array/binview/mutable.rs b/crates/polars-arrow/src/array/binview/mutable.rs index 3258f18052e3..b7b4aac24ef9 100644 --- a/crates/polars-arrow/src/array/binview/mutable.rs +++ b/crates/polars-arrow/src/array/binview/mutable.rs @@ -573,6 +573,128 @@ impl MutableBinaryViewArray<[u8]> { } Ok(()) } + + /// Extend from a `buffer` and `length` of items given some statistics about the lengths. + /// + /// This will attempt to dispatch to several optimized implementations. + /// + /// # Safety + /// + /// This is safe if the statistics are correct. + pub unsafe fn extend_from_lengths_with_stats( + &mut self, + buffer: &[u8], + lengths_iterator: impl Clone + ExactSizeIterator, + min_length: usize, + max_length: usize, + sum_length: usize, + ) { + let num_items = lengths_iterator.len(); + + if num_items == 0 { + return; + } + + #[cfg(debug_assertions)] + { + let (min, max, sum) = lengths_iterator.clone().map(|v| (v, v, v)).fold( + (usize::MAX, usize::MIN, 0usize), + |(cmin, cmax, csum), (emin, emax, esum)| { + (cmin.min(emin), cmax.max(emax), csum + esum) + }, + ); + + assert_eq!(min, min_length); + assert_eq!(max, max_length); + assert_eq!(sum, sum_length); + } + + assert!(sum_length <= buffer.len()); + + let mut buffer_offset = 0; + if min_length > View::MAX_INLINE_SIZE as usize + && (num_items == 1 || sum_length + self.in_progress_buffer.len() <= u32::MAX as usize) + { + let buffer_idx = self.completed_buffers().len() as u32; + let in_progress_buffer_offset = self.in_progress_buffer.len(); + + self.in_progress_buffer + .extend_from_slice(&buffer[..sum_length]); + self.views.extend(lengths_iterator.map(|length| { + // SAFETY: We asserted before that the sum of all lengths is smaller or equal to + // the buffer length. + let view_buffer = + unsafe { buffer.get_unchecked(buffer_offset..buffer_offset + length) }; + + // SAFETY: We know that the minimum length > View::MAX_INLINE_SIZE. Therefore, this + // length is > View::MAX_INLINE_SIZE. + let view = unsafe { + View::new_noninline_unchecked( + view_buffer, + buffer_idx, + (buffer_offset + in_progress_buffer_offset) as u32, + ) + }; + buffer_offset += length; + view + })); + } else if max_length <= View::MAX_INLINE_SIZE as usize { + // If the min and max are the same, we can dispatch to the optimized SIMD + // implementation. + if min_length == max_length { + let length = min_length; + if length == 0 { + self.views + .resize(self.views.len() + num_items, View::new_inline(&[])); + } else { + View::extend_with_inlinable_strided( + &mut self.views, + &buffer[..length * num_items], + length as u8, + ); + } + } else { + self.views.extend(lengths_iterator.map(|length| { + // SAFETY: We asserted before that the sum of all lengths is smaller or equal + // to the buffer length. + let view_buffer = + unsafe { buffer.get_unchecked(buffer_offset..buffer_offset + length) }; + + // SAFETY: We know that each view has a length <= View::MAX_INLINE_SIZE because + // the maximum length is <= View::MAX_INLINE_SIZE + let view = unsafe { View::new_inline_unchecked(view_buffer) }; + buffer_offset += length; + view + })); + } + } else { + // If all fails, just fall back to a base implementation. + self.reserve(num_items); + for length in lengths_iterator { + let value = &buffer[buffer_offset..buffer_offset + length]; + buffer_offset += length; + self.push_value(value); + } + } + } + + /// Extend from a `buffer` and `length` of items. + /// + /// This will attempt to dispatch to several optimized implementations. + #[inline] + pub fn extend_from_lengths( + &mut self, + buffer: &[u8], + lengths_iterator: impl Clone + ExactSizeIterator, + ) { + let (min, max, sum) = lengths_iterator.clone().map(|v| (v, v, v)).fold( + (usize::MAX, 0usize, 0usize), + |(cmin, cmax, csum), (emin, emax, esum)| (cmin.min(emin), cmax.max(emax), csum + esum), + ); + + // SAFETY: We just collected the right stats. + unsafe { self.extend_from_lengths_with_stats(buffer, lengths_iterator, min, max, sum) } + } } impl> Extend> for MutableBinaryViewArray { @@ -646,3 +768,54 @@ impl> TryPush> for MutableBinaryView Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + fn roundtrip(values: &[&[u8]]) -> bool { + let buffer = values + .iter() + .flat_map(|v| v.iter().copied()) + .collect::>(); + let lengths = values.iter().map(|v| v.len()).collect::>(); + let mut bv = MutableBinaryViewArray::<[u8]>::with_capacity(values.len()); + + bv.extend_from_lengths(&buffer[..], lengths.into_iter()); + + &bv.values_iter().collect::>()[..] == values + } + + #[test] + fn extend_with_lengths_basic() { + assert!(roundtrip(&[])); + assert!(roundtrip(&[b"abc"])); + assert!(roundtrip(&[ + b"a_very_very_long_string_that_is_not_inlinable" + ])); + assert!(roundtrip(&[ + b"abc", + b"a_very_very_long_string_that_is_not_inlinable" + ])); + } + + #[test] + fn extend_with_inlinable_fastpath() { + assert!(roundtrip(&[b"abc", b"defg", b"hix"])); + assert!(roundtrip(&[b"abc", b"defg", b"hix", b"xyza1234abcd"])); + } + + #[test] + fn extend_with_inlinable_eq_len_fastpath() { + assert!(roundtrip(&[b"abc", b"def", b"hix"])); + assert!(roundtrip(&[b"abc", b"def", b"hix", b"xyz"])); + } + + #[test] + fn extend_with_not_inlinable_fastpath() { + assert!(roundtrip(&[ + b"a_very_long_string123", + b"a_longer_string_than_the_previous" + ])); + } +} diff --git a/crates/polars-arrow/src/array/binview/view.rs b/crates/polars-arrow/src/array/binview/view.rs index 6542e2c9761b..d0cd8cd36eda 100644 --- a/crates/polars-arrow/src/array/binview/view.rs +++ b/crates/polars-arrow/src/array/binview/view.rs @@ -157,12 +157,12 @@ impl View { /// Extend a `Vec` with inline views slices of `src` with `width`. /// /// This tries to use SIMD to optimize the copying and can be massively faster than doing a - /// `views.extend(src.chunks_exact(stride).map(View::new_inline))`. + /// `views.extend(src.chunks_exact(width).map(View::new_inline))`. /// /// # Panics /// - /// This function panics if `src.len()` is not divisible by `width` or if `width > - /// View::MAX_INLINE_SIZE`. + /// This function panics if `src.len()` is not divisible by `width`, `width > + /// View::MAX_INLINE_SIZE` or `width == 0`. pub fn extend_with_inlinable_strided(views: &mut Vec, src: &[u8], width: u8) { macro_rules! dispatch { ($n:ident = $match:ident in [$($v:literal),+ $(,)?] => $block:block, otherwise = $otherwise:expr) => { @@ -180,17 +180,16 @@ impl View { } let width = width as usize; - assert_eq!(src.len() % width, 0); + + assert!(width > 0); assert!(width <= View::MAX_INLINE_SIZE as usize); + + assert_eq!(src.len() % width, 0); + let num_values = src.len() / width; views.reserve(num_values); - if width == 0 { - views.resize(views.len() + num_values, View::new_inline(&[])); - return; - } - #[allow(unused_mut)] let mut src = src; diff --git a/crates/polars-arrow/src/array/boolean/mutable.rs b/crates/polars-arrow/src/array/boolean/mutable.rs index 80d689806f1d..7f97f82762b0 100644 --- a/crates/polars-arrow/src/array/boolean/mutable.rs +++ b/crates/polars-arrow/src/array/boolean/mutable.rs @@ -101,9 +101,8 @@ impl MutableBooleanArray { #[inline] pub fn push_value(&mut self, value: bool) { self.values.push(value); - match &mut self.validity { - Some(validity) => validity.push(true), - None => {}, + if let Some(validity) = &mut self.validity { + validity.push(true) } } diff --git a/crates/polars-arrow/src/array/fixed_size_binary/mutable.rs b/crates/polars-arrow/src/array/fixed_size_binary/mutable.rs index 8f81ce86f6d8..1c744dbe88fd 100644 --- a/crates/polars-arrow/src/array/fixed_size_binary/mutable.rs +++ b/crates/polars-arrow/src/array/fixed_size_binary/mutable.rs @@ -114,9 +114,8 @@ impl MutableFixedSizeBinaryArray { } self.values.extend_from_slice(bytes); - match &mut self.validity { - Some(validity) => validity.push(true), - None => {}, + if let Some(validity) = &mut self.validity { + validity.push(true) } }, None => { diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index c2c0c958032d..49e086853a37 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -195,6 +195,7 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static { dyn_clone::clone_trait_object!(Array); /// A trait describing a mutable array; i.e. an array whose values can be changed. +/// /// Mutable arrays cannot be cloned but can be mutated in place, /// thereby making them useful to perform numeric operations without allocations. /// As in [`Array`], concrete arrays (such as [`MutablePrimitiveArray`]) implement how they are mutated. @@ -370,6 +371,7 @@ pub fn new_empty_array(data_type: ArrowDataType) -> Box { } /// Creates a new [`Array`] of [`ArrowDataType`] `data_type` and `length`. +/// /// The array is guaranteed to have [`Array::null_count`] equal to [`Array::len`] /// for all types except Union, which does not have a validity. pub fn new_null_array(data_type: ArrowDataType, length: usize) -> Box { diff --git a/crates/polars-arrow/src/array/primitive/mutable.rs b/crates/polars-arrow/src/array/primitive/mutable.rs index ae2025482f2c..53565dda831a 100644 --- a/crates/polars-arrow/src/array/primitive/mutable.rs +++ b/crates/polars-arrow/src/array/primitive/mutable.rs @@ -130,9 +130,8 @@ impl MutablePrimitiveArray { #[inline] pub fn push_value(&mut self, value: T) { self.values.push(value); - match &mut self.validity { - Some(validity) => validity.push(true), - None => {}, + if let Some(validity) = &mut self.validity { + validity.push(true) } } diff --git a/crates/polars-arrow/src/array/utf8/mutable.rs b/crates/polars-arrow/src/array/utf8/mutable.rs index ef9a5e8527b7..af4845680428 100644 --- a/crates/polars-arrow/src/array/utf8/mutable.rs +++ b/crates/polars-arrow/src/array/utf8/mutable.rs @@ -522,9 +522,8 @@ impl> TryPush> for MutableUtf8Array { Some(value) => { self.values.try_push(value.as_ref())?; - match &mut self.validity { - Some(validity) => validity.push(true), - None => {}, + if let Some(validity) = &mut self.validity { + validity.push(true) } }, None => { diff --git a/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs b/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs index dc388f1d41b5..f3083ad0b141 100644 --- a/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs +++ b/crates/polars-arrow/src/bitmap/utils/slice_iterator.rs @@ -9,7 +9,8 @@ enum State { Finished, } -/// Iterator over a bitmap that returns slices of set regions +/// Iterator over a bitmap that returns slices of set regions. +/// /// This is the most efficient method to extract slices of values from arrays /// with a validity bitmap. /// For example, the bitmap `00101111` returns `[(0,4), (6,1)]` diff --git a/crates/polars-arrow/src/compute/arity.rs b/crates/polars-arrow/src/compute/arity.rs index e590e7b1974b..22ac733c2839 100644 --- a/crates/polars-arrow/src/compute/arity.rs +++ b/crates/polars-arrow/src/compute/arity.rs @@ -8,10 +8,10 @@ use crate::bitmap::{Bitmap, MutableBitmap}; use crate::datatypes::ArrowDataType; use crate::types::NativeType; -/// Applies an unary and infallible function to a [`PrimitiveArray`]. This is the -/// fastest way to perform an operation on a [`PrimitiveArray`] when the benefits -/// of a vectorized operation outweighs the cost of branching nulls and -/// non-nulls. +/// Applies an unary and infallible function to a [`PrimitiveArray`]. +/// +/// This is the /// fastest way to perform an operation on a [`PrimitiveArray`] when the benefits +/// of a vectorized operation outweighs the cost of branching nulls and non-nulls. /// /// # Implementation /// This will apply the function for all values, including those on null slots. @@ -131,11 +131,14 @@ where PrimitiveArray::::new(data_type, values, validity) } -/// Applies a binary operations to two primitive arrays. This is the fastest -/// way to perform an operation on two primitive array when the benefits of a +/// Applies a binary operations to two primitive arrays. +/// +/// This is the fastest way to perform an operation on two primitive array when the benefits of a /// vectorized operation outweighs the cost of branching nulls and non-nulls. +/// /// # Errors /// This function errors iff the arrays have a different length. +/// /// # Implementation /// This will apply the function for all values, including those on null slots. /// This implies that the operation must be infallible for any value of the diff --git a/crates/polars-arrow/src/compute/temporal.rs b/crates/polars-arrow/src/compute/temporal.rs index 1198c04bb152..437089b72891 100644 --- a/crates/polars-arrow/src/compute/temporal.rs +++ b/crates/polars-arrow/src/compute/temporal.rs @@ -75,12 +75,14 @@ macro_rules! date_like { } /// Extracts the years of a temporal array as [`PrimitiveArray`]. +/// /// Use [`can_year`] to check if this operation is supported for the target [`ArrowDataType`]. pub fn year(array: &dyn Array) -> PolarsResult> { date_like!(year, array, ArrowDataType::Int32) } /// Extracts the months of a temporal array as [`PrimitiveArray`]. +/// /// Value ranges from 1 to 12. /// Use [`can_month`] to check if this operation is supported for the target [`ArrowDataType`]. pub fn month(array: &dyn Array) -> PolarsResult> { @@ -88,6 +90,7 @@ pub fn month(array: &dyn Array) -> PolarsResult> { } /// Extracts the days of a temporal array as [`PrimitiveArray`]. +/// /// Value ranges from 1 to 32 (Last day depends on month). /// Use [`can_day`] to check if this operation is supported for the target [`ArrowDataType`]. pub fn day(array: &dyn Array) -> PolarsResult> { @@ -95,13 +98,15 @@ pub fn day(array: &dyn Array) -> PolarsResult> { } /// Extracts weekday of a temporal array as [`PrimitiveArray`]. +/// /// Monday is 1, Tuesday is 2, ..., Sunday is 7. /// Use [`can_weekday`] to check if this operation is supported for the target [`ArrowDataType`] pub fn weekday(array: &dyn Array) -> PolarsResult> { date_like!(i8_weekday, array, ArrowDataType::Int8) } -/// Extracts ISO week of a temporal array as [`PrimitiveArray`] +/// Extracts ISO week of a temporal array as [`PrimitiveArray`]. +/// /// Value ranges from 1 to 53 (Last week depends on the year). /// Use [`can_iso_week`] to check if this operation is supported for the target [`ArrowDataType`] pub fn iso_week(array: &dyn Array) -> PolarsResult> { @@ -161,6 +166,7 @@ pub fn second(array: &dyn Array) -> PolarsResult> { } /// Extracts the nanoseconds of a temporal array as [`PrimitiveArray`]. +/// /// Value ranges from 0 to 1_999_999_999. /// The range from 1_000_000_000 to 1_999_999_999 represents the leap second. /// Use [`can_nanosecond`] to check if this operation is supported for the target [`ArrowDataType`]. diff --git a/crates/polars-arrow/src/datatypes/physical_type.rs b/crates/polars-arrow/src/datatypes/physical_type.rs index 31693cefd4bd..174c0401ca3f 100644 --- a/crates/polars-arrow/src/datatypes/physical_type.rs +++ b/crates/polars-arrow/src/datatypes/physical_type.rs @@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize}; pub use crate::types::PrimitiveType; /// The set of physical types: unique in-memory representations of an Arrow array. +/// /// A physical type has a one-to-many relationship with a [`crate::datatypes::ArrowDataType`] and /// a one-to-one mapping to each struct in this crate that implements [`crate::array::Array`]. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] diff --git a/crates/polars-arrow/src/legacy/kernels/take_agg/var.rs b/crates/polars-arrow/src/legacy/kernels/take_agg/var.rs index 8fd54d712e94..62e2ba1353f2 100644 --- a/crates/polars-arrow/src/legacy/kernels/take_agg/var.rs +++ b/crates/polars-arrow/src/legacy/kernels/take_agg/var.rs @@ -1,6 +1,7 @@ use super::*; -/// Numerical stable online variance aggregation +/// Numerical stable online variance aggregation. +/// /// See: /// Welford, B. P. (1962). "Note on a method for calculating corrected sums of squares and products". /// Technometrics. 4 (3): 419–420. doi:10.2307/1266577. JSTOR 1266577. diff --git a/crates/polars-arrow/src/offset.rs b/crates/polars-arrow/src/offset.rs index 33b3058cbb78..ae4583dfe6f4 100644 --- a/crates/polars-arrow/src/offset.rs +++ b/crates/polars-arrow/src/offset.rs @@ -518,6 +518,14 @@ impl OffsetsBuffer { pub fn into_inner(self) -> Buffer { self.0 } + + /// Returns the offset difference between `start` and `end`. + #[inline] + pub fn delta(&self, start: usize, end: usize) -> usize { + assert!(start <= end); + + (self.0[end + 1] - self.0[start]).to_usize() + } } impl From<&OffsetsBuffer> for OffsetsBuffer { diff --git a/crates/polars-arrow/src/temporal_conversions.rs b/crates/polars-arrow/src/temporal_conversions.rs index b10eef9694c0..487996094f37 100644 --- a/crates/polars-arrow/src/temporal_conversions.rs +++ b/crates/polars-arrow/src/temporal_conversions.rs @@ -267,6 +267,7 @@ pub fn parse_offset(offset: &str) -> PolarsResult { } /// Parses `value` to `Option` consistent with the Arrow's definition of timestamp with timezone. +/// /// `tz` must be built from `timezone` (either via [`parse_offset`] or `chrono-tz`). /// Returns in scale `tz` of `TimeUnit`. #[inline] diff --git a/crates/polars-arrow/src/trusted_len.rs b/crates/polars-arrow/src/trusted_len.rs index 5f194770e7c4..359edfd1b88c 100644 --- a/crates/polars-arrow/src/trusted_len.rs +++ b/crates/polars-arrow/src/trusted_len.rs @@ -3,6 +3,7 @@ use std::iter::Scan; use std::slice::Iter; /// An iterator of known, fixed size. +/// /// A trait denoting Rusts' unstable [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). /// This is re-defined here and implemented for some iterators until `std::iter::TrustedLen` /// is stabilized. @@ -98,6 +99,14 @@ where } } +impl TrustMyLength>, J> { + /// Create a new `TrustMyLength` iterator that repeats `value` `len` times. + pub fn new_repeat_n(value: J, len: usize) -> Self { + // SAFETY: This is always safe since repeat(..).take(n) always repeats exactly `n` times`. + unsafe { Self::new(std::iter::repeat(value).take(len), len) } + } +} + impl Iterator for TrustMyLength where I: Iterator, diff --git a/crates/polars-arrow/src/types/bit_chunk.rs b/crates/polars-arrow/src/types/bit_chunk.rs index c618c5458515..be4445a5d77a 100644 --- a/crates/polars-arrow/src/types/bit_chunk.rs +++ b/crates/polars-arrow/src/types/bit_chunk.rs @@ -48,8 +48,10 @@ bit_chunk!(u16); bit_chunk!(u32); bit_chunk!(u64); -/// An [`Iterator`] over a [`BitChunk`]. This iterator is often -/// compiled to SIMD. +/// An [`Iterator`] over a [`BitChunk`]. +/// +/// This iterator is often compiled to SIMD. +/// /// The [LSB](https://en.wikipedia.org/wiki/Bit_numbering#Least_significant_bit) corresponds /// to the first slot, as defined by the arrow specification. /// # Example diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index 882392f080cf..204f022ff3ae 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -64,7 +64,7 @@ performant = ["arrow/performant", "reinterpret"] # extra utilities for StringChunked strings = ["regex", "arrow/strings", "polars-error/regex"] # support for ObjectChunked (downcastable Series of any type) -object = ["serde_json"] +object = ["serde_json", "algorithm_group_by"] fmt = ["comfy-table/tty"] fmt_no_tty = ["comfy-table"] @@ -93,9 +93,9 @@ diagonal_concat = [] dataframe_arithmetic = [] product = [] unique_counts = [] -partition_by = [] +partition_by = ["algorithm_group_by"] describe = [] -timezones = ["chrono-tz", "arrow/chrono-tz", "arrow/timezones"] +timezones = ["temporal", "chrono", "chrono-tz", "arrow/chrono-tz", "arrow/timezones"] dynamic_group_by = ["dtype-datetime", "dtype-date"] arrow_rs = ["arrow-array", "arrow/arrow_rs"] diff --git a/crates/polars-core/src/chunked_array/object/registry.rs b/crates/polars-core/src/chunked_array/object/registry.rs index 5ebcad2a022a..ef5febddad76 100644 --- a/crates/polars-core/src/chunked_array/object/registry.rs +++ b/crates/polars-core/src/chunked_array/object/registry.rs @@ -1,4 +1,5 @@ //! This is a heap allocated utility that can be used to register an object type. +//! //! That object type will know its own generic type parameter `T` and callers can simply //! send `&Any` values and don't have to know the generic type themselves. use std::any::Any; diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index c3d030447794..f946fce715e6 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -121,6 +121,7 @@ pub trait ChunkTakeUnchecked { } /// Create a `ChunkedArray` with new values by index or by boolean mask. +/// /// Note that these operations clone data. This is however the only way we can modify at mask or /// index level as the underlying Arrow arrays are immutable. pub trait ChunkSet<'a, A, B> { @@ -461,7 +462,7 @@ pub trait ChunkFilter { /// Create a new ChunkedArray filled with values at that index. pub trait ChunkExpandAtIndex { /// Create a new ChunkedArray filled with values at that index. - fn new_from_index(&self, length: usize, index: usize) -> ChunkedArray; + fn new_from_index(&self, index: usize, length: usize) -> ChunkedArray; } macro_rules! impl_chunk_expand { @@ -536,7 +537,7 @@ impl ChunkExpandAtIndex for ListChunked { #[cfg(feature = "dtype-struct")] impl ChunkExpandAtIndex for StructChunked { - fn new_from_index(&self, length: usize, index: usize) -> ChunkedArray { + fn new_from_index(&self, index: usize, length: usize) -> ChunkedArray { let (chunk_idx, idx) = self.index_to_chunked_index(index); let chunk = self.downcast_chunks().get(chunk_idx).unwrap(); let chunk = if chunk.is_null(idx) { diff --git a/crates/polars-core/src/chunked_array/ops/search_sorted.rs b/crates/polars-core/src/chunked_array/ops/search_sorted.rs index e31599429aae..5e97f0818176 100644 --- a/crates/polars-core/src/chunked_array/ops/search_sorted.rs +++ b/crates/polars-core/src/chunked_array/ops/search_sorted.rs @@ -38,8 +38,10 @@ where } /// Search through a series of chunks for the first position where f(x) is true, -/// assuming it is first always false and then always true. It repeats this for -/// each value in search_values. If the search value is null null_idx is returned. +/// assuming it is first always false and then always true. +/// +/// It repeats this for each value in search_values. If the search value is null null_idx is +/// returned. /// /// Assumes the chunks are non-empty. pub fn lower_bound_chunks<'a, T, F>( diff --git a/crates/polars-core/src/datatypes/_serde.rs b/crates/polars-core/src/datatypes/_serde.rs index ee5839663ddf..fd79b5bf6566 100644 --- a/crates/polars-core/src/datatypes/_serde.rs +++ b/crates/polars-core/src/datatypes/_serde.rs @@ -4,6 +4,7 @@ //! We could use [serde_1712](https://github.com/serde-rs/serde/issues/1712), but that gave problems caused by //! [rust_96956](https://github.com/rust-lang/rust/issues/96956), so we make a dummy type without static +#[cfg(feature = "dtype-categorical")] use serde::de::SeqAccess; use serde::{Deserialize, Serialize}; diff --git a/crates/polars-core/src/datatypes/time_unit.rs b/crates/polars-core/src/datatypes/time_unit.rs index 481de22249b1..d3a9a61443fb 100644 --- a/crates/polars-core/src/datatypes/time_unit.rs +++ b/crates/polars-core/src/datatypes/time_unit.rs @@ -58,7 +58,7 @@ impl TimeUnit { } } -#[cfg(feature = "rows")] +#[cfg(any(feature = "rows", feature = "object"))] #[cfg(any(feature = "dtype-datetime", feature = "dtype-duration"))] #[inline] pub(crate) fn convert_time_units(v: i64, tu_l: TimeUnit, tu_r: TimeUnit) -> i64 { diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs index c772031f01da..906b18dcedb7 100644 --- a/crates/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -1,5 +1,7 @@ use arrow::offset::OffsetsBuffer; use rayon::prelude::*; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use smartstring::alias::String as SmartString; use crate::chunked_array::ops::explode::offsets_to_indexes; @@ -18,6 +20,7 @@ fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer)> { /// Arguments for `[DataFrame::unpivot]` function #[derive(Clone, Default, Debug, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct UnpivotArgsIR { pub on: Vec, pub index: Vec, diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 8894e488695a..1923d370ebc1 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -61,6 +61,37 @@ pub enum UniqueKeepStrategy { Any, } +fn ensure_names_unique(items: &[T], mut get_name: F) -> PolarsResult<()> +where + F: FnMut(&T) -> &str, +{ + // Always unique. + if items.len() <= 1 { + return Ok(()); + } + + if items.len() <= 4 { + // Too small to be worth spawning a hashmap for, this is at most 6 comparisons. + for i in 0..items.len() - 1 { + let name = get_name(&items[i]); + for other in items.iter().skip(i + 1) { + if name == get_name(other) { + polars_bail!(duplicate = name); + } + } + } + } else { + let mut names = PlHashSet::with_capacity(items.len()); + for item in items { + let name = get_name(item); + if !names.insert(name) { + polars_bail!(duplicate = name); + } + } + } + Ok(()) +} + /// A contiguous growable collection of `Series` that have the same length. /// /// ## Use declarations @@ -221,89 +252,62 @@ impl DataFrame { /// let df = DataFrame::new(vec![s0, s1])?; /// # Ok::<(), PolarsError>(()) /// ``` - pub fn new(columns: Vec) -> PolarsResult { - let mut first_len = None; + pub fn new(columns: Vec) -> PolarsResult { + ensure_names_unique(&columns, |s| s.name())?; - let shape_err = |&first_name, &first_len, &name, &len| { - polars_bail!( - ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} \ - while series {:?} has length {}", - first_name, first_len, name, len - ); - }; + if columns.len() > 1 { + let first_len = columns[0].len(); + for col in &columns { + polars_ensure!( + col.len() == first_len, + ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}", + columns[0].len(), first_len, col.name(), col.len() + ); + } + } - let series_cols = if S::is_series() { - // SAFETY: - // we are guarded by the type system here. - #[allow(clippy::transmute_undefined_repr)] - let series_cols = unsafe { std::mem::transmute::, Vec>(columns) }; - let mut names = PlHashSet::with_capacity(series_cols.len()); - - for s in &series_cols { - let name = s.name(); - - match first_len { - Some(len) => { - if s.len() != len { - let first_series = &series_cols.first().unwrap(); - return shape_err( - &first_series.name(), - &first_series.len(), - &name, - &s.len(), - ); - } - }, - None => first_len = Some(s.len()), - } + Ok(DataFrame { columns }) + } - if !names.insert(name) { - polars_bail!(duplicate = name); - } - } - // we drop early as the brchk thinks the &str borrows are used when calling the drop - // of both `series_cols` and `names` - drop(names); - series_cols - } else { - let mut series_cols: Vec = Vec::with_capacity(columns.len()); - let mut names = PlHashSet::with_capacity(columns.len()); - - // check for series length equality and convert into series in one pass - for s in columns { - let series = s.into_series(); - // we have aliasing borrows so we must allocate a string - let name = series.name().to_string(); - - match first_len { - Some(len) => { - if series.len() != len { - let first_series = &series_cols.first().unwrap(); - return shape_err( - &first_series.name(), - &first_series.len(), - &name.as_str(), - &series.len(), - ); - } - }, - None => first_len = Some(series.len()), - } + /// Converts a sequence of columns into a DataFrame, broadcasting length-1 + /// columns to match the other columns. + pub fn new_with_broadcast(columns: Vec) -> PolarsResult { + ensure_names_unique(&columns, |s| s.name())?; + unsafe { Self::new_with_broadcast_no_checks(columns) } + } - if names.contains(&name) { - polars_bail!(duplicate = name); + /// Converts a sequence of columns into a DataFrame, broadcasting length-1 + /// columns to match the other columns. + /// + /// # Safety + /// Does not check that the column names are unique (which they must be). + pub unsafe fn new_with_broadcast_no_checks(mut columns: Vec) -> PolarsResult { + // The length of the longest non-unit length column determines the + // broadcast length. If all columns are unit-length the broadcast length + // is one. + let broadcast_len = columns + .iter() + .map(|s| s.len()) + .filter(|l| *l != 1) + .max() + .unwrap_or(1); + + for col in &mut columns { + // Length not equal to the broadcast len, needs broadcast or is an error. + let len = col.len(); + if len != broadcast_len { + if len != 1 { + let name = col.name().to_owned(); + let longest_column = columns.iter().max_by_key(|c| c.len()).unwrap().name(); + polars_bail!( + ShapeMismatch: "could not create a new DataFrame: series {:?} has length {} while series {:?} has length {}", + name, len, longest_column, broadcast_len + ); } - - series_cols.push(series); - names.insert(name); + *col = col.new_from_index(0, broadcast_len); } - drop(names); - series_cols - }; - - Ok(DataFrame { - columns: series_cols, - }) + } + Ok(unsafe { DataFrame::new_no_checks(columns) }) } /// Creates an empty `DataFrame` usable in a compile time context (such as static initializers). @@ -442,16 +446,7 @@ impl DataFrame { /// It is the callers responsibility to uphold the contract of all `Series` /// having an equal length, if not this may panic down the line. pub unsafe fn new_no_length_checks(columns: Vec) -> PolarsResult { - let mut names = PlHashSet::with_capacity(columns.len()); - for column in &columns { - let name = column.name(); - if !names.insert(name) { - polars_bail!(duplicate = name); - } - } - // we drop early as the brchk thinks the &str borrows are used when calling the drop - // of both `columns` and `names` - drop(names); + ensure_names_unique(&columns, |s| s.name())?; Ok(DataFrame { columns }) } @@ -637,12 +632,7 @@ impl DataFrame { ShapeMismatch: "{} column names provided for a DataFrame of width {}", names.len(), self.width() ); - let unique_names: PlHashSet<&str> = - PlHashSet::from_iter(names.iter().map(|name| name.as_ref())); - polars_ensure!( - unique_names.len() == self.width(), - Duplicate: "duplicate column names found" - ); + ensure_names_unique(names, |s| s.as_ref())?; let columns = mem::take(&mut self.columns); self.columns = columns @@ -1171,8 +1161,15 @@ impl DataFrame { /// # Safety /// The caller must ensure `column.len() == self.height()` . pub unsafe fn with_column_unchecked(&mut self, column: Series) -> &mut Self { - self.get_columns_mut().push(column); - self + #[cfg(debug_assertions)] + { + return self.with_column(column).unwrap(); + } + #[cfg(not(debug_assertions))] + { + self.get_columns_mut().push(column); + self + } } fn add_column_by_schema(&mut self, s: Series, schema: &Schema) -> PolarsResult<()> { @@ -1440,7 +1437,7 @@ impl DataFrame { } pub fn _select_impl(&self, cols: &[SmartString]) -> PolarsResult { - self.select_check_duplicates(cols)?; + ensure_names_unique(cols, |s| s.as_str())?; self._select_impl_unchecked(cols) } @@ -1486,7 +1483,7 @@ impl DataFrame { check_duplicates: bool, ) -> PolarsResult { if check_duplicates { - self.select_check_duplicates(cols)?; + ensure_names_unique(cols, |s| s.as_str())?; } let selected = self.select_series_impl_with_schema(cols, schema)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) @@ -1519,21 +1516,11 @@ impl DataFrame { } fn select_physical_impl(&self, cols: &[SmartString]) -> PolarsResult { - self.select_check_duplicates(cols)?; + ensure_names_unique(cols, |s| s.as_str())?; let selected = self.select_series_physical_impl(cols)?; Ok(unsafe { DataFrame::new_no_checks(selected) }) } - fn select_check_duplicates(&self, cols: &[SmartString]) -> PolarsResult<()> { - let mut names = PlHashSet::with_capacity(cols.len()); - for name in cols { - if !names.insert(name.as_str()) { - polars_bail!(duplicate = name); - } - } - Ok(()) - } - /// Select column(s) from this [`DataFrame`] and return them into a [`Vec`]. /// /// # Example @@ -1705,16 +1692,16 @@ impl DataFrame { /// } /// ``` pub fn rename(&mut self, column: &str, name: &str) -> PolarsResult<&mut Self> { + if column == name { + return Ok(self); + } + polars_ensure!( + self.columns.iter().all(|c| c.name() != name), + Duplicate: "column rename attempted with already existing name \"{name}\"" + ); self.select_mut(column) .ok_or_else(|| polars_err!(col_not_found = column)) .map(|s| s.rename(name))?; - let unique_names: PlHashSet<&str> = - PlHashSet::from_iter(self.columns.iter().map(|s| s.name())); - polars_ensure!( - unique_names.len() == self.width(), - Duplicate: "duplicate column names found" - ); - drop(unique_names); Ok(self) } diff --git a/crates/polars-core/src/hashing/identity.rs b/crates/polars-core/src/hashing/identity.rs index 7554395ac50c..e917291f1586 100644 --- a/crates/polars-core/src/hashing/identity.rs +++ b/crates/polars-core/src/hashing/identity.rs @@ -36,6 +36,7 @@ pub type IdBuildHasher = BuildHasherDefault; #[derive(Debug)] /// Contains an idx of a row in a DataFrame and the precomputed hash of that row. +/// /// That hash still needs to be used to create another hash to be able to resize hashmaps without /// accidental quadratic behavior. So do not use an Identity function! pub struct IdxHash { diff --git a/crates/polars-core/src/hashing/mod.rs b/crates/polars-core/src/hashing/mod.rs index 1ac43c2888bf..8f966eb2f317 100644 --- a/crates/polars-core/src/hashing/mod.rs +++ b/crates/polars-core/src/hashing/mod.rs @@ -39,6 +39,7 @@ pub(crate) unsafe fn compare_df_rows(keys: &DataFrame, idx_a: usize, idx_b: usiz } /// Populate a multiple key hashmap with row indexes. +/// /// Instead of the keys (which could be very large), the row indexes are stored. /// To check if a row is equal the original DataFrame is also passed as ref. /// When a hash collision occurs the indexes are ptrs to the rows and the rows are compared diff --git a/crates/polars-core/src/schema.rs b/crates/polars-core/src/schema.rs index e6c121824893..8f04d1bb20be 100644 --- a/crates/polars-core/src/schema.rs +++ b/crates/polars-core/src/schema.rs @@ -5,6 +5,7 @@ use arrow::datatypes::ArrowSchemaRef; use indexmap::map::MutableKeys; use indexmap::IndexMap; use polars_utils::aliases::PlRandomState; +use polars_utils::itertools::Itertools; #[cfg(feature = "serde-lazy")] use serde::{Deserialize, Serialize}; use smartstring::alias::String as SmartString; @@ -66,12 +67,12 @@ where } impl Schema { - /// Create a new, empty schema + /// Create a new, empty schema. pub fn new() -> Self { Self::with_capacity(0) } - /// Create a new, empty schema with capacity + /// Create a new, empty schema with the given capacity. /// /// If you know the number of fields you have ahead of time, using this is more efficient than using /// [`new`][Self::new]. Also consider using [`Schema::from_iter`] if you have the collection of fields available @@ -87,7 +88,7 @@ impl Schema { self.inner.reserve(additional); } - /// The number of fields in the schema + /// The number of fields in the schema. #[inline] pub fn len(&self) -> usize { self.inner.len() @@ -98,7 +99,7 @@ impl Schema { self.inner.is_empty() } - /// Rename field `old` to `new`, and return the (owned) old name + /// Rename field `old` to `new`, and return the (owned) old name. /// /// If `old` is not present in the schema, the schema is not modified and `None` is returned. Otherwise the schema /// is updated and `Some(old_name)` is returned. @@ -114,7 +115,7 @@ impl Schema { Some(old_name) } - /// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index` + /// Create a new schema from this one, inserting a field with `name` and `dtype` at the given `index`. /// /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the @@ -150,7 +151,7 @@ impl Schema { Ok(new) } - /// Insert a field with `name` and `dtype` at the given `index` into this schema + /// Insert a field with `name` and `dtype` at the given `index` into this schema. /// /// If a field named `name` already exists, it is updated with the new dtype. Regardless, the field named `name` is /// always moved to the given index. Valid indices range from `0` (front of the schema) to `self.len()` (after the @@ -189,32 +190,32 @@ impl Schema { Ok(old_dtype) } - /// Get a reference to the dtype of the field named `name`, or `None` if the field doesn't exist + /// Get a reference to the dtype of the field named `name`, or `None` if the field doesn't exist. pub fn get(&self, name: &str) -> Option<&DataType> { self.inner.get(name) } - /// Get a reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist + /// Get a reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist. pub fn try_get(&self, name: &str) -> PolarsResult<&DataType> { self.get(name) .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) } - /// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist + /// Get a mutable reference to the dtype of the field named `name`, or `Err(PolarsErr)` if the field doesn't exist. pub fn try_get_mut(&mut self, name: &str) -> PolarsResult<&mut DataType> { self.inner .get_mut(name) .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) } - /// Return all data about the field named `name`: its index in the schema, its name, and its dtype + /// Return all data about the field named `name`: its index in the schema, its name, and its dtype. /// /// Returns `Some((index, &name, &dtype))` if the field exists, `None` if it doesn't. pub fn get_full(&self, name: &str) -> Option<(usize, &SmartString, &DataType)> { self.inner.get_full(name) } - /// Return all data about the field named `name`: its index in the schema, its name, and its dtype + /// Return all data about the field named `name`: its index in the schema, its name, and its dtype. /// /// Returns `Ok((index, &name, &dtype))` if the field exists, `Err(PolarsErr)` if it doesn't. pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &SmartString, &DataType)> { @@ -223,7 +224,7 @@ impl Schema { .ok_or_else(|| polars_err!(SchemaFieldNotFound: "{}", name)) } - /// Look up the name in the schema and return an owned [`Field`] by cloning the data + /// Look up the name in the schema and return an owned [`Field`] by cloning the data. /// /// Returns `None` if the field does not exist. /// @@ -235,7 +236,7 @@ impl Schema { .map(|dtype| Field::new(name, dtype.clone())) } - /// Look up the name in the schema and return an owned [`Field`] by cloning the data + /// Look up the name in the schema and return an owned [`Field`] by cloning the data. /// /// Returns `Err(PolarsErr)` if the field does not exist. /// @@ -248,7 +249,7 @@ impl Schema { .map(|dtype| Field::new(name, dtype.clone())) } - /// Get references to the name and dtype of the field at `index` + /// Get references to the name and dtype of the field at `index`. /// /// If `index` is inbounds, returns `Some((&name, &dtype))`, else `None`. See /// [`get_at_index_mut`][Self::get_at_index_mut] for a mutable version. @@ -260,7 +261,7 @@ impl Schema { self.inner.get_index(index).ok_or_else(|| polars_err!(ComputeError: "index {index} out of bounds with 'schema' of len: {}", self.len())) } - /// Get mutable references to the name and dtype of the field at `index` + /// Get mutable references to the name and dtype of the field at `index`. /// /// If `index` is inbounds, returns `Some((&mut name, &mut dtype))`, else `None`. See /// [`get_at_index`][Self::get_at_index] for an immutable version. @@ -268,7 +269,7 @@ impl Schema { self.inner.get_index_mut2(index) } - /// Swap-remove a field by name and, if the field existed, return its dtype + /// Swap-remove a field by name and, if the field existed, return its dtype. /// /// If the field does not exist, the schema is not modified and `None` is returned. /// @@ -279,7 +280,7 @@ impl Schema { self.inner.swap_remove(name) } - /// Remove a field by name, preserving order, and, if the field existed, return its dtype + /// Remove a field by name, preserving order, and, if the field existed, return its dtype. /// /// If the field does not exist, the schema is not modified and `None` is returned. /// @@ -289,7 +290,7 @@ impl Schema { self.inner.shift_remove(name) } - /// Remove a field by name, preserving order, and, if the field existed, return its dtype + /// Remove a field by name, preserving order, and, if the field existed, return its dtype. /// /// If the field does not exist, the schema is not modified and `None` is returned. /// @@ -299,12 +300,12 @@ impl Schema { self.inner.shift_remove_index(index) } - /// Whether the schema contains a field named `name` + /// Whether the schema contains a field named `name`. pub fn contains(&self, name: &str) -> bool { self.get(name).is_some() } - /// Change the field named `name` to the given `dtype` and return the previous dtype + /// Change the field named `name` to the given `dtype` and return the previous dtype. /// /// If `name` doesn't already exist in the schema, the schema is not modified and `None` is returned. Otherwise /// returns `Some(old_dtype)`. @@ -316,7 +317,7 @@ impl Schema { Some(std::mem::replace(old_dtype, dtype)) } - /// Change the field at the given index to the given `dtype` and return the previous dtype + /// Change the field at the given index to the given `dtype` and return the previous dtype. /// /// If the index is out of bounds, the schema is not modified and `None` is returned. Otherwise returns /// `Some(old_dtype)`. @@ -328,7 +329,7 @@ impl Schema { Some(std::mem::replace(old_dtype, dtype)) } - /// Insert a new column in the [`Schema`] + /// Insert a new column in the [`Schema`]. /// /// If an equivalent name already exists in the schema: the name remains and /// retains in its place in the order, its corresponding value is updated @@ -344,7 +345,7 @@ impl Schema { self.inner.insert(name, dtype) } - /// Merge `other` into `self` + /// Merge `other` into `self`. /// /// Merging logic: /// - Fields that occur in `self` but not `other` are unmodified @@ -355,7 +356,7 @@ impl Schema { self.inner.extend(other.inner) } - /// Merge borrowed `other` into `self` + /// Merge borrowed `other` into `self`. /// /// Merging logic: /// - Fields that occur in `self` but not `other` are unmodified @@ -370,7 +371,7 @@ impl Schema { ) } - /// Convert self to `ArrowSchema` by cloning the fields + /// Convert self to `ArrowSchema` by cloning the fields. pub fn to_arrow(&self, compat_level: CompatLevel) -> ArrowSchema { let fields: Vec<_> = self .inner @@ -380,7 +381,7 @@ impl Schema { ArrowSchema::from(fields) } - /// Iterates the [`Field`]s in this schema, constructing them anew by cloning each `(&name, &dtype)` pair + /// Iterates the [`Field`]s in this schema, constructing them anew by cloning each `(&name, &dtype)` pair. /// /// Note that this clones each name and dtype in order to form an owned [`Field`]. For a clone-free version, use /// [`iter`][Self::iter], which returns `(&name, &dtype)`. @@ -390,22 +391,22 @@ impl Schema { .map(|(name, dtype)| Field::new(name, dtype.clone())) } - /// Iterates over references to the dtypes in this schema + /// Iterates over references to the dtypes in this schema. pub fn iter_dtypes(&self) -> impl '_ + ExactSizeIterator { self.inner.iter().map(|(_name, dtype)| dtype) } - /// Iterates over mut references to the dtypes in this schema + /// Iterates over mut references to the dtypes in this schema. pub fn iter_dtypes_mut(&mut self) -> impl '_ + ExactSizeIterator { self.inner.iter_mut().map(|(_name, dtype)| dtype) } - /// Iterates over references to the names in this schema + /// Iterates over references to the names in this schema. pub fn iter_names(&self) -> impl '_ + ExactSizeIterator { self.inner.iter().map(|(name, _dtype)| name) } - /// Iterates over the `(&name, &dtype)` pairs in this schema + /// Iterates over the `(&name, &dtype)` pairs in this schema. /// /// For an owned version, use [`iter_fields`][Self::iter_fields], which clones the data to iterate owned `Field`s pub fn iter(&self) -> impl Iterator + '_ { @@ -426,6 +427,27 @@ impl Schema { } Ok(changed) } + + /// Generates another schema with just the specified columns selected from this one. + pub fn select(&self, columns: I) -> PolarsResult + where + I: IntoIterator, + I::Item: AsRef, + { + Ok(Self { + inner: columns + .into_iter() + .map(|c| { + let name = c.as_ref(); + let dtype = self + .inner + .get(name) + .ok_or_else(|| polars_err!(col_not_found = name))?; + PolarsResult::Ok((SmartString::from(name), dtype.clone())) + }) + .try_collect()?, + }) + } } pub type SchemaRef = Arc; @@ -439,7 +461,7 @@ impl IntoIterator for Schema { } } -/// This trait exists to be unify the API of polars Schema and arrows Schema +/// This trait exists to be unify the API of polars Schema and arrows Schema. pub trait IndexOfSchema: Debug { /// Get the index of a column by name. fn index_of(&self, name: &str) -> Option; diff --git a/crates/polars-core/src/serde/series.rs b/crates/polars-core/src/serde/series.rs index 006edd96d604..49e9b6d004be 100644 --- a/crates/polars-core/src/serde/series.rs +++ b/crates/polars-core/src/serde/series.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::fmt::Formatter; use serde::de::{Error as DeError, MapAccess, Visitor}; +#[cfg(feature = "object")] use serde::ser::Error as SerError; use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; diff --git a/crates/polars-core/src/series/implementations/struct__.rs b/crates/polars-core/src/series/implementations/struct__.rs index 767eec146195..07b35502dd6b 100644 --- a/crates/polars-core/src/series/implementations/struct__.rs +++ b/crates/polars-core/src/series/implementations/struct__.rs @@ -149,7 +149,7 @@ impl SeriesTrait for SeriesWrap { } fn new_from_index(&self, _index: usize, _length: usize) -> Series { - self.0.new_from_index(_length, _index).into_series() + self.0.new_from_index(_index, _length).into_series() } fn cast(&self, dtype: &DataType, cast_options: CastOptions) -> PolarsResult { diff --git a/crates/polars-core/src/testing.rs b/crates/polars-core/src/testing.rs index cb9f6e5389ab..91d6b998c671 100644 --- a/crates/polars-core/src/testing.rs +++ b/crates/polars-core/src/testing.rs @@ -162,7 +162,9 @@ impl PartialEq for DataFrame { } /// Asserts that two expressions of type [`DataFrame`] are equal according to [`DataFrame::equals`] -/// at runtime. If the expression are not equal, the program will panic with a message that displays +/// at runtime. +/// +/// If the expression are not equal, the program will panic with a message that displays /// both dataframes. #[macro_export] macro_rules! assert_df_eq { diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index d1a48a981cf8..4a078ae0f2c6 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -40,7 +40,8 @@ pub fn _set_partition_size() -> usize { POOL.current_num_threads() } -/// Just a wrapper structure. Useful for certain impl specializations +/// Just a wrapper structure which is useful for certain impl specializations. +/// /// This is for instance use to implement /// `impl FromIterator for NoNull>` /// as `Option` was already implemented: @@ -848,6 +849,16 @@ where pub(crate) fn align_chunks_binary_owned_series(left: Series, right: Series) -> (Series, Series) { match (left.chunks().len(), right.chunks().len()) { (1, 1) => (left, right), + // All chunks are equal length + (a, b) + if a == b + && left + .chunk_lengths() + .zip(right.chunk_lengths()) + .all(|(l, r)| l == r) => + { + (left, right) + }, (_, 1) => (left.rechunk(), right), (1, _) => (left, right.rechunk()), (_, _) => (left.rechunk(), right.rechunk()), @@ -864,6 +875,16 @@ where { match (left.chunks.len(), right.chunks.len()) { (1, 1) => (left, right), + // All chunks are equal length + (a, b) + if a == b + && left + .chunk_lengths() + .zip(right.chunk_lengths()) + .all(|(l, r)| l == r) => + { + (left, right) + }, (_, 1) => (left.rechunk(), right), (1, _) => (left, right.rechunk()), (_, _) => (left.rechunk(), right.rechunk()), @@ -1161,6 +1182,22 @@ pub fn coalesce_nulls_series(a: &Series, b: &Series) -> (Series, Series) { } } +pub fn operation_exceeded_idxsize_msg(operation: &str) -> String { + if core::mem::size_of::() == core::mem::size_of::() { + format!( + "{} exceeded the maximum supported limit of {} rows. Consider installing 'polars-u64-idx'.", + operation, + IdxSize::MAX, + ) + } else { + format!( + "{} exceeded the maximum supported limit of {} rows.", + operation, + IdxSize::MAX, + ) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/crates/polars-error/src/lib.rs b/crates/polars-error/src/lib.rs index e0989141d2b6..f664c1acf3f3 100644 --- a/crates/polars-error/src/lib.rs +++ b/crates/polars-error/src/lib.rs @@ -6,11 +6,27 @@ use std::collections::TryReserveError; use std::error::Error; use std::fmt::{self, Display, Formatter, Write}; use std::ops::Deref; -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; use std::{env, io}; pub use warning::*; +enum ErrorStrategy { + Panic, + WithBacktrace, + Normal, +} + +static ERROR_STRATEGY: LazyLock = LazyLock::new(|| { + if env::var("POLARS_PANIC_ON_ERR").as_deref() == Ok("1") { + ErrorStrategy::Panic + } else if env::var("POLARS_BACKTRACE_IN_ERR").as_deref() == Ok("1") { + ErrorStrategy::WithBacktrace + } else { + ErrorStrategy::Normal + } +}); + #[derive(Debug)] pub struct ErrString(Cow<'static, str>); @@ -25,10 +41,14 @@ where T: Into>, { fn from(msg: T) -> Self { - if env::var("POLARS_PANIC_ON_ERR").as_deref().unwrap_or("") == "1" { - panic!("{}", msg.into()) - } else { - ErrString(msg.into()) + match &*ERROR_STRATEGY { + ErrorStrategy::Panic => panic!("{}", msg.into()), + ErrorStrategy::WithBacktrace => ErrString(Cow::Owned(format!( + "{}\n\nRust backtrace:\n{}", + msg.into(), + std::backtrace::Backtrace::force_capture() + ))), + ErrorStrategy::Normal => ErrString(msg.into()), } } } @@ -184,7 +204,7 @@ impl PolarsError { } } - fn wrap_msg String>(&self, func: F) -> Self { + pub fn wrap_msg String>(&self, func: F) -> Self { use PolarsError::*; match self { ColumnNotFound(msg) => ColumnNotFound(func(msg).into()), diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs index 4d13d784540e..802e130d15f2 100644 --- a/crates/polars-expr/src/expressions/apply.rs +++ b/crates/polars-expr/src/expressions/apply.rs @@ -240,10 +240,7 @@ impl ApplyExpr { // then unpack the lists and finally create iterators from this list chunked arrays. let mut iters = acs .iter_mut() - .map(|ac| { - // SAFETY: unstable series never lives longer than the iterator. - unsafe { ac.iter_groups(self.pass_name_to_apply) } - }) + .map(|ac| ac.iter_groups(self.pass_name_to_apply)) .collect::>(); // Length of the items to iterate over. diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs index ce26c1a57c77..55caf00ad69a 100644 --- a/crates/polars-expr/src/expressions/binary.rs +++ b/crates/polars-expr/src/expressions/binary.rs @@ -151,15 +151,13 @@ impl BinaryExpr { mut ac_r: AggregationContext<'a>, ) -> PolarsResult> { let name = ac_l.series().name().to_string(); - // SAFETY: unstable series never lives longer than the iterator. - let ca = unsafe { - ac_l.iter_groups(false) - .zip(ac_r.iter_groups(false)) - .map(|(l, r)| Some(apply_operator(l?.as_ref(), r?.as_ref(), self.op))) - .map(|opt_res| opt_res.transpose()) - .collect::>()? - .with_name(&name) - }; + let ca = ac_l + .iter_groups(false) + .zip(ac_r.iter_groups(false)) + .map(|(l, r)| Some(apply_operator(l?.as_ref(), r?.as_ref(), self.op))) + .map(|opt_res| opt_res.transpose()) + .collect::>()? + .with_name(&name); ac_l.with_update_groups(UpdateGroups::WithSeriesLen); ac_l.with_agg_state(AggState::AggregatedList(ca.into_series())); diff --git a/crates/polars-expr/src/expressions/filter.rs b/crates/polars-expr/src/expressions/filter.rs index d9df88419ae7..db9ee0cf120e 100644 --- a/crates/polars-expr/src/expressions/filter.rs +++ b/crates/polars-expr/src/expressions/filter.rs @@ -45,10 +45,15 @@ impl PhysicalExpr for FilterExpr { let (ac_s, ac_predicate) = POOL.install(|| rayon::join(ac_s_f, ac_predicate_f)); let (mut ac_s, mut ac_predicate) = (ac_s?, ac_predicate?); + // Check if the groups are still equal, otherwise aggregate. + // TODO! create a special group iters that don't materialize + if ac_s.groups.as_ref() as *const _ != ac_predicate.groups.as_ref() as *const _ { + let _ = ac_s.aggregated(); + let _ = ac_predicate.aggregated(); + } if ac_predicate.is_aggregated() || ac_s.is_aggregated() { - // SAFETY: unstable series never lives longer than the iterator. - let preds = unsafe { ac_predicate.iter_groups(false) }; + let preds = ac_predicate.iter_groups(false); let s = ac_s.aggregated(); let ca = s.list()?; let out = if ca.is_empty() { diff --git a/crates/polars-expr/src/expressions/gather.rs b/crates/polars-expr/src/expressions/gather.rs index c54f8b9e8262..951833717a33 100644 --- a/crates/polars-expr/src/expressions/gather.rs +++ b/crates/polars-expr/src/expressions/gather.rs @@ -253,21 +253,19 @@ impl GatherExpr { ac.series().name(), )?; - unsafe { - let iter = ac.iter_groups(false).zip(idx.iter_groups(false)); - for (s, idx) in iter { - match (s, idx) { - (Some(s), Some(idx)) => { - let idx = convert_to_unsigned_index(idx.as_ref(), s.as_ref().len())?; - let out = s.as_ref().take(&idx)?; - builder.append_series(&out)?; - }, - _ => builder.append_null(), - }; - } - let out = builder.finish().into_series(); - ac.with_agg_state(AggState::AggregatedList(out)); + let iter = ac.iter_groups(false).zip(idx.iter_groups(false)); + for (s, idx) in iter { + match (s, idx) { + (Some(s), Some(idx)) => { + let idx = convert_to_unsigned_index(idx.as_ref(), s.as_ref().len())?; + let out = s.as_ref().take(&idx)?; + builder.append_series(&out)?; + }, + _ => builder.append_null(), + }; } + let out = builder.finish().into_series(); + ac.with_agg_state(AggState::AggregatedList(out)); Ok(ac) } } diff --git a/crates/polars-expr/src/expressions/group_iter.rs b/crates/polars-expr/src/expressions/group_iter.rs index 8c921a519bd1..26c68fdae3d2 100644 --- a/crates/polars-expr/src/expressions/group_iter.rs +++ b/crates/polars-expr/src/expressions/group_iter.rs @@ -5,10 +5,7 @@ use polars_core::series::amortized_iter::AmortSeries; use super::*; impl<'a> AggregationContext<'a> { - /// # Safety - /// The lifetime of [AmortSeries] is bound to the iterator. Keeping it alive - /// longer than the iterator is UB. - pub(super) unsafe fn iter_groups( + pub(super) fn iter_groups( &mut self, keep_names: bool, ) -> Box> + '_> { diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs index 17179f89cbdd..266d577b22ee 100644 --- a/crates/polars-expr/src/expressions/mod.rs +++ b/crates/polars-expr/src/expressions/mod.rs @@ -421,7 +421,9 @@ impl<'a> AggregationContext<'a> { self.groups(); let rows = self.groups.len(); let s = s.new_from_index(0, rows); - s.reshape_list(&[rows as i64, -1]).unwrap() + let out = s.reshape_list(&[rows as i64, -1]).unwrap(); + self.state = AggState::AggregatedList(out.clone()); + out }, } } diff --git a/crates/polars-expr/src/expressions/ternary.rs b/crates/polars-expr/src/expressions/ternary.rs index b84e868efd35..e3c2f9e833a2 100644 --- a/crates/polars-expr/src/expressions/ternary.rs +++ b/crates/polars-expr/src/expressions/ternary.rs @@ -37,26 +37,23 @@ fn finish_as_iters<'a>( mut ac_falsy: AggregationContext<'a>, mut ac_mask: AggregationContext<'a>, ) -> PolarsResult> { - // SAFETY: unstable series never lives longer than the iterator. - let ca = unsafe { - ac_truthy - .iter_groups(false) - .zip(ac_falsy.iter_groups(false)) - .zip(ac_mask.iter_groups(false)) - .map(|((truthy, falsy), mask)| { - match (truthy, falsy, mask) { - (Some(truthy), Some(falsy), Some(mask)) => Some( - truthy - .as_ref() - .zip_with(mask.as_ref().bool()?, falsy.as_ref()), - ), - _ => None, - } - .transpose() - }) - .collect::>()? - .with_name(ac_truthy.series().name()) - }; + let ca = ac_truthy + .iter_groups(false) + .zip(ac_falsy.iter_groups(false)) + .zip(ac_mask.iter_groups(false)) + .map(|((truthy, falsy), mask)| { + match (truthy, falsy, mask) { + (Some(truthy), Some(falsy), Some(mask)) => Some( + truthy + .as_ref() + .zip_with(mask.as_ref().bool()?, falsy.as_ref()), + ), + _ => None, + } + .transpose() + }) + .collect::>()? + .with_name(ac_truthy.series().name()); // Aggregation leaves only a single chunk. let arr = ca.downcast_iter().next().unwrap(); diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index ef569e61519f..4a5fd97bf689 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -28,6 +28,7 @@ fast-float = { workspace = true, optional = true } flate2 = { workspace = true, optional = true } futures = { workspace = true, optional = true } glob = { version = "0.3" } +hashbrown = { workspace = true } itoa = { workspace = true, optional = true } memchr = { workspace = true } memmap = { workspace = true } @@ -40,7 +41,7 @@ regex = { workspace = true } reqwest = { workspace = true, optional = true } ryu = { workspace = true, optional = true } serde = { workspace = true, features = ["rc"], optional = true } -serde_json = { version = "1", default-features = false, features = ["alloc"], optional = true } +serde_json = { version = "1", optional = true } simd-json = { workspace = true, optional = true } simdutf8 = { workspace = true, optional = true } smartstring = { workspace = true } @@ -100,7 +101,7 @@ dtype-struct = ["polars-core/dtype-struct"] dtype-decimal = ["polars-core/dtype-decimal", "polars-json?/dtype-decimal"] fmt = ["polars-core/fmt"] lazy = [] -parquet = ["polars-parquet", "polars-parquet/compression"] +parquet = ["polars-parquet", "polars-parquet/compression", "polars-core/partition_by"] async = [ "async-trait", "futures", @@ -121,12 +122,11 @@ cloud = [ "reqwest", "http", ] -file_cache = ["async", "dep:blake3", "dep:fs4"] +file_cache = ["async", "dep:blake3", "dep:fs4", "serde_json", "cloud"] aws = ["object_store/aws", "cloud", "reqwest"] azure = ["object_store/azure", "cloud"] gcp = ["object_store/gcp", "cloud"] http = ["object_store/http", "cloud"] -partition = ["polars-core/partition_by"] temporal = ["dtype-datetime", "dtype-date", "dtype-time"] simd = [] python = ["polars-error/python"] diff --git a/crates/polars-io/src/cloud/adaptors.rs b/crates/polars-io/src/cloud/adaptors.rs index 435d703f2d80..5e034b55a80c 100644 --- a/crates/polars-io/src/cloud/adaptors.rs +++ b/crates/polars-io/src/cloud/adaptors.rs @@ -11,11 +11,13 @@ use tokio::io::AsyncWriteExt; use super::CloudOptions; use crate::pl_async::get_runtime; -/// Adaptor which wraps the interface of [ObjectStore::BufWriter](https://docs.rs/object_store/latest/object_store/buffered/struct.BufWriter.html) -/// exposing a synchronous interface which implements `std::io::Write`. +/// Adaptor which wraps the interface of [ObjectStore::BufWriter] exposing a synchronous interface +/// which implements `std::io::Write`. /// /// This allows it to be used in sync code which would otherwise write to a simple File or byte stream, /// such as with `polars::prelude::CsvWriter`. +/// +/// [ObjectStore::BufWriter]: https://docs.rs/object_store/latest/object_store/buffered/struct.BufWriter.html pub struct CloudWriter { // Internal writer, constructed at creation writer: BufWriter, diff --git a/crates/polars-io/src/cloud/options.rs b/crates/polars-io/src/cloud/options.rs index de0968a80da0..ca9016d05a96 100644 --- a/crates/polars-io/src/cloud/options.rs +++ b/crates/polars-io/src/cloud/options.rs @@ -277,7 +277,7 @@ impl CloudOptions { &mut builder, &[( Path::new("~/.aws/config"), - &[("region = (.*)\n", AmazonS3ConfigKey::Region)], + &[("region\\s*=\\s*(.*)\n", AmazonS3ConfigKey::Region)], )], ); read_config( @@ -285,9 +285,12 @@ impl CloudOptions { &[( Path::new("~/.aws/credentials"), &[ - ("aws_access_key_id = (.*)\n", AmazonS3ConfigKey::AccessKeyId), ( - "aws_secret_access_key = (.*)\n", + "aws_access_key_id\\s*=\\s*(.*)\n", + AmazonS3ConfigKey::AccessKeyId, + ), + ( + "aws_secret_access_key\\s*=\\s*(.*)\n", AmazonS3ConfigKey::SecretAccessKey, ), ], diff --git a/crates/polars-io/src/cloud/polars_object_store.rs b/crates/polars-io/src/cloud/polars_object_store.rs index f2744432bfa0..cd72d568f2fb 100644 --- a/crates/polars-io/src/cloud/polars_object_store.rs +++ b/crates/polars-io/src/cloud/polars_object_store.rs @@ -16,6 +16,7 @@ use crate::pl_async::{ /// concurrent requests for the entire application. #[derive(Debug, Clone)] pub struct PolarsObjectStore(Arc); +pub type ObjectStorePath = object_store::path::Path; impl PolarsObjectStore { pub fn new(store: Arc) -> Self { @@ -82,8 +83,31 @@ impl PolarsObjectStore { /// Fetch the metadata of the parquet file, do not memoize it. pub async fn head(&self, path: &Path) -> PolarsResult { - with_concurrency_budget(1, || self.0.head(path)) - .await - .map_err(to_compute_err) + with_concurrency_budget(1, || async { + let head_result = self.0.head(path).await; + + if head_result.is_err() { + // Pre-signed URLs forbid the HEAD method, but we can still retrieve the header + // information with a range 0-0 request. + let get_range_0_0_result = self + .0 + .get_opts( + path, + object_store::GetOptions { + range: Some((0..1).into()), + ..Default::default() + }, + ) + .await; + + if let Ok(v) = get_range_0_0_result { + return Ok(v.meta); + } + } + + head_result + }) + .await + .map_err(to_compute_err) } } diff --git a/crates/polars-io/src/csv/read/schema_inference.rs b/crates/polars-io/src/csv/read/schema_inference.rs index 189c54501c12..bdbd8296f7fe 100644 --- a/crates/polars-io/src/csv/read/schema_inference.rs +++ b/crates/polars-io/src/csv/read/schema_inference.rs @@ -502,7 +502,7 @@ fn infer_file_schema_inner( pub(super) fn check_decimal_comma(decimal_comma: bool, separator: u8) -> PolarsResult<()> { if decimal_comma { - polars_ensure!(b',' != separator, InvalidOperation: "'decimal_comma' argument cannot be combined with ',' quote char") + polars_ensure!(b',' != separator, InvalidOperation: "'decimal_comma' argument cannot be combined with ',' separator") } Ok(()) } diff --git a/crates/polars-io/src/json/mod.rs b/crates/polars-io/src/json/mod.rs index 99dbd53ffa5d..f4158abe69e7 100644 --- a/crates/polars-io/src/json/mod.rs +++ b/crates/polars-io/src/json/mod.rs @@ -71,6 +71,7 @@ use std::ops::Deref; use arrow::legacy::conversion::chunk_to_struct; use polars_core::error::to_compute_err; use polars_core::prelude::*; +use polars_error::{polars_bail, PolarsResult}; use polars_json::json::write::FallibleStreamingIterator; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -86,9 +87,11 @@ pub struct JsonWriterOptions { pub maintain_order: bool, } -/// The format to use to write the DataFrame to JSON: `Json` (a JSON array) or `JsonLines` (each row output on a -/// separate line). In either case, each row is serialized as a JSON object whose keys are the column names and whose -/// values are the row's corresponding values. +/// The format to use to write the DataFrame to JSON: `Json` (a JSON array) +/// or `JsonLines` (each row output on a separate line). +/// +/// In either case, each row is serialized as a JSON object whose keys are the column names and +/// whose values are the row's corresponding values. pub enum JsonFormat { /// A single JSON array containing each DataFrame row as an object. The length of the array is the number of rows in /// the DataFrame. @@ -222,6 +225,17 @@ where json_format: JsonFormat, } +pub fn remove_bom(bytes: &[u8]) -> PolarsResult<&[u8]> { + if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) { + // UTF-8 BOM + Ok(&bytes[3..]) + } else if bytes.starts_with(&[0xFE, 0xFF]) || bytes.starts_with(&[0xFF, 0xFE]) { + // UTF-16 BOM + polars_bail!(ComputeError: "utf-16 not supported") + } else { + Ok(bytes) + } +} impl<'a, R> SerReader for JsonReader<'a, R> where R: MmapBytesReader, @@ -251,8 +265,9 @@ where /// incompatible types in the input. In the event that a column contains mixed dtypes, is it unspecified whether an /// error is returned or whether elements of incompatible dtypes are replaced with `null`. fn finish(mut self) -> PolarsResult { - let rb: ReaderBytes = (&mut self.reader).into(); - + let pre_rb: ReaderBytes = (&mut self.reader).into(); + let bytes = remove_bom(pre_rb.deref())?; + let rb = ReaderBytes::Borrowed(bytes); let out = match self.json_format { JsonFormat::Json => { polars_ensure!(!self.ignore_errors, InvalidOperation: "'ignore_errors' only supported in ndjson"); diff --git a/crates/polars-io/src/lib.rs b/crates/polars-io/src/lib.rs index 5aa6e7fcebab..f3540f4e13fd 100644 --- a/crates/polars-io/src/lib.rs +++ b/crates/polars-io/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(docsrs, feature(doc_auto_cfg))] #![cfg_attr(feature = "simd", feature(portable_simd))] #![allow(ambiguous_glob_reexports)] +extern crate core; #[cfg(feature = "avro")] pub mod avro; diff --git a/crates/polars-io/src/ndjson/core.rs b/crates/polars-io/src/ndjson/core.rs index 3390d1004b9d..706e49c80f42 100644 --- a/crates/polars-io/src/ndjson/core.rs +++ b/crates/polars-io/src/ndjson/core.rs @@ -14,7 +14,7 @@ use crate::mmap::{MmapBytesReader, ReaderBytes}; use crate::ndjson::buffer::*; use crate::predicates::PhysicalIoExpr; use crate::prelude::*; -use crate::RowIndex; +use crate::{RowIndex, SerReader}; const NEWLINE: u8 = b'\n'; const CLOSING_BRACKET: u8 = b'}'; diff --git a/crates/polars-io/src/parquet/read/async_impl.rs b/crates/polars-io/src/parquet/read/async_impl.rs index 97e4829581bc..812011af48bf 100644 --- a/crates/polars-io/src/parquet/read/async_impl.rs +++ b/crates/polars-io/src/parquet/read/async_impl.rs @@ -18,6 +18,7 @@ use crate::cloud::{ build_object_store, object_path_from_str, CloudLocation, CloudOptions, PolarsObjectStore, }; use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::read::metadata::PartitionedColumnChunkMD; use crate::pl_async::get_runtime; use crate::predicates::PhysicalIoExpr; @@ -277,8 +278,19 @@ impl FetchRowGroupsFromObjectStore { row_group_range .filter_map(|i| { let rg = &row_groups[i]; + + // TODO! + // Optimize this. Now we partition the predicate columns twice. (later on reading as well) + // I think we must add metadata context where we can cache and amortize the partitioning. + let mut part_md = PartitionedColumnChunkMD::new(rg); + let live = pred.live_variables(); + part_md.set_partitions( + live.as_ref() + .map(|vars| vars.iter().map(|s| s.as_ref()).collect::>()) + .as_ref(), + ); let should_be_read = - matches!(read_this_row_group(Some(pred), rg, &schema), Ok(true)); + matches!(read_this_row_group(Some(pred), &part_md, &schema), Ok(true)); // Already add the row groups that will be skipped to the prefetched data. if !should_be_read { diff --git a/crates/polars-io/src/parquet/read/metadata.rs b/crates/polars-io/src/parquet/read/metadata.rs new file mode 100644 index 000000000000..8f1a2c1642c8 --- /dev/null +++ b/crates/polars-io/src/parquet/read/metadata.rs @@ -0,0 +1,57 @@ +use hashbrown::hash_map::RawEntryMut; +use polars_parquet::read::{ColumnChunkMetaData, RowGroupMetaData}; +use polars_utils::aliases::{PlHashMap, PlHashSet}; +use polars_utils::idx_vec::UnitVec; +use polars_utils::unitvec; + +/// This is a utility struct that Partitions the `ColumnChunkMetaData` by `field.name == descriptor.path_in_schema[0]` +/// This is required to fix quadratic behavior in wide parquet files. See #18319. +pub struct PartitionedColumnChunkMD<'a> { + partitions: Option>>, + metadata: &'a RowGroupMetaData, +} + +impl<'a> PartitionedColumnChunkMD<'a> { + pub fn new(metadata: &'a RowGroupMetaData) -> Self { + Self { + partitions: Default::default(), + metadata, + } + } + + pub(super) fn num_rows(&self) -> usize { + self.metadata.num_rows() + } + + pub fn set_partitions(&mut self, field_names: Option<&PlHashSet<&str>>) { + let mut partitions = PlHashMap::default(); + for (i, ccmd) in self.metadata.columns().iter().enumerate() { + let name = &ccmd.descriptor().path_in_schema[0]; + if field_names + .map(|field_names| field_names.contains(name.as_str())) + .unwrap_or(true) + { + let entry = partitions.raw_entry_mut().from_key(name.as_str()); + + match entry { + RawEntryMut::Vacant(slot) => { + slot.insert(name.to_string(), unitvec![i]); + }, + RawEntryMut::Occupied(mut slot) => { + slot.get_mut().push(i); + }, + }; + } + } + self.partitions = Some(partitions) + } + + pub fn get_partitions(&self, name: &str) -> Option> { + let columns = self.metadata.columns(); + self.partitions + .as_ref() + .expect("fields should be partitioned first") + .get(name) + .map(|idx| idx.iter().map(|i| &columns[*i]).collect::>()) + } +} diff --git a/crates/polars-io/src/parquet/read/mmap.rs b/crates/polars-io/src/parquet/read/mmap.rs index 4489247e1a6f..84725fd7a2e1 100644 --- a/crates/polars-io/src/parquet/read/mmap.rs +++ b/crates/polars-io/src/parquet/read/mmap.rs @@ -6,8 +6,7 @@ use bytes::Bytes; use polars_core::datatypes::PlHashMap; use polars_error::PolarsResult; use polars_parquet::read::{ - column_iter_to_arrays, get_field_columns, BasicDecompressor, ColumnChunkMetaData, Filter, - PageReader, + column_iter_to_arrays, BasicDecompressor, ColumnChunkMetaData, Filter, PageReader, }; use polars_utils::mmap::{MemReader, MemSlice}; @@ -32,11 +31,10 @@ pub enum ColumnStore { /// For cloud files the relevant memory regions should have been prefetched. pub(super) fn mmap_columns<'a>( store: &'a ColumnStore, - columns: &'a [ColumnChunkMetaData], - field_name: &str, + field_columns: &'a [&ColumnChunkMetaData], ) -> Vec<(&'a ColumnChunkMetaData, MemSlice)> { - get_field_columns(columns, field_name) - .into_iter() + field_columns + .iter() .map(|meta| _mmap_single_column(store, meta)) .collect() } @@ -63,7 +61,7 @@ fn _mmap_single_column<'a>( // similar to arrow2 serializer, except this accepts a slice instead of a vec. // this allows us to memory map -pub(super) fn to_deserializer( +pub fn to_deserializer( columns: Vec<(&ColumnChunkMetaData, MemSlice)>, field: Field, filter: Option, diff --git a/crates/polars-io/src/parquet/read/mod.rs b/crates/polars-io/src/parquet/read/mod.rs index 9b965172c375..b6b337c3ff6e 100644 --- a/crates/polars-io/src/parquet/read/mod.rs +++ b/crates/polars-io/src/parquet/read/mod.rs @@ -16,6 +16,7 @@ #[cfg(feature = "cloud")] mod async_impl; +mod metadata; mod mmap; mod options; mod predicates; @@ -37,3 +38,9 @@ use polars_error::{ErrString, PolarsError}; pub use reader::ParquetAsyncReader; pub use reader::{BatchedParquetReader, ParquetReader}; pub use utils::materialize_empty_df; + +pub mod _internal { + pub use super::metadata::PartitionedColumnChunkMD; + pub use super::mmap::to_deserializer; + pub use super::predicates::read_this_row_group; +} diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs index d3775864e1a3..565ef53f4edd 100644 --- a/crates/polars-io/src/parquet/read/predicates.rs +++ b/crates/polars-io/src/parquet/read/predicates.rs @@ -1,8 +1,7 @@ -use arrow::datatypes::ArrowSchemaRef; use polars_core::prelude::*; use polars_parquet::read::statistics::{deserialize, Statistics}; -use polars_parquet::read::RowGroupMetaData; +use crate::parquet::read::metadata::PartitionedColumnChunkMD; use crate::predicates::{BatchStats, ColumnStats, PhysicalIoExpr}; impl ColumnStats { @@ -16,37 +15,43 @@ impl ColumnStats { } } -/// Collect the statistics in a column chunk. +/// Collect the statistics in a row-group pub(crate) fn collect_statistics( - md: &RowGroupMetaData, + part_md: &PartitionedColumnChunkMD, schema: &ArrowSchema, ) -> PolarsResult> { - let mut stats = vec![]; + // TODO! fix this performance. This is a full sequential scan. + let stats = schema + .fields + .iter() + .map(|field| match part_md.get_partitions(&field.name) { + Some(md) => { + let st = deserialize(field, &md)?; + Ok(ColumnStats::from_arrow_stats(st, field)) + }, + None => Ok(ColumnStats::new(field.into(), None, None, None)), + }) + .collect::>>()?; - for field in schema.fields.iter() { - let st = deserialize(field, md)?; - stats.push(ColumnStats::from_arrow_stats(st, field)); + if stats.is_empty() { + return Ok(None); } - Ok(if stats.is_empty() { - None - } else { - Some(BatchStats::new( - Arc::new(schema.into()), - stats, - Some(md.num_rows()), - )) - }) + Ok(Some(BatchStats::new( + Arc::new(schema.into()), + stats, + Some(part_md.num_rows()), + ))) } -pub(super) fn read_this_row_group( +pub fn read_this_row_group( predicate: Option<&dyn PhysicalIoExpr>, - md: &RowGroupMetaData, - schema: &ArrowSchemaRef, + part_md: &PartitionedColumnChunkMD, + schema: &ArrowSchema, ) -> PolarsResult { if let Some(pred) = predicate { if let Some(pred) = pred.as_stats_evaluator() { - if let Some(stats) = collect_statistics(md, schema)? { + if let Some(stats) = collect_statistics(part_md, schema)? { let should_read = pred.should_read(&stats); // a parquet file may not have statistics of all columns if matches!(should_read, Ok(false)) { diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 15d35fcd285b..d0b1845cc8c5 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -9,7 +9,9 @@ use polars_core::utils::{accumulate_dataframes_vertical, split_df}; use polars_core::POOL; use polars_parquet::parquet::error::ParquetResult; use polars_parquet::parquet::statistics::Statistics; -use polars_parquet::read::{self, FileMetaData, Filter, PhysicalType, RowGroupMetaData}; +use polars_parquet::read::{ + self, ColumnChunkMetaData, FileMetaData, Filter, PhysicalType, RowGroupMetaData, +}; use polars_utils::mmap::MemSlice; use polars_utils::vec::inplace_zip_filtermap; use rayon::prelude::*; @@ -24,6 +26,7 @@ use super::{mmap, ParallelStrategy}; use crate::hive::materialize_hive_partitions; use crate::mmap::{MmapBytesReader, ReaderBytes}; use crate::parquet::metadata::FileMetaDataRef; +use crate::parquet::read::metadata::PartitionedColumnChunkMD; use crate::parquet::read::ROW_COUNT_OVERFLOW_ERR; use crate::predicates::{apply_predicate, PhysicalIoExpr}; use crate::utils::get_reader_bytes; @@ -58,7 +61,8 @@ fn assert_dtypes(data_type: &ArrowDataType) { fn column_idx_to_series( column_i: usize, - md: &RowGroupMetaData, + // The metadata belonging to this column + field_md: &[&ColumnChunkMetaData], filter: Option, file_schema: &ArrowSchema, store: &mmap::ColumnStore, @@ -69,8 +73,7 @@ fn column_idx_to_series( { assert_dtypes(field.data_type()) } - - let columns = mmap_columns(store, md.columns(), &field.name); + let columns = mmap_columns(store, field_md); let stats = columns .iter() .map(|(col_md, _)| col_md.statistics().transpose()) @@ -203,6 +206,24 @@ fn rg_to_dfs( } } +/// Collect a HashSet of the projected columns. +/// Returns `None` if all columns are projected. +fn projected_columns_set<'a>( + schema: &'a ArrowSchema, + projection: &[usize], +) -> Option> { + if projection.len() == schema.len() { + None + } else { + Some( + projection + .iter() + .map(|i| schema.fields[*i].name.as_str()) + .collect::>(), + ) + } +} + #[allow(clippy::too_many_arguments)] fn rg_to_dfs_prefiltered( store: &mmap::ColumnStore, @@ -227,9 +248,24 @@ fn rg_to_dfs_prefiltered( polars_bail!(ComputeError: "Parquet file contains too many row groups (> {})", u32::MAX); } + let projected_columns = projected_columns_set(schema, projection); + + let part_mds = POOL.install(|| { + file_metadata + .row_groups + .par_iter() + .map(|rg| { + let mut part_md = PartitionedColumnChunkMD::new(rg); + part_md.set_partitions(projected_columns.as_ref()); + part_md + }) + .collect::>() + }); + let mut row_offset = *previous_row_count; let mut row_groups: Vec = (row_group_start..row_group_end) .filter_map(|index| { + let part_md = &part_mds[index]; let md = &file_metadata.row_groups[index]; let current_offset = row_offset; @@ -237,8 +273,7 @@ fn rg_to_dfs_prefiltered( row_offset += current_row_count; if use_statistics { - match read_this_row_group(Some(predicate), &file_metadata.row_groups[index], schema) - { + match read_this_row_group(Some(predicate), part_md, schema) { Ok(false) => return None, Ok(true) => {}, Err(e) => return Some(Err(e)), @@ -252,38 +287,46 @@ fn rg_to_dfs_prefiltered( }) .collect::>>()?; - let num_live_columns = live_variables.len(); - let num_dead_columns = projection.len() - num_live_columns; - + // Deduplicate the live variables let live_variables = live_variables .iter() .map(Deref::deref) .collect::>(); + // Get the number of live columns + let num_live_columns = live_variables.len(); + let num_dead_columns = projection.len() - num_live_columns; + // We create two look-up tables that map indexes offsets into the live- and dead-set onto // column indexes of the schema. let mut live_idx_to_col_idx = Vec::with_capacity(num_live_columns); let mut dead_idx_to_col_idx = Vec::with_capacity(num_dead_columns); - for (i, col) in file_metadata.schema().columns().iter().enumerate() { - if live_variables.contains(col.path_in_schema[0].deref()) { + for (i, field) in schema.fields.iter().enumerate() { + if live_variables.contains(&field.name[..]) { live_idx_to_col_idx.push(i); } else { dead_idx_to_col_idx.push(i); } } - debug_assert_eq!(live_variables.len(), num_live_columns); + + debug_assert_eq!(live_idx_to_col_idx.len(), num_live_columns); debug_assert_eq!(dead_idx_to_col_idx.len(), num_dead_columns); POOL.install(|| { + // Set partitioned fields to prevent quadratic behavior. + // Ensure all row groups are partitioned. + // Collect the data for the live columns let mut live_columns = (0..row_groups.len() * num_live_columns) .into_par_iter() .map(|i| { let col_idx = live_idx_to_col_idx[i % num_live_columns]; - let rg_idx = row_groups[i / num_live_columns].index as usize; - let md = &file_metadata.row_groups[rg_idx]; - column_idx_to_series(col_idx, md, None, schema, store) + let name = &schema.fields[col_idx].name; + let rg_idx = row_groups[i / num_live_columns].index; + let field_md = part_mds[rg_idx as usize].get_partitions(name).unwrap(); + + column_idx_to_series(col_idx, field_md.as_slice(), None, schema, store) }) .collect::>>()?; @@ -316,8 +359,12 @@ fn rg_to_dfs_prefiltered( let mut bitmap = MutableBitmap::with_capacity(mask.len()); + // We need to account for the validity of the items for chunk in mask.downcast_iter() { - bitmap.extend_from_bitmap(chunk.values()); + match chunk.validity() { + None => bitmap.extend_from_bitmap(chunk.values()), + Some(validity) => bitmap.extend_from_bitmap(&(validity & chunk.values())), + } } let bitmap = bitmap.freeze(); @@ -341,6 +388,11 @@ fn rg_to_dfs_prefiltered( .ok_or(ROW_COUNT_OVERFLOW_ERR)?; } + // We don't need to do any further work if there are no dead columns + if num_dead_columns == 0 { + return Ok(dfs.into_iter().map(|(_, df)| df).collect()); + } + // @TODO: Incorporate this if we how we can properly use it. The problem here is that // different columns really have a different cost when it comes to collecting them. We // would need a cost model to properly estimate this. @@ -378,15 +430,22 @@ fn rg_to_dfs_prefiltered( .into_par_iter() .map(|i| { let col_idx = dead_idx_to_col_idx[i % num_dead_columns]; - let rg_idx = row_groups[i / num_dead_columns].index as usize; + let name = &schema.fields[col_idx].name; let (mask, _) = &dfs[i / num_dead_columns]; - let md = &file_metadata.row_groups[rg_idx]; - debug_assert_eq!(md.num_rows(), mask.len()); + let rg_idx = row_groups[i / num_dead_columns].index; + + #[cfg(debug_assertions)] + { + let md = &file_metadata.row_groups[rg_idx as usize]; + debug_assert_eq!(md.num_rows(), mask.len()); + } + let field_md = part_mds[rg_idx as usize].get_partitions(name).unwrap(); + column_idx_to_series( col_idx, - md, + field_md.as_slice(), Some(Filter::new_masked(mask.clone())), schema, store, @@ -394,22 +453,10 @@ fn rg_to_dfs_prefiltered( }) .collect::>>()?; - let mut rearranged_schema: Schema = Schema::new(); - if let Some(rc) = &row_index { - rearranged_schema.insert_at_index( - 0, - SmartString::from(rc.name.deref()), - IdxType::get_dtype(), - )?; - } - for i in live_idx_to_col_idx.iter().copied() { - rearranged_schema.insert_at_index( - rearranged_schema.len(), - schema.fields[i].name.clone().into(), - schema.fields[i].data_type().into(), - )?; - } - rearranged_schema.merge(Schema::from(schema.as_ref())); + let Some(df) = dfs.first().map(|(_, df)| df) else { + return Ok(Vec::new()); + }; + let rearranged_schema = df.schema(); rg_columns .par_chunks_exact_mut(num_dead_columns) @@ -454,13 +501,17 @@ fn rg_to_dfs_optionally_par_over_columns( for rg_idx in row_group_start..row_group_end { let md = &file_metadata.row_groups[rg_idx]; + + // Set partitioned fields to prevent quadratic behavior. + let projected_columns = projected_columns_set(schema, projection); + let mut part_md = PartitionedColumnChunkMD::new(md); + part_md.set_partitions(projected_columns.as_ref()); + let rg_slice = split_slice_at_file(&mut n_rows_processed, md.num_rows(), slice.0, slice_end); let current_row_count = md.num_rows() as IdxSize; - if use_statistics - && !read_this_row_group(predicate, &file_metadata.row_groups[rg_idx], schema)? - { + if use_statistics && !read_this_row_group(predicate, &part_md, schema)? { *previous_row_count += rg_slice.1 as IdxSize; continue; } @@ -475,9 +526,12 @@ fn rg_to_dfs_optionally_par_over_columns( projection .par_iter() .map(|column_i| { + let name = &schema.fields[*column_i].name; + let part = part_md.get_partitions(name).unwrap(); + column_idx_to_series( *column_i, - md, + part.as_slice(), Some(Filter::new_ranged(rg_slice.0, rg_slice.0 + rg_slice.1)), schema, store, @@ -489,9 +543,12 @@ fn rg_to_dfs_optionally_par_over_columns( projection .iter() .map(|column_i| { + let name = &schema.fields[*column_i].name; + let part = part_md.get_partitions(name).unwrap(); + column_idx_to_series( *column_i, - md, + part.as_slice(), Some(Filter::new_ranged(rg_slice.0, rg_slice.0 + rg_slice.1)), schema, store, @@ -508,7 +565,7 @@ fn rg_to_dfs_optionally_par_over_columns( materialize_hive_partitions(&mut df, schema.as_ref(), hive_partition_columns, rg_slice.1); apply_predicate(&mut df, predicate, true)?; - *previous_row_count = previous_row_count.checked_add(current_row_count).ok_or( + *previous_row_count = previous_row_count.checked_add(current_row_count).ok_or_else(|| polars_err!( ComputeError: "Parquet file produces more than pow(2, 32) rows; \ consider compiling with polars-bigidx feature (polars-u64-idx package on python), \ @@ -566,16 +623,28 @@ fn rg_to_dfs_par_over_rg( } let dfs = POOL.install(|| { + // Set partitioned fields to prevent quadratic behavior. + // Ensure all row groups are partitioned. + let part_mds = { + let projected_columns = projected_columns_set(schema, projection); + row_groups + .par_iter() + .map(|(_, rg, _, _)| { + let mut ccmd = PartitionedColumnChunkMD::new(rg); + ccmd.set_partitions(projected_columns.as_ref()); + ccmd + }) + .collect::>() + }; + row_groups .into_par_iter() - .map(|(rg_idx, md, slice, row_count_start)| { + .enumerate() + .map(|(iter_idx, (_rg_idx, _md, slice, row_count_start))| { + let part_md = &part_mds[iter_idx]; + if slice.1 == 0 - || use_statistics - && !read_this_row_group( - predicate, - &file_metadata.row_groups[rg_idx], - schema, - )? + || use_statistics && !read_this_row_group(predicate, part_md, schema)? { return Ok(None); } @@ -588,9 +657,12 @@ fn rg_to_dfs_par_over_rg( let columns = projection .iter() .map(|column_i| { + let name = &schema.fields[*column_i].name; + let field_md = part_md.get_partitions(name).unwrap(); + column_idx_to_series( *column_i, - md, + field_md.as_slice(), Some(Filter::new_ranged(slice.0, slice.0 + slice.1)), schema, store, @@ -1022,7 +1094,7 @@ impl BatchedParquetReader { // Re-use the same ChunkedArray if ca.len() < max_len { - *ca = ca.new_from_index(max_len, 0); + *ca = ca.new_from_index(0, max_len); } for df in &mut dfs { diff --git a/crates/polars-io/src/utils/byte_source.rs b/crates/polars-io/src/utils/byte_source.rs new file mode 100644 index 000000000000..fce7e795ce46 --- /dev/null +++ b/crates/polars-io/src/utils/byte_source.rs @@ -0,0 +1,176 @@ +use std::ops::Range; +use std::sync::Arc; + +use polars_error::{to_compute_err, PolarsResult}; +use polars_utils::_limit_path_len_io_err; +use polars_utils::mmap::MemSlice; + +use crate::cloud::{ + build_object_store, object_path_from_str, CloudLocation, CloudOptions, ObjectStorePath, + PolarsObjectStore, +}; + +#[allow(async_fn_in_trait)] +pub trait ByteSource: Send + Sync { + async fn get_size(&self) -> PolarsResult; + /// # Panics + /// Panics if `range` is not in bounds. + async fn get_range(&self, range: Range) -> PolarsResult; + async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult>; +} + +/// Byte source backed by a `MemSlice`, which can potentially be memory-mapped. +pub struct MemSliceByteSource(pub MemSlice); + +impl MemSliceByteSource { + async fn try_new_mmap_from_path( + path: &str, + _cloud_options: Option<&CloudOptions>, + ) -> PolarsResult { + let file = Arc::new( + tokio::fs::File::open(path) + .await + .map_err(|err| _limit_path_len_io_err(path.as_ref(), err))? + .into_std() + .await, + ); + let mmap = Arc::new(unsafe { memmap::Mmap::map(file.as_ref()) }.map_err(to_compute_err)?); + + Ok(Self(MemSlice::from_mmap(mmap))) + } +} + +impl ByteSource for MemSliceByteSource { + async fn get_size(&self) -> PolarsResult { + Ok(self.0.as_ref().len()) + } + + async fn get_range(&self, range: Range) -> PolarsResult { + let out = self.0.slice(range); + Ok(out) + } + + async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + Ok(ranges + .iter() + .map(|x| self.0.slice(x.clone())) + .collect::>()) + } +} + +pub struct ObjectStoreByteSource { + store: PolarsObjectStore, + path: ObjectStorePath, +} + +impl ObjectStoreByteSource { + async fn try_new_from_path( + path: &str, + cloud_options: Option<&CloudOptions>, + ) -> PolarsResult { + let (CloudLocation { prefix, .. }, store) = + build_object_store(path, cloud_options, false).await?; + let path = object_path_from_str(&prefix)?; + let store = PolarsObjectStore::new(store); + + Ok(Self { store, path }) + } +} + +impl ByteSource for ObjectStoreByteSource { + async fn get_size(&self) -> PolarsResult { + Ok(self.store.head(&self.path).await?.size) + } + + async fn get_range(&self, range: Range) -> PolarsResult { + let bytes = self.store.get_range(&self.path, range).await?; + let mem_slice = MemSlice::from_bytes(bytes); + + Ok(mem_slice) + } + + async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + let ranges = self.store.get_ranges(&self.path, ranges).await?; + Ok(ranges.into_iter().map(MemSlice::from_bytes).collect()) + } +} + +/// Dynamic dispatch to async functions. +pub enum DynByteSource { + MemSlice(MemSliceByteSource), + Cloud(ObjectStoreByteSource), +} + +impl DynByteSource { + pub fn variant_name(&self) -> &str { + match self { + Self::MemSlice(_) => "MemSlice", + Self::Cloud(_) => "Cloud", + } + } +} + +impl Default for DynByteSource { + fn default() -> Self { + Self::MemSlice(MemSliceByteSource(MemSlice::default())) + } +} + +impl ByteSource for DynByteSource { + async fn get_size(&self) -> PolarsResult { + match self { + Self::MemSlice(v) => v.get_size().await, + Self::Cloud(v) => v.get_size().await, + } + } + + async fn get_range(&self, range: Range) -> PolarsResult { + match self { + Self::MemSlice(v) => v.get_range(range).await, + Self::Cloud(v) => v.get_range(range).await, + } + } + + async fn get_ranges(&self, ranges: &[Range]) -> PolarsResult> { + match self { + Self::MemSlice(v) => v.get_ranges(ranges).await, + Self::Cloud(v) => v.get_ranges(ranges).await, + } + } +} + +impl From for DynByteSource { + fn from(value: MemSliceByteSource) -> Self { + Self::MemSlice(value) + } +} + +impl From for DynByteSource { + fn from(value: ObjectStoreByteSource) -> Self { + Self::Cloud(value) + } +} + +#[derive(Clone, Debug)] +pub enum DynByteSourceBuilder { + Mmap, + /// Supports both cloud and local files. + ObjectStore, +} + +impl DynByteSourceBuilder { + pub async fn try_build_from_path( + &self, + path: &str, + cloud_options: Option<&CloudOptions>, + ) -> PolarsResult { + Ok(match self { + Self::Mmap => MemSliceByteSource::try_new_mmap_from_path(path, cloud_options) + .await? + .into(), + Self::ObjectStore => ObjectStoreByteSource::try_new_from_path(path, cloud_options) + .await? + .into(), + }) + } +} diff --git a/crates/polars-io/src/utils/mod.rs b/crates/polars-io/src/utils/mod.rs index 5ed22c76561c..87c80b1b5c5a 100644 --- a/crates/polars-io/src/utils/mod.rs +++ b/crates/polars-io/src/utils/mod.rs @@ -3,6 +3,8 @@ mod other; pub use compression::is_compressed; pub use other::*; +#[cfg(feature = "cloud")] +pub mod byte_source; pub mod slice; pub const URL_ENCODE_CHAR_SET: &percent_encoding::AsciiSet = &percent_encoding::CONTROLS diff --git a/crates/polars-io/src/utils/slice.rs b/crates/polars-io/src/utils/slice.rs index 78ff29cf1b29..24a3b7dc1ab8 100644 --- a/crates/polars-io/src/utils/slice.rs +++ b/crates/polars-io/src/utils/slice.rs @@ -1,33 +1,58 @@ /// Given a `slice` that is relative to the start of a list of files, calculate the slice to apply /// at a file with a row offset of `current_row_offset`. pub fn split_slice_at_file( - current_row_offset: &mut usize, + current_row_offset_ref: &mut usize, n_rows_this_file: usize, global_slice_start: usize, global_slice_end: usize, ) -> (usize, usize) { - let next_file_offset = *current_row_offset + n_rows_this_file; - // e.g. - // slice: (start: 1, end: 2) - // files: - // 0: (1 row): current_offset: 0, next_file_offset: 1 - // 1: (1 row): current_offset: 1, next_file_offset: 2 - // 2: (1 row): current_offset: 2, next_file_offset: 3 - // in this example we want to include only file 1. - let has_overlap_with_slice = - *current_row_offset < global_slice_end && next_file_offset > global_slice_start; + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref += n_rows_this_file; + match SplitSlicePosition::split_slice_at_file( + current_row_offset, + n_rows_this_file, + global_slice_start..global_slice_end, + ) { + SplitSlicePosition::Overlapping(offset, len) => (offset, len), + SplitSlicePosition::Before | SplitSlicePosition::After => (0, 0), + } +} + +#[derive(Debug)] +pub enum SplitSlicePosition { + Before, + Overlapping(usize, usize), + After, +} + +impl SplitSlicePosition { + pub fn split_slice_at_file( + current_row_offset: usize, + n_rows_this_file: usize, + global_slice: std::ops::Range, + ) -> Self { + // e.g. + // slice: (start: 1, end: 2) + // files: + // 0: (1 row): current_offset: 0, next_file_offset: 1 + // 1: (1 row): current_offset: 1, next_file_offset: 2 + // 2: (1 row): current_offset: 2, next_file_offset: 3 + // in this example we want to include only file 1. + + let next_row_offset = current_row_offset + n_rows_this_file; - let (rel_start, slice_len) = if !has_overlap_with_slice { - (0, 0) - } else { - let n_rows_to_skip = global_slice_start.saturating_sub(*current_row_offset); - let n_excess_rows = next_file_offset.saturating_sub(global_slice_end); - ( - n_rows_to_skip, - n_rows_this_file - n_rows_to_skip - n_excess_rows, - ) - }; + if next_row_offset <= global_slice.start { + Self::Before + } else if current_row_offset >= global_slice.end { + Self::After + } else { + let n_rows_to_skip = global_slice.start.saturating_sub(current_row_offset); + let n_excess_rows = next_row_offset.saturating_sub(global_slice.end); - *current_row_offset = next_file_offset; - (rel_start, slice_len) + Self::Overlapping( + n_rows_to_skip, + n_rows_this_file - n_rows_to_skip - n_excess_rows, + ) + } + } } diff --git a/crates/polars-json/src/ndjson/file.rs b/crates/polars-json/src/ndjson/file.rs index 1f4af394b78c..08f059b685d2 100644 --- a/crates/polars-json/src/ndjson/file.rs +++ b/crates/polars-json/src/ndjson/file.rs @@ -90,6 +90,7 @@ fn parse_value<'a>(scratch: &'a mut Vec, val: &[u8]) -> PolarsResult PolarsResult { + pub fn collect_schema(&mut self) -> PolarsResult { let mut cached_arenas = self.cached_arena.lock().unwrap(); match &mut *cached_arenas { diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 9f81d07a97d3..e97633f4433d 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -67,6 +67,7 @@ impl IntoLazy for LazyFrame { } /// Lazy abstraction over an eager `DataFrame`. +/// /// It really is an abstraction over a logical plan. The methods of this struct will incrementally /// modify a logical plan until output is requested (via [`collect`](crate::frame::LazyFrame::collect)). #[derive(Clone, Default)] @@ -582,10 +583,19 @@ impl LazyFrame { #[allow(unused_mut)] let mut opt_state = self.opt_state; let streaming = self.opt_state.contains(OptState::STREAMING); + let new_streaming = self.opt_state.contains(OptState::NEW_STREAMING); #[cfg(feature = "cse")] - if streaming && self.opt_state.contains(OptState::COMM_SUBPLAN_ELIM) { + if streaming && !new_streaming { opt_state &= !OptState::COMM_SUBPLAN_ELIM; } + + // The new streaming engine can't deal with the way the common + // subexpression elimination adds length-incorrect with_columns. + #[cfg(feature = "cse")] + if new_streaming { + opt_state &= !OptState::COMM_SUBEXPR_ELIM; + } + let lp_top = optimize( self.logical_plan, opt_state, @@ -694,48 +704,45 @@ impl LazyFrame { pub fn collect(self) -> PolarsResult { #[cfg(feature = "new_streaming")] { - let force_new_streaming = self.opt_state.contains(OptState::NEW_STREAMING); - let mut alp_plan = self.to_alp_optimized()?; - let stream_lp_top = alp_plan.lp_arena.add(IR::Sink { - input: alp_plan.lp_top, - payload: SinkType::Memory, - }); - - if force_new_streaming { - return polars_stream::run_query( - stream_lp_top, - alp_plan.lp_arena, - &alp_plan.expr_arena, - ); - } + let auto_new_streaming = + std::env::var("POLARS_AUTO_NEW_STREAMING").as_deref() == Ok("1"); + if self.opt_state.contains(OptState::NEW_STREAMING) || auto_new_streaming { + // Try to run using the new streaming engine, falling back + // if it fails in a todo!() error if auto_new_streaming is set. + let mut new_stream_lazy = self.clone(); + new_stream_lazy.opt_state |= OptState::NEW_STREAMING; + let mut alp_plan = new_stream_lazy.to_alp_optimized()?; + let stream_lp_top = alp_plan.lp_arena.add(IR::Sink { + input: alp_plan.lp_top, + payload: SinkType::Memory, + }); - if std::env::var("POLARS_AUTO_NEW_STREAMING") - .as_deref() - .unwrap_or("") - == "1" - { let f = || { polars_stream::run_query( stream_lp_top, - alp_plan.lp_arena.clone(), - &alp_plan.expr_arena, + alp_plan.lp_arena, + &mut alp_plan.expr_arena, ) }; match std::panic::catch_unwind(std::panic::AssertUnwindSafe(f)) { Ok(r) => return r, Err(e) => { - // Fallback to normal engine if error is due to not being implemented, - // otherwise propagate error. - if e.downcast_ref::<&str>() != Some(&"not yet implemented") { + // Fallback to normal engine if error is due to not being implemented + // and auto_new_streaming is set, otherwise propagate error. + if auto_new_streaming + && e.downcast_ref::<&str>() == Some(&"not yet implemented") + { if polars_core::config::verbose() { eprintln!("caught unimplemented error in new streaming engine, falling back to normal engine"); } + } else { std::panic::resume_unwind(e); } }, } } + let mut alp_plan = self.to_alp_optimized()?; let mut physical_plan = create_physical_plan( alp_plan.lp_top, &mut alp_plan.lp_arena, @@ -1045,7 +1052,7 @@ impl LazyFrame { options.index_column = name.as_ref().into(); } else { let output_field = index_column - .to_field(&self.schema().unwrap(), Context::Default) + .to_field(&self.collect_schema().unwrap(), Context::Default) .unwrap(); return self.with_column(index_column).rolling( Expr::Column(Arc::from(output_field.name().as_str())), @@ -1090,7 +1097,7 @@ impl LazyFrame { options.index_column = name.as_ref().into(); } else { let output_field = index_column - .to_field(&self.schema().unwrap(), Context::Default) + .to_field(&self.collect_schema().unwrap(), Context::Default) .unwrap(); return self.with_column(index_column).group_by_dynamic( Expr::Column(Arc::from(output_field.name().as_str())), @@ -1513,13 +1520,25 @@ impl LazyFrame { /// Apply explode operation. [See eager explode](polars_core::frame::DataFrame::explode). pub fn explode, IE: Into + Clone>(self, columns: E) -> LazyFrame { + self.explode_impl(columns, false) + } + + /// Apply explode operation. [See eager explode](polars_core::frame::DataFrame::explode). + fn explode_impl, IE: Into + Clone>( + self, + columns: E, + allow_empty: bool, + ) -> LazyFrame { let columns = columns .as_ref() .iter() .map(|e| e.clone().into()) .collect::>(); let opt_state = self.get_opt_state(); - let lp = self.get_plan_builder().explode(columns).build(); + let lp = self + .get_plan_builder() + .explode(columns, allow_empty) + .build(); Self::from_logical_plan(lp, opt_state) } @@ -1877,7 +1896,7 @@ impl LazyGroupBy { .collect::>(); self.agg([col("*").exclude(&keys).head(n)]) - .explode([col("*").exclude(&keys)]) + .explode_impl([col("*").exclude(&keys)], true) } /// Return last n rows of each group @@ -1889,7 +1908,7 @@ impl LazyGroupBy { .collect::>(); self.agg([col("*").exclude(&keys).tail(n)]) - .explode([col("*").exclude(&keys)]) + .explode_impl([col("*").exclude(&keys)], true) } /// Apply a function over the groups as a new DataFrame. diff --git a/crates/polars-lazy/src/frame/pivot.rs b/crates/polars-lazy/src/frame/pivot.rs index 759981c52f0e..eedcdc700e1e 100644 --- a/crates/polars-lazy/src/frame/pivot.rs +++ b/crates/polars-lazy/src/frame/pivot.rs @@ -1,3 +1,5 @@ +//! Module containing implementation of the pivot operation. +//! //! Polars lazy does not implement a pivot because it is impossible to know the schema without //! materializing the whole dataset. This makes a pivot quite a terrible operation for performant //! workflows. An optimization can never be pushed down passed a pivot. diff --git a/crates/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs index ffdfedd9bfc5..ecaaba71056d 100644 --- a/crates/polars-lazy/src/tests/optimization_checks.rs +++ b/crates/polars-lazy/src/tests/optimization_checks.rs @@ -495,7 +495,7 @@ fn test_with_column_prune() -> PolarsResult<()> { matches!(lp, SimpleProjection { .. } | DataFrameScan { .. }) })); assert_eq!( - q.schema().unwrap().as_ref(), + q.collect_schema().unwrap().as_ref(), &Schema::from_iter([Field::new("c1", DataType::Int32)]) ); Ok(()) diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index fe777499812d..d32efc4b295e 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -1440,7 +1440,7 @@ fn test_when_then_schema() -> PolarsResult<()> { .select([when(col("A").gt(lit(1))) .then(Null {}.lit()) .otherwise(col("A"))]) - .schema(); + .collect_schema(); assert_ne!(schema?.get_at_index(0).unwrap().1, &DataType::Null); Ok(()) diff --git a/crates/polars-mem-engine/src/executors/scan/parquet.rs b/crates/polars-mem-engine/src/executors/scan/parquet.rs index a78dbf113151..bc3f69ac95ab 100644 --- a/crates/polars-mem-engine/src/executors/scan/parquet.rs +++ b/crates/polars-mem-engine/src/executors/scan/parquet.rs @@ -187,15 +187,14 @@ impl ParquetExec { readers_and_metadata .into_par_iter() .zip(row_statistics.into_par_iter()) - .enumerate() .map( - |(i, ((reader, _, predicate, projection), (cumulative_read, slice)))| { + |((reader, _, predicate, projection), (cumulative_read, slice))| { let row_index = base_row_index.as_ref().map(|rc| RowIndex { name: rc.name.clone(), offset: rc.offset + cumulative_read as IdxSize, }); - let mut df = reader + let df = reader .with_slice(Some(slice)) .with_row_index(row_index) .with_predicate(predicate.clone()) @@ -210,20 +209,6 @@ impl ParquetExec { )? .finish()?; - if let Some(col) = &self.file_options.include_file_paths { - let path = paths[i].to_str().unwrap(); - unsafe { - df.with_column_unchecked( - StringChunked::full( - col, - path, - std::cmp::max(df.height(), slice.1), - ) - .into_series(), - ) - }; - } - Ok(df) }, ) diff --git a/crates/polars-ops/Cargo.toml b/crates/polars-ops/Cargo.toml index 3bbdb10fcaf0..163b45726837 100644 --- a/crates/polars-ops/Cargo.toml +++ b/crates/polars-ops/Cargo.toml @@ -79,10 +79,11 @@ business = ["dtype-date", "chrono"] fused = [] cutqcut = ["dtype-categorical", "dtype-struct"] rle = ["dtype-struct"] -timezones = ["chrono-tz", "chrono"] +timezones = ["chrono", "chrono-tz", "polars-core/temporal", "polars-core/timezones", "polars-core/dtype-datetime"] random = ["rand", "rand_distr"] rank = ["rand"] find_many = ["aho-corasick"] +serde = ["dep:serde", "polars-core/serde"] # extra utilities for BinaryChunked binary_encoding = ["base64", "hex"] @@ -112,7 +113,7 @@ mode = [] search_sorted = [] merge_sorted = [] top_k = [] -pivot = ["polars-core/reinterpret"] +pivot = ["polars-core/reinterpret", "polars-core/dtype-struct"] cross_join = [] chunked_ids = [] asof_join = [] @@ -123,7 +124,7 @@ list_gather = [] list_sets = [] list_any_all = [] list_drop_nulls = [] -list_sample = [] +list_sample = ["polars-core/random"] extract_groups = ["dtype-struct", "polars-core/regex"] is_in = ["polars-core/reinterpret"] hist = ["dtype-categorical", "dtype-struct"] diff --git a/crates/polars-ops/src/chunked_array/hist.rs b/crates/polars-ops/src/chunked_array/hist.rs index d2a0acc76239..455a0c6cc921 100644 --- a/crates/polars-ops/src/chunked_array/hist.rs +++ b/crates/polars-ops/src/chunked_array/hist.rs @@ -3,7 +3,6 @@ use std::fmt::Write; use num_traits::ToPrimitive; use polars_core::prelude::*; use polars_core::with_match_physical_numeric_polars_type; -use polars_utils::float::IsFloat; use polars_utils::total_ord::ToTotalOrd; fn compute_hist( @@ -17,6 +16,7 @@ where T: PolarsNumericType, ChunkedArray: ChunkAgg, { + let mut lower_bound: f64; let (breaks, count) = if let Some(bins) = bins { let mut breaks = Vec::with_capacity(bins.len() + 1); breaks.extend_from_slice(bins); @@ -31,7 +31,7 @@ where // We start with the lower garbage bin. // (-inf, B0] - let mut lower_bound = f64::NEG_INFINITY; + lower_bound = f64::NEG_INFINITY; let mut upper_bound = *breaks_iter.next().unwrap(); for chunk in sorted.downcast_iter() { @@ -60,17 +60,17 @@ where while count.len() < breaks.len() { count.push(0) } + // Push lower bound to infinity + lower_bound = f64::NEG_INFINITY; (breaks, count) } else if ca.null_count() == ca.len() { + lower_bound = f64::NEG_INFINITY; let breaks: Vec = vec![f64::INFINITY]; let count: Vec = vec![0]; (breaks, count) } else { - let min = ChunkAgg::min(ca).unwrap().to_f64().unwrap(); - let max = ChunkAgg::max(ca).unwrap().to_f64().unwrap(); - - let start = min.floor() - 1.0; - let end = max.ceil() + 1.0; + let start = ChunkAgg::min(ca).unwrap().to_f64().unwrap(); + let end = ChunkAgg::max(ca).unwrap().to_f64().unwrap(); // If bin_count is omitted, default to the difference between start and stop (unit bins) let bin_count = if let Some(bin_count) = bin_count { @@ -79,37 +79,24 @@ where (end - start).round() as usize }; - // Calculate the breakpoints and make the array + // Calculate the breakpoints and make the array. The breakpoints form the RHS of the bins. let interval = (end - start) / (bin_count as f64); - - let breaks_iter = (0..(bin_count)).map(|b| start + (b as f64) * interval); - + let breaks_iter = (1..(bin_count)).map(|b| start + (b as f64) * interval); let mut breaks = Vec::with_capacity(breaks_iter.size_hint().0 + 1); breaks.extend(breaks_iter); - breaks.push(f64::INFINITY); - let mut count: Vec = vec![0; breaks.len()]; - let end_idx = count.len() - 1; + // Extend the left-most edge by 0.1% of the total range to include the minimum value. + let margin = (end - start) * 0.001; + lower_bound = start - margin; + breaks.push(end); - // start is the closed rhs of the interval, so we subtract the bucket width - let start_range = start - interval; + let mut count: Vec = vec![0; bin_count]; + let max_bin = breaks.len() - 1; for chunk in ca.downcast_iter() { for item in chunk.non_null_values_iter() { - let item = item.to_f64().unwrap() - start_range; - - // This is needed for numeric stability. - // Only for integers. - // we can fall directly on a boundary with an integer. - let item = item / interval; - let item = if !T::Native::is_float() && (item.round() - item).abs() < 0.0000001 { - item.round() - 1.0 - } else { - item.ceil() - 1.0 - }; - - let idx = item as usize; - let idx = std::cmp::min(idx, end_idx); - count[idx] += 1; + let item = item.to_f64().unwrap(); + let bin = ((((item - start) / interval).ceil() - 1.0) as usize).min(max_bin); + count[bin] += 1; } } (breaks, count) @@ -117,7 +104,7 @@ where let mut fields = Vec::with_capacity(3); if include_category { // Use AnyValue for formatting. - let mut lower = AnyValue::Float64(f64::NEG_INFINITY); + let mut lower = AnyValue::Float64(lower_bound); let mut categories = StringChunkedBuilder::new("category", breaks.len()); let mut buf = String::new(); diff --git a/crates/polars-ops/src/chunked_array/top_k.rs b/crates/polars-ops/src/chunked_array/top_k.rs index f5948d0c88a4..9772a5593be0 100644 --- a/crates/polars-ops/src/chunked_array/top_k.rs +++ b/crates/polars-ops/src/chunked_array/top_k.rs @@ -204,6 +204,7 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { Ok(ca.into_series()) }, DataType::Binary => Ok(top_k_binary_impl(s.binary().unwrap(), k, descending).into_series()), + #[cfg(feature = "dtype-decimal")] DataType::Decimal(_, _) => { let src = src.decimal().unwrap(); let ca = top_k_num_impl(src, k, descending); @@ -212,6 +213,7 @@ pub fn top_k(s: &[Series], descending: bool) -> PolarsResult { Ok(lca.into_series()) }, DataType::Null => Ok(src.slice(0, k)), + #[cfg(feature = "dtype-struct")] DataType::Struct(_) => { // Fallback to more generic impl. top_k_by_impl(k, src, &[src.clone()], vec![descending]) diff --git a/crates/polars-ops/src/frame/pivot/unpivot.rs b/crates/polars-ops/src/frame/pivot/unpivot.rs index 3b45b1986fa5..289529d4b4f4 100644 --- a/crates/polars-ops/src/frame/pivot/unpivot.rs +++ b/crates/polars-ops/src/frame/pivot/unpivot.rs @@ -104,7 +104,7 @@ pub trait UnpivotDF: IntoDf { // return empty frame if there are no columns available to use as value vars if index.len() == self_.width() { let variable_col = Series::new_empty(variable_name, &DataType::String); - let value_col = Series::new_empty(variable_name, &DataType::Null); + let value_col = Series::new_empty(value_name, &DataType::Null); let mut out = self_.select(index).unwrap().clear().take_columns(); out.push(variable_col); @@ -193,6 +193,7 @@ impl UnpivotDF for DataFrame {} #[cfg(test)] mod test { use polars_core::df; + use polars_core::utils::Container; use super::*; @@ -205,12 +206,31 @@ mod test { ) .unwrap(); + // Specify on and index let unpivoted = df.unpivot(["C", "D"], ["A", "B"])?; + assert_eq!( + unpivoted.get_column_names(), + &["A", "B", "variable", "value"] + ); assert_eq!( Vec::from(unpivoted.column("value")?.i32()?), &[Some(10), Some(11), Some(12), Some(2), Some(4), Some(6)] ); + // Specify custom column names + let args = UnpivotArgsIR { + on: vec!["C".into(), "D".into()], + index: vec!["A".into(), "B".into()], + variable_name: Some("custom_variable".into()), + value_name: Some("custom_value".into()), + }; + let unpivoted = df.unpivot2(args).unwrap(); + assert_eq!( + unpivoted.get_column_names(), + &["A", "B", "custom_variable", "custom_value"] + ); + + // Specify neither on nor index let args = UnpivotArgsIR { on: vec![], index: vec![], @@ -218,6 +238,7 @@ mod test { }; let unpivoted = df.unpivot2(args).unwrap(); + assert_eq!(unpivoted.get_column_names(), &["variable", "value"]); let value = unpivoted.column("value")?; // String because of supertype let value = value.str()?; @@ -227,6 +248,7 @@ mod test { &["a", "b", "a", "1", "3", "5", "10", "11", "12", "2", "4", "6"] ); + // Specify index but not on let args = UnpivotArgsIR { on: vec![], index: vec!["A".into()], @@ -234,6 +256,7 @@ mod test { }; let unpivoted = df.unpivot2(args).unwrap(); + assert_eq!(unpivoted.get_column_names(), &["A", "variable", "value"]); let value = unpivoted.column("value")?; let value = value.i32()?; let value = value.into_no_null_iter().collect::>(); @@ -243,6 +266,20 @@ mod test { let variable = variable.into_no_null_iter().collect::>(); assert_eq!(variable, &["B", "B", "B", "C", "C", "C", "D", "D", "D"]); assert!(unpivoted.column("A").is_ok()); + + // Specify all columns in index + let args = UnpivotArgsIR { + on: vec![], + index: vec!["A".into(), "B".into(), "C".into(), "D".into()], + ..Default::default() + }; + let unpivoted = df.unpivot2(args).unwrap(); + assert_eq!( + unpivoted.get_column_names(), + &["A", "B", "C", "D", "variable", "value"] + ); + assert_eq!(unpivoted.len(), 0); + Ok(()) } } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs index 2d56f09d0af1..67e895fd173e 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/basic.rs @@ -144,6 +144,10 @@ impl<'a, 'b, O: Offset> BatchableCollector<(), Binary> for DeltaCollector<'a, target.extend_constant(n); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } impl<'a, 'b, O: Offset> BatchableCollector<(), Binary> for DeltaBytesCollector<'a, 'b, O> { @@ -159,6 +163,10 @@ impl<'a, 'b, O: Offset> BatchableCollector<(), Binary> for DeltaBytesCollecto target.extend_constant(n); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } impl<'a, O: Offset> StateTranslation<'a, BinaryDecoder> for BinaryStateTranslation<'a> { @@ -210,7 +218,7 @@ impl<'a, O: Offset> StateTranslation<'a, BinaryDecoder> for BinaryStateTransl page.dict, additional, )?, - T::Delta(ref mut page) => { + T::DeltaLengthByteArray(ref mut page, ref mut _lengths) => { let (values, validity) = decoded; let mut collector = DeltaCollector { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs b/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs index 53c25d8050b9..fc98e039229e 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binary/decoders.rs @@ -39,7 +39,7 @@ impl<'a> ValuesDictionary<'a> { pub(crate) enum BinaryStateTranslation<'a> { Plain(BinaryIter<'a>), Dictionary(ValuesDictionary<'a>), - Delta(delta_length_byte_array::Decoder<'a>), + DeltaLengthByteArray(delta_length_byte_array::Decoder<'a>, Vec), DeltaBytes(delta_byte_array::Decoder<'a>), } @@ -67,8 +67,9 @@ impl<'a> BinaryStateTranslation<'a> { }, (Encoding::DeltaLengthByteArray, _) => { let values = split_buffer(page)?.values; - Ok(BinaryStateTranslation::Delta( + Ok(BinaryStateTranslation::DeltaLengthByteArray( delta_length_byte_array::Decoder::try_new(values)?, + Vec::new(), )) }, (Encoding::DeltaByteArray, _) => { @@ -84,7 +85,7 @@ impl<'a> BinaryStateTranslation<'a> { match self { Self::Plain(v) => v.len_when_not_nullable(), Self::Dictionary(v) => v.len(), - Self::Delta(v) => v.len(), + Self::DeltaLengthByteArray(v, _) => v.len(), Self::DeltaBytes(v) => v.len(), } } @@ -97,7 +98,7 @@ impl<'a> BinaryStateTranslation<'a> { match self { Self::Plain(t) => _ = t.by_ref().nth(n - 1), Self::Dictionary(t) => t.values.skip_in_place(n)?, - Self::Delta(t) => t.skip_in_place(n)?, + Self::DeltaLengthByteArray(t, _) => t.skip_in_place(n)?, Self::DeltaBytes(t) => t.skip_in_place(n)?, } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs index bf6f4bf97f1d..be615035addb 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/binview.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/binview.rs @@ -10,7 +10,7 @@ use arrow::datatypes::{ArrowDataType, PhysicalType}; use super::binary::decoders::*; use super::utils::{freeze_validity, BatchableCollector}; -use crate::parquet::encoding::delta_bitpacked::DeltaGatherer; +use crate::parquet::encoding::delta_bitpacked::{lin_natural_sum, DeltaGatherer}; use crate::parquet::encoding::hybrid_rle::{self, DictionaryTranslator}; use crate::parquet::encoding::{delta_byte_array, delta_length_byte_array}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -85,23 +85,27 @@ impl<'a> StateTranslation<'a, BinViewDecoder> for BinaryStateTranslation<'a> { // Already done in decode_plain_encoded validate_utf8 = false; }, - Self::Delta(ref mut page_values) => { + Self::DeltaLengthByteArray(ref mut page_values, ref mut lengths) => { let (values, validity) = decoded; let mut collector = DeltaCollector { + gatherer: &mut StatGatherer::default(), + pushed_lengths: lengths, decoder: page_values, }; match page_validity { - None => collector.push_n(values, additional)?, + None => (&mut collector).push_n(values, additional)?, Some(page_validity) => extend_from_decoder( validity, page_validity, Some(additional), values, - collector, + &mut collector, )?, } + + collector.flush(values); }, Self::DeltaBytes(ref mut page_values) => { let (values, validity) = decoded; @@ -147,6 +151,12 @@ impl utils::ExactSize for DecodedStateTuple { } pub(crate) struct DeltaCollector<'a, 'b> { + // We gatherer the decoded lengths into `pushed_lengths`. Then, we `flush` those to the + // `BinView` This allows us to group many memcopies into one and take better potential fast + // paths for inlineable views and such. + pub(crate) gatherer: &'b mut StatGatherer, + pub(crate) pushed_lengths: &'b mut Vec, + pub(crate) decoder: &'b mut delta_length_byte_array::Decoder<'a>, } @@ -154,44 +164,148 @@ pub(crate) struct DeltaBytesCollector<'a, 'b> { pub(crate) decoder: &'b mut delta_byte_array::Decoder<'a>, } -pub(crate) struct ViewGatherer<'a, 'b> { - values: &'a [u8], - offset: &'b mut usize, +/// A [`DeltaGatherer`] that gathers the minimum, maximum and summation of the values as `usize`s. +pub(crate) struct StatGatherer { + min: usize, + max: usize, + sum: usize, +} + +impl Default for StatGatherer { + fn default() -> Self { + Self { + min: usize::MAX, + max: usize::MIN, + sum: 0, + } + } } -impl<'a, 'b> DeltaGatherer for ViewGatherer<'a, 'b> { - type Target = MutableBinaryViewArray<[u8]>; +impl DeltaGatherer for StatGatherer { + type Target = Vec; fn target_len(&self, target: &Self::Target) -> usize { target.len() } fn target_reserve(&self, target: &mut Self::Target, n: usize) { - target.views_mut().reserve(n) + target.reserve(n); } fn gather_one(&mut self, target: &mut Self::Target, v: i64) -> ParquetResult<()> { + if v < 0 { + return Err(ParquetError::oos("DELTA_LENGTH_BYTE_ARRAY length < 0")); + } + + if v > i64::from(u32::MAX) { + return Err(ParquetError::not_supported( + "DELTA_LENGTH_BYTE_ARRAY length > u32::MAX", + )); + } + let v = v as usize; - let s = &self.values[*self.offset..*self.offset + v]; - *self.offset += v; - target.push(Some(s)); + + self.min = self.min.min(v); + self.max = self.max.max(v); + self.sum += v; + + target.push(v as u32); + + Ok(()) + } + + fn gather_slice(&mut self, target: &mut Self::Target, slice: &[i64]) -> ParquetResult<()> { + let mut is_invalid = false; + let mut is_too_large = false; + + target.extend(slice.iter().map(|&v| { + is_invalid |= v < 0; + is_too_large |= v > i64::from(u32::MAX); + + let v = v as usize; + + self.min = self.min.min(v); + self.max = self.max.max(v); + self.sum += v; + + v as u32 + })); + + if is_invalid { + target.truncate(target.len() - slice.len()); + return Err(ParquetError::oos("DELTA_LENGTH_BYTE_ARRAY length < 0")); + } + + if is_too_large { + return Err(ParquetError::not_supported( + "DELTA_LENGTH_BYTE_ARRAY length > u32::MAX", + )); + } + + Ok(()) + } + + fn gather_constant( + &mut self, + target: &mut Self::Target, + v: i64, + delta: i64, + num_repeats: usize, + ) -> ParquetResult<()> { + if v < 0 || (delta < 0 && num_repeats > 0 && (num_repeats - 1) as i64 * delta + v < 0) { + return Err(ParquetError::oos("DELTA_LENGTH_BYTE_ARRAY length < 0")); + } + + if v > i64::from(u32::MAX) || v + ((num_repeats - 1) as i64) * delta > i64::from(u32::MAX) { + return Err(ParquetError::not_supported( + "DELTA_LENGTH_BYTE_ARRAY length > u32::MAX", + )); + } + + target.extend((0..num_repeats).map(|i| (v + (i as i64) * delta) as u32)); + + let vstart = v; + let vend = v + (num_repeats - 1) as i64 * delta; + + let (min, max) = if delta < 0 { + (vend, vstart) + } else { + (vstart, vend) + }; + + let sum = lin_natural_sum(v, delta, num_repeats) as usize; + + #[cfg(debug_assertions)] + { + assert_eq!( + (0..num_repeats) + .map(|i| (v + (i as i64) * delta) as usize) + .sum::(), + sum + ); + } + + self.min = self.min.min(min as usize); + self.max = self.max.max(max as usize); + self.sum += sum; + Ok(()) } } -impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaCollector<'a, 'b> { +impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for &mut DeltaCollector<'a, 'b> { fn reserve(target: &mut MutableBinaryViewArray<[u8]>, n: usize) { target.views_mut().reserve(n); } - fn push_n(&mut self, target: &mut MutableBinaryViewArray<[u8]>, n: usize) -> ParquetResult<()> { - let mut gatherer = ViewGatherer { - values: self.decoder.values, - offset: &mut self.decoder.offset, - }; + fn push_n( + &mut self, + _target: &mut MutableBinaryViewArray<[u8]>, + n: usize, + ) -> ParquetResult<()> { self.decoder .lengths - .gather_n_into(target, n, &mut gatherer)?; + .gather_n_into(self.pushed_lengths, n, self.gatherer)?; Ok(()) } @@ -201,9 +315,34 @@ impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaColle target: &mut MutableBinaryViewArray<[u8]>, n: usize, ) -> ParquetResult<()> { + self.flush(target); target.extend_constant(n, >::None); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } +} + +impl<'a, 'b> DeltaCollector<'a, 'b> { + pub fn flush(&mut self, target: &mut MutableBinaryViewArray<[u8]>) { + if !self.pushed_lengths.is_empty() { + unsafe { + target.extend_from_lengths_with_stats( + &self.decoder.values[self.decoder.offset..], + self.pushed_lengths.iter().map(|&v| v as usize), + self.gatherer.min, + self.gatherer.max, + self.gatherer.sum, + ) + }; + + self.decoder.offset += self.gatherer.sum; + self.pushed_lengths.clear(); + *self.gatherer = StatGatherer::default(); + } + } } impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaBytesCollector<'a, 'b> { @@ -291,6 +430,10 @@ impl<'a, 'b> BatchableCollector<(), MutableBinaryViewArray<[u8]>> for DeltaBytes target.extend_constant(n, >::None); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } impl utils::Decoder for BinViewDecoder { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs b/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs index 1f33da0678d6..154e46b41c18 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/boolean.rs @@ -165,6 +165,10 @@ impl<'a, 'b> BatchableCollector for BitmapCollector<'a, 'b> target.extend_constant(n, false); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.0.skip_in_place(n) + } } impl ExactSize for (MutableBitmap, MutableBitmap) { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs b/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs index 4a7b8f740063..ab8098adbdd6 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/dictionary.rs @@ -191,6 +191,10 @@ impl<'a, 'b, K: DictionaryKey> BatchableCollector<(), Vec> for DictArrayColle target.resize(target.len() + n, K::default()); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.values.skip_in_place(n) + } } impl Translator for DictArrayTranslator { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs index 747243ce26ef..c423ab919091 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/fixed_size_binary.rs @@ -16,6 +16,7 @@ pub(crate) enum StateTranslation<'a> { Dictionary(hybrid_rle::HybridRleDecoder<'a>, &'a Vec), } +#[derive(Debug)] pub struct FixedSizeBinary { pub values: Vec, pub size: usize, @@ -164,6 +165,12 @@ impl Decoder for BinaryDecoder { target.resize(target.len() + n * self.size, 0); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + let n = usize::min(n, self.slice.len() / self.size); + *self.slice = &self.slice[n * self.size..]; + Ok(()) + } } let mut collector = FixedSizeBinaryCollector { diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs index e200d3c1a8da..05360a08d7d7 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested.rs @@ -202,6 +202,17 @@ pub fn columns_to_iter_recursive( )? .collect_n(filter)? }, + Binary | Utf8 => { + init.push(InitNested::Primitive(field.is_nullable)); + types.pop(); + PageNestedDecoder::new( + columns.pop().unwrap(), + field.data_type().clone(), + binary::BinaryDecoder::::default(), + init, + )? + .collect_n(filter)? + }, _ => match field.data_type().to_logical_type() { ArrowDataType::Dictionary(key_type, _, _) => { init.push(InitNested::Primitive(field.is_nullable)); diff --git a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs index fd135a9b63ac..e9c122a6e4b4 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/nested_utils.rs @@ -140,7 +140,7 @@ impl Nested { fn invalid_num_values(&self) -> usize { match &self.content { - NestedContent::Primitive => 0, + NestedContent::Primitive => 1, NestedContent::List { .. } => 0, NestedContent::FixedSizeList { width } => *width, NestedContent::Struct => 1, @@ -204,6 +204,10 @@ impl<'a, 'b, 'c, D: utils::NestedDecoder> BatchableCollector<(), D::DecodedState self.decoder.push_n_nulls(self.state, target, n); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.state.skip_in_place(n) + } } /// The initial info of nested data types. @@ -290,6 +294,67 @@ impl NestedState { } } +/// Calculate the number of leaf values that are covered by the first `limit` definition level +/// values. +fn limit_to_num_values( + def_iter: &HybridRleDecoder<'_>, + def_levels: &[u16], + limit: usize, +) -> ParquetResult { + struct NumValuesGatherer { + leaf_def_level: u16, + } + struct NumValuesState { + num_values: usize, + length: usize, + } + + impl HybridRleGatherer for NumValuesGatherer { + type Target = NumValuesState; + + fn target_reserve(&self, _target: &mut Self::Target, _n: usize) {} + + fn target_num_elements(&self, target: &Self::Target) -> usize { + target.length + } + + fn hybridrle_to_target(&self, value: u32) -> ParquetResult { + Ok(value) + } + + fn gather_one(&self, target: &mut Self::Target, value: u32) -> ParquetResult<()> { + target.num_values += usize::from(value == self.leaf_def_level as u32); + target.length += 1; + Ok(()) + } + + fn gather_repeated( + &self, + target: &mut Self::Target, + value: u32, + n: usize, + ) -> ParquetResult<()> { + target.num_values += n * usize::from(value == self.leaf_def_level as u32); + target.length += n; + Ok(()) + } + } + + let mut state = NumValuesState { + num_values: 0, + length: 0, + }; + def_iter.clone().gather_n_into( + &mut state, + limit, + &NumValuesGatherer { + leaf_def_level: *def_levels.last().unwrap(), + }, + )?; + + Ok(state.num_values) +} + fn idx_to_limit(rep_iter: &HybridRleDecoder<'_>, idx: usize) -> ParquetResult { struct RowIdxOffsetGatherer; struct RowIdxOffsetState { @@ -384,7 +449,7 @@ fn extend_offsets2<'a, D: utils::NestedDecoder>( >, nested: &mut [Nested], filter: Option, - // Amortized allocations + def_levels: &[u16], rep_levels: &[u16], ) -> PolarsResult<()> { @@ -416,6 +481,9 @@ fn extend_offsets2<'a, D: utils::NestedDecoder>( if start > 0 { let start_cell = idx_to_limit(&rep_iter, start)?; + let num_skipped_values = limit_to_num_values(&def_iter, def_levels, start_cell)?; + batched_collector.skip_in_place(num_skipped_values)?; + rep_iter.skip_in_place(start_cell)?; def_iter.skip_in_place(start_cell)?; } @@ -436,6 +504,8 @@ fn extend_offsets2<'a, D: utils::NestedDecoder>( // @NOTE: This is kind of unused let last_skip = def_iter.len(); + let num_skipped_values = limit_to_num_values(&def_iter, def_levels, last_skip)?; + batched_collector.skip_in_place(num_skipped_values)?; rep_iter.skip_in_place(last_skip)?; def_iter.skip_in_place(last_skip)?; @@ -447,6 +517,8 @@ fn extend_offsets2<'a, D: utils::NestedDecoder>( let num_zeros = iter.take_leading_zeros(); if num_zeros > 0 { let offset = idx_to_limit(&rep_iter, num_zeros)?; + let num_skipped_values = limit_to_num_values(&def_iter, def_levels, offset)?; + batched_collector.skip_in_place(num_skipped_values)?; rep_iter.skip_in_place(offset)?; def_iter.skip_in_place(offset)?; } @@ -601,23 +673,16 @@ fn extend_offsets_limited<'a, D: utils::NestedDecoder>( } } - if embed_depth == max_depth - 1 { - for _ in 0..num_elements { - batched_collector.push_invalid(); - } - - break; - } - let embed_num_values = embed_nest.invalid_num_values(); + num_elements *= embed_num_values; if embed_num_values == 0 { break; } - - num_elements *= embed_num_values; } + batched_collector.push_n_invalids(num_elements); + break; } @@ -705,6 +770,7 @@ impl PageNestedDecoder { break; }; let page = page?; + let page = page.decompress(&mut self.iter)?; let mut state = utils::State::new_nested(&self.decoder, &page, self.dict.as_ref())?; @@ -743,9 +809,11 @@ impl PageNestedDecoder { break; }; let page = page?; + // We cannot lazily decompress because we don't have the number of leaf values + // at this point. This is encoded within the `definition level` values. *sign*. + // In general, lazy decompression is quite difficult with nested values. + let page = page.decompress(&mut self.iter)?; - let mut state = - utils::State::new_nested(&self.decoder, &page, self.dict.as_ref())?; let (def_iter, rep_iter) = level_iters(&page)?; let mut count = ZeroCount::default(); @@ -762,6 +830,9 @@ impl PageNestedDecoder { None }; + let mut state = + utils::State::new_nested(&self.decoder, &page, self.dict.as_ref())?; + let start_length = nested_state.len(); // @TODO: move this to outside the loop. diff --git a/crates/polars-parquet/src/arrow/read/deserialize/null.rs b/crates/polars-parquet/src/arrow/read/deserialize/null.rs index 8c28a7fc66bb..b2ce451ced5d 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/null.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/null.rs @@ -12,6 +12,7 @@ use crate::parquet::error::ParquetResult; use crate::parquet::page::{DataPage, DictPage}; pub(crate) struct NullDecoder; +#[derive(Debug)] pub(crate) struct NullArrayLength { length: usize, } @@ -136,18 +137,20 @@ pub fn iter_to_arrays( }; let page = page?; - let rows = page.num_values(); - let page_filter; - (page_filter, filter) = Filter::opt_split_at(&filter, rows); + let state_filter; + (state_filter, filter) = Filter::opt_split_at(&filter, page.num_values()); - let num_rows = match page_filter { - None => rows, + // Skip the whole page if we don't need any rows from it + if state_filter.as_ref().is_some_and(|f| f.num_rows() == 0) { + continue; + } + + let num_rows = match state_filter { + None => page.num_values(), Some(filter) => filter.num_rows(), }; len = (len + num_rows).min(num_rows); - - iter.reuse_page_buffer(page); } Ok(Box::new(NullArray::new(data_type, len))) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs index ce658b764412..696463eefa39 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/basic.rs @@ -147,6 +147,11 @@ where target.resize(target.len() + n, T::default()); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.chunks.skip_in_place(n); + Ok(()) + } } #[allow(clippy::large_enum_variant)] @@ -206,7 +211,7 @@ where } match self { - Self::Plain(t) => _ = t.nth(n - 1), + Self::Plain(t) => t.skip_in_place(n), Self::Dictionary(t) => t.values.skip_in_place(n)?, Self::ByteStreamSplit(t) => _ = t.iter_converted(|_| ()).nth(n - 1), } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs index 45518947e0a1..1a767981d291 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/integer.rs @@ -84,7 +84,7 @@ where } match self { - Self::Plain(v) => _ = v.nth(n - 1), + Self::Plain(v) => v.skip_in_place(n), Self::Dictionary(v) => v.values.skip_in_place(n)?, Self::ByteStreamSplit(v) => _ = v.iter_converted(|_| ()).nth(n - 1), Self::DeltaBinaryPacked(v) => v.skip_in_place(n)?, diff --git a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs index 45c95a7d5ee1..22da6ff14895 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/primitive/mod.rs @@ -109,4 +109,8 @@ where target.resize(target.len() + n, T::default()); Ok(()) } + + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs index f95be359631d..330ad77a7c44 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils/array_chunks.rs @@ -24,6 +24,11 @@ impl<'a, P: ParquetNativeType> ArrayChunks<'a, P> { Some(Self { bytes }) } + + pub(crate) fn skip_in_place(&mut self, n: usize) { + let n = usize::min(self.bytes.len(), n); + self.bytes = &self.bytes[n..]; + } } impl<'a, P: ParquetNativeType> Iterator for ArrayChunks<'a, P> { @@ -36,13 +41,6 @@ impl<'a, P: ParquetNativeType> Iterator for ArrayChunks<'a, P> { Some(item) } - #[inline(always)] - fn nth(&mut self, n: usize) -> Option { - let item = self.bytes.get(n)?; - self.bytes = &self.bytes[n + 1..]; - Some(item) - } - #[inline(always)] fn size_hint(&self) -> (usize, Option) { (self.bytes.len(), Some(self.bytes.len())) diff --git a/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs b/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs index b96f7b6a429c..9c85c14edb0c 100644 --- a/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs +++ b/crates/polars-parquet/src/arrow/read/deserialize/utils/mod.rs @@ -112,6 +112,11 @@ impl<'a, D: Decoder> State<'a, D> { match filter { None => { let num_rows = self.len(); + + if num_rows == 0 { + return Ok(()); + } + self.translation.extend_from_state( decoder, decoded, @@ -126,12 +131,16 @@ impl<'a, D: Decoder> State<'a, D> { self.skip_in_place(start)?; debug_assert!(end - start <= self.len()); - self.translation.extend_from_state( - decoder, - decoded, - &mut self.page_validity, - end - start, - )?; + + if end - start > 0 { + self.translation.extend_from_state( + decoder, + decoded, + &mut self.page_validity, + end - start, + )?; + } + Ok(()) }, Filter::Mask(bitmap) => { @@ -142,12 +151,15 @@ impl<'a, D: Decoder> State<'a, D> { let prev_state_len = self.len(); let num_ones = iter.take_leading_ones(); - self.translation.extend_from_state( - decoder, - decoded, - &mut self.page_validity, - num_ones, - )?; + + if num_ones > 0 { + self.translation.extend_from_state( + decoder, + decoded, + &mut self.page_validity, + num_ones, + )?; + } if iter.num_remaining() == 0 || self.len() == 0 { break; @@ -171,11 +183,9 @@ impl<'a, D: Decoder> State<'a, D> { pub fn not_implemented(page: &DataPage) -> ParquetError { let is_optional = page.descriptor.primitive_type.field_info.repetition == Repetition::Optional; - let is_filtered = page.selected_rows().is_some(); let required = if is_optional { "optional" } else { "required" }; - let is_filtered = if is_filtered { ", index-filtered" } else { "" }; ParquetError::not_supported(format!( - "Decoding {:?} \"{:?}\"-encoded {required}{is_filtered} parquet pages not yet supported", + "Decoding {:?} \"{:?}\"-encoded {required} parquet pages not yet supported", page.descriptor.primitive_type.physical_type, page.encoding(), )) @@ -185,14 +195,15 @@ pub trait BatchableCollector { fn reserve(target: &mut T, n: usize); fn push_n(&mut self, target: &mut T, n: usize) -> ParquetResult<()>; fn push_n_nulls(&mut self, target: &mut T, n: usize) -> ParquetResult<()>; + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()>; } /// This batches sequential collect operations to try and prevent unnecessary buffering and /// `Iterator::next` polling. #[must_use] pub struct BatchedCollector<'a, I, T, C: BatchableCollector> { - num_waiting_valids: usize, - num_waiting_invalids: usize, + pub(crate) num_waiting_valids: usize, + pub(crate) num_waiting_invalids: usize, target: &'a mut T, collector: C, @@ -243,6 +254,24 @@ impl<'a, I, T, C: BatchableCollector> BatchedCollector<'a, I, T, C> { self.num_waiting_invalids += n; } + #[inline] + pub fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + if self.num_waiting_valids > 0 { + self.collector + .push_n(self.target, self.num_waiting_valids)?; + self.num_waiting_valids = 0; + } + if self.num_waiting_invalids > 0 { + self.collector + .push_n_nulls(self.target, self.num_waiting_invalids)?; + self.num_waiting_invalids = 0; + } + + self.collector.skip_in_place(n)?; + + Ok(()) + } + #[inline] pub fn finalize(mut self) -> ParquetResult<()> { self.collector @@ -403,6 +432,11 @@ where target.resize(target.len() + n, O::default()); Ok(()) } + + #[inline] + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } pub struct GatheredHybridRle<'a, 'b, 'c, O, G> @@ -453,6 +487,11 @@ where .gather_repeated(target, self.null_value.clone(), n)?; Ok(()) } + + #[inline] + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } impl<'a, 'b, 'c, O, Out, G> BatchableCollector> @@ -480,6 +519,11 @@ where .gather_repeated(target, self.null_value.clone(), n)?; Ok(()) } + + #[inline] + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } impl<'a, 'b, 'c, T> BatchableCollector> @@ -513,6 +557,11 @@ where target.extend_null(n); Ok(()) } + + #[inline] + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + self.decoder.skip_in_place(n) + } } impl, I: Iterator> BatchableCollector for I { @@ -532,6 +581,14 @@ impl, I: Iterator> BatchableCollector for I { target.extend_null_constant(n); Ok(()) } + + #[inline] + fn skip_in_place(&mut self, n: usize) -> ParquetResult<()> { + if n > 0 { + _ = self.nth(n - 1); + } + Ok(()) + } } /// An item with a known size @@ -653,21 +710,22 @@ impl PageDecoder { while num_rows_remaining > 0 { let Some(page) = self.iter.next() else { - return self.decoder.finalize(self.data_type, self.dict, target); + break; }; let page = page?; - let mut state = State::new(&self.decoder, &page, self.dict.as_ref())?; - let state_len = state.len(); - let state_filter; - (state_filter, filter) = Filter::opt_split_at(&filter, state_len); + (state_filter, filter) = Filter::opt_split_at(&filter, page.num_values()); // Skip the whole page if we don't need any rows from it if state_filter.as_ref().is_some_and(|f| f.num_rows() == 0) { continue; } + let page = page.decompress(&mut self.iter)?; + + let mut state = State::new(&self.decoder, &page, self.dict.as_ref())?; + let start_length = target.len(); state.extend_from_state(&mut self.decoder, &mut target, state_filter)?; let end_length = target.len(); diff --git a/crates/polars-parquet/src/arrow/read/indexes/binary.rs b/crates/polars-parquet/src/arrow/read/indexes/binary.rs deleted file mode 100644 index b6e017644746..000000000000 --- a/crates/polars-parquet/src/arrow/read/indexes/binary.rs +++ /dev/null @@ -1,44 +0,0 @@ -use arrow::array::{Array, BinaryArray, PrimitiveArray, Utf8Array}; -use arrow::datatypes::{ArrowDataType, PhysicalType}; -use arrow::trusted_len::TrustedLen; -use polars_error::{to_compute_err, PolarsResult}; - -use super::ColumnPageStatistics; -use crate::parquet::indexes::PageIndex; - -pub fn deserialize( - indexes: &[PageIndex>], - data_type: &ArrowDataType, -) -> PolarsResult { - Ok(ColumnPageStatistics { - min: deserialize_binary_iter(indexes.iter().map(|index| index.min.as_ref()), data_type)?, - max: deserialize_binary_iter(indexes.iter().map(|index| index.max.as_ref()), data_type)?, - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - }) -} - -fn deserialize_binary_iter<'a, I: TrustedLen>>>( - iter: I, - data_type: &ArrowDataType, -) -> PolarsResult> { - match data_type.to_physical_type() { - PhysicalType::LargeBinary => Ok(Box::new(BinaryArray::::from_iter(iter))), - PhysicalType::Utf8 => { - let iter = iter.map(|x| x.map(|x| std::str::from_utf8(x)).transpose()); - Ok(Box::new( - Utf8Array::::try_from_trusted_len_iter(iter).map_err(to_compute_err)?, - )) - }, - PhysicalType::LargeUtf8 => { - let iter = iter.map(|x| x.map(|x| std::str::from_utf8(x)).transpose()); - Ok(Box::new( - Utf8Array::::try_from_trusted_len_iter(iter).map_err(to_compute_err)?, - )) - }, - _ => Ok(Box::new(BinaryArray::::from_iter(iter))), - } -} diff --git a/crates/polars-parquet/src/arrow/read/indexes/boolean.rs b/crates/polars-parquet/src/arrow/read/indexes/boolean.rs deleted file mode 100644 index b6414e24a621..000000000000 --- a/crates/polars-parquet/src/arrow/read/indexes/boolean.rs +++ /dev/null @@ -1,20 +0,0 @@ -use arrow::array::{BooleanArray, PrimitiveArray}; - -use super::ColumnPageStatistics; -use crate::parquet::indexes::PageIndex; - -pub fn deserialize(indexes: &[PageIndex]) -> ColumnPageStatistics { - ColumnPageStatistics { - min: Box::new(BooleanArray::from_trusted_len_iter( - indexes.iter().map(|index| index.min), - )), - max: Box::new(BooleanArray::from_trusted_len_iter( - indexes.iter().map(|index| index.max), - )), - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - } -} diff --git a/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs b/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs deleted file mode 100644 index 5b2785b22b06..000000000000 --- a/crates/polars-parquet/src/arrow/read/indexes/fixed_len_binary.rs +++ /dev/null @@ -1,70 +0,0 @@ -use arrow::array::{Array, FixedSizeBinaryArray, MutableFixedSizeBinaryArray, PrimitiveArray}; -use arrow::datatypes::{ArrowDataType, PhysicalType, PrimitiveType}; -use arrow::trusted_len::TrustedLen; -use arrow::types::{i256, NativeType}; - -use super::ColumnPageStatistics; -use crate::parquet::indexes::PageIndex; - -pub fn deserialize( - indexes: &[PageIndex>], - data_type: ArrowDataType, -) -> ColumnPageStatistics { - ColumnPageStatistics { - min: deserialize_binary_iter( - indexes.iter().map(|index| index.min.as_ref()), - data_type.clone(), - ), - max: deserialize_binary_iter(indexes.iter().map(|index| index.max.as_ref()), data_type), - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - } -} - -fn deserialize_binary_iter<'a, I: TrustedLen>>>( - iter: I, - data_type: ArrowDataType, -) -> Box { - match data_type.to_physical_type() { - PhysicalType::Primitive(PrimitiveType::Int128) => { - Box::new(PrimitiveArray::from_trusted_len_iter(iter.map(|v| { - v.map(|x| { - // Copy the fixed-size byte value to the start of a 16 byte stack - // allocated buffer, then use an arithmetic right shift to fill in - // MSBs, which accounts for leading 1's in negative (two's complement) - // values. - let n = x.len(); - let mut bytes = [0u8; 16]; - bytes[..n].copy_from_slice(x); - i128::from_be_bytes(bytes) >> (8 * (16 - n)) - }) - }))) - }, - PhysicalType::Primitive(PrimitiveType::Int256) => { - Box::new(PrimitiveArray::from_trusted_len_iter(iter.map(|v| { - v.map(|x| { - let n = x.len(); - let mut bytes = [0u8; 32]; - bytes[..n].copy_from_slice(x); - i256::from_be_bytes(bytes) - }) - }))) - }, - _ => { - let mut a = MutableFixedSizeBinaryArray::try_new( - data_type, - Vec::with_capacity(iter.size_hint().0), - None, - ) - .unwrap(); - for item in iter { - a.push(item); - } - let a: FixedSizeBinaryArray = a.into(); - Box::new(a) - }, - } -} diff --git a/crates/polars-parquet/src/arrow/read/indexes/mod.rs b/crates/polars-parquet/src/arrow/read/indexes/mod.rs deleted file mode 100644 index 9cf465c64206..000000000000 --- a/crates/polars-parquet/src/arrow/read/indexes/mod.rs +++ /dev/null @@ -1,377 +0,0 @@ -//! API to perform page-level filtering (also known as indexes) -use crate::parquet::error::ParquetError; -use crate::parquet::indexes::{ - select_pages, BooleanIndex, ByteIndex, FixedLenByteIndex, Index as ParquetIndex, NativeIndex, - PageLocation, -}; -use crate::parquet::metadata::{ColumnChunkMetaData, RowGroupMetaData}; -use crate::parquet::read::{read_columns_indexes as _read_columns_indexes, read_pages_locations}; -use crate::parquet::schema::types::PhysicalType as ParquetPhysicalType; - -mod binary; -mod boolean; -mod fixed_len_binary; -mod primitive; - -use std::collections::VecDeque; -use std::io::{Read, Seek}; - -use arrow::array::{Array, UInt64Array}; -use arrow::datatypes::{ArrowDataType, Field, PhysicalType, PrimitiveType}; -use polars_error::{polars_bail, PolarsResult}; - -use super::get_field_pages; -pub use crate::parquet::indexes::{FilteredPage, Interval}; - -/// Page statistics of an Arrow field. -#[derive(Debug, PartialEq)] -pub enum FieldPageStatistics { - /// Variant used for fields with a single parquet column (e.g. primitives, dictionaries, list) - Single(ColumnPageStatistics), - /// Variant used for fields with multiple parquet columns (e.g. Struct, Map) - Multiple(Vec), -} - -impl From for FieldPageStatistics { - fn from(column: ColumnPageStatistics) -> Self { - Self::Single(column) - } -} - -/// [`ColumnPageStatistics`] contains the minimum, maximum, and null_count -/// of each page of a parquet column, as an [`Array`]. -/// This struct has the following invariants: -/// * `min`, `max` and `null_count` have the same length (equal to the number of pages in the column) -/// * `min`, `max` and `null_count` are guaranteed to be non-null -/// * `min` and `max` have the same logical type -#[derive(Debug, PartialEq)] -pub struct ColumnPageStatistics { - /// The minimum values in the pages - pub min: Box, - /// The maximum values in the pages - pub max: Box, - /// The number of null values in the pages. - pub null_count: UInt64Array, -} - -/// Given a sequence of [`ParquetIndex`] representing the page indexes of each column in the -/// parquet file, returns the page-level statistics as a [`FieldPageStatistics`]. -/// -/// This function maps timestamps, decimal types, etc. accordingly. -/// # Implementation -/// This function is CPU-bounded `O(P)` where `P` is the total number of pages on all columns. -/// # Error -/// This function errors iff the value is not deserializable to arrow (e.g. invalid utf-8) -fn deserialize( - indexes: &mut VecDeque<&dyn ParquetIndex>, - data_type: ArrowDataType, -) -> PolarsResult { - match data_type.to_physical_type() { - PhysicalType::Boolean => { - let index = indexes - .pop_front() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - Ok(boolean::deserialize(&index.indexes).into()) - }, - PhysicalType::Primitive(PrimitiveType::Int128) => { - let index = indexes.pop_front().unwrap(); - match index.physical_type() { - ParquetPhysicalType::Int32 => { - let index = index.as_any().downcast_ref::>().unwrap(); - Ok(primitive::deserialize_i32(&index.indexes, data_type).into()) - }, - crate::parquet::schema::types::PhysicalType::Int64 => { - let index = index.as_any().downcast_ref::>().unwrap(); - Ok( - primitive::deserialize_i64( - &index.indexes, - &index.primitive_type, - data_type, - ) - .into(), - ) - }, - crate::parquet::schema::types::PhysicalType::FixedLenByteArray(_) => { - let index = index.as_any().downcast_ref::().unwrap(); - Ok(fixed_len_binary::deserialize(&index.indexes, data_type).into()) - }, - other => polars_bail!(nyi = "Deserialize {other:?} to arrow's int64"), - } - }, - PhysicalType::Primitive(PrimitiveType::Int256) => { - let index = indexes.pop_front().unwrap(); - match index.physical_type() { - ParquetPhysicalType::Int32 => { - let index = index.as_any().downcast_ref::>().unwrap(); - Ok(primitive::deserialize_i32(&index.indexes, data_type).into()) - }, - crate::parquet::schema::types::PhysicalType::Int64 => { - let index = index.as_any().downcast_ref::>().unwrap(); - Ok( - primitive::deserialize_i64( - &index.indexes, - &index.primitive_type, - data_type, - ) - .into(), - ) - }, - crate::parquet::schema::types::PhysicalType::FixedLenByteArray(_) => { - let index = index.as_any().downcast_ref::().unwrap(); - Ok(fixed_len_binary::deserialize(&index.indexes, data_type).into()) - }, - other => polars_bail!(nyi = "Deserialize {other:?} to arrow's int64"), - } - }, - PhysicalType::Primitive(PrimitiveType::UInt8) - | PhysicalType::Primitive(PrimitiveType::UInt16) - | PhysicalType::Primitive(PrimitiveType::UInt32) - | PhysicalType::Primitive(PrimitiveType::Int32) => { - let index = indexes - .pop_front() - .unwrap() - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(primitive::deserialize_i32(&index.indexes, data_type).into()) - }, - PhysicalType::Primitive(PrimitiveType::UInt64) - | PhysicalType::Primitive(PrimitiveType::Int64) => { - let index = indexes.pop_front().unwrap(); - match index.physical_type() { - ParquetPhysicalType::Int64 => { - let index = index.as_any().downcast_ref::>().unwrap(); - Ok( - primitive::deserialize_i64( - &index.indexes, - &index.primitive_type, - data_type, - ) - .into(), - ) - }, - crate::parquet::schema::types::PhysicalType::Int96 => { - let index = index - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(primitive::deserialize_i96(&index.indexes, data_type).into()) - }, - other => polars_bail!(nyi = "Deserialize {other:?} to arrow's int64"), - } - }, - PhysicalType::Primitive(PrimitiveType::Float32) => { - let index = indexes - .pop_front() - .unwrap() - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(primitive::deserialize_id(&index.indexes, data_type).into()) - }, - PhysicalType::Primitive(PrimitiveType::Float64) => { - let index = indexes - .pop_front() - .unwrap() - .as_any() - .downcast_ref::>() - .unwrap(); - Ok(primitive::deserialize_id(&index.indexes, data_type).into()) - }, - PhysicalType::Binary - | PhysicalType::LargeBinary - | PhysicalType::Utf8 - | PhysicalType::LargeUtf8 - | PhysicalType::Utf8View - | PhysicalType::BinaryView => { - let index = indexes - .pop_front() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - binary::deserialize(&index.indexes, &data_type).map(|x| x.into()) - }, - PhysicalType::FixedSizeBinary => { - let index = indexes - .pop_front() - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - Ok(fixed_len_binary::deserialize(&index.indexes, data_type).into()) - }, - PhysicalType::Dictionary(_) => { - if let ArrowDataType::Dictionary(_, inner, _) = data_type.to_logical_type() { - deserialize(indexes, (**inner).clone()) - } else { - unreachable!() - } - }, - PhysicalType::List => { - if let ArrowDataType::List(inner) = data_type.to_logical_type() { - deserialize(indexes, inner.data_type.clone()) - } else { - unreachable!() - } - }, - PhysicalType::LargeList => { - if let ArrowDataType::LargeList(inner) = data_type.to_logical_type() { - deserialize(indexes, inner.data_type.clone()) - } else { - unreachable!() - } - }, - PhysicalType::Map => { - if let ArrowDataType::Map(inner, _) = data_type.to_logical_type() { - deserialize(indexes, inner.data_type.clone()) - } else { - unreachable!() - } - }, - PhysicalType::Struct => { - let children_fields = - if let ArrowDataType::Struct(children) = data_type.to_logical_type() { - children - } else { - unreachable!() - }; - let children = children_fields - .iter() - .map(|child| deserialize(indexes, child.data_type.clone())) - .collect::>>()?; - - Ok(FieldPageStatistics::Multiple(children)) - }, - - other => polars_bail!(nyi = "Deserialize into arrow's {other:?} page index"), - } -} - -/// Checks whether the row group have page index information (page statistics) -pub fn has_indexes(row_group: &RowGroupMetaData) -> bool { - row_group - .columns() - .iter() - .all(|chunk| chunk.column_chunk().column_index_offset.is_some()) -} - -/// Reads the column indexes from the reader assuming a valid set of derived Arrow fields -/// for all parquet the columns in the file. -/// -/// It returns one [`FieldPageStatistics`] per field in `fields` -/// -/// This function is expected to be used to filter out parquet pages. -/// -/// # Implementation -/// This function is IO-bounded and calls `reader.read_exact` exactly once. -/// # Error -/// Errors iff the indexes can't be read or their deserialization to arrow is incorrect (e.g. invalid utf-8) -pub fn read_columns_indexes( - reader: &mut R, - chunks: &[ColumnChunkMetaData], - fields: &[Field], -) -> PolarsResult> { - let indexes = _read_columns_indexes(reader, chunks)?; - - fields - .iter() - .map(|field| { - let indexes = get_field_pages(chunks, &indexes, &field.name); - let mut indexes = indexes.into_iter().map(|boxed| boxed.as_ref()).collect(); - - deserialize(&mut indexes, field.data_type.clone()) - }) - .collect() -} - -/// Returns the set of (row) intervals of the pages. -pub fn compute_page_row_intervals( - locations: &[PageLocation], - num_rows: usize, -) -> Result, ParquetError> { - if locations.is_empty() { - return Ok(vec![]); - }; - - let last = (|| { - let start: usize = locations.last().unwrap().first_row_index.try_into()?; - let length = num_rows - start; - Result::<_, ParquetError>::Ok(Interval::new(start, length)) - })(); - - let pages_lengths = locations - .windows(2) - .map(|x| { - let start = usize::try_from(x[0].first_row_index)?; - let length = usize::try_from(x[1].first_row_index - x[0].first_row_index)?; - Ok(Interval::new(start, length)) - }) - .chain(std::iter::once(last)); - pages_lengths.collect() -} - -/// Reads all page locations and index locations (IO-bounded) and uses `predicate` to compute -/// the set of [`FilteredPage`] that fulfill the predicate. -/// -/// The non-trivial argument of this function is `predicate`, that controls which pages are selected. -/// Its signature contains 2 arguments: -/// * 0th argument (indexes): contains one [`ColumnPageStatistics`] (page statistics) per field. -/// Use it to evaluate the predicate against -/// * 1th argument (intervals): contains one [`Vec>`] (row positions) per field. -/// For each field, the outermost vector corresponds to each parquet column: -/// a primitive field contains 1 column, a struct field with 2 primitive fields contain 2 columns. -/// The inner `Vec` contains one [`Interval`] per page: its length equals the length of [`ColumnPageStatistics`]. -/// -/// It returns a single [`Vec`] denoting the set of intervals that the predicate selects (over all columns). -/// -/// This returns one item per `field`. For each field, there is one item per column (for non-nested types it returns one column) -/// and finally [`Vec`], that corresponds to the set of selected pages. -pub fn read_filtered_pages< - R: Read + Seek, - F: Fn(&[FieldPageStatistics], &[Vec>]) -> Vec, ->( - reader: &mut R, - row_group: &RowGroupMetaData, - fields: &[Field], - predicate: F, - //is_intersection: bool, -) -> PolarsResult>>> { - let num_rows = row_group.num_rows(); - - // one vec per column - let locations = read_pages_locations(reader, row_group.columns())?; - // one Vec> per field (non-nested contain a single entry on the first column) - let locations = fields - .iter() - .map(|field| get_field_pages(row_group.columns(), &locations, &field.name)) - .collect::>(); - - // one ColumnPageStatistics per field - let indexes = read_columns_indexes(reader, row_group.columns(), fields)?; - - let intervals = locations - .iter() - .map(|locations| { - locations - .iter() - .map(|locations| Ok(compute_page_row_intervals(locations, num_rows)?)) - .collect::>>() - }) - .collect::>>()?; - - let intervals = predicate(&indexes, &intervals); - - locations - .into_iter() - .map(|locations| { - locations - .into_iter() - .map(|locations| Ok(select_pages(&intervals, locations, num_rows)?)) - .collect::>>() - }) - .collect() -} diff --git a/crates/polars-parquet/src/arrow/read/indexes/primitive.rs b/crates/polars-parquet/src/arrow/read/indexes/primitive.rs deleted file mode 100644 index dfd72bc9c54e..000000000000 --- a/crates/polars-parquet/src/arrow/read/indexes/primitive.rs +++ /dev/null @@ -1,227 +0,0 @@ -use arrow::array::{Array, MutablePrimitiveArray, PrimitiveArray}; -use arrow::datatypes::{ArrowDataType, TimeUnit}; -use arrow::trusted_len::TrustedLen; -use arrow::types::{i256, NativeType}; -use ethnum::I256; - -use super::ColumnPageStatistics; -use crate::parquet::indexes::PageIndex; -use crate::parquet::schema::types::{ - PrimitiveLogicalType, PrimitiveType, TimeUnit as ParquetTimeUnit, -}; -use crate::parquet::types::int96_to_i64_ns; - -#[inline] -fn deserialize_int32>>( - iter: I, - data_type: ArrowDataType, -) -> Box { - use ArrowDataType::*; - match data_type.to_logical_type() { - UInt8 => Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u8))) - .to(data_type), - ) as _, - UInt16 => Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u16))) - .to(data_type), - ), - UInt32 => Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u32))) - .to(data_type), - ), - Decimal(_, _) => Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as i128))) - .to(data_type), - ), - Decimal256(_, _) => Box::new( - PrimitiveArray::::from_trusted_len_iter( - iter.map(|x| x.map(|x| i256(I256::new(x.into())))), - ) - .to(data_type), - ) as _, - _ => Box::new(PrimitiveArray::::from_trusted_len_iter(iter).to(data_type)), - } -} - -#[inline] -fn timestamp( - array: &mut MutablePrimitiveArray, - time_unit: TimeUnit, - logical_type: Option, -) { - let unit = if let Some(PrimitiveLogicalType::Timestamp { unit, .. }) = logical_type { - unit - } else { - return; - }; - - match (unit, time_unit) { - (ParquetTimeUnit::Milliseconds, TimeUnit::Second) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000), - (ParquetTimeUnit::Microseconds, TimeUnit::Second) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000_000), - (ParquetTimeUnit::Nanoseconds, TimeUnit::Second) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000_000_000), - - (ParquetTimeUnit::Milliseconds, TimeUnit::Millisecond) => {}, - (ParquetTimeUnit::Microseconds, TimeUnit::Millisecond) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000), - (ParquetTimeUnit::Nanoseconds, TimeUnit::Millisecond) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000_000), - - (ParquetTimeUnit::Milliseconds, TimeUnit::Microsecond) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x *= 1_000), - (ParquetTimeUnit::Microseconds, TimeUnit::Microsecond) => {}, - (ParquetTimeUnit::Nanoseconds, TimeUnit::Microsecond) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000), - - (ParquetTimeUnit::Milliseconds, TimeUnit::Nanosecond) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x *= 1_000_000), - (ParquetTimeUnit::Microseconds, TimeUnit::Nanosecond) => array - .values_mut_slice() - .iter_mut() - .for_each(|x| *x /= 1_000), - (ParquetTimeUnit::Nanoseconds, TimeUnit::Nanosecond) => {}, - } -} - -#[inline] -fn deserialize_int64>>( - iter: I, - primitive_type: &PrimitiveType, - data_type: ArrowDataType, -) -> Box { - use ArrowDataType::*; - match data_type.to_logical_type() { - UInt64 => Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as u64))) - .to(data_type), - ) as _, - Decimal(_, _) => Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(|x| x as i128))) - .to(data_type), - ) as _, - Decimal256(_, _) => Box::new( - PrimitiveArray::::from_trusted_len_iter( - iter.map(|x| x.map(|x| i256(I256::new(x.into())))), - ) - .to(data_type), - ) as _, - Timestamp(time_unit, _) => { - let mut array = - MutablePrimitiveArray::::from_trusted_len_iter(iter).to(data_type.clone()); - - timestamp(&mut array, *time_unit, primitive_type.logical_type); - - let array: PrimitiveArray = array.into(); - - Box::new(array) - }, - _ => Box::new(PrimitiveArray::::from_trusted_len_iter(iter).to(data_type)), - } -} - -#[inline] -fn deserialize_int96>>( - iter: I, - data_type: ArrowDataType, -) -> Box { - Box::new( - PrimitiveArray::::from_trusted_len_iter(iter.map(|x| x.map(int96_to_i64_ns))) - .to(data_type), - ) -} - -#[inline] -fn deserialize_id_s>>( - iter: I, - data_type: ArrowDataType, -) -> Box { - Box::new(PrimitiveArray::::from_trusted_len_iter(iter).to(data_type)) -} - -pub fn deserialize_i32( - indexes: &[PageIndex], - data_type: ArrowDataType, -) -> ColumnPageStatistics { - ColumnPageStatistics { - min: deserialize_int32(indexes.iter().map(|index| index.min), data_type.clone()), - max: deserialize_int32(indexes.iter().map(|index| index.max), data_type), - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - } -} - -pub fn deserialize_i64( - indexes: &[PageIndex], - primitive_type: &PrimitiveType, - data_type: ArrowDataType, -) -> ColumnPageStatistics { - ColumnPageStatistics { - min: deserialize_int64( - indexes.iter().map(|index| index.min), - primitive_type, - data_type.clone(), - ), - max: deserialize_int64( - indexes.iter().map(|index| index.max), - primitive_type, - data_type, - ), - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - } -} - -pub fn deserialize_i96( - indexes: &[PageIndex<[u32; 3]>], - data_type: ArrowDataType, -) -> ColumnPageStatistics { - ColumnPageStatistics { - min: deserialize_int96(indexes.iter().map(|index| index.min), data_type.clone()), - max: deserialize_int96(indexes.iter().map(|index| index.max), data_type), - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - } -} - -pub fn deserialize_id( - indexes: &[PageIndex], - data_type: ArrowDataType, -) -> ColumnPageStatistics { - ColumnPageStatistics { - min: deserialize_id_s(indexes.iter().map(|index| index.min), data_type.clone()), - max: deserialize_id_s(indexes.iter().map(|index| index.max), data_type), - null_count: PrimitiveArray::from_trusted_len_iter( - indexes - .iter() - .map(|index| index.null_count.map(|x| x as u64)), - ), - } -} diff --git a/crates/polars-parquet/src/arrow/read/mod.rs b/crates/polars-parquet/src/arrow/read/mod.rs index b7fe702e8107..9c445d7a46ce 100644 --- a/crates/polars-parquet/src/arrow/read/mod.rs +++ b/crates/polars-parquet/src/arrow/read/mod.rs @@ -2,7 +2,6 @@ #![allow(clippy::type_complexity)] mod deserialize; -pub mod indexes; pub mod schema; pub mod statistics; @@ -28,8 +27,7 @@ pub use crate::parquet::{ metadata::{ColumnChunkMetaData, ColumnDescriptor, RowGroupMetaData}, page::{CompressedDataPage, DataPageHeader, Page}, read::{ - decompress, get_column_iterator, read_columns_indexes as _read_columns_indexes, - read_metadata as _read_metadata, read_pages_locations, BasicDecompressor, + decompress, get_column_iterator, read_metadata as _read_metadata, BasicDecompressor, MutStreamingIterator, PageReader, ReadColumnIterator, State, }, schema::types::{ @@ -40,18 +38,6 @@ pub use crate::parquet::{ FallibleStreamingIterator, }; -/// Returns all [`ColumnChunkMetaData`] associated to `field_name`. -/// For non-nested parquet types, this returns a single column -pub fn get_field_columns<'a>( - columns: &'a [ColumnChunkMetaData], - field_name: &str, -) -> Vec<&'a ColumnChunkMetaData> { - columns - .iter() - .filter(|x| x.descriptor().path_in_schema[0] == field_name) - .collect() -} - /// Returns all [`ColumnChunkMetaData`] associated to `field_name`. /// For non-nested parquet types, this returns a single column pub fn get_field_pages<'a, T>( diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index a52498557d9e..34fb195a4eaa 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -33,9 +33,11 @@ impl Default for SchemaInferenceOptions { } } -/// Infers a [`ArrowSchema`] from parquet's [`FileMetaData`]. This first looks for the metadata key -/// `"ARROW:schema"`; if it does not exist, it converts the parquet types declared in the -/// file's parquet schema to Arrow's equivalent. +/// Infers a [`ArrowSchema`] from parquet's [`FileMetaData`]. +/// +/// This first looks for the metadata key `"ARROW:schema"`; if it does not exist, it converts the +/// Parquet types declared in the file's Parquet schema to Arrow's equivalent. +/// /// # Error /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded, /// indicating that that the file's arrow metadata was incorrectly written. diff --git a/crates/polars-parquet/src/arrow/read/statistics/mod.rs b/crates/polars-parquet/src/arrow/read/statistics/mod.rs index ea3b34fb8631..0face3c8b358 100644 --- a/crates/polars-parquet/src/arrow/read/statistics/mod.rs +++ b/crates/polars-parquet/src/arrow/read/statistics/mod.rs @@ -8,12 +8,12 @@ use arrow::with_match_primitive_type_full; use ethnum::I256; use polars_error::{polars_bail, PolarsResult}; -use crate::parquet::metadata::RowGroupMetaData; use crate::parquet::schema::types::{ PhysicalType as ParquetPhysicalType, PrimitiveType as ParquetPrimitiveType, }; use crate::parquet::statistics::{PrimitiveStatistics, Statistics as ParquetStatistics}; use crate::parquet::types::int96_to_i64_ns; +use crate::read::ColumnChunkMetaData; mod binary; mod binview; @@ -28,7 +28,6 @@ mod struct_; mod utf8; use self::list::DynMutableListArray; -use super::get_field_columns; /// Arrow-deserialized parquet Statistics of a file #[derive(Debug, PartialEq)] @@ -543,12 +542,11 @@ fn push( /// /// # Errors /// This function errors if the deserialization of the statistics fails (e.g. invalid utf8) -pub fn deserialize(field: &Field, row_group: &RowGroupMetaData) -> PolarsResult { +pub fn deserialize(field: &Field, field_md: &[&ColumnChunkMetaData]) -> PolarsResult { let mut statistics = MutableStatistics::try_new(field)?; - let columns = get_field_columns(row_group.columns(), field.name.as_ref()); - let mut stats = columns - .into_iter() + let mut stats = field_md + .iter() .map(|column| { Ok(( column.statistics().transpose()?, diff --git a/crates/polars-parquet/src/arrow/write/binary/basic.rs b/crates/polars-parquet/src/arrow/write/binary/basic.rs index c977a4e4939c..0e7cbdfb37b1 100644 --- a/crates/polars-parquet/src/arrow/write/binary/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binary/basic.rs @@ -9,7 +9,7 @@ use crate::parquet::encoding::{delta_bitpacked, Encoding}; use crate::parquet::schema::types::PrimitiveType; use crate::parquet::statistics::{BinaryStatistics, ParquetStatistics}; use crate::write::utils::invalid_encoding; -use crate::write::{Page, StatisticsOptions}; +use crate::write::{EncodeNullability, Page, StatisticsOptions}; pub(crate) fn encode_non_null_values<'a, I: Iterator>( iter: I, @@ -23,14 +23,27 @@ pub(crate) fn encode_non_null_values<'a, I: Iterator>( }) } -pub(crate) fn encode_plain(array: &BinaryArray, buffer: &mut Vec) { - let len_before = buffer.len(); - let capacity = - array.get_values_size() + (array.len() - array.null_count()) * std::mem::size_of::(); - buffer.reserve(capacity); - encode_non_null_values(array.non_null_values_iter(), buffer); - // Ensure we allocated properly. - debug_assert_eq!(buffer.len() - len_before, capacity); +pub(crate) fn encode_plain( + array: &BinaryArray, + options: EncodeNullability, + buffer: &mut Vec, +) { + if options.is_optional() && array.validity().is_some() { + let len_before = buffer.len(); + let capacity = array.get_values_size() + + (array.len() - array.null_count()) * std::mem::size_of::(); + buffer.reserve(capacity); + encode_non_null_values(array.non_null_values_iter(), buffer); + // Ensure we allocated properly. + debug_assert_eq!(buffer.len() - len_before, capacity); + } else { + let len_before = buffer.len(); + let capacity = array.get_values_size() + array.len() * std::mem::size_of::(); + buffer.reserve(capacity); + encode_non_null_values(array.values_iter(), buffer); + // Ensure we allocated properly. + debug_assert_eq!(buffer.len() - len_before, capacity); + } } pub fn array_to_page( @@ -41,6 +54,7 @@ pub fn array_to_page( ) -> PolarsResult { let validity = array.validity(); let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); let mut buffer = vec![]; utils::write_def_levels( @@ -54,12 +68,12 @@ pub fn array_to_page( let definition_levels_byte_length = buffer.len(); match encoding { - Encoding::Plain => encode_plain(array, &mut buffer), + Encoding::Plain => encode_plain(array, encode_options, &mut buffer), Encoding::DeltaLengthByteArray => encode_delta( array.values(), array.offsets().buffer(), array.validity(), - is_optional, + encode_options, &mut buffer, ), _ => return Err(invalid_encoding(encoding, array.data_type())), @@ -113,10 +127,10 @@ pub(crate) fn encode_delta( values: &[u8], offsets: &[O], validity: Option<&Bitmap>, - is_optional: bool, + options: EncodeNullability, buffer: &mut Vec, ) { - if is_optional { + if options.is_optional() && validity.is_some() { if let Some(validity) = validity { let lengths = offsets .windows(2) diff --git a/crates/polars-parquet/src/arrow/write/binary/nested.rs b/crates/polars-parquet/src/arrow/write/binary/nested.rs index afc487f42333..afb87200da53 100644 --- a/crates/polars-parquet/src/arrow/write/binary/nested.rs +++ b/crates/polars-parquet/src/arrow/write/binary/nested.rs @@ -8,6 +8,8 @@ use crate::arrow::write::Nested; use crate::parquet::encoding::Encoding; use crate::parquet::page::DataPage; use crate::parquet::schema::types::PrimitiveType; +use crate::read::schema::is_nullable; +use crate::write::EncodeNullability; pub fn array_to_page( array: &BinaryArray, @@ -18,11 +20,14 @@ pub fn array_to_page( where O: Offset, { + let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); + let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = nested::write_rep_and_def(options.version, nested, &mut buffer)?; - encode_plain(array, &mut buffer); + encode_plain(array, encode_options, &mut buffer); let statistics = if options.has_statistics() { Some(build_statistics(array, type_.clone(), &options.statistics)) diff --git a/crates/polars-parquet/src/arrow/write/binview/basic.rs b/crates/polars-parquet/src/arrow/write/binview/basic.rs index c7059b63c99e..251a336b2177 100644 --- a/crates/polars-parquet/src/arrow/write/binview/basic.rs +++ b/crates/polars-parquet/src/arrow/write/binview/basic.rs @@ -8,29 +8,56 @@ use crate::parquet::statistics::{BinaryStatistics, ParquetStatistics}; use crate::read::schema::is_nullable; use crate::write::binary::encode_non_null_values; use crate::write::utils::invalid_encoding; -use crate::write::{utils, Encoding, Page, StatisticsOptions, WriteOptions}; +use crate::write::{utils, EncodeNullability, Encoding, Page, StatisticsOptions, WriteOptions}; -pub(crate) fn encode_plain(array: &BinaryViewArray, buffer: &mut Vec) { - let capacity = - array.total_bytes_len() + (array.len() - array.null_count()) * std::mem::size_of::(); +pub(crate) fn encode_plain( + array: &BinaryViewArray, + options: EncodeNullability, + buffer: &mut Vec, +) { + if options.is_optional() && array.validity().is_some() { + let capacity = array.total_bytes_len() + + (array.len() - array.null_count()) * std::mem::size_of::(); + + let len_before = buffer.len(); + buffer.reserve(capacity); + + encode_non_null_values(array.non_null_values_iter(), buffer); + // Append the non-null values. + debug_assert_eq!(buffer.len() - len_before, capacity); + } else { + let capacity = array.total_bytes_len() + array.len() * std::mem::size_of::(); - let len_before = buffer.len(); - buffer.reserve(capacity); + let len_before = buffer.len(); + buffer.reserve(capacity); - encode_non_null_values(array.non_null_values_iter(), buffer); - // Append the non-null values. - debug_assert_eq!(buffer.len() - len_before, capacity); + encode_non_null_values(array.values_iter(), buffer); + // Append the non-null values. + debug_assert_eq!(buffer.len() - len_before, capacity); + } } -pub(crate) fn encode_delta(array: &BinaryViewArray, buffer: &mut Vec) { - let lengths = utils::ExactSizedIter::new( - array.non_null_views_iter().map(|v| v.length as i64), - array.len() - array.null_count(), - ); - delta_bitpacked::encode(lengths, buffer, 1); +pub(crate) fn encode_delta( + array: &BinaryViewArray, + options: EncodeNullability, + buffer: &mut Vec, +) { + if options.is_optional() && array.validity().is_some() { + let lengths = utils::ExactSizedIter::new( + array.non_null_views_iter().map(|v| v.length as i64), + array.len() - array.null_count(), + ); + delta_bitpacked::encode(lengths, buffer, 1); + + for slice in array.non_null_values_iter() { + buffer.extend_from_slice(slice) + } + } else { + let lengths = + utils::ExactSizedIter::new(array.views().iter().map(|v| v.length as i64), array.len()); + delta_bitpacked::encode(lengths, buffer, 1); - for slice in array.non_null_values_iter() { - buffer.extend_from_slice(slice) + buffer.extend(array.values_iter().flatten()); } } @@ -41,6 +68,7 @@ pub fn array_to_page( encoding: Encoding, ) -> PolarsResult { let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); let mut buffer = vec![]; // TODO! reserve capacity @@ -55,8 +83,8 @@ pub fn array_to_page( let definition_levels_byte_length = buffer.len(); match encoding { - Encoding::Plain => encode_plain(array, &mut buffer), - Encoding::DeltaLengthByteArray => encode_delta(array, &mut buffer), + Encoding::Plain => encode_plain(array, encode_options, &mut buffer), + Encoding::DeltaLengthByteArray => encode_delta(array, encode_options, &mut buffer), _ => return Err(invalid_encoding(encoding, array.data_type())), } diff --git a/crates/polars-parquet/src/arrow/write/binview/nested.rs b/crates/polars-parquet/src/arrow/write/binview/nested.rs index 9e76b23e6b19..16165a7d4299 100644 --- a/crates/polars-parquet/src/arrow/write/binview/nested.rs +++ b/crates/polars-parquet/src/arrow/write/binview/nested.rs @@ -7,6 +7,8 @@ use crate::arrow::write::Nested; use crate::parquet::encoding::Encoding; use crate::parquet::page::DataPage; use crate::parquet::schema::types::PrimitiveType; +use crate::read::schema::is_nullable; +use crate::write::EncodeNullability; pub fn array_to_page( array: &BinaryViewArray, @@ -14,11 +16,14 @@ pub fn array_to_page( type_: PrimitiveType, nested: &[Nested], ) -> PolarsResult { + let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); + let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = nested::write_rep_and_def(options.version, nested, &mut buffer)?; - encode_plain(array, &mut buffer); + encode_plain(array, encode_options, &mut buffer); let statistics = if options.has_statistics() { Some(build_statistics(array, type_.clone(), &options.statistics)) diff --git a/crates/polars-parquet/src/arrow/write/boolean/basic.rs b/crates/polars-parquet/src/arrow/write/boolean/basic.rs index 0735ba2f4d6c..e338ca0c3d12 100644 --- a/crates/polars-parquet/src/arrow/write/boolean/basic.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/basic.rs @@ -8,7 +8,7 @@ use crate::parquet::encoding::Encoding; use crate::parquet::page::DataPage; use crate::parquet::schema::types::PrimitiveType; use crate::parquet::statistics::{BooleanStatistics, ParquetStatistics}; -use crate::write::StatisticsOptions; +use crate::write::{EncodeNullability, StatisticsOptions}; fn encode(iterator: impl Iterator, buffer: &mut Vec) -> PolarsResult<()> { // encode values using bitpacking @@ -20,10 +20,10 @@ fn encode(iterator: impl Iterator, buffer: &mut Vec) -> PolarsR pub(super) fn encode_plain( array: &BooleanArray, - is_optional: bool, + encode_options: EncodeNullability, buffer: &mut Vec, ) -> PolarsResult<()> { - if is_optional && array.validity().is_some() { + if encode_options.is_optional() && array.validity().is_some() { encode(array.non_null_values_iter(), buffer) } else { encode(array.values().iter(), buffer) @@ -32,13 +32,13 @@ pub(super) fn encode_plain( pub(super) fn encode_hybrid_rle( array: &BooleanArray, - is_optional: bool, + encode_options: EncodeNullability, buffer: &mut Vec, ) -> PolarsResult<()> { buffer.extend_from_slice(&[0; 4]); let start = buffer.len(); - if is_optional && array.validity().is_some() { + if encode_options.is_optional() && array.validity().is_some() { hybrid_rle::encode(buffer, array.non_null_values_iter(), 1)?; } else { hybrid_rle::encode(buffer, array.values().iter(), 1)?; @@ -60,6 +60,7 @@ pub fn array_to_page( encoding: Encoding, ) -> PolarsResult { let is_optional = is_nullable(&type_.field_info); + let encode_nullability = EncodeNullability::new(is_optional); let validity = array.validity(); @@ -75,8 +76,8 @@ pub fn array_to_page( let definition_levels_byte_length = buffer.len(); match encoding { - Encoding::Plain => encode_plain(array, is_optional, &mut buffer)?, - Encoding::Rle => encode_hybrid_rle(array, is_optional, &mut buffer)?, + Encoding::Plain => encode_plain(array, encode_nullability, &mut buffer)?, + Encoding::Rle => encode_hybrid_rle(array, encode_nullability, &mut buffer)?, other => polars_bail!(nyi = "Encoding boolean as {other:?}"), } diff --git a/crates/polars-parquet/src/arrow/write/boolean/nested.rs b/crates/polars-parquet/src/arrow/write/boolean/nested.rs index 3560bc167369..082197202588 100644 --- a/crates/polars-parquet/src/arrow/write/boolean/nested.rs +++ b/crates/polars-parquet/src/arrow/write/boolean/nested.rs @@ -1,7 +1,7 @@ use arrow::array::{Array, BooleanArray}; use polars_error::PolarsResult; -use super::super::{nested, utils, WriteOptions}; +use super::super::{nested, utils, EncodeNullability, WriteOptions}; use super::basic::{build_statistics, encode_plain}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::Nested; @@ -16,12 +16,13 @@ pub fn array_to_page( nested: &[Nested], ) -> PolarsResult { let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = nested::write_rep_and_def(options.version, nested, &mut buffer)?; - encode_plain(array, is_optional, &mut buffer)?; + encode_plain(array, encode_options, &mut buffer)?; let statistics = if options.has_statistics() { Some(build_statistics(array, &options.statistics)) diff --git a/crates/polars-parquet/src/arrow/write/dictionary.rs b/crates/polars-parquet/src/arrow/write/dictionary.rs index 4a507557d36b..096765446f88 100644 --- a/crates/polars-parquet/src/arrow/write/dictionary.rs +++ b/crates/polars-parquet/src/arrow/write/dictionary.rs @@ -19,7 +19,7 @@ use super::pages::PrimitiveNested; use super::primitive::{ build_statistics as primitive_build_statistics, encode_plain as primitive_encode_plain, }; -use super::{binview, nested, Nested, WriteOptions}; +use super::{binview, nested, EncodeNullability, Nested, WriteOptions}; use crate::arrow::read::schema::is_nullable; use crate::arrow::write::{slice_nested_leaf, utils}; use crate::parquet::encoding::hybrid_rle::encode; @@ -313,7 +313,8 @@ macro_rules! dyn_prim { ($from:ty, $to:ty, $array:expr, $options:expr, $type_:expr) => {{ let values = $array.values().as_any().downcast_ref().unwrap(); - let buffer = primitive_encode_plain::<$from, $to>(values, false, vec![]); + let buffer = + primitive_encode_plain::<$from, $to>(values, EncodeNullability::new(false), vec![]); let stats: Option = if !$options.statistics.is_empty() { let mut stats = primitive_build_statistics::<$from, $to>( @@ -343,140 +344,144 @@ pub fn array_to_pages( match encoding { Encoding::PlainDictionary | Encoding::RleDictionary => { // write DictPage - let (dict_page, mut statistics): (_, Option) = - match array.values().data_type().to_logical_type() { - ArrowDataType::Int8 => dyn_prim!(i8, i32, array, options, type_), - ArrowDataType::Int16 => dyn_prim!(i16, i32, array, options, type_), - ArrowDataType::Int32 | ArrowDataType::Date32 | ArrowDataType::Time32(_) => { - dyn_prim!(i32, i32, array, options, type_) - }, - ArrowDataType::Int64 - | ArrowDataType::Date64 - | ArrowDataType::Time64(_) - | ArrowDataType::Timestamp(_, _) - | ArrowDataType::Duration(_) => dyn_prim!(i64, i64, array, options, type_), - ArrowDataType::UInt8 => dyn_prim!(u8, i32, array, options, type_), - ArrowDataType::UInt16 => dyn_prim!(u16, i32, array, options, type_), - ArrowDataType::UInt32 => dyn_prim!(u32, i32, array, options, type_), - ArrowDataType::UInt64 => dyn_prim!(u64, i64, array, options, type_), - ArrowDataType::Float32 => dyn_prim!(f32, f32, array, options, type_), - ArrowDataType::Float64 => dyn_prim!(f64, f64, array, options, type_), - ArrowDataType::LargeUtf8 => { - let array = arrow::compute::cast::cast( - array.values().as_ref(), - &ArrowDataType::LargeBinary, - Default::default(), - ) + let (dict_page, mut statistics): (_, Option) = match array + .values() + .data_type() + .to_logical_type() + { + ArrowDataType::Int8 => dyn_prim!(i8, i32, array, options, type_), + ArrowDataType::Int16 => dyn_prim!(i16, i32, array, options, type_), + ArrowDataType::Int32 | ArrowDataType::Date32 | ArrowDataType::Time32(_) => { + dyn_prim!(i32, i32, array, options, type_) + }, + ArrowDataType::Int64 + | ArrowDataType::Date64 + | ArrowDataType::Time64(_) + | ArrowDataType::Timestamp(_, _) + | ArrowDataType::Duration(_) => dyn_prim!(i64, i64, array, options, type_), + ArrowDataType::UInt8 => dyn_prim!(u8, i32, array, options, type_), + ArrowDataType::UInt16 => dyn_prim!(u16, i32, array, options, type_), + ArrowDataType::UInt32 => dyn_prim!(u32, i32, array, options, type_), + ArrowDataType::UInt64 => dyn_prim!(u64, i64, array, options, type_), + ArrowDataType::Float32 => dyn_prim!(f32, f32, array, options, type_), + ArrowDataType::Float64 => dyn_prim!(f64, f64, array, options, type_), + ArrowDataType::LargeUtf8 => { + let array = arrow::compute::cast::cast( + array.values().as_ref(), + &ArrowDataType::LargeBinary, + Default::default(), + ) + .unwrap(); + let array = array.as_any().downcast_ref().unwrap(); + + let mut buffer = vec![]; + binary_encode_plain::(array, EncodeNullability::Required, &mut buffer); + let stats = if options.has_statistics() { + Some(binary_build_statistics( + array, + type_.clone(), + &options.statistics, + )) + } else { + None + }; + ( + DictPage::new(CowBuffer::Owned(buffer), array.len(), false), + stats, + ) + }, + ArrowDataType::BinaryView => { + let array = array + .values() + .as_any() + .downcast_ref::() .unwrap(); - let array = array.as_any().downcast_ref().unwrap(); - - let mut buffer = vec![]; - binary_encode_plain::(array, &mut buffer); - let stats = if options.has_statistics() { - Some(binary_build_statistics( - array, - type_.clone(), - &options.statistics, - )) - } else { - None - }; - ( - DictPage::new(CowBuffer::Owned(buffer), array.len(), false), - stats, - ) - }, - ArrowDataType::BinaryView => { - let array = array - .values() - .as_any() - .downcast_ref::() - .unwrap(); - let mut buffer = vec![]; - binview::encode_plain(array, &mut buffer); - - let stats = if options.has_statistics() { - Some(binview::build_statistics( - array, - type_.clone(), - &options.statistics, - )) - } else { - None - }; - ( - DictPage::new(CowBuffer::Owned(buffer), array.len(), false), - stats, - ) - }, - ArrowDataType::Utf8View => { - let array = array - .values() - .as_any() - .downcast_ref::() - .unwrap() - .to_binview(); - let mut buffer = vec![]; - binview::encode_plain(&array, &mut buffer); - - let stats = if options.has_statistics() { - Some(binview::build_statistics( - &array, - type_.clone(), - &options.statistics, - )) - } else { - None - }; - ( - DictPage::new(CowBuffer::Owned(buffer), array.len(), false), - stats, - ) - }, - ArrowDataType::LargeBinary => { - let values = array.values().as_any().downcast_ref().unwrap(); - - let mut buffer = vec![]; - binary_encode_plain::(values, &mut buffer); - let stats = if options.has_statistics() { - Some(binary_build_statistics( - values, - type_.clone(), - &options.statistics, - )) - } else { - None - }; - ( - DictPage::new(CowBuffer::Owned(buffer), values.len(), false), - stats, - ) - }, - ArrowDataType::FixedSizeBinary(_) => { - let mut buffer = vec![]; - let array = array.values().as_any().downcast_ref().unwrap(); - fixed_binary_encode_plain(array, false, &mut buffer); - let stats = if options.has_statistics() { - let stats = fixed_binary_build_statistics( - array, - type_.clone(), - &options.statistics, - ); - Some(stats.serialize()) - } else { - None - }; - ( - DictPage::new(CowBuffer::Owned(buffer), array.len(), false), - stats, - ) - }, - other => { - polars_bail!(nyi = + let mut buffer = vec![]; + binview::encode_plain(array, EncodeNullability::Required, &mut buffer); + + let stats = if options.has_statistics() { + Some(binview::build_statistics( + array, + type_.clone(), + &options.statistics, + )) + } else { + None + }; + ( + DictPage::new(CowBuffer::Owned(buffer), array.len(), false), + stats, + ) + }, + ArrowDataType::Utf8View => { + let array = array + .values() + .as_any() + .downcast_ref::() + .unwrap() + .to_binview(); + let mut buffer = vec![]; + binview::encode_plain(&array, EncodeNullability::Required, &mut buffer); + + let stats = if options.has_statistics() { + Some(binview::build_statistics( + &array, + type_.clone(), + &options.statistics, + )) + } else { + None + }; + ( + DictPage::new(CowBuffer::Owned(buffer), array.len(), false), + stats, + ) + }, + ArrowDataType::LargeBinary => { + let values = array.values().as_any().downcast_ref().unwrap(); + + let mut buffer = vec![]; + binary_encode_plain::(values, EncodeNullability::Required, &mut buffer); + let stats = if options.has_statistics() { + Some(binary_build_statistics( + values, + type_.clone(), + &options.statistics, + )) + } else { + None + }; + ( + DictPage::new(CowBuffer::Owned(buffer), values.len(), false), + stats, + ) + }, + ArrowDataType::FixedSizeBinary(_) => { + let mut buffer = vec![]; + let array = array.values().as_any().downcast_ref().unwrap(); + fixed_binary_encode_plain(array, EncodeNullability::Required, &mut buffer); + let stats = if options.has_statistics() { + let stats = fixed_binary_build_statistics( + array, + type_.clone(), + &options.statistics, + ); + Some(stats.serialize()) + } else { + None + }; + ( + DictPage::new(CowBuffer::Owned(buffer), array.len(), false), + stats, + ) + }, + other => { + polars_bail!( + nyi = "Writing dictionary arrays to parquet only support data type {other:?}" - ) - }, - }; + ) + }, + }; if let Some(stats) = &mut statistics { stats.null_count = Some(array.null_count() as i64) diff --git a/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs b/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs index bf15c0ab50cc..9277b9c78a98 100644 --- a/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs +++ b/crates/polars-parquet/src/arrow/write/fixed_len_bytes.rs @@ -3,16 +3,20 @@ use arrow::types::i256; use polars_error::PolarsResult; use super::binary::ord_binary; -use super::{utils, StatisticsOptions, WriteOptions}; +use super::{utils, EncodeNullability, StatisticsOptions, WriteOptions}; use crate::arrow::read::schema::is_nullable; use crate::parquet::encoding::Encoding; use crate::parquet::page::DataPage; use crate::parquet::schema::types::PrimitiveType; use crate::parquet::statistics::FixedLenStatistics; -pub(crate) fn encode_plain(array: &FixedSizeBinaryArray, is_optional: bool, buffer: &mut Vec) { +pub(crate) fn encode_plain( + array: &FixedSizeBinaryArray, + options: EncodeNullability, + buffer: &mut Vec, +) { // append the non-null values - if is_optional { + if options.is_optional() && array.validity().is_some() { array.iter().for_each(|x| { if let Some(x) = x { buffer.extend_from_slice(x); @@ -30,6 +34,8 @@ pub fn array_to_page( statistics: Option, ) -> PolarsResult { let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); + let validity = array.validity(); let mut buffer = vec![]; @@ -43,7 +49,7 @@ pub fn array_to_page( let definition_levels_byte_length = buffer.len(); - encode_plain(array, is_optional, &mut buffer); + encode_plain(array, encode_options, &mut buffer); utils::build_plain_page( buffer, diff --git a/crates/polars-parquet/src/arrow/write/mod.rs b/crates/polars-parquet/src/arrow/write/mod.rs index 950e5fc16837..abdaab87bb3f 100644 --- a/crates/polars-parquet/src/arrow/write/mod.rs +++ b/crates/polars-parquet/src/arrow/write/mod.rs @@ -71,6 +71,13 @@ impl Default for StatisticsOptions { } } +/// Options to encode an array +#[derive(Clone, Copy)] +pub enum EncodeNullability { + Required, + Optional, +} + /// Currently supported options to write to parquet #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct WriteOptions { @@ -131,6 +138,20 @@ impl WriteOptions { } } +impl EncodeNullability { + const fn new(is_optional: bool) -> Self { + if is_optional { + Self::Optional + } else { + Self::Required + } + } + + fn is_optional(self) -> bool { + matches!(self, Self::Optional) + } +} + /// returns offset and length to slice the leaf values pub fn slice_nested_leaf(nested: &[Nested]) -> (usize, usize) { // find the deepest recursive dremel structure as that one determines how many values we must @@ -1003,6 +1024,7 @@ fn transverse_recursive T + Clone>( /// Transverses the `data_type` up to its (parquet) columns and returns a vector of /// items based on `map`. +/// /// This is used to assign an [`Encoding`] to every parquet column based on the columns' type (see example) pub fn transverse T + Clone>( data_type: &ArrowDataType, diff --git a/crates/polars-parquet/src/arrow/write/primitive/basic.rs b/crates/polars-parquet/src/arrow/write/primitive/basic.rs index 2c6c137ce220..d970e3659dcb 100644 --- a/crates/polars-parquet/src/arrow/write/primitive/basic.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/basic.rs @@ -13,11 +13,11 @@ use crate::parquet::schema::types::PrimitiveType; use crate::parquet::statistics::PrimitiveStatistics; use crate::parquet::types::NativeType as ParquetNativeType; use crate::read::Page; -use crate::write::StatisticsOptions; +use crate::write::{EncodeNullability, StatisticsOptions}; pub(crate) fn encode_plain( array: &PrimitiveArray, - is_optional: bool, + options: EncodeNullability, mut buffer: Vec, ) -> Vec where @@ -25,6 +25,8 @@ where P: ParquetNativeType, T: num_traits::AsPrimitive

, { + let is_optional = options.is_optional(); + if is_optional { // append the non-null values let validity = array.validity(); @@ -33,10 +35,10 @@ where let null_count = validity.unset_bits(); if null_count > 0 { - let values = array.values().as_slice(); let mut iter = validity.iter(); + let values = array.values().as_slice(); - buffer.reserve(std::mem::size_of::

() * (array.len() - null_count)); + buffer.reserve(std::mem::size_of::() * (array.len() - null_count)); let mut offset = 0; let mut remaining_valid = array.len() - null_count; @@ -72,7 +74,7 @@ where pub(crate) fn encode_delta( array: &PrimitiveArray, - is_optional: bool, + options: EncodeNullability, mut buffer: Vec, ) -> Vec where @@ -81,6 +83,8 @@ where T: num_traits::AsPrimitive

, P: num_traits::AsPrimitive, { + let is_optional = options.is_optional(); + if is_optional { // append the non-null values let iterator = array.non_null_values_iter().map(|x| { @@ -135,7 +139,7 @@ where .map(Page::Data) } -pub fn array_to_page, bool, Vec) -> Vec>( +pub fn array_to_page, EncodeNullability, Vec) -> Vec>( array: &PrimitiveArray, options: WriteOptions, type_: PrimitiveType, @@ -149,6 +153,7 @@ where T: num_traits::AsPrimitive

, { let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); let validity = array.validity(); @@ -163,7 +168,7 @@ where let definition_levels_byte_length = buffer.len(); - let buffer = encode(array, is_optional, buffer); + let buffer = encode(array, encode_options, buffer); let statistics = if options.has_statistics() { Some(build_statistics(array, type_.clone(), &options.statistics).serialize()) diff --git a/crates/polars-parquet/src/arrow/write/primitive/nested.rs b/crates/polars-parquet/src/arrow/write/primitive/nested.rs index 918afa6a4dc6..b5391263025e 100644 --- a/crates/polars-parquet/src/arrow/write/primitive/nested.rs +++ b/crates/polars-parquet/src/arrow/write/primitive/nested.rs @@ -10,6 +10,7 @@ use crate::parquet::encoding::Encoding; use crate::parquet::page::DataPage; use crate::parquet::schema::types::PrimitiveType; use crate::parquet::types::NativeType; +use crate::write::EncodeNullability; pub fn array_to_page( array: &PrimitiveArray, @@ -23,13 +24,14 @@ where T: num_traits::AsPrimitive, { let is_optional = is_nullable(&type_.field_info); + let encode_options = EncodeNullability::new(is_optional); let mut buffer = vec![]; let (repetition_levels_byte_length, definition_levels_byte_length) = nested::write_rep_and_def(options.version, nested, &mut buffer)?; - let buffer = encode_plain(array, is_optional, buffer); + let buffer = encode_plain(array, encode_options, buffer); let statistics = if options.has_statistics() { Some(build_statistics(array, type_.clone(), &options.statistics).serialize()) diff --git a/crates/polars-parquet/src/arrow/write/utils.rs b/crates/polars-parquet/src/arrow/write/utils.rs index 7f7796b0fff2..6e3efee54be5 100644 --- a/crates/polars-parquet/src/arrow/write/utils.rs +++ b/crates/polars-parquet/src/arrow/write/utils.rs @@ -92,7 +92,7 @@ pub fn build_plain_page( max_def_level: 0, max_rep_level: 0, }, - Some(num_rows), + num_rows, )) } diff --git a/crates/polars-parquet/src/parquet/compression.rs b/crates/polars-parquet/src/parquet/compression.rs index 7798af585b7b..41bfb5f557bf 100644 --- a/crates/polars-parquet/src/parquet/compression.rs +++ b/crates/polars-parquet/src/parquet/compression.rs @@ -26,6 +26,7 @@ fn inner_compress< /// Compresses data stored in slice `input_buf` and writes the compressed result /// to `output_buf`. +/// /// Note that you'll need to call `clear()` before reusing the same `output_buf` /// across different `compress` calls. #[allow(unused_variables)] diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs index fb8eb153cfb7..261e84ce2e23 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/decoder.rs @@ -28,6 +28,7 @@ //! Note that all these additions need to be wrapping. use super::super::{bitpacked, uleb128, zigzag_leb128}; +use super::lin_natural_sum; use crate::parquet::encoding::bitpacked::{Unpackable, Unpacked}; use crate::parquet::error::{ParquetError, ParquetResult}; @@ -166,16 +167,11 @@ impl DeltaGatherer for SumGatherer { delta: i64, num_repeats: usize, ) -> ParquetResult<()> { - if v < 0 || (delta < 0 && num_repeats as i64 * delta + v < 0) { + if v < 0 || (delta < 0 && num_repeats > 0 && (num_repeats - 1) as i64 * delta + v < 0) { return Err(ParquetError::oos("Invalid delta encoding length")); } - let base = v * num_repeats as i64; - let is_even = num_repeats & 1; - // SUM_i=0^n f * i = f * (n(n+1)/2) - let increment = (num_repeats >> is_even) * ((num_repeats + 1) >> (is_even ^ 1)); - - *target += base as usize + increment; + *target += lin_natural_sum(v, delta, num_repeats) as usize; Ok(()) } @@ -254,6 +250,13 @@ fn gather_miniblock( ) -> ParquetResult<()> { let bitwidth = bitwidth as usize; + if bitwidth == 0 { + let v = last_value.wrapping_add(min_delta); + gatherer.gather_constant(target, v, min_delta, values_per_miniblock)?; + *last_value = last_value.wrapping_add(min_delta * values_per_miniblock as i64); + return Ok(()); + } + debug_assert!(bitwidth <= 64); debug_assert_eq!((bitwidth * values_per_miniblock).div_ceil(8), values.len()); @@ -286,18 +289,14 @@ fn gather_block<'a, G: DeltaGatherer>( let bitwidths; (bitwidths, values) = values .split_at_checked(num_miniblocks) - .ok_or(ParquetError::oos( - "Not enough bitwidths available in delta encoding", - ))?; + .ok_or_else(|| ParquetError::oos("Not enough bitwidths available in delta encoding"))?; gatherer.target_reserve(target, num_miniblocks * values_per_miniblock); for &bitwidth in bitwidths { let miniblock; (miniblock, values) = values .split_at_checked((bitwidth as usize * values_per_miniblock).div_ceil(8)) - .ok_or(ParquetError::oos( - "Not enough bytes for miniblock in delta encoding", - ))?; + .ok_or_else(|| ParquetError::oos("Not enough bytes for miniblock in delta encoding"))?; gather_miniblock( target, min_delta, @@ -372,9 +371,9 @@ impl<'a> Decoder<'a> { // let (_, consumed) = zigzag_leb128::decode(rem); - rem = rem.get(consumed..).ok_or(ParquetError::oos( - "No min-delta value in delta encoding miniblock", - ))?; + rem = rem.get(consumed..).ok_or_else(|| { + ParquetError::oos("No min-delta value in delta encoding miniblock") + })?; if rem.len() < num_miniblocks_per_block { return Err(ParquetError::oos( @@ -401,9 +400,11 @@ impl<'a> Decoder<'a> { rem = rem .get(num_miniblocks_per_block + num_bitpacking_bytes..) - .ok_or(ParquetError::oos( - "Not enough bytes for all bitpacked values in delta encoding", - ))?; + .ok_or_else(|| { + ParquetError::oos( + "Not enough bytes for all bitpacked values in delta encoding", + ) + })?; num_values_left = num_values_left.saturating_sub(values_per_block); } diff --git a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs index 23e67ee7fb4f..4a32610a302e 100644 --- a/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs +++ b/crates/polars-parquet/src/parquet/encoding/delta_bitpacked/mod.rs @@ -5,11 +5,39 @@ mod fuzz; pub(crate) use decoder::{Decoder, DeltaGatherer, SumGatherer}; pub(crate) use encoder::encode; +/// The sum of `start, start + delta, start + 2 * delta, ... len times`. +pub(crate) fn lin_natural_sum(start: i64, delta: i64, len: usize) -> i64 { + debug_assert!(len < i64::MAX as usize); + + let base = start * len as i64; + let sum = if len == 0 { + 0 + } else { + let is_odd = len & 1; + // SUM_i=0^n f * i = f * (n(n+1)/2) + let sum = (len >> (is_odd ^ 1)) * (len.wrapping_sub(1) >> is_odd); + delta * sum as i64 + }; + + base + sum +} + #[cfg(test)] mod tests { use super::*; use crate::parquet::error::{ParquetError, ParquetResult}; + #[test] + fn linear_natural_sum() { + assert_eq!(lin_natural_sum(0, 0, 0), 0); + assert_eq!(lin_natural_sum(10, 4, 0), 0); + assert_eq!(lin_natural_sum(0, 1, 1), 0); + assert_eq!(lin_natural_sum(0, 1, 3), 3); + assert_eq!(lin_natural_sum(0, 1, 4), 6); + assert_eq!(lin_natural_sum(0, 2, 3), 6); + assert_eq!(lin_natural_sum(2, 2, 3), 12); + } + #[test] fn basic() -> Result<(), ParquetError> { let data = vec![1, 3, 1, 2, 3]; diff --git a/crates/polars-parquet/src/parquet/indexes/index.rs b/crates/polars-parquet/src/parquet/indexes/index.rs deleted file mode 100644 index ecf11fe7f30e..000000000000 --- a/crates/polars-parquet/src/parquet/indexes/index.rs +++ /dev/null @@ -1,322 +0,0 @@ -use std::any::Any; - -use parquet_format_safe::ColumnIndex; - -use crate::parquet::error::ParquetError; -use crate::parquet::parquet_bridge::BoundaryOrder; -use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; -use crate::parquet::types::NativeType; - -/// Trait object representing a [`ColumnIndex`] in Rust's native format. -/// -/// See [`NativeIndex`], [`ByteIndex`] and [`FixedLenByteIndex`] for concrete implementations. -pub trait Index: Send + Sync + std::fmt::Debug { - fn as_any(&self) -> &dyn Any; - - fn physical_type(&self) -> &PhysicalType; -} - -impl PartialEq for dyn Index + '_ { - fn eq(&self, that: &dyn Index) -> bool { - equal(self, that) - } -} - -impl Eq for dyn Index + '_ {} - -fn equal(lhs: &dyn Index, rhs: &dyn Index) -> bool { - if lhs.physical_type() != rhs.physical_type() { - return false; - } - - match lhs.physical_type() { - PhysicalType::Boolean => { - lhs.as_any().downcast_ref::().unwrap() - == rhs.as_any().downcast_ref::().unwrap() - }, - PhysicalType::Int32 => { - lhs.as_any().downcast_ref::>().unwrap() - == rhs.as_any().downcast_ref::>().unwrap() - }, - PhysicalType::Int64 => { - lhs.as_any().downcast_ref::>().unwrap() - == rhs.as_any().downcast_ref::>().unwrap() - }, - PhysicalType::Int96 => { - lhs.as_any() - .downcast_ref::>() - .unwrap() - == rhs - .as_any() - .downcast_ref::>() - .unwrap() - }, - PhysicalType::Float => { - lhs.as_any().downcast_ref::>().unwrap() - == rhs.as_any().downcast_ref::>().unwrap() - }, - PhysicalType::Double => { - lhs.as_any().downcast_ref::>().unwrap() - == rhs.as_any().downcast_ref::>().unwrap() - }, - PhysicalType::ByteArray => { - lhs.as_any().downcast_ref::().unwrap() - == rhs.as_any().downcast_ref::().unwrap() - }, - PhysicalType::FixedLenByteArray(_) => { - lhs.as_any().downcast_ref::().unwrap() - == rhs.as_any().downcast_ref::().unwrap() - }, - } -} - -/// An index of a column of [`NativeType`] physical representation -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct NativeIndex { - /// The primitive type - pub primitive_type: PrimitiveType, - /// The indexes, one item per page - pub indexes: Vec>, - /// the order - pub boundary_order: BoundaryOrder, -} - -impl NativeIndex { - /// Creates a new [`NativeIndex`] - pub(crate) fn try_new( - index: ColumnIndex, - primitive_type: PrimitiveType, - ) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - let indexes = index - .min_values - .iter() - .zip(index.max_values.into_iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - let min = min.as_slice().try_into()?; - let max = max.as_slice().try_into()?; - (Some(T::from_le_bytes(min)), Some(T::from_le_bytes(max))) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) - .collect::, ParquetError>>()?; - - Ok(Self { - primitive_type, - indexes, - boundary_order: index.boundary_order.try_into()?, - }) - } -} - -/// The index of a page, containing the min and max values of the page. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct PageIndex { - /// The minimum value in the page. It is None when all values are null - pub min: Option, - /// The maximum value in the page. It is None when all values are null - pub max: Option, - /// The number of null values in the page - pub null_count: Option, -} - -impl Index for NativeIndex { - fn as_any(&self) -> &dyn Any { - self - } - - fn physical_type(&self) -> &PhysicalType { - &T::TYPE - } -} - -/// An index of a column of bytes physical type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ByteIndex { - /// The [`PrimitiveType`]. - pub primitive_type: PrimitiveType, - /// The indexes, one item per page - pub indexes: Vec>>, - pub boundary_order: BoundaryOrder, -} - -impl ByteIndex { - pub(crate) fn try_new( - index: ColumnIndex, - primitive_type: PrimitiveType, - ) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - let indexes = index - .min_values - .into_iter() - .zip(index.max_values.into_iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - (Some(min), Some(max)) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) - .collect::, ParquetError>>()?; - - Ok(Self { - primitive_type, - indexes, - boundary_order: index.boundary_order.try_into()?, - }) - } -} - -impl Index for ByteIndex { - fn as_any(&self) -> &dyn Any { - self - } - - fn physical_type(&self) -> &PhysicalType { - &PhysicalType::ByteArray - } -} - -/// An index of a column of fixed len byte physical type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct FixedLenByteIndex { - /// The [`PrimitiveType`]. - pub primitive_type: PrimitiveType, - /// The indexes, one item per page - pub indexes: Vec>>, - pub boundary_order: BoundaryOrder, -} - -impl FixedLenByteIndex { - pub(crate) fn try_new( - index: ColumnIndex, - primitive_type: PrimitiveType, - ) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - let indexes = index - .min_values - .into_iter() - .zip(index.max_values.into_iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - (Some(min), Some(max)) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) - .collect::, ParquetError>>()?; - - Ok(Self { - primitive_type, - indexes, - boundary_order: index.boundary_order.try_into()?, - }) - } -} - -impl Index for FixedLenByteIndex { - fn as_any(&self) -> &dyn Any { - self - } - - fn physical_type(&self) -> &PhysicalType { - &self.primitive_type.physical_type - } -} - -/// An index of a column of boolean physical type -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct BooleanIndex { - /// The indexes, one item per page - pub indexes: Vec>, - pub boundary_order: BoundaryOrder, -} - -impl BooleanIndex { - pub(crate) fn try_new(index: ColumnIndex) -> Result { - let len = index.min_values.len(); - - let null_counts = index - .null_counts - .map(|x| x.into_iter().map(Some).collect::>()) - .unwrap_or_else(|| vec![None; len]); - - let indexes = index - .min_values - .into_iter() - .zip(index.max_values.into_iter()) - .zip(index.null_pages.into_iter()) - .zip(null_counts.into_iter()) - .map(|(((min, max), is_null), null_count)| { - let (min, max) = if is_null { - (None, None) - } else { - let min = min[0] == 1; - let max = max[0] == 1; - (Some(min), Some(max)) - }; - Ok(PageIndex { - min, - max, - null_count, - }) - }) - .collect::, ParquetError>>()?; - - Ok(Self { - indexes, - boundary_order: index.boundary_order.try_into()?, - }) - } -} - -impl Index for BooleanIndex { - fn as_any(&self) -> &dyn Any { - self - } - - fn physical_type(&self) -> &PhysicalType { - &PhysicalType::Boolean - } -} diff --git a/crates/polars-parquet/src/parquet/indexes/intervals.rs b/crates/polars-parquet/src/parquet/indexes/intervals.rs deleted file mode 100644 index d04d3104a618..000000000000 --- a/crates/polars-parquet/src/parquet/indexes/intervals.rs +++ /dev/null @@ -1,139 +0,0 @@ -use parquet_format_safe::PageLocation; -#[cfg(feature = "serde_types")] -use serde::{Deserialize, Serialize}; - -use crate::parquet::error::ParquetError; - -/// An interval -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] -pub struct Interval { - /// Its start - pub start: usize, - /// Its length - pub length: usize, -} - -impl Interval { - /// Create a new interval - pub fn new(start: usize, length: usize) -> Self { - Self { start, length } - } -} - -/// Returns the set of (row) intervals of the pages. -/// # Errors -/// This function errors if the locations are not castable to `usize` or such that -/// their ranges of row are larger than `num_rows`. -pub fn compute_page_row_intervals( - locations: &[PageLocation], - num_rows: usize, -) -> Result, ParquetError> { - if locations.is_empty() { - return Ok(vec![]); - }; - - let last = (|| { - let start: usize = locations.last().unwrap().first_row_index.try_into()?; - let length = num_rows.checked_sub(start).ok_or_else(|| { - ParquetError::oos("Page start cannot be smaller than the number of rows") - })?; - Result::<_, ParquetError>::Ok(Interval::new(start, length)) - })(); - - let pages_lengths = locations - .windows(2) - .map(|x| { - let start = x[0].first_row_index.try_into()?; - - let length = x[1] - .first_row_index - .checked_sub(x[0].first_row_index) - .ok_or_else(|| { - ParquetError::oos("Page start cannot be smaller than the number of rows") - })? - .try_into()?; - - Ok(Interval::new(start, length)) - }) - .chain(std::iter::once(last)); - pages_lengths.collect() -} - -/// Returns the set of intervals `(start, len)` containing all the -/// selected rows (for a given column) -pub fn compute_rows( - selected: &[bool], - locations: &[PageLocation], - num_rows: usize, -) -> Result, ParquetError> { - let page_intervals = compute_page_row_intervals(locations, num_rows)?; - - Ok(selected - .iter() - .zip(page_intervals.iter().copied()) - .filter_map( - |(&is_selected, page)| { - if is_selected { - Some(page) - } else { - None - } - }, - ) - .collect()) -} - -/// An enum describing a page that was either selected in a filter pushdown or skipped -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "serde_types", derive(Deserialize, Serialize))] -pub struct FilteredPage { - /// Location of the page in the file - pub start: u64, - pub length: usize, - /// rows to select from the page - pub selected_rows: Vec, - pub num_rows: usize, -} - -fn is_in(probe: Interval, intervals: &[Interval]) -> Vec { - intervals - .iter() - .filter_map(|interval| { - let interval_end = interval.start + interval.length; - let probe_end = probe.start + probe.length; - let overlaps = (probe.start < interval_end) && (probe_end > interval.start); - if overlaps { - let start = interval.start.max(probe.start); - let end = interval_end.min(probe_end); - Some(Interval::new(start - probe.start, end - start)) - } else { - None - } - }) - .collect() -} - -/// Given a set of selected [Interval]s of rows and the set of [`PageLocation`], returns the -/// a set of [`FilteredPage`] with the same number of items as `locations`. -pub fn select_pages( - intervals: &[Interval], - locations: &[PageLocation], - num_rows: usize, -) -> Result, ParquetError> { - let page_intervals = compute_page_row_intervals(locations, num_rows)?; - - page_intervals - .into_iter() - .zip(locations.iter()) - .map(|(interval, location)| { - let selected_rows = is_in(interval, intervals); - Ok(FilteredPage { - start: location.offset.try_into()?, - length: location.compressed_page_size.try_into()?, - selected_rows, - num_rows: interval.length, - }) - }) - .collect() -} diff --git a/crates/polars-parquet/src/parquet/indexes/mod.rs b/crates/polars-parquet/src/parquet/indexes/mod.rs deleted file mode 100644 index f652f8bb4be3..000000000000 --- a/crates/polars-parquet/src/parquet/indexes/mod.rs +++ /dev/null @@ -1,234 +0,0 @@ -mod index; -mod intervals; - -pub use intervals::{compute_rows, select_pages, FilteredPage, Interval}; - -pub use self::index::{BooleanIndex, ByteIndex, FixedLenByteIndex, Index, NativeIndex, PageIndex}; -pub use crate::parquet::parquet_bridge::BoundaryOrder; -pub use crate::parquet::thrift_format::PageLocation; - -#[cfg(test)] -mod tests { - use super::*; - use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; - - #[test] - fn test_basic() { - let locations = &[PageLocation { - offset: 100, - compressed_page_size: 10, - first_row_index: 0, - }]; - let num_rows = 10; - - let row_intervals = compute_rows(&[true; 1], locations, num_rows).unwrap(); - assert_eq!(row_intervals, vec![Interval::new(0, 10)]) - } - - #[test] - fn test_multiple() { - // two pages - let index = ByteIndex { - primitive_type: PrimitiveType::from_physical("c1".to_string(), PhysicalType::ByteArray), - indexes: vec![ - PageIndex { - min: Some(vec![0]), - max: Some(vec![8, 9]), - null_count: Some(0), - }, - PageIndex { - min: Some(vec![20]), - max: Some(vec![98, 99]), - null_count: Some(0), - }, - ], - boundary_order: Default::default(), - }; - let locations = &[ - PageLocation { - offset: 100, - compressed_page_size: 10, - first_row_index: 0, - }, - PageLocation { - offset: 110, - compressed_page_size: 20, - first_row_index: 5, - }, - ]; - let num_rows = 10; - - // filter of the form `x > "a"` - let selector = |page: &PageIndex>| { - page.max - .as_ref() - .map(|x| x.as_slice()[0] > 97) - .unwrap_or(false) // no max is present => all nulls => not selected - }; - let selected = index.indexes.iter().map(selector).collect::>(); - - let rows = compute_rows(&selected, locations, num_rows).unwrap(); - assert_eq!(rows, vec![Interval::new(5, 5)]); - - let pages = select_pages(&rows, locations, num_rows).unwrap(); - - assert_eq!( - pages, - vec![ - FilteredPage { - start: 100, - length: 10, - selected_rows: vec![], - num_rows: 5 - }, - FilteredPage { - start: 110, - length: 20, - selected_rows: vec![Interval::new(0, 5)], - num_rows: 5 - } - ] - ); - } - - #[test] - fn test_other_column() { - let locations = &[ - PageLocation { - offset: 100, - compressed_page_size: 20, - first_row_index: 0, - }, - PageLocation { - offset: 120, - compressed_page_size: 20, - first_row_index: 10, - }, - ]; - let num_rows = 100; - - let intervals = &[Interval::new(5, 5)]; - - let pages = select_pages(intervals, locations, num_rows).unwrap(); - - assert_eq!( - pages, - vec![ - FilteredPage { - start: 100, - length: 20, - selected_rows: vec![Interval::new(5, 5)], - num_rows: 10, - }, - FilteredPage { - start: 120, - length: 20, - selected_rows: vec![], - num_rows: 90 - }, - ] - ); - } - - #[test] - fn test_other_interval_in_middle() { - let locations = &[ - PageLocation { - offset: 100, - compressed_page_size: 20, - first_row_index: 0, - }, - PageLocation { - offset: 120, - compressed_page_size: 20, - first_row_index: 10, - }, - PageLocation { - offset: 140, - compressed_page_size: 20, - first_row_index: 100, - }, - ]; - let num_rows = 200; - - // interval partially intersects 2 pages (0 and 1) - let intervals = &[Interval::new(5, 6)]; - - let pages = select_pages(intervals, locations, num_rows).unwrap(); - - assert_eq!( - pages, - vec![ - FilteredPage { - start: 100, - length: 20, - selected_rows: vec![Interval::new(5, 5)], - num_rows: 10, - }, - FilteredPage { - start: 120, - length: 20, - selected_rows: vec![Interval::new(0, 1)], - num_rows: 90, - }, - FilteredPage { - start: 140, - length: 20, - selected_rows: vec![], - num_rows: 100 - }, - ] - ); - } - - #[test] - fn test_other_column2() { - let locations = &[ - PageLocation { - offset: 100, - compressed_page_size: 20, - first_row_index: 0, - }, - PageLocation { - offset: 120, - compressed_page_size: 20, - first_row_index: 10, - }, - PageLocation { - offset: 140, - compressed_page_size: 20, - first_row_index: 100, - }, - ]; - let num_rows = 200; - - // interval partially intersects 1 page (0) - let intervals = &[Interval::new(0, 1)]; - - let pages = select_pages(intervals, locations, num_rows).unwrap(); - - assert_eq!( - pages, - vec![ - FilteredPage { - start: 100, - length: 20, - selected_rows: vec![Interval::new(0, 1)], - num_rows: 10, - }, - FilteredPage { - start: 120, - length: 20, - selected_rows: vec![], - num_rows: 90 - }, - FilteredPage { - start: 140, - length: 20, - selected_rows: vec![], - num_rows: 100 - }, - ] - ); - } -} diff --git a/crates/polars-parquet/src/parquet/mod.rs b/crates/polars-parquet/src/parquet/mod.rs index f40b21ea0e04..ea6b5b2c8357 100644 --- a/crates/polars-parquet/src/parquet/mod.rs +++ b/crates/polars-parquet/src/parquet/mod.rs @@ -4,7 +4,6 @@ pub mod error; pub mod bloom_filter; pub mod compression; pub mod encoding; -pub mod indexes; pub mod metadata; pub mod page; mod parquet_bridge; diff --git a/crates/polars-parquet/src/parquet/page/mod.rs b/crates/polars-parquet/src/parquet/page/mod.rs index 62b3aa20163b..128f1af03c14 100644 --- a/crates/polars-parquet/src/parquet/page/mod.rs +++ b/crates/polars-parquet/src/parquet/page/mod.rs @@ -2,7 +2,6 @@ use super::CowBuffer; use crate::parquet::compression::Compression; use crate::parquet::encoding::{get_length, Encoding}; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::indexes::Interval; use crate::parquet::metadata::Descriptor; pub use crate::parquet::parquet_bridge::{DataPageHeaderExt, PageType}; use crate::parquet::statistics::Statistics; @@ -24,9 +23,7 @@ pub struct CompressedDataPage { pub(crate) compression: Compression, uncompressed_page_size: usize, pub(crate) descriptor: Descriptor, - - // The offset and length in rows - pub(crate) selected_rows: Option>, + pub num_rows: Option, } impl CompressedDataPage { @@ -37,16 +34,16 @@ impl CompressedDataPage { compression: Compression, uncompressed_page_size: usize, descriptor: Descriptor, - rows: Option, + num_rows: usize, ) -> Self { - Self::new_read( + Self { header, buffer, compression, uncompressed_page_size, descriptor, - rows.map(|x| vec![Interval::new(0, x)]), - ) + num_rows: Some(num_rows), + } } /// Returns a new [`CompressedDataPage`]. @@ -56,7 +53,6 @@ impl CompressedDataPage { compression: Compression, uncompressed_page_size: usize, descriptor: Descriptor, - selected_rows: Option>, ) -> Self { Self { header, @@ -64,7 +60,7 @@ impl CompressedDataPage { compression, uncompressed_page_size, descriptor, - selected_rows, + num_rows: None, } } @@ -87,16 +83,14 @@ impl CompressedDataPage { self.compression } - /// the rows to be selected by this page. - /// When `None`, all rows are to be considered. - pub fn selected_rows(&self) -> Option<&[Interval]> { - self.selected_rows.as_deref() - } - pub fn num_values(&self) -> usize { self.header.num_values() } + pub fn num_rows(&self) -> Option { + self.num_rows + } + /// Decodes the raw statistics into a statistics pub fn statistics(&self) -> Option> { match &self.header { @@ -111,11 +105,6 @@ impl CompressedDataPage { } } - #[inline] - pub fn select_rows(&mut self, selected_rows: Vec) { - self.selected_rows = Some(selected_rows); - } - pub fn slice_mut(&mut self) -> &mut CowBuffer { &mut self.buffer } @@ -143,7 +132,7 @@ pub struct DataPage { pub(super) header: DataPageHeader, pub(super) buffer: CowBuffer, pub descriptor: Descriptor, - pub selected_rows: Option>, + pub num_rows: Option, } impl DataPage { @@ -151,27 +140,26 @@ impl DataPage { header: DataPageHeader, buffer: CowBuffer, descriptor: Descriptor, - rows: Option, + num_rows: usize, ) -> Self { - Self::new_read( + Self { header, buffer, descriptor, - rows.map(|x| vec![Interval::new(0, x)]), - ) + num_rows: Some(num_rows), + } } pub(crate) fn new_read( header: DataPageHeader, buffer: CowBuffer, descriptor: Descriptor, - selected_rows: Option>, ) -> Self { Self { header, buffer, descriptor, - selected_rows, + num_rows: None, } } @@ -183,12 +171,6 @@ impl DataPage { &self.buffer } - /// the rows to be selected by this page. - /// When `None`, all rows are to be considered. - pub fn selected_rows(&self) -> Option<&[Interval]> { - self.selected_rows.as_deref() - } - /// Returns a mutable reference to the internal buffer. /// Useful to recover the buffer after the page has been decoded. pub fn buffer_mut(&mut self) -> &mut Vec { @@ -199,6 +181,10 @@ impl DataPage { self.header.num_values() } + pub fn num_rows(&self) -> Option { + self.num_rows + } + pub fn encoding(&self) -> Encoding { match &self.header { DataPageHeader::V1(d) => d.encoding(), @@ -300,10 +286,10 @@ impl CompressedPage { } } - pub(crate) fn selected_rows(&self) -> Option<&[Interval]> { + pub(crate) fn num_rows(&self) -> Option { match self { - CompressedPage::Data(page) => page.selected_rows(), - CompressedPage::Dict(_) => None, + CompressedPage::Data(page) => page.num_rows(), + CompressedPage::Dict(_) => Some(0), } } diff --git a/crates/polars-parquet/src/parquet/read/column/mod.rs b/crates/polars-parquet/src/parquet/read/column/mod.rs index 1a1277637f27..d6bcda08fe2d 100644 --- a/crates/polars-parquet/src/parquet/read/column/mod.rs +++ b/crates/polars-parquet/src/parquet/read/column/mod.rs @@ -1,4 +1,3 @@ -use std::io::{Read, Seek}; use std::vec::IntoIter; use super::{get_field_columns, get_page_iterator, MemReader, PageReader}; @@ -7,10 +6,6 @@ use crate::parquet::metadata::{ColumnChunkMetaData, RowGroupMetaData}; use crate::parquet::page::CompressedPage; use crate::parquet::schema::types::ParquetType; -#[cfg(feature = "async")] -#[cfg_attr(docsrs, doc(cfg(feature = "async")))] -mod stream; - /// Returns a [`ColumnIterator`] of column chunks corresponding to `field`. /// /// Contrarily to [`get_page_iterator`] that returns a single iterator of pages, this iterator @@ -149,38 +144,3 @@ impl MutStreamingIterator for ReadColumnIterator { self.current.as_mut() } } - -/// Reads all columns that are part of the parquet field `field_name` -/// # Implementation -/// This operation is IO-bounded `O(C)` where C is the number of columns associated to -/// the field (one for non-nested types) -/// It reads the columns sequentially. Use [`read_column`] to fork this operation to multiple -/// readers. -pub fn read_columns<'a, R: Read + Seek>( - reader: &mut R, - columns: &'a [ColumnChunkMetaData], - field_name: &'a str, -) -> Result)>, ParquetError> { - get_field_columns(columns, field_name) - .map(|column| read_column(reader, column).map(|c| (column, c))) - .collect() -} - -/// Reads a column chunk into memory -/// This operation is IO-bounded and allocates the column's `compressed_size`. -pub fn read_column(reader: &mut R, column: &ColumnChunkMetaData) -> Result, ParquetError> -where - R: Read + Seek, -{ - let (start, length) = column.byte_range(); - reader.seek(std::io::SeekFrom::Start(start))?; - - let mut chunk = vec![]; - chunk.try_reserve(length as usize)?; - reader.by_ref().take(length).read_to_end(&mut chunk)?; - Ok(chunk) -} - -#[cfg(feature = "async")] -#[cfg_attr(docsrs, doc(cfg(feature = "async")))] -pub use stream::{read_column_async, read_columns_async}; diff --git a/crates/polars-parquet/src/parquet/read/column/stream.rs b/crates/polars-parquet/src/parquet/read/column/stream.rs deleted file mode 100644 index 63319d2260c6..000000000000 --- a/crates/polars-parquet/src/parquet/read/column/stream.rs +++ /dev/null @@ -1,51 +0,0 @@ -use futures::future::{try_join_all, BoxFuture}; -use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; - -use crate::parquet::error::ParquetError; -use crate::parquet::metadata::ColumnChunkMetaData; -use crate::parquet::read::get_field_columns; - -/// Reads a single column chunk into memory asynchronously -pub async fn read_column_async<'b, R, F>( - factory: F, - meta: &ColumnChunkMetaData, -) -> Result, ParquetError> -where - R: AsyncRead + AsyncSeek + Send + Unpin, - F: Fn() -> BoxFuture<'b, std::io::Result>, -{ - let mut reader = factory().await?; - let (start, length) = meta.byte_range(); - reader.seek(std::io::SeekFrom::Start(start)).await?; - - let mut chunk = vec![]; - chunk.try_reserve(length as usize)?; - reader.take(length).read_to_end(&mut chunk).await?; - Result::Ok(chunk) -} - -/// Reads all columns that are part of the parquet field `field_name` -/// # Implementation -/// This operation is IO-bounded `O(C)` where C is the number of columns associated to -/// the field (one for non-nested types) -/// -/// It does so asynchronously via a single `join_all` over all the necessary columns for -/// `field_name`. -pub async fn read_columns_async< - 'a, - 'b, - R: AsyncRead + AsyncSeek + Send + Unpin, - F: Fn() -> BoxFuture<'b, std::io::Result> + Clone, ->( - factory: F, - columns: &'a [ColumnChunkMetaData], - field_name: &'a str, -) -> Result)>, ParquetError> { - let fields = get_field_columns(columns, field_name).collect::>(); - let futures = fields - .iter() - .map(|meta| async { read_column_async(factory.clone(), meta).await }); - - let columns = try_join_all(futures).await?; - Ok(fields.into_iter().zip(columns).collect()) -} diff --git a/crates/polars-parquet/src/parquet/read/compression.rs b/crates/polars-parquet/src/parquet/read/compression.rs index 0996093b31f0..a3d2db312ada 100644 --- a/crates/polars-parquet/src/parquet/read/compression.rs +++ b/crates/polars-parquet/src/parquet/read/compression.rs @@ -3,7 +3,9 @@ use parquet_format_safe::DataPageHeaderV2; use super::PageReader; use crate::parquet::compression::{self, Compression}; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::page::{CompressedPage, DataPage, DataPageHeader, DictPage, Page}; +use crate::parquet::page::{ + CompressedDataPage, CompressedPage, DataPage, DataPageHeader, DictPage, Page, +}; use crate::parquet::CowBuffer; fn decompress_v1( @@ -103,7 +105,6 @@ fn create_page(compressed_page: CompressedPage, buffer: Vec) -> Page { page.header, CowBuffer::Owned(buffer), page.descriptor, - page.selected_rows, )), CompressedPage::Dict(page) => Page::Dict(DictPage { buffer: CowBuffer::Owned(buffer), @@ -205,8 +206,27 @@ impl BasicDecompressor { } } +pub struct DataPageItem { + page: CompressedDataPage, +} + +impl DataPageItem { + pub fn num_values(&self) -> usize { + self.page.num_values() + } + + pub fn decompress(self, decompressor: &mut BasicDecompressor) -> ParquetResult { + let p = decompress(CompressedPage::Data(self.page), &mut decompressor.buffer)?; + let Page::Data(p) = p else { + panic!("Decompressing a data page should result in a data page"); + }; + + Ok(p) + } +} + impl Iterator for BasicDecompressor { - type Item = ParquetResult; + type Item = ParquetResult; fn next(&mut self) -> Option { let page = match self.reader.next() { @@ -215,15 +235,13 @@ impl Iterator for BasicDecompressor { Some(Ok(p)) => p, }; - Some(decompress(page, &mut self.buffer).and_then(|p| { - let Page::Data(p) = p else { - return Err(ParquetError::oos( - "Found dictionary page beyond the first page of a column chunk", - )); - }; + let CompressedPage::Data(page) = page else { + return Some(Err(ParquetError::oos( + "Found dictionary page beyond the first page of a column chunk", + ))); + }; - Ok(p) - })) + Some(Ok(DataPageItem { page })) } fn size_hint(&self) -> (usize, Option) { diff --git a/crates/polars-parquet/src/parquet/read/indexes/deserialize.rs b/crates/polars-parquet/src/parquet/read/indexes/deserialize.rs deleted file mode 100644 index d6bfb4de8a06..000000000000 --- a/crates/polars-parquet/src/parquet/read/indexes/deserialize.rs +++ /dev/null @@ -1,30 +0,0 @@ -use parquet_format_safe::thrift::protocol::TCompactInputProtocol; -use parquet_format_safe::ColumnIndex; - -use crate::parquet::error::ParquetError; -use crate::parquet::indexes::{BooleanIndex, ByteIndex, FixedLenByteIndex, Index, NativeIndex}; -use crate::parquet::schema::types::{PhysicalType, PrimitiveType}; - -pub fn deserialize( - data: &[u8], - primitive_type: PrimitiveType, -) -> Result, ParquetError> { - let mut prot = TCompactInputProtocol::new(data, data.len() * 2 + 1024); - - let index = ColumnIndex::read_from_in_protocol(&mut prot)?; - - let index = match primitive_type.physical_type { - PhysicalType::Boolean => Box::new(BooleanIndex::try_new(index)?) as Box, - PhysicalType::Int32 => Box::new(NativeIndex::::try_new(index, primitive_type)?), - PhysicalType::Int64 => Box::new(NativeIndex::::try_new(index, primitive_type)?), - PhysicalType::Int96 => Box::new(NativeIndex::<[u32; 3]>::try_new(index, primitive_type)?), - PhysicalType::Float => Box::new(NativeIndex::::try_new(index, primitive_type)?), - PhysicalType::Double => Box::new(NativeIndex::::try_new(index, primitive_type)?), - PhysicalType::ByteArray => Box::new(ByteIndex::try_new(index, primitive_type)?), - PhysicalType::FixedLenByteArray(_) => { - Box::new(FixedLenByteIndex::try_new(index, primitive_type)?) - }, - }; - - Ok(index) -} diff --git a/crates/polars-parquet/src/parquet/read/indexes/mod.rs b/crates/polars-parquet/src/parquet/read/indexes/mod.rs deleted file mode 100644 index 1e1919c84c75..000000000000 --- a/crates/polars-parquet/src/parquet/read/indexes/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -mod deserialize; -mod read; - -pub use read::*; diff --git a/crates/polars-parquet/src/parquet/read/indexes/read.rs b/crates/polars-parquet/src/parquet/read/indexes/read.rs deleted file mode 100644 index 1dbb5aa20fde..000000000000 --- a/crates/polars-parquet/src/parquet/read/indexes/read.rs +++ /dev/null @@ -1,134 +0,0 @@ -use std::io::{Cursor, Read, Seek, SeekFrom}; - -use parquet_format_safe::thrift::protocol::TCompactInputProtocol; -use parquet_format_safe::{ColumnChunk, OffsetIndex, PageLocation}; - -use super::deserialize::deserialize; -use crate::parquet::error::ParquetError; -use crate::parquet::indexes::Index; -use crate::parquet::metadata::ColumnChunkMetaData; - -fn prepare_read Option, G: Fn(&ColumnChunk) -> Option>( - chunks: &[ColumnChunkMetaData], - get_offset: F, - get_length: G, -) -> Result<(u64, Vec), ParquetError> { - // c1: [start, length] - // ... - // cN: [start, length] - - let first_chunk = if let Some(chunk) = chunks.first() { - chunk - } else { - return Ok((0, vec![])); - }; - let metadata = first_chunk.column_chunk(); - - let offset: u64 = if let Some(offset) = get_offset(metadata) { - offset.try_into()? - } else { - return Ok((0, vec![])); - }; - - let lengths = chunks - .iter() - .map(|x| get_length(x.column_chunk())) - .map(|maybe_length| { - let index_length = maybe_length.ok_or_else(|| { - ParquetError::oos("The column length must exist if column offset exists") - })?; - - Ok(index_length.try_into()?) - }) - .collect::, ParquetError>>()?; - - Ok((offset, lengths)) -} - -fn prepare_column_index_read( - chunks: &[ColumnChunkMetaData], -) -> Result<(u64, Vec), ParquetError> { - prepare_read(chunks, |x| x.column_index_offset, |x| x.column_index_length) -} - -fn prepare_offset_index_read( - chunks: &[ColumnChunkMetaData], -) -> Result<(u64, Vec), ParquetError> { - prepare_read(chunks, |x| x.offset_index_offset, |x| x.offset_index_length) -} - -fn deserialize_column_indexes( - chunks: &[ColumnChunkMetaData], - data: &[u8], - lengths: Vec, -) -> Result>, ParquetError> { - let mut start = 0; - let data = lengths.into_iter().map(|length| { - let r = &data[start..start + length]; - start += length; - r - }); - - chunks - .iter() - .zip(data) - .map(|(chunk, data)| { - let primitive_type = chunk.descriptor().descriptor.primitive_type.clone(); - deserialize(data, primitive_type) - }) - .collect() -} - -/// Reads the column indexes of all [`ColumnChunkMetaData`] and deserializes them into [`Index`]. -/// Returns an empty vector if indexes are not available -pub fn read_columns_indexes( - reader: &mut R, - chunks: &[ColumnChunkMetaData], -) -> Result>, ParquetError> { - let (offset, lengths) = prepare_column_index_read(chunks)?; - - let length = lengths.iter().sum::(); - - reader.seek(SeekFrom::Start(offset))?; - - let mut data = vec![]; - data.try_reserve(length)?; - reader.by_ref().take(length as u64).read_to_end(&mut data)?; - - deserialize_column_indexes(chunks, &data, lengths) -} - -fn deserialize_page_locations( - data: &[u8], - column_number: usize, -) -> Result>, ParquetError> { - let len = data.len() * 2 + 1024; - let mut reader = Cursor::new(data); - - (0..column_number) - .map(|_| { - let mut prot = TCompactInputProtocol::new(&mut reader, len); - let offset = OffsetIndex::read_from_in_protocol(&mut prot)?; - Ok(offset.page_locations) - }) - .collect() -} - -/// Read [`PageLocation`]s from the [`ColumnChunkMetaData`]s. -/// Returns an empty vector if indexes are not available -pub fn read_pages_locations( - reader: &mut R, - chunks: &[ColumnChunkMetaData], -) -> Result>, ParquetError> { - let (offset, lengths) = prepare_offset_index_read(chunks)?; - - let length = lengths.iter().sum::(); - - reader.seek(SeekFrom::Start(offset))?; - - let mut data = vec![]; - data.try_reserve(length)?; - reader.by_ref().take(length as u64).read_to_end(&mut data)?; - - deserialize_page_locations(&data, chunks.len()) -} diff --git a/crates/polars-parquet/src/parquet/read/mod.rs b/crates/polars-parquet/src/parquet/read/mod.rs index e3426a38dc3c..ffd1534f928c 100644 --- a/crates/polars-parquet/src/parquet/read/mod.rs +++ b/crates/polars-parquet/src/parquet/read/mod.rs @@ -1,6 +1,5 @@ mod column; mod compression; -mod indexes; pub mod levels; mod metadata; mod page; @@ -11,7 +10,6 @@ use std::io::{Seek, SeekFrom}; pub use column::*; pub use compression::{decompress, BasicDecompressor}; -pub use indexes::{read_columns_indexes, read_pages_locations}; pub use metadata::{deserialize_metadata, read_metadata, read_metadata_with_size}; #[cfg(feature = "async")] pub use page::{get_page_stream, get_page_stream_from_column_start}; diff --git a/crates/polars-parquet/src/parquet/read/page/reader.rs b/crates/polars-parquet/src/parquet/read/page/reader.rs index dcc94d51dec3..f01cf55c4a8e 100644 --- a/crates/polars-parquet/src/parquet/read/page/reader.rs +++ b/crates/polars-parquet/src/parquet/read/page/reader.rs @@ -7,7 +7,6 @@ use polars_utils::mmap::{MemReader, MemSlice}; use super::PageIterator; use crate::parquet::compression::Compression; use crate::parquet::error::{ParquetError, ParquetResult}; -use crate::parquet::indexes::Interval; use crate::parquet::metadata::{ColumnChunkMetaData, Descriptor}; use crate::parquet::page::{ CompressedDataPage, CompressedDictPage, CompressedPage, DataPageHeader, PageType, @@ -58,6 +57,7 @@ impl From<&ColumnChunkMetaData> for PageMetaData { /// A fallible [`Iterator`] of [`CompressedDataPage`]. This iterator reads pages back /// to back until all pages have been consumed. +/// /// The pages from this iterator always have [`None`] [`crate::parquet::page::CompressedDataPage::selected_rows()`] since /// filter pushdown is not supported without a /// pre-computed [page index](https://github.com/apache/parquet-format/blob/master/PageIndex.md). @@ -159,14 +159,7 @@ impl PageReader { )); } - finish_page( - page_header, - buffer, - self.compression, - &self.descriptor, - None, - ) - .map(|p| { + finish_page(page_header, buffer, self.compression, &self.descriptor).map(|p| { if let CompressedPage::Dict(d) = p { Some(d) } else { @@ -234,14 +227,7 @@ pub(super) fn build_page(reader: &mut PageReader) -> ParquetResult>, ) -> ParquetResult { let type_ = page_header.type_.try_into()?; let uncompressed_page_size = page_header.uncompressed_page_size.try_into()?; @@ -302,7 +287,6 @@ pub(super) fn finish_page( compression, uncompressed_page_size, descriptor.clone(), - selected_rows, ))) }, PageType::DataPageV2 => { @@ -325,7 +309,6 @@ pub(super) fn finish_page( compression, uncompressed_page_size, descriptor.clone(), - selected_rows, ))) }, } diff --git a/crates/polars-parquet/src/parquet/read/page/stream.rs b/crates/polars-parquet/src/parquet/read/page/stream.rs index bc1ccb32880e..0101196f3752 100644 --- a/crates/polars-parquet/src/parquet/read/page/stream.rs +++ b/crates/polars-parquet/src/parquet/read/page/stream.rs @@ -100,7 +100,6 @@ fn _get_page_stream( MemSlice::from_vec(std::mem::take(&mut scratch)), compression, &descriptor, - None, )?; } } diff --git a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs index 36da3d5edcd1..3098241d8425 100644 --- a/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs +++ b/crates/polars-parquet/src/parquet/schema/io_message/from_message.rs @@ -158,9 +158,11 @@ fn type_from_str(s: &str) -> ParquetResult { } } -/// Parses message type as string into a Parquet [`ParquetType`](crate::parquet::schema::types::ParquetType) -/// which, for example, could be used to extract individual columns. Returns Parquet -/// general error when parsing or validation fails. +/// Parses message type as string into a Parquet [`ParquetType`](crate::parquet::schema::types::ParquetType). +/// +/// This could, for example, be used to extract individual columns. +/// +/// Returns Parquet general error when parsing or validation fails. pub fn from_message(message_type: &str) -> ParquetResult { let mut parser = Parser { tokenizer: &mut Tokenizer::from_str(message_type), diff --git a/crates/polars-parquet/src/parquet/write/compression.rs b/crates/polars-parquet/src/parquet/write/compression.rs index 1c7d4d36a901..04d01a6e34bc 100644 --- a/crates/polars-parquet/src/parquet/write/compression.rs +++ b/crates/polars-parquet/src/parquet/write/compression.rs @@ -16,9 +16,10 @@ fn compress_data( mut buffer, header, descriptor, - selected_rows, + num_rows, } = page; let uncompressed_page_size = buffer.len(); + let num_rows = num_rows.expect("We should have num_rows when we are writing"); if compression != CompressionOptions::Uncompressed { match &header { DataPageHeader::V1(_) => { @@ -40,13 +41,13 @@ fn compress_data( std::mem::swap(buffer.to_mut(), &mut compressed_buffer); } - Ok(CompressedDataPage::new_read( + Ok(CompressedDataPage::new( header, CowBuffer::Owned(compressed_buffer), compression.into(), uncompressed_page_size, descriptor, - selected_rows, + num_rows, )) } diff --git a/crates/polars-parquet/src/parquet/write/indexes/serialize.rs b/crates/polars-parquet/src/parquet/write/indexes/serialize.rs index 8b3cebec1686..14594bc2b8c4 100644 --- a/crates/polars-parquet/src/parquet/write/indexes/serialize.rs +++ b/crates/polars-parquet/src/parquet/write/indexes/serialize.rs @@ -62,11 +62,7 @@ pub fn serialize_offset_index(pages: &[PageWriteSpec]) -> ParquetResult, + /// The number of actual rows. For non-nested values, this is equal to the number of values. + pub num_rows: usize, pub header_size: u64, pub offset: u64, pub bytes_written: u64, @@ -55,7 +56,9 @@ pub fn write_page( compressed_page: &CompressedPage, ) -> ParquetResult { let num_values = compressed_page.num_values(); - let selected_rows = compressed_page.selected_rows(); + let num_rows = compressed_page + .num_rows() + .expect("We should have num_rows when we are writing"); let header = match &compressed_page { CompressedPage::Data(compressed_page) => assemble_data_page_header(compressed_page), @@ -88,8 +91,8 @@ pub fn write_page( bytes_written, compression: compressed_page.compression(), statistics, - num_rows: selected_rows.map(|x| x.last().unwrap().length), num_values, + num_rows, }) } @@ -101,7 +104,9 @@ pub async fn write_page_async( compressed_page: &CompressedPage, ) -> ParquetResult { let num_values = compressed_page.num_values(); - let selected_rows = compressed_page.selected_rows(); + let num_rows = compressed_page + .num_rows() + .expect("We should have the num_rows when we are writing"); let header = match &compressed_page { CompressedPage::Data(compressed_page) => assemble_data_page_header(compressed_page), @@ -134,7 +139,7 @@ pub async fn write_page_async( bytes_written, compression: compressed_page.compression(), statistics, - num_rows: selected_rows.map(|x| x.last().unwrap().length), + num_rows, num_values, }) } diff --git a/crates/polars-parquet/src/parquet/write/row_group.rs b/crates/polars-parquet/src/parquet/write/row_group.rs index e5c535055ea6..68c25a9c40fb 100644 --- a/crates/polars-parquet/src/parquet/write/row_group.rs +++ b/crates/polars-parquet/src/parquet/write/row_group.rs @@ -58,9 +58,7 @@ fn compute_num_rows(columns: &[(ColumnChunk, Vec)]) -> ParquetRes .iter() .filter(|x| is_data_page(x)) .try_for_each(|spec| { - num_rows += spec.num_rows.ok_or_else(|| { - ParquetError::oos("All data pages must declare the number of rows on it") - })? as i64; + num_rows += spec.num_rows as i64; ParquetResult::Ok(()) })?; ParquetResult::Ok(num_rows) diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 383803f816ea..5155d7bdfcff 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -211,7 +211,7 @@ impl Source for CsvSource { if let Some(ca) = &mut self.include_file_path { if ca.len() < max_height { - *ca = ca.new_from_index(max_height, 0); + *ca = ca.new_from_index(0, max_height); }; for data_chunk in &mut out { diff --git a/crates/polars-pipe/src/executors/sources/parquet.rs b/crates/polars-pipe/src/executors/sources/parquet.rs index a897df0c2478..cd0cb58f3574 100644 --- a/crates/polars-pipe/src/executors/sources/parquet.rs +++ b/crates/polars-pipe/src/executors/sources/parquet.rs @@ -1,8 +1,10 @@ use std::collections::VecDeque; use std::ops::Range; use std::path::PathBuf; +use std::sync::atomic::AtomicUsize; use std::sync::Arc; +use futures::{StreamExt, TryStreamExt}; use polars_core::config::{self, get_file_prefetch_size}; use polars_core::error::*; use polars_core::prelude::Series; @@ -32,7 +34,7 @@ pub struct ParquetSource { batched_readers: VecDeque, n_threads: usize, processed_paths: usize, - processed_rows: usize, + processed_rows: AtomicUsize, iter: Range, paths: Arc>, options: ParquetOptions, @@ -110,11 +112,13 @@ impl ParquetSource { } fn init_reader_sync(&mut self) -> PolarsResult<()> { + use std::sync::atomic::Ordering; + let Some(index) = self.iter.next() else { return Ok(()); }; if let Some(slice) = self.file_options.slice { - if self.processed_rows >= slice.0 as usize + slice.1 { + if self.processed_rows.load(Ordering::Relaxed) >= slice.0 as usize + slice.1 { return Ok(()); } } @@ -147,20 +151,22 @@ impl ParquetSource { ); let n_rows_this_file = reader.num_rows().unwrap(); + let current_row_offset = self + .processed_rows + .fetch_add(n_rows_this_file, Ordering::Relaxed); let slice = file_options.slice.map(|slice| { assert!(slice.0 >= 0); let slice_start = slice.0 as usize; let slice_end = slice_start + slice.1; split_slice_at_file( - &mut self.processed_rows.clone(), + &mut current_row_offset.clone(), n_rows_this_file, slice_start, slice_end, ) }); - self.processed_rows += n_rows_this_file; reader = reader.with_slice(slice); reader.batched(chunk_size)? }; @@ -174,42 +180,64 @@ impl ParquetSource { Ok(()) } + /// This function must NOT be run concurrently if there is a slice (or any operation that + /// requires `self.processed_rows` to be incremented in the correct order), as it does not + /// coordinate to increment the row offset in a properly ordered manner. #[cfg(feature = "async")] async fn init_reader_async(&self, index: usize) -> PolarsResult { + use std::sync::atomic::Ordering; + let metadata = self.metadata.clone(); let predicate = self.predicate.clone(); let cloud_options = self.cloud_options.clone(); let (path, options, file_options, projection, chunk_size, hive_partitions) = self.prepare_init_reader(index)?; - assert_eq!(file_options.slice, None); - let batched_reader = { let uri = path.to_string_lossy(); - ParquetAsyncReader::from_uri(&uri, cloud_options.as_ref(), metadata) - .await? - .with_row_index(file_options.row_index) - .with_projection(projection) - .check_schema( - self.file_info - .reader_schema - .as_ref() - .unwrap() - .as_ref() - .unwrap_left(), - ) - .await? - .with_predicate(predicate.clone()) - .use_statistics(options.use_statistics) - .with_hive_partition_columns(hive_partitions) - .with_include_file_path( - self.file_options - .include_file_paths - .as_ref() - .map(|x| (x.clone(), Arc::from(path.to_str().unwrap()))), + + let mut async_reader = + ParquetAsyncReader::from_uri(&uri, cloud_options.as_ref(), metadata) + .await? + .with_row_index(file_options.row_index) + .with_projection(projection) + .check_schema( + self.file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left(), + ) + .await? + .with_predicate(predicate.clone()) + .use_statistics(options.use_statistics) + .with_hive_partition_columns(hive_partitions) + .with_include_file_path( + self.file_options + .include_file_paths + .as_ref() + .map(|x| (x.clone(), Arc::from(path.to_str().unwrap()))), + ); + + let n_rows_this_file = async_reader.num_rows().await?; + let current_row_offset = self + .processed_rows + .fetch_add(n_rows_this_file, Ordering::Relaxed); + + let slice = file_options.slice.map(|slice| { + assert!(slice.0 >= 0); + let slice_start = slice.0 as usize; + let slice_end = slice_start + slice.1; + split_slice_at_file( + &mut current_row_offset.clone(), + n_rows_this_file, + slice_start, + slice_end, ) - .batched(chunk_size) - .await? + }); + + async_reader.with_slice(slice).batched(chunk_size).await? }; Ok(batched_reader) } @@ -241,7 +269,7 @@ impl ParquetSource { batched_readers: VecDeque::new(), n_threads, processed_paths: 0, - processed_rows: 0, + processed_rows: AtomicUsize::new(0), options, file_options, iter, @@ -269,29 +297,36 @@ impl ParquetSource { // // It is important we do this for a reasonable batch size, that's why we start this when we // have just 2 readers left. - if self.file_options.slice.is_none() - && self.run_async - && (self.batched_readers.len() <= 2 || self.batched_readers.is_empty()) - { + if self.run_async { #[cfg(not(feature = "async"))] panic!("activate 'async' feature"); #[cfg(feature = "async")] { - let range = 0..self.prefetch_size - self.batched_readers.len(); - let range = range - .zip(&mut self.iter) - .map(|(_, index)| index) - .collect::>(); - let init_iter = range.into_iter().map(|index| self.init_reader_async(index)); - - let batched_readers = - polars_io::pl_async::get_runtime().block_on_potential_spawn(async { - futures::future::try_join_all(init_iter).await - })?; - - for r in batched_readers { - self.finish_init_reader(r)?; + if self.batched_readers.len() <= 2 || self.batched_readers.is_empty() { + let range = 0..self.prefetch_size - self.batched_readers.len(); + let range = range + .zip(&mut self.iter) + .map(|(_, index)| index) + .collect::>(); + let init_iter = range.into_iter().map(|index| self.init_reader_async(index)); + + let batched_readers = if self.file_options.slice.is_some() { + polars_io::pl_async::get_runtime().block_on_potential_spawn(async { + futures::stream::iter(init_iter) + .then(|x| x) + .try_collect() + .await + })? + } else { + polars_io::pl_async::get_runtime().block_on_potential_spawn(async { + futures::future::try_join_all(init_iter).await + })? + }; + + for r in batched_readers { + self.finish_init_reader(r)?; + } } } } else { diff --git a/crates/polars-plan/Cargo.toml b/crates/polars-plan/Cargo.toml index 28b867c8580c..5d1cdc79ab15 100644 --- a/crates/polars-plan/Cargo.toml +++ b/crates/polars-plan/Cargo.toml @@ -184,6 +184,7 @@ offset_by = ["polars-time/offset_by"] bigidx = ["polars-core/bigidx"] polars_cloud = ["serde", "ciborium"] +ir_serde = ["serde", "polars-utils/ir_serde"] panic_on_schema = [] diff --git a/crates/polars-plan/src/client/mod.rs b/crates/polars-plan/src/client/mod.rs index a815babcc6ad..f5a5cdb0f763 100644 --- a/crates/polars-plan/src/client/mod.rs +++ b/crates/polars-plan/src/client/mod.rs @@ -1,38 +1,18 @@ mod check; -use std::sync::Arc; +use arrow::legacy::error::to_compute_err; +use polars_core::error::PolarsResult; -use polars_core::error::{polars_ensure, polars_err, PolarsResult}; -use polars_io::parquet::write::ParquetWriteOptions; -use polars_io::path_utils::is_cloud_url; - -use crate::plans::options::{FileType, SinkType}; use crate::plans::DslPlan; /// Prepare the given [`DslPlan`] for execution on Polars Cloud. -pub fn prepare_cloud_plan(dsl: DslPlan, uri: String) -> PolarsResult> { +pub fn prepare_cloud_plan(dsl: DslPlan) -> PolarsResult> { // Check the plan for cloud eligibility. check::assert_cloud_eligible(&dsl)?; - // Add Sink node. - polars_ensure!( - is_cloud_url(&uri), - InvalidOperation: "non-cloud paths not supported: {uri}" - ); - let sink_type = SinkType::Cloud { - uri: Arc::new(uri), - file_type: FileType::Parquet(ParquetWriteOptions::default()), - cloud_options: None, - }; - let dsl = DslPlan::Sink { - input: Arc::new(dsl), - payload: sink_type, - }; - // Serialize the plan. let mut writer = Vec::new(); - ciborium::into_writer(&dsl, &mut writer) - .map_err(|err| polars_err!(ComputeError: err.to_string()))?; + ciborium::into_writer(&dsl, &mut writer).map_err(to_compute_err)?; Ok(writer) } diff --git a/crates/polars-plan/src/dsl/expr.rs b/crates/polars-plan/src/dsl/expr.rs index 089813e494a8..da6accf80c6f 100644 --- a/crates/polars-plan/src/dsl/expr.rs +++ b/crates/polars-plan/src/dsl/expr.rs @@ -61,9 +61,11 @@ impl AsRef for AggExpr { } } -/// Expressions that can be used in various contexts. Queries consist of multiple expressions. When using the polars -/// lazy API, don't construct an `Expr` directly; instead, create one using the functions in the `polars_lazy::dsl` -/// module. See that module's docs for more info. +/// Expressions that can be used in various contexts. +/// +/// Queries consist of multiple expressions. +/// When using the polars lazy API, don't construct an `Expr` directly; instead, create one using +/// the functions in the `polars_lazy::dsl` module. See that module's docs for more info. #[derive(Clone, PartialEq)] #[must_use] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] diff --git a/crates/polars-plan/src/dsl/function_expr/datetime.rs b/crates/polars-plan/src/dsl/function_expr/datetime.rs index 8e2db0ec8134..604c915c817a 100644 --- a/crates/polars-plan/src/dsl/function_expr/datetime.rs +++ b/crates/polars-plan/src/dsl/function_expr/datetime.rs @@ -311,24 +311,31 @@ pub(super) fn microsecond(s: &Series) -> PolarsResult { pub(super) fn nanosecond(s: &Series) -> PolarsResult { s.nanosecond().map(|ca| ca.into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_days(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.days().into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_hours(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.hours().into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_minutes(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.minutes().into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_seconds(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.seconds().into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_milliseconds(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.milliseconds().into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_microseconds(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.microseconds().into_series()) } +#[cfg(feature = "dtype-duration")] pub(super) fn total_nanoseconds(s: &Series) -> PolarsResult { s.duration().map(|ca| ca.nanoseconds().into_series()) } diff --git a/crates/polars-plan/src/dsl/functions/correlation.rs b/crates/polars-plan/src/dsl/functions/correlation.rs index bb0fc5aa3cf1..dd7521ad20a9 100644 --- a/crates/polars-plan/src/dsl/functions/correlation.rs +++ b/crates/polars-plan/src/dsl/functions/correlation.rs @@ -79,11 +79,15 @@ pub fn rolling_corr(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { ..Default::default() }; + let non_null_mask = when(x.clone().is_not_null().and(y.clone().is_not_null())) + .then(lit(1.0)) + .otherwise(lit(Null {})); + let mean_x_y = (x.clone() * y.clone()).rolling_mean(rolling_options.clone()); - let mean_x = x.clone().rolling_mean(rolling_options.clone()); - let mean_y = y.clone().rolling_mean(rolling_options.clone()); - let var_x = x.clone().rolling_var(rolling_options.clone()); - let var_y = y.clone().rolling_var(rolling_options); + let mean_x = (x.clone() * non_null_mask.clone()).rolling_mean(rolling_options.clone()); + let mean_y = (y.clone() * non_null_mask.clone()).rolling_mean(rolling_options.clone()); + let var_x = (x.clone() * non_null_mask.clone()).rolling_var(rolling_options.clone()); + let var_y = (y.clone() * non_null_mask.clone()).rolling_var(rolling_options); let rolling_options_count = RollingOptionsFixedWindow { window_size: options.window_size as usize, @@ -110,9 +114,13 @@ pub fn rolling_cov(x: Expr, y: Expr, options: RollingCovOptions) -> Expr { ..Default::default() }; + let non_null_mask = when(x.clone().is_not_null().and(y.clone().is_not_null())) + .then(lit(1.0)) + .otherwise(lit(Null {})); + let mean_x_y = (x.clone() * y.clone()).rolling_mean(rolling_options.clone()); - let mean_x = x.clone().rolling_mean(rolling_options.clone()); - let mean_y = y.clone().rolling_mean(rolling_options); + let mean_x = (x.clone() * non_null_mask.clone()).rolling_mean(rolling_options.clone()); + let mean_y = (y.clone() * non_null_mask.clone()).rolling_mean(rolling_options); let rolling_options_count = RollingOptionsFixedWindow { window_size: options.window_size as usize, min_periods: 0, diff --git a/crates/polars-plan/src/dsl/functions/index.rs b/crates/polars-plan/src/dsl/functions/index.rs index d125ce571307..7a452a033245 100644 --- a/crates/polars-plan/src/dsl/functions/index.rs +++ b/crates/polars-plan/src/dsl/functions/index.rs @@ -1,6 +1,7 @@ use super::*; /// Find the indexes that would sort these series in order of appearance. +/// /// That means that the first `Series` will be used to determine the ordering /// until duplicates are found. Once duplicates are found, the next `Series` will /// be used and so on. diff --git a/crates/polars-plan/src/dsl/functions/repeat.rs b/crates/polars-plan/src/dsl/functions/repeat.rs index 1b32abc97b5a..9da42c36242f 100644 --- a/crates/polars-plan/src/dsl/functions/repeat.rs +++ b/crates/polars-plan/src/dsl/functions/repeat.rs @@ -1,8 +1,9 @@ use super::*; -/// Create a column of length `n` containing `n` copies of the literal `value`. Generally you won't need this function, -/// as `lit(value)` already represents a column containing only `value` whose length is automatically set to the correct -/// number of rows. +/// Create a column of length `n` containing `n` copies of the literal `value`. +/// +/// Generally you won't need this function, as `lit(value)` already represents a column containing +/// only `value` whose length is automatically set to the correct number of rows. pub fn repeat>(value: E, n: Expr) -> Expr { let function = |s: Series, n: Series| { polars_ensure!( diff --git a/crates/polars-plan/src/dsl/mod.rs b/crates/polars-plan/src/dsl/mod.rs index 16c1fcb6fd85..166c7f5e7962 100644 --- a/crates/polars-plan/src/dsl/mod.rs +++ b/crates/polars-plan/src/dsl/mod.rs @@ -1707,12 +1707,20 @@ impl Expr { /// Get maximal value that could be hold by this dtype. pub fn upper_bound(self) -> Expr { - self.map_private(FunctionExpr::UpperBound) + self.apply_private(FunctionExpr::UpperBound) + .with_function_options(|mut options| { + options.flags |= FunctionFlags::RETURNS_SCALAR; + options + }) } /// Get minimal value that could be hold by this dtype. pub fn lower_bound(self) -> Expr { - self.map_private(FunctionExpr::LowerBound) + self.apply_private(FunctionExpr::LowerBound) + .with_function_options(|mut options| { + options.flags |= FunctionFlags::RETURNS_SCALAR; + options + }) } pub fn reshape(self, dimensions: &[i64], nested_type: NestedType) -> Self { diff --git a/crates/polars-plan/src/plans/aexpr/mod.rs b/crates/polars-plan/src/plans/aexpr/mod.rs index 9e14563303b7..49e4a94a62a0 100644 --- a/crates/polars-plan/src/plans/aexpr/mod.rs +++ b/crates/polars-plan/src/plans/aexpr/mod.rs @@ -11,6 +11,8 @@ use polars_core::chunked_array::cast::CastOptions; use polars_core::prelude::*; use polars_core::utils::{get_time_units, try_get_supertype}; use polars_utils::arena::{Arena, Node}; +#[cfg(feature = "ir_serde")] +use serde::{Deserialize, Serialize}; use strum_macros::IntoStaticStr; pub use utils::*; @@ -19,6 +21,7 @@ use crate::plans::Context; use crate::prelude::*; #[derive(Clone, Debug, IntoStaticStr)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub enum IRAggExpr { Min { input: Node, @@ -125,6 +128,7 @@ impl From for GroupByMethod { /// IR expression node that is allocated in an [`Arena`][polars_utils::arena::Arena]. #[derive(Clone, Debug, Default)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub enum AExpr { Explode(Node), Alias(Node, ColumnName), @@ -164,6 +168,7 @@ pub enum AExpr { truthy: Node, falsy: Node, }, + #[cfg_attr(feature = "ir_serde", serde(skip))] AnonymousFunction { input: Vec, function: SpecialEq>, diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index 82fa483643d7..849be4a6e4ea 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -346,10 +346,13 @@ impl DslBuilder { .into() } - pub fn explode(self, columns: Vec) -> Self { + pub fn explode(self, columns: Vec, allow_empty: bool) -> Self { DslPlan::MapFunction { input: Arc::new(self.0), - function: DslFunction::Explode { columns }, + function: DslFunction::Explode { + columns, + allow_empty, + }, } .into() } @@ -442,7 +445,7 @@ impl DslBuilder { function: F, optimizations: AllowedOptimizations, schema: Option>, - name: &'static str, + name: &str, ) -> Self where F: DataFrameUdf + 'static, @@ -457,7 +460,7 @@ impl DslBuilder { predicate_pd: optimizations.contains(OptState::PREDICATE_PUSHDOWN), projection_pd: optimizations.contains(OptState::PROJECTION_PUSHDOWN), streamable: optimizations.contains(OptState::STREAMING), - fmt_str: name, + fmt_str: name.into(), }), } .into() diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 8a92d90ffa1c..08b3a8b66d1e 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -636,6 +636,23 @@ pub fn to_alp_impl( let input_schema = lp_arena.get(input).schema(lp_arena); match function { + DslFunction::Explode { + columns, + allow_empty, + } => { + let columns = expand_selectors(columns, &input_schema, &[])?; + validate_columns_in_input(&columns, &input_schema, "explode")?; + polars_ensure!(!columns.is_empty() || allow_empty, InvalidOperation: "no columns provided in explode"); + if columns.is_empty() { + return Ok(input); + } + let function = FunctionIR::Explode { + columns, + schema: Default::default(), + }; + let ir = IR::MapFunction { input, function }; + return Ok(lp_arena.add(ir)); + }, DslFunction::FillNan(fill_value) => { let exprs = input_schema .iter() @@ -794,8 +811,11 @@ pub fn to_alp_impl( IR::Sink { input, payload } }, DslPlan::IR { node, dsl, version } => { - return if let (true, Some(node)) = (version == lp_arena.version(), node) { - Ok(node) + return if node.is_some() + && version == lp_arena.version() + && convert.used_arenas.insert(version) + { + Ok(node.unwrap()) } else { to_alp_impl(owned(dsl), expr_arena, lp_arena, convert) } diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index 250a44d1f524..8d7e232c4cd7 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -3,7 +3,12 @@ mod dsl_to_ir; mod expr_expansion; mod expr_to_ir; mod ir_to_dsl; -#[cfg(any(feature = "ipc", feature = "parquet", feature = "csv"))] +#[cfg(any( + feature = "ipc", + feature = "parquet", + feature = "csv", + feature = "json" +))] mod scans; mod stack_opt; diff --git a/crates/polars-plan/src/plans/conversion/scans.rs b/crates/polars-plan/src/plans/conversion/scans.rs index 959327148f6c..308e9b9d0511 100644 --- a/crates/polars-plan/src/plans/conversion/scans.rs +++ b/crates/polars-plan/src/plans/conversion/scans.rs @@ -40,7 +40,7 @@ fn prepare_schemas(mut schema: Schema, row_index: Option<&RowIndex>) -> (SchemaR pub(super) fn parquet_file_info( paths: &[PathBuf], file_options: &FileScanOptions, - cloud_options: Option<&polars_io::cloud::CloudOptions>, + #[allow(unused)] cloud_options: Option<&polars_io::cloud::CloudOptions>, ) -> PolarsResult<(FileInfo, Option)> { let path = get_first_path(paths)?; diff --git a/crates/polars-plan/src/plans/conversion/stack_opt.rs b/crates/polars-plan/src/plans/conversion/stack_opt.rs index 6e05a872a8cf..8db4e82659d5 100644 --- a/crates/polars-plan/src/plans/conversion/stack_opt.rs +++ b/crates/polars-plan/src/plans/conversion/stack_opt.rs @@ -7,6 +7,12 @@ pub(super) struct ConversionOptimizer { scratch: Vec, simplify: Option, coerce: Option, + // IR's can be cached in the DSL. + // But if they are used multiple times in DSL (e.g. concat/join) + // then it can occur that we take a slot multiple times. + // So we keep track of the arena versions used and allow only + // one unique IR cache to be reused. + pub(super) used_arenas: PlHashSet, } impl ConversionOptimizer { @@ -27,6 +33,7 @@ impl ConversionOptimizer { scratch: Vec::with_capacity(8), simplify, coerce, + used_arenas: Default::default(), } } diff --git a/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs b/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs index 9bbc614ca088..a8c2122f9b65 100644 --- a/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/type_coercion/mod.rs @@ -513,7 +513,7 @@ fn inline_or_prune_cast( }, // We generate casted literal datetimes, so ensure we cast upon conversion // to create simpler expr trees. - #[cfg(feature = "temporal")] + #[cfg(feature = "dtype-datetime")] LiteralValue::DateTime(ts, tu, None) if dtype.is_date() => { let from_size = time_unit_multiple(tu.to_arrow()) * SECONDS_IN_DAY; LiteralValue::Date((*ts / from_size) as i32) diff --git a/crates/polars-plan/src/plans/expr_ir.rs b/crates/polars-plan/src/plans/expr_ir.rs index 1161406a44b9..d9c0886c201c 100644 --- a/crates/polars-plan/src/plans/expr_ir.rs +++ b/crates/polars-plan/src/plans/expr_ir.rs @@ -3,10 +3,14 @@ use std::hash::Hash; #[cfg(feature = "cse")] use std::hash::Hasher; +#[cfg(feature = "ir_serde")] +use serde::{Deserialize, Serialize}; + use super::*; use crate::constants::{get_len_name, LITERAL_NAME}; #[derive(Default, Debug, Clone, Hash, PartialEq, Eq)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub enum OutputName { /// No not yet set. #[default] @@ -23,7 +27,7 @@ pub enum OutputName { } impl OutputName { - fn unwrap(&self) -> &ColumnName { + pub fn unwrap(&self) -> &ColumnName { match self { OutputName::Alias(name) => name, OutputName::ColumnLhs(name) => name, @@ -40,6 +44,7 @@ impl OutputName { } #[derive(Clone, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub struct ExprIR { /// Output name of this expression. output_name: OutputName, @@ -146,7 +151,7 @@ impl ExprIR { self.output_name = OutputName::Alias(name) } - pub(crate) fn output_name_inner(&self) -> &OutputName { + pub fn output_name_inner(&self) -> &OutputName { &self.output_name } diff --git a/crates/polars-plan/src/plans/functions/count.rs b/crates/polars-plan/src/plans/functions/count.rs index fd92fdd9fc9d..d00f19e36f8a 100644 --- a/crates/polars-plan/src/plans/functions/count.rs +++ b/crates/polars-plan/src/plans/functions/count.rs @@ -1,17 +1,19 @@ #[cfg(feature = "ipc")] use arrow::io::ipc::read::get_row_count as count_rows_ipc_sync; -#[cfg(feature = "parquet")] +#[cfg(any(feature = "parquet", feature = "json"))] use polars_io::cloud::CloudOptions; #[cfg(feature = "csv")] use polars_io::csv::read::count_rows as count_rows_csv; +#[cfg(any(feature = "parquet", feature = "ipc", feature = "json"))] +use polars_io::is_cloud_url; #[cfg(all(feature = "parquet", feature = "cloud"))] use polars_io::parquet::read::ParquetAsyncReader; #[cfg(feature = "parquet")] use polars_io::parquet::read::ParquetReader; #[cfg(all(feature = "parquet", feature = "async"))] use polars_io::pl_async::{get_runtime, with_concurrency_budget}; -#[cfg(any(feature = "parquet", feature = "ipc"))] -use polars_io::{path_utils::is_cloud_url, SerReader}; +#[cfg(any(feature = "json", feature = "parquet"))] +use polars_io::SerReader; use super::*; @@ -90,7 +92,7 @@ pub fn count_rows(paths: &Arc>, scan_type: &FileScan) -> PolarsResu #[cfg(feature = "parquet")] pub(super) fn count_rows_parquet( paths: &Arc>, - cloud_options: Option<&CloudOptions>, + #[allow(unused)] cloud_options: Option<&CloudOptions>, ) -> PolarsResult { if paths.is_empty() { return Ok(0); @@ -189,6 +191,7 @@ pub(super) fn count_rows_ndjson( cloud_options: Option<&CloudOptions>, ) -> PolarsResult { use polars_core::config; + use polars_io::utils::maybe_decompress_bytes; let run_async = !paths.is_empty() && is_cloud_url(&paths[0]) || config::force_async(); @@ -233,7 +236,12 @@ pub(super) fn count_rows_ndjson( polars_utils::open_file(&paths[i])? }; - let reader = polars_io::ndjson::core::JsonLineReader::new(f); + let mmap = unsafe { memmap::Mmap::map(&f).unwrap() }; + let owned = &mut vec![]; + + let reader = polars_io::ndjson::core::JsonLineReader::new(std::io::Cursor::new( + maybe_decompress_bytes(mmap.as_ref(), owned)?, + )); reader.count() }) .sum() diff --git a/crates/polars-plan/src/plans/functions/dsl.rs b/crates/polars-plan/src/plans/functions/dsl.rs index 76c7dc9d3211..458c7c6d8e28 100644 --- a/crates/polars-plan/src/plans/functions/dsl.rs +++ b/crates/polars-plan/src/plans/functions/dsl.rs @@ -29,6 +29,7 @@ pub enum DslFunction { OpaquePython(OpaquePythonUdf), Explode { columns: Vec, + allow_empty: bool, }, #[cfg(feature = "pivot")] Unpivot { @@ -79,7 +80,7 @@ pub enum StatsFunction { Max, } -fn validate_columns>( +pub(crate) fn validate_columns_in_input>( columns: &[S], input_schema: &Schema, operation_name: &str, @@ -93,20 +94,12 @@ fn validate_columns>( impl DslFunction { pub(crate) fn into_function_ir(self, input_schema: &Schema) -> PolarsResult { let function = match self { - DslFunction::Explode { columns } => { - let columns = expand_selectors(columns, input_schema, &[])?; - validate_columns(columns.as_ref(), input_schema, "explode")?; - FunctionIR::Explode { - columns, - schema: Default::default(), - } - }, #[cfg(feature = "pivot")] DslFunction::Unpivot { args } => { let on = expand_selectors(args.on, input_schema, &[])?; let index = expand_selectors(args.index, input_schema, &[])?; - validate_columns(on.as_ref(), input_schema, "unpivot")?; - validate_columns(index.as_ref(), input_schema, "unpivot")?; + validate_columns_in_input(on.as_ref(), input_schema, "unpivot")?; + validate_columns_in_input(index.as_ref(), input_schema, "unpivot")?; let args = UnpivotArgsIR { on: on.iter().map(|s| s.as_ref().into()).collect(), @@ -128,7 +121,7 @@ impl DslFunction { }, DslFunction::Rename { existing, new } => { let swapping = new.iter().any(|name| input_schema.get(name).is_some()); - validate_columns(existing.as_ref(), input_schema, "rename")?; + validate_columns_in_input(existing.as_ref(), input_schema, "rename")?; FunctionIR::Rename { existing, @@ -139,12 +132,15 @@ impl DslFunction { }, DslFunction::Unnest(selectors) => { let columns = expand_selectors(selectors, input_schema, &[])?; - validate_columns(columns.as_ref(), input_schema, "explode")?; + validate_columns_in_input(columns.as_ref(), input_schema, "explode")?; FunctionIR::Unnest { columns } }, #[cfg(feature = "python")] DslFunction::OpaquePython(inner) => FunctionIR::OpaquePython(inner), - DslFunction::Stats(_) | DslFunction::FillNan(_) | DslFunction::Drop(_) => { + DslFunction::Stats(_) + | DslFunction::FillNan(_) + | DslFunction::Drop(_) + | DslFunction::Explode { .. } => { // We should not reach this. panic!("impl error") }, diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index fb3edbe12bd3..4e9f42f205ba 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -26,11 +26,13 @@ use crate::dsl::python_udf::PythonFunction; use crate::plans::functions::merge_sorted::merge_sorted; use crate::prelude::*; +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] #[derive(Clone, IntoStaticStr)] #[strum(serialize_all = "SCREAMING_SNAKE_CASE")] pub enum FunctionIR { #[cfg(feature = "python")] OpaquePython(OpaquePythonUdf), + #[cfg_attr(feature = "ir_serde", serde(skip))] Opaque { function: Arc, schema: Option>, @@ -40,7 +42,7 @@ pub enum FunctionIR { projection_pd: bool, streamable: bool, // used for formatting - fmt_str: &'static str, + fmt_str: String, }, FastCount { paths: Arc>, @@ -48,6 +50,7 @@ pub enum FunctionIR { alias: Option>, }, /// Streaming engine pipeline + #[cfg_attr(feature = "ir_serde", serde(skip))] Pipeline { function: Arc>, schema: SchemaRef, @@ -71,20 +74,24 @@ pub enum FunctionIR { new: Arc<[SmartString]>, // A column name gets swapped with an existing column swapping: bool, + #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, }, Explode { columns: Arc<[ColumnName]>, + #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, }, #[cfg(feature = "pivot")] Unpivot { args: Arc, + #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, }, RowIndex { name: Arc, // Might be cached. + #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, offset: Option, }, diff --git a/crates/polars-plan/src/plans/ir/dot.rs b/crates/polars-plan/src/plans/ir/dot.rs index 49e9bef1a3dc..8fb6dbe5444d 100644 --- a/crates/polars-plan/src/plans/ir/dot.rs +++ b/crates/polars-plan/src/plans/ir/dot.rs @@ -32,9 +32,9 @@ impl fmt::Display for DotNode { #[inline(always)] fn write_label<'a, 'b>( - f: &'b mut fmt::Formatter<'a>, + f: &'a mut fmt::Formatter<'b>, id: DotNode, - mut w: impl FnMut(&mut EscapeLabel<'a, 'b>) -> fmt::Result, + mut w: impl FnMut(&mut EscapeLabel<'a>) -> fmt::Result, ) -> fmt::Result { write!(f, "{INDENT}{id}[label=\"")?; @@ -341,7 +341,7 @@ impl<'a> IRDotDisplay<'a> { } // A few utility structures for formatting -pub(crate) struct PathsDisplay<'a>(pub &'a [PathBuf]); +pub struct PathsDisplay<'a>(pub &'a [PathBuf]); struct NumColumns<'a>(Option<&'a [String]>); struct NumColumnsSchema<'a>(Option<&'a Schema>); struct OptionExprIRDisplay<'a>(Option>); @@ -390,9 +390,9 @@ impl fmt::Display for OptionExprIRDisplay<'_> { } /// Utility structure to write to a [`fmt::Formatter`] whilst escaping the output as a label name -struct EscapeLabel<'a, 'b>(&'b mut fmt::Formatter<'a>); +pub struct EscapeLabel<'a>(pub &'a mut dyn fmt::Write); -impl<'a, 'b> fmt::Write for EscapeLabel<'a, 'b> { +impl<'a> fmt::Write for EscapeLabel<'a> { fn write_str(&mut self, mut s: &str) -> fmt::Result { loop { let mut char_indices = s.char_indices(); diff --git a/crates/polars-plan/src/plans/ir/mod.rs b/crates/polars-plan/src/plans/ir/mod.rs index 8d30639f1fe1..b8b0378419d6 100644 --- a/crates/polars-plan/src/plans/ir/mod.rs +++ b/crates/polars-plan/src/plans/ir/mod.rs @@ -8,12 +8,14 @@ use std::borrow::Cow; use std::fmt; use std::path::PathBuf; -pub use dot::IRDotDisplay; +pub use dot::{EscapeLabel, IRDotDisplay, PathsDisplay}; pub use format::{ExprIRDisplay, IRDisplay}; use hive::HivePartitions; use polars_core::prelude::*; use polars_utils::idx_vec::UnitVec; use polars_utils::unitvec; +#[cfg(feature = "ir_serde")] +use serde::{Deserialize, Serialize}; use crate::prelude::*; @@ -33,6 +35,7 @@ pub struct IRPlanRef<'a> { /// [`IR`] is a representation of [`DslPlan`] with [`Node`]s which are allocated in an [`Arena`] /// In this IR the logical plan has access to the full dataset. #[derive(Clone, Debug, Default)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub enum IR { #[cfg(feature = "python")] PythonScan { @@ -105,6 +108,7 @@ pub enum IR { keys: Vec, aggs: Vec, schema: SchemaRef, + #[cfg_attr(feature = "ir_serde", serde(skip))] apply: Option>, maintain_order: bool, options: Arc, diff --git a/crates/polars-plan/src/plans/ir/schema.rs b/crates/polars-plan/src/plans/ir/schema.rs index 5b5042e50377..1586463a8c0f 100644 --- a/crates/polars-plan/src/plans/ir/schema.rs +++ b/crates/polars-plan/src/plans/ir/schema.rs @@ -107,4 +107,60 @@ impl IR { }; Cow::Borrowed(schema) } + + /// Get the schema of the logical plan node, using caching. + #[recursive] + pub fn schema_with_cache<'a>( + node: Node, + arena: &'a Arena, + cache: &mut PlHashMap>, + ) -> Arc { + use IR::*; + if let Some(schema) = cache.get(&node) { + return schema.clone(); + } + + let schema = match arena.get(node) { + #[cfg(feature = "python")] + PythonScan { options } => options + .output_schema + .as_ref() + .unwrap_or(&options.schema) + .clone(), + Union { inputs, .. } => IR::schema_with_cache(inputs[0], arena, cache), + HConcat { schema, .. } => schema.clone(), + Cache { input, .. } + | Sort { input, .. } + | Filter { input, .. } + | Distinct { input, .. } + | Sink { input, .. } + | Slice { input, .. } => IR::schema_with_cache(*input, arena, cache), + Scan { + output_schema, + file_info, + .. + } => output_schema.as_ref().unwrap_or(&file_info.schema).clone(), + DataFrameScan { + schema, + output_schema, + .. + } => output_schema.as_ref().unwrap_or(schema).clone(), + Select { schema, .. } + | Reduce { schema, .. } + | GroupBy { schema, .. } + | Join { schema, .. } + | HStack { schema, .. } + | ExtContext { schema, .. } + | SimpleProjection { + columns: schema, .. + } => schema.clone(), + MapFunction { input, function } => { + let input_schema = IR::schema_with_cache(*input, arena, cache); + function.schema(&input_schema).unwrap().into_owned() + }, + Invalid => unreachable!(), + }; + cache.insert(node, schema.clone()); + schema + } } diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs index ac8dff3e90fd..6967e743f5b3 100644 --- a/crates/polars-plan/src/plans/mod.rs +++ b/crates/polars-plan/src/plans/mod.rs @@ -213,7 +213,7 @@ impl Clone for DslPlan { impl Default for DslPlan { fn default() -> Self { - let df = DataFrame::new::(vec![]).unwrap(); + let df = DataFrame::empty(); let schema = df.schema(); DslPlan::DataFrameScan { df: Arc::new(df), diff --git a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs index 33edd4b6ed8f..f62bd9ee197d 100644 --- a/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs +++ b/crates/polars-plan/src/plans/optimizer/slice_pushdown_lp.rs @@ -413,7 +413,8 @@ impl SlicePushDown { // [Pushdown] // these nodes will be pushed down. // State is None, we can continue - m @(Select {..}, None) + m @(Select {..}, None) | + m @ (SimpleProjection {..}, _) => { let (lp, state) = m; self.pushdown_and_continue(lp, state, lp_arena, expr_arena) @@ -431,14 +432,14 @@ impl SlicePushDown { } } (HStack {input, exprs, schema, options}, _) => { - let check = can_pushdown_slice_past_projections(&exprs, expr_arena); + let (can_pushdown, all_elementwise_and_any_expr_has_column) = can_pushdown_slice_past_projections(&exprs, expr_arena); if ( - // If the schema length is greater then an input column is being projected, so + // If the schema length is greater than an input column is being projected, so // the exprs in with_columns do not need to have an input column name. - schema.len() > exprs.len() && check.0 + schema.len() > exprs.len() && can_pushdown ) - || check.1 // e.g. select(c).with_columns(c = c + 1) + || all_elementwise_and_any_expr_has_column // e.g. select(c).with_columns(c = c + 1) { let lp = HStack {input, exprs, schema, options}; self.pushdown_and_continue(lp, state, lp_arena, expr_arena) diff --git a/crates/polars-plan/src/plans/options.rs b/crates/polars-plan/src/plans/options.rs index 0cff24124ff1..85506b7f6a15 100644 --- a/crates/polars-plan/src/plans/options.rs +++ b/crates/polars-plan/src/plans/options.rs @@ -85,6 +85,7 @@ pub struct DistinctOptionsDSL { } #[derive(Clone, Debug, Eq, PartialEq, Hash)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub struct DistinctOptionsIR { /// Subset of columns that will be taken into account. pub subset: Option>, @@ -213,6 +214,13 @@ impl FunctionOptions { pub fn check_lengths(&self) -> bool { self.check_lengths.0 } + + pub fn is_elementwise(&self) -> bool { + self.collect_groups == ApplyOptions::ElementWise + && !self + .flags + .contains(FunctionFlags::CHANGES_LENGTH | FunctionFlags::RETURNS_SCALAR) + } } impl Default for FunctionOptions { diff --git a/crates/polars-python/src/cloud.rs b/crates/polars-python/src/cloud.rs index 5c8a7d01eafe..dacca675c551 100644 --- a/crates/polars-python/src/cloud.rs +++ b/crates/polars-python/src/cloud.rs @@ -5,9 +5,9 @@ use crate::error::PyPolarsErr; use crate::PyLazyFrame; #[pyfunction] -pub fn prepare_cloud_plan(lf: PyLazyFrame, uri: String, py: Python) -> PyResult { +pub fn prepare_cloud_plan(lf: PyLazyFrame, py: Python) -> PyResult { let plan = lf.ldf.logical_plan; - let bytes = polars::prelude::prepare_cloud_plan(plan, uri).map_err(PyPolarsErr::from)?; + let bytes = polars::prelude::prepare_cloud_plan(plan).map_err(PyPolarsErr::from)?; Ok(PyBytes::new_bound(py, &bytes).to_object(py)) } diff --git a/crates/polars-python/src/functions/lazy.rs b/crates/polars-python/src/functions/lazy.rs index aa098aee2cb0..51800ed9d4e1 100644 --- a/crates/polars-python/src/functions/lazy.rs +++ b/crates/polars-python/src/functions/lazy.rs @@ -437,7 +437,7 @@ pub fn lit(value: &Bound<'_, PyAny>, allow_object: bool) -> PyResult { Ok(dsl::lit(value.as_bytes()).into()) } else if matches!( value.get_type().qualname().unwrap().as_str(), - "date" | "datetime" | "Decimal" + "date" | "datetime" | "time" | "timedelta" | "Decimal" ) { let av = py_object_to_any_value(value, true)?; Ok(Expr::Literal(LiteralValue::try_from(av).unwrap()).into()) diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 7ec2c392cb6e..cc79d1102abc 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -890,12 +890,12 @@ impl PyLazyFrame { strategy: Wrap, tolerance: Option>>, tolerance_str: Option, - coalesce: Option, + coalesce: bool, ) -> PyResult { - let coalesce = match coalesce { - None => JoinCoalesce::JoinSpecific, - Some(true) => JoinCoalesce::CoalesceColumns, - Some(false) => JoinCoalesce::KeepColumns, + let coalesce = if coalesce { + JoinCoalesce::CoalesceColumns + } else { + JoinCoalesce::KeepColumns }; let ldf = self.ldf.clone(); let other = other.ldf; @@ -1170,7 +1170,7 @@ impl PyLazyFrame { fn collect_schema(&mut self, py: Python) -> PyResult { let schema = py - .allow_threads(|| self.ldf.schema()) + .allow_threads(|| self.ldf.collect_schema()) .map_err(PyPolarsErr::from)?; let schema_dict = PyDict::new_bound(py); diff --git a/crates/polars-python/src/lazyframe/visit.rs b/crates/polars-python/src/lazyframe/visit.rs index 32585c4cc887..36d8e6e4b793 100644 --- a/crates/polars-python/src/lazyframe/visit.rs +++ b/crates/polars-python/src/lazyframe/visit.rs @@ -57,7 +57,7 @@ impl NodeTraverser { // Increment major on breaking changes to the IR (e.g. renaming // fields, reordering tuples), minor on backwards compatible // changes (e.g. exposing a new expression node). - const VERSION: Version = (1, 0); + const VERSION: Version = (1, 1); pub(crate) fn new(root: Node, lp_arena: Arena, expr_arena: Arena) -> Self { Self { diff --git a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs index fe85d23b6fb7..d282e6d528e3 100644 --- a/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/expr_nodes.rs @@ -1,7 +1,11 @@ use polars::datatypes::TimeUnit; +use polars::series::ops::NullBehavior; use polars_core::prelude::{NonExistent, QuantileInterpolOptions}; use polars_core::series::IsSorted; use polars_ops::prelude::ClosedInterval; +use polars_ops::series::InterpolationMethod; +#[cfg(feature = "search_sorted")] +use polars_ops::series::SearchSortedSide; use polars_plan::dsl::function_expr::rolling::RollingFunction; use polars_plan::dsl::function_expr::rolling_by::RollingFunctionBy; use polars_plan::dsl::{BooleanFunction, StringFunction, TemporalFunction}; @@ -1054,21 +1058,31 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { }, FunctionExpr::Abs => ("abs",).to_object(py), #[cfg(feature = "hist")] - FunctionExpr::Hist { .. } => return Err(PyNotImplementedError::new_err("hist")), + FunctionExpr::Hist { + bin_count, + include_category, + include_breakpoint, + } => ("hist", bin_count, include_category, include_breakpoint).to_object(py), FunctionExpr::NullCount => ("null_count",).to_object(py), FunctionExpr::Pow(f) => match f { PowFunction::Generic => ("pow",).to_object(py), PowFunction::Sqrt => ("sqrt",).to_object(py), PowFunction::Cbrt => ("cbrt",).to_object(py), }, - FunctionExpr::Hash(_, _, _, _) => { - return Err(PyNotImplementedError::new_err("hash")) + FunctionExpr::Hash(seed, seed_1, seed_2, seed_3) => { + ("hash", seed, seed_1, seed_2, seed_3).to_object(py) }, FunctionExpr::ArgWhere => ("argwhere",).to_object(py), #[cfg(feature = "search_sorted")] - FunctionExpr::SearchSorted(_) => { - return Err(PyNotImplementedError::new_err("search sorted")) - }, + FunctionExpr::SearchSorted(side) => ( + "search_sorted", + match side { + SearchSortedSide::Any => "any", + SearchSortedSide::Left => "left", + SearchSortedSide::Right => "right", + }, + ) + .to_object(py), FunctionExpr::Range(_) => return Err(PyNotImplementedError::new_err("range")), #[cfg(feature = "trigonometry")] FunctionExpr::Trigonometry(trigfun) => { @@ -1147,17 +1161,13 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { return Err(PyNotImplementedError::new_err("rolling std by")) }, }, - FunctionExpr::ShiftAndFill => { - return Err(PyNotImplementedError::new_err("shift and fill")) - }, + FunctionExpr::ShiftAndFill => ("shift_and_fill",).to_object(py), FunctionExpr::Shift => ("shift",).to_object(py), FunctionExpr::DropNans => ("drop_nans",).to_object(py), FunctionExpr::DropNulls => ("drop_nulls",).to_object(py), FunctionExpr::Mode => ("mode",).to_object(py), - FunctionExpr::Skew(_) => return Err(PyNotImplementedError::new_err("skew")), - FunctionExpr::Kurtosis(_, _) => { - return Err(PyNotImplementedError::new_err("kurtosis")) - }, + FunctionExpr::Skew(bias) => ("skew", bias).to_object(py), + FunctionExpr::Kurtosis(fisher, bias) => ("kurtosis", fisher, bias).to_object(py), FunctionExpr::Reshape(_, _) => { return Err(PyNotImplementedError::new_err("reshape")) }, @@ -1168,11 +1178,8 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { options: _, seed: _, } => return Err(PyNotImplementedError::new_err("rank")), - FunctionExpr::Clip { - has_min: _, - has_max: _, - } => return Err(PyNotImplementedError::new_err("clip")), - FunctionExpr::AsStruct => return Err(PyNotImplementedError::new_err("as struct")), + FunctionExpr::Clip { has_min, has_max } => ("clip", has_min, has_max).to_object(py), + FunctionExpr::AsStruct => ("as_struct",).to_object(py), #[cfg(feature = "top_k")] FunctionExpr::TopK { descending } => ("top_k", descending).to_object(py), FunctionExpr::CumCount { reverse } => ("cum_count", reverse).to_object(py), @@ -1182,37 +1189,41 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { FunctionExpr::CumMax { reverse } => ("cum_max", reverse).to_object(py), FunctionExpr::Reverse => ("reverse",).to_object(py), FunctionExpr::ValueCounts { - sort: _, - parallel: _, - name: _, - normalize: _, - } => return Err(PyNotImplementedError::new_err("value counts")), + sort, + parallel, + name, + normalize, + } => ("value_counts", sort, parallel, name, normalize).to_object(py), FunctionExpr::UniqueCounts => ("unique_counts",).to_object(py), - FunctionExpr::ApproxNUnique => { - return Err(PyNotImplementedError::new_err("approx nunique")) - }, + FunctionExpr::ApproxNUnique => ("approx_n_unique",).to_object(py), FunctionExpr::Coalesce => ("coalesce",).to_object(py), - FunctionExpr::ShrinkType => { - return Err(PyNotImplementedError::new_err("shrink type")) - }, - FunctionExpr::Diff(_, _) => return Err(PyNotImplementedError::new_err("diff")), + FunctionExpr::ShrinkType => ("shrink_dtype",).to_object(py), + FunctionExpr::Diff(n, null_behaviour) => ( + "diff", + n, + match null_behaviour { + NullBehavior::Drop => "drop", + NullBehavior::Ignore => "ignore", + }, + ) + .to_object(py), #[cfg(feature = "pct_change")] - FunctionExpr::PctChange => { - return Err(PyNotImplementedError::new_err("pct change")) - }, - FunctionExpr::Interpolate(_) => { - return Err(PyNotImplementedError::new_err("interpolate")) - }, - FunctionExpr::InterpolateBy => { - return Err(PyNotImplementedError::new_err("interpolate_by")) + FunctionExpr::PctChange => ("pct_change",).to_object(py), + FunctionExpr::Interpolate(method) => ( + "interpolate", + match method { + InterpolationMethod::Linear => "linear", + InterpolationMethod::Nearest => "nearest", + }, + ) + .to_object(py), + FunctionExpr::InterpolateBy => ("interpolate_by",).to_object(py), + FunctionExpr::Entropy { base, normalize } => { + ("entropy", base, normalize).to_object(py) }, - FunctionExpr::Entropy { - base: _, - normalize: _, - } => return Err(PyNotImplementedError::new_err("entropy")), - FunctionExpr::Log { base: _ } => return Err(PyNotImplementedError::new_err("log")), - FunctionExpr::Log1p => return Err(PyNotImplementedError::new_err("log1p")), - FunctionExpr::Exp => return Err(PyNotImplementedError::new_err("exp")), + FunctionExpr::Log { base } => ("log", base).to_object(py), + FunctionExpr::Log1p => ("log1p",).to_object(py), + FunctionExpr::Exp => ("exp",).to_object(py), FunctionExpr::Unique(maintain_order) => ("unique", maintain_order).to_object(py), FunctionExpr::Round { decimals } => ("round", decimals).to_object(py), FunctionExpr::RoundSF { digits } => ("round_sig_figs", digits).to_object(py), @@ -1228,20 +1239,18 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { return Err(PyNotImplementedError::new_err("corr")) }, #[cfg(feature = "peaks")] - FunctionExpr::PeakMin => return Err(PyNotImplementedError::new_err("peak min")), + FunctionExpr::PeakMin => ("peak_max",).to_object(py), #[cfg(feature = "peaks")] - FunctionExpr::PeakMax => return Err(PyNotImplementedError::new_err("peak max")), + FunctionExpr::PeakMax => ("peak_min",).to_object(py), #[cfg(feature = "cutqcut")] FunctionExpr::Cut { .. } => return Err(PyNotImplementedError::new_err("cut")), #[cfg(feature = "cutqcut")] FunctionExpr::QCut { .. } => return Err(PyNotImplementedError::new_err("qcut")), #[cfg(feature = "rle")] - FunctionExpr::RLE => return Err(PyNotImplementedError::new_err("rle")), + FunctionExpr::RLE => ("rle",).to_object(py), #[cfg(feature = "rle")] - FunctionExpr::RLEID => return Err(PyNotImplementedError::new_err("rleid")), - FunctionExpr::ToPhysical => { - return Err(PyNotImplementedError::new_err("to physical")) - }, + FunctionExpr::RLEID => ("rle_id",).to_object(py), + FunctionExpr::ToPhysical => ("to_physical",).to_object(py), FunctionExpr::Random { .. } => { return Err(PyNotImplementedError::new_err("random")) }, @@ -1258,24 +1267,12 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { FunctionExpr::FfiPlugin { .. } => { return Err(PyNotImplementedError::new_err("ffi plugin")) }, - FunctionExpr::BackwardFill { limit: _ } => { - return Err(PyNotImplementedError::new_err("backward fill")) - }, - FunctionExpr::ForwardFill { limit: _ } => { - return Err(PyNotImplementedError::new_err("forward fill")) - }, - FunctionExpr::SumHorizontal => { - return Err(PyNotImplementedError::new_err("sum horizontal")) - }, - FunctionExpr::MaxHorizontal => { - return Err(PyNotImplementedError::new_err("max horizontal")) - }, - FunctionExpr::MeanHorizontal => { - return Err(PyNotImplementedError::new_err("mean horizontal")) - }, - FunctionExpr::MinHorizontal => { - return Err(PyNotImplementedError::new_err("min horizontal")) - }, + FunctionExpr::BackwardFill { limit } => ("backward_fill", limit).to_object(py), + FunctionExpr::ForwardFill { limit } => ("forward_fill", limit).to_object(py), + FunctionExpr::SumHorizontal => ("sum_horizontal",).to_object(py), + FunctionExpr::MaxHorizontal => ("max_horizontal",).to_object(py), + FunctionExpr::MeanHorizontal => ("mean_horizontal",).to_object(py), + FunctionExpr::MinHorizontal => ("min_horizontal",).to_object(py), FunctionExpr::EwmMean { options: _ } => { return Err(PyNotImplementedError::new_err("ewm mean")) }, @@ -1285,23 +1282,20 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult { FunctionExpr::EwmVar { options: _ } => { return Err(PyNotImplementedError::new_err("ewm var")) }, - FunctionExpr::Replace => return Err(PyNotImplementedError::new_err("replace")), + FunctionExpr::Replace => ("replace",).to_object(py), FunctionExpr::ReplaceStrict { return_dtype: _ } => { - return Err(PyNotImplementedError::new_err("replace_strict")) + // Can ignore the return dtype because it is encoded in the schema. + ("replace_strict",).to_object(py) }, - FunctionExpr::Negate => return Err(PyNotImplementedError::new_err("negate")), + FunctionExpr::Negate => ("negate",).to_object(py), FunctionExpr::FillNullWithStrategy(_) => { return Err(PyNotImplementedError::new_err("fill null with strategy")) }, FunctionExpr::GatherEvery { n, offset } => { ("gather_every", offset, n).to_object(py) }, - FunctionExpr::Reinterpret(_) => { - return Err(PyNotImplementedError::new_err("reinterpret")) - }, - FunctionExpr::ExtendConstant => { - return Err(PyNotImplementedError::new_err("extend constant")) - }, + FunctionExpr::Reinterpret(signed) => ("reinterpret", signed).to_object(py), + FunctionExpr::ExtendConstant => ("extend_constant",).to_object(py), FunctionExpr::Business(_) => { return Err(PyNotImplementedError::new_err("business")) }, diff --git a/crates/polars-python/src/lazygroupby.rs b/crates/polars-python/src/lazygroupby.rs index 255bb34917f9..52df635efb53 100644 --- a/crates/polars-python/src/lazygroupby.rs +++ b/crates/polars-python/src/lazygroupby.rs @@ -43,7 +43,7 @@ impl PyLazyGroupBy { let schema = match schema { Some(schema) => Arc::new(schema.0), None => LazyFrame::from(lgb.logical_plan.clone()) - .schema() + .collect_schema() .map_err(PyPolarsErr::from)?, }; diff --git a/crates/polars-python/src/series/general.rs b/crates/polars-python/src/series/general.rs index a3ea8a6fc676..63c1caeb71ee 100644 --- a/crates/polars-python/src/series/general.rs +++ b/crates/polars-python/src/series/general.rs @@ -335,7 +335,7 @@ impl PySeries { if let Some(output_type) = output_type { return Ok(Series::full_null(series.name(), series.len(), &output_type.0).into()); } - let msg = "The output type of the 'apply' function cannot be determined.\n\ + let msg = "The output type of the 'map_elements' function cannot be determined.\n\ The function was never called because 'skip_nulls=True' and all values are null.\n\ Consider setting 'skip_nulls=False' or setting the 'return_dtype'."; raise_err!(msg, ComputeError) diff --git a/crates/polars-sql/Cargo.toml b/crates/polars-sql/Cargo.toml index 0c8f883daf50..29febbfc149c 100644 --- a/crates/polars-sql/Cargo.toml +++ b/crates/polars-sql/Cargo.toml @@ -37,7 +37,7 @@ csv = ["polars-lazy/csv"] diagonal_concat = ["polars-lazy/diagonal_concat"] dtype-decimal = ["polars-lazy/dtype-decimal"] ipc = ["polars-lazy/ipc"] -json = ["polars-lazy/json", "polars-plan/extract_jsonpath"] +json = ["polars-lazy/json", "polars-plan/json", "polars-plan/extract_jsonpath"] list_eval = ["polars-lazy/list_eval"] parquet = ["polars-lazy/parquet"] semi_anti_join = ["polars-lazy/semi_anti_join"] diff --git a/crates/polars-sql/src/context.rs b/crates/polars-sql/src/context.rs index ab1b9a53997c..b131ae805339 100644 --- a/crates/polars-sql/src/context.rs +++ b/crates/polars-sql/src/context.rs @@ -382,7 +382,7 @@ impl SQLContext { let lf_schema = self.get_frame_schema(&mut lf)?; let lf_cols: Vec<_> = lf_schema.iter_names().map(|nm| col(nm)).collect(); let joined_tbl = match quantifier { - SetQuantifier::ByName | SetQuantifier::AllByName => join.on(lf_cols).finish(), + SetQuantifier::ByName => join.on(lf_cols).finish(), SetQuantifier::Distinct | SetQuantifier::None => { let rf_schema = self.get_frame_schema(&mut rf)?; let rf_cols: Vec<_> = rf_schema.iter_names().map(|nm| col(nm)).collect(); diff --git a/crates/polars-stream/Cargo.toml b/crates/polars-stream/Cargo.toml index a8741189f7dd..e2a7d0c45649 100644 --- a/crates/polars-stream/Cargo.toml +++ b/crates/polars-stream/Cargo.toml @@ -12,9 +12,11 @@ description = "Private crate for the streaming execution engine for the Polars D atomic-waker = { workspace = true } crossbeam-deque = { workspace = true } crossbeam-utils = { workspace = true } +futures = { workspace = true } +memmap = { workspace = true } parking_lot = { workspace = true } pin-project-lite = { workspace = true } -polars-io = { workspace = true, features = ["async"] } +polars-io = { workspace = true, features = ["async", "cloud", "aws"] } polars-utils = { workspace = true } rand = { workspace = true } rayon = { workspace = true } @@ -25,8 +27,9 @@ tokio = { workspace = true } polars-core = { workspace = true } polars-error = { workspace = true } polars-expr = { workspace = true } -polars-mem-engine = { workspace = true } -polars-plan = { workspace = true } +polars-mem-engine = { workspace = true, features = ["parquet"] } +polars-parquet = { workspace = true } +polars-plan = { workspace = true, features = ["parquet"] } [build-dependencies] version_check = { workspace = true } diff --git a/crates/polars-stream/src/async_executor/mod.rs b/crates/polars-stream/src/async_executor/mod.rs index ea239628990f..dec560845b09 100644 --- a/crates/polars-stream/src/async_executor/mod.rs +++ b/crates/polars-stream/src/async_executor/mod.rs @@ -15,7 +15,7 @@ use parking_lot::Mutex; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; use slotmap::SlotMap; -pub use task::JoinHandle; +pub use task::{AbortOnDropHandle, JoinHandle}; use task::{CancelHandle, Runnable}; static NUM_EXECUTOR_THREADS: AtomicUsize = AtomicUsize::new(0); @@ -345,7 +345,6 @@ where } } -#[allow(unused)] pub fn spawn(priority: TaskPriority, fut: F) -> JoinHandle where ::Output: Send + 'static, diff --git a/crates/polars-stream/src/async_executor/task.rs b/crates/polars-stream/src/async_executor/task.rs index b1f0dfcfbe69..9991377eb718 100644 --- a/crates/polars-stream/src/async_executor/task.rs +++ b/crates/polars-stream/src/async_executor/task.rs @@ -278,6 +278,10 @@ impl Runnable { pub struct JoinHandle(Option>>); pub struct CancelHandle(Weak); +pub struct AbortOnDropHandle { + join_handle: JoinHandle, + cancel_handle: CancelHandle, +} impl JoinHandle { pub fn cancel_handle(&self) -> CancelHandle { @@ -305,13 +309,37 @@ impl Future for JoinHandle { } impl CancelHandle { - pub fn cancel(self) { + pub fn cancel(&self) { if let Some(t) = self.0.upgrade() { t.cancel(); } } } +impl AbortOnDropHandle { + pub fn new(join_handle: JoinHandle) -> Self { + let cancel_handle = join_handle.cancel_handle(); + Self { + join_handle, + cancel_handle, + } + } +} + +impl Future for AbortOnDropHandle { + type Output = T; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + Pin::new(&mut self.join_handle).poll(cx) + } +} + +impl Drop for AbortOnDropHandle { + fn drop(&mut self) { + self.cancel_handle.cancel(); + } +} + pub fn spawn(future: F, schedule: S, metadata: M) -> (Runnable, JoinHandle) where F: Future + Send + 'static, diff --git a/crates/polars-stream/src/async_primitives/distributor_channel.rs b/crates/polars-stream/src/async_primitives/distributor_channel.rs index 5bdeb7e56866..21af7b53d7d1 100644 --- a/crates/polars-stream/src/async_primitives/distributor_channel.rs +++ b/crates/polars-stream/src/async_primitives/distributor_channel.rs @@ -198,6 +198,8 @@ impl Sender { } impl Receiver { + /// Note: This intentionally takes `&mut` to ensure it is only accessed in a single-threaded + /// manner. pub async fn recv(&mut self) -> Result { loop { // Fast-path. diff --git a/crates/polars-stream/src/execute.rs b/crates/polars-stream/src/execute.rs index 5f3bdebf7d36..d17bc89bd6ad 100644 --- a/crates/polars-stream/src/execute.rs +++ b/crates/polars-stream/src/execute.rs @@ -205,10 +205,11 @@ fn run_subgraph( for input in &node.inputs { let sender = graph.pipes[*input].sender; if let Some(count) = num_send_ports_not_yet_ready.get_mut(sender) { - assert!(*count > 0); - *count -= 1; - if *count == 0 { - ready.push(sender); + if *count > 0 { + *count -= 1; + if *count == 0 { + ready.push(sender); + } } } } @@ -247,7 +248,7 @@ pub fn execute_graph( if polars_core::config::verbose() { eprintln!("polars-stream: updating graph state"); } - graph.update_all_states(); + graph.update_all_states()?; let (nodes, pipes) = find_runnable_subgraph(graph); if polars_core::config::verbose() { for node in &nodes { diff --git a/crates/polars-stream/src/expression.rs b/crates/polars-stream/src/expression.rs index a6e41728d111..3c1b9445997c 100644 --- a/crates/polars-stream/src/expression.rs +++ b/crates/polars-stream/src/expression.rs @@ -6,7 +6,7 @@ use polars_error::PolarsResult; use polars_expr::prelude::{ExecutionState, PhysicalExpr}; #[derive(Clone)] -pub(crate) struct StreamExpr { +pub struct StreamExpr { inner: Arc, // Whether the expression can be re-entering the engine (e.g. a function use the lazy api // within that function) @@ -14,18 +14,14 @@ pub(crate) struct StreamExpr { } impl StreamExpr { - pub(crate) fn new(phys_expr: Arc, reentrant: bool) -> Self { + pub fn new(phys_expr: Arc, reentrant: bool) -> Self { Self { inner: phys_expr, reentrant, } } - pub(crate) async fn evaluate( - &self, - df: &DataFrame, - state: &ExecutionState, - ) -> PolarsResult { + pub async fn evaluate(&self, df: &DataFrame, state: &ExecutionState) -> PolarsResult { if self.reentrant { let state = state.clone(); let phys_expr = self.inner.clone(); diff --git a/crates/polars-stream/src/graph.rs b/crates/polars-stream/src/graph.rs index 055d8df4a5ae..572c1f1c306d 100644 --- a/crates/polars-stream/src/graph.rs +++ b/crates/polars-stream/src/graph.rs @@ -1,3 +1,4 @@ +use polars_error::PolarsResult; use slotmap::{SecondaryMap, SlotMap}; use crate::nodes::ComputeNode; @@ -64,11 +65,13 @@ impl Graph { } /// Updates all the nodes' states until a fixed point is reached. - pub fn update_all_states(&mut self) { + pub fn update_all_states(&mut self) -> PolarsResult<()> { let mut to_update: Vec<_> = self.nodes.keys().collect(); let mut scheduled_for_update: SecondaryMap = self.nodes.keys().map(|k| (k, ())).collect(); + let verbose = std::env::var("POLARS_VERBOSE_STATE_UPDATE").as_deref() == Ok("1"); + let mut recv_state = Vec::new(); let mut send_state = Vec::new(); while let Some(node_key) = to_update.pop() { @@ -82,15 +85,25 @@ impl Graph { send_state.extend(node.outputs.iter().map(|o| self.pipes[*o].recv_state)); // Compute the new state of this node given its environment. - // eprintln!("updating {}, before: {recv_state:?} {send_state:?}", node.compute.name()); - node.compute.update_state(&mut recv_state, &mut send_state); - // eprintln!("updating {}, after: {recv_state:?} {send_state:?}", node.compute.name()); + if verbose { + eprintln!( + "updating {}, before: {recv_state:?} {send_state:?}", + node.compute.name() + ); + } + node.compute + .update_state(&mut recv_state, &mut send_state)?; + if verbose { + eprintln!( + "updating {}, after: {recv_state:?} {send_state:?}", + node.compute.name() + ); + } // Propagate information. for (input, state) in node.inputs.iter().zip(recv_state.iter()) { let pipe = &mut self.pipes[*input]; if pipe.recv_state != *state { - // eprintln!("transitioning input pipe from {:?} to {state:?}", pipe.recv_state); assert!(pipe.recv_state != PortState::Done, "implementation error: state transition from Done to Blocked/Ready attempted"); pipe.recv_state = *state; if scheduled_for_update.insert(pipe.sender, ()).is_none() { @@ -102,7 +115,6 @@ impl Graph { for (output, state) in node.outputs.iter().zip(send_state.iter()) { let pipe = &mut self.pipes[*output]; if pipe.send_state != *state { - // eprintln!("transitioning output pipe from {:?} to {state:?}", pipe.send_state); assert!(pipe.send_state != PortState::Done, "implementation error: state transition from Done to Blocked/Ready attempted"); pipe.send_state = *state; if scheduled_for_update.insert(pipe.receiver, ()).is_none() { @@ -111,6 +123,7 @@ impl Graph { } } } + Ok(()) } } diff --git a/crates/polars-stream/src/nodes/filter.rs b/crates/polars-stream/src/nodes/filter.rs index 8a19b1a27986..9f0b0301ef91 100644 --- a/crates/polars-stream/src/nodes/filter.rs +++ b/crates/polars-stream/src/nodes/filter.rs @@ -18,9 +18,10 @@ impl ComputeNode for FilterNode { "filter" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && send.len() == 1); recv.swap_with_slice(send); + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/in_memory_map.rs b/crates/polars-stream/src/nodes/in_memory_map.rs index 09769172c430..3a8bff496a18 100644 --- a/crates/polars-stream/src/nodes/in_memory_map.rs +++ b/crates/polars-stream/src/nodes/in_memory_map.rs @@ -39,7 +39,7 @@ impl ComputeNode for InMemoryMapNode { } } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && send.len() == 1); // If the output doesn't want any more data, transition to being done. @@ -55,9 +55,8 @@ impl ComputeNode for InMemoryMapNode { } = self { if recv[0] == PortState::Done { - let df = sink_node.get_output().unwrap(); - let mut source_node = - InMemorySourceNode::new(Arc::new(map.call_udf(df.unwrap()).unwrap())); + let df = sink_node.get_output()?; + let mut source_node = InMemorySourceNode::new(Arc::new(map.call_udf(df.unwrap())?)); source_node.initialize(*num_pipelines); *self = Self::Source(source_node); } @@ -65,18 +64,19 @@ impl ComputeNode for InMemoryMapNode { match self { Self::Sink { sink_node, .. } => { - sink_node.update_state(recv, &mut []); + sink_node.update_state(recv, &mut [])?; send[0] = PortState::Blocked; }, Self::Source(source_node) => { recv[0] = PortState::Done; - source_node.update_state(&mut [], send); + source_node.update_state(&mut [], send)?; }, Self::Done => { recv[0] = PortState::Done; send[0] = PortState::Done; }, } + Ok(()) } fn is_memory_intensive_pipeline_blocker(&self) -> bool { diff --git a/crates/polars-stream/src/nodes/in_memory_sink.rs b/crates/polars-stream/src/nodes/in_memory_sink.rs index 0a4750d7b8b9..afd6ccfd95cc 100644 --- a/crates/polars-stream/src/nodes/in_memory_sink.rs +++ b/crates/polars-stream/src/nodes/in_memory_sink.rs @@ -26,7 +26,7 @@ impl ComputeNode for InMemorySinkNode { "in_memory_sink" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(send.is_empty()); assert!(recv.len() == 1); @@ -35,6 +35,7 @@ impl ComputeNode for InMemorySinkNode { if recv[0] != PortState::Done { recv[0] = PortState::Ready; } + Ok(()) } fn is_memory_intensive_pipeline_blocker(&self) -> bool { diff --git a/crates/polars-stream/src/nodes/in_memory_source.rs b/crates/polars-stream/src/nodes/in_memory_source.rs index 826f9e5e5c83..45630eb7aab0 100644 --- a/crates/polars-stream/src/nodes/in_memory_source.rs +++ b/crates/polars-stream/src/nodes/in_memory_source.rs @@ -34,7 +34,7 @@ impl ComputeNode for InMemorySourceNode { self.seq = AtomicU64::new(0); } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.is_empty()); assert!(send.len() == 1); @@ -52,6 +52,7 @@ impl ComputeNode for InMemorySourceNode { } else { send[0] = PortState::Ready; } + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/map.rs b/crates/polars-stream/src/nodes/map.rs index 44587193f23d..007dfa921672 100644 --- a/crates/polars-stream/src/nodes/map.rs +++ b/crates/polars-stream/src/nodes/map.rs @@ -20,9 +20,10 @@ impl ComputeNode for MapNode { "map" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && send.len() == 1); recv.swap_with_slice(send); + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/mod.rs b/crates/polars-stream/src/nodes/mod.rs index 839646a488b0..4c71380e0ad4 100644 --- a/crates/polars-stream/src/nodes/mod.rs +++ b/crates/polars-stream/src/nodes/mod.rs @@ -5,6 +5,7 @@ pub mod in_memory_source; pub mod map; pub mod multiplexer; pub mod ordered_union; +pub mod parquet_source; pub mod reduce; pub mod select; pub mod simple_projection; @@ -45,7 +46,7 @@ pub trait ComputeNode: Send { /// Similarly, for each output pipe `send` will contain the respective /// state of the input port that pipe is connected to when called, and you /// must update it to contain the desired state of your output port. - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]); + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()>; /// If this node (in its current state) is a pipeline blocker, and whether /// this is memory intensive or not. diff --git a/crates/polars-stream/src/nodes/multiplexer.rs b/crates/polars-stream/src/nodes/multiplexer.rs index a0238b94da2e..65f2e752d28d 100644 --- a/crates/polars-stream/src/nodes/multiplexer.rs +++ b/crates/polars-stream/src/nodes/multiplexer.rs @@ -34,7 +34,7 @@ impl ComputeNode for MultiplexerNode { "multiplexer" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && !send.is_empty()); // Initialize buffered streams, and mark those for which the receiver @@ -60,14 +60,24 @@ impl ComputeNode for MultiplexerNode { for s in send { *s = PortState::Done; } - return; + return Ok(()); } let all_blocked = send.iter().all(|p| *p == PortState::Blocked); // Pass along the input state to the output. - for s in send { - *s = recv[0]; + for (i, s) in send.iter_mut().enumerate() { + let buffer_empty = match &self.buffers[i] { + BufferedStream::Open(v) => v.is_empty(), + BufferedStream::Closed => true, + }; + *s = if buffer_empty && recv[0] == PortState::Done { + PortState::Done + } else if !buffer_empty || recv[0] == PortState::Ready { + PortState::Ready + } else { + PortState::Blocked + }; } // We say we are ready to receive unless all outputs are blocked. @@ -76,6 +86,7 @@ impl ComputeNode for MultiplexerNode { } else { PortState::Ready }; + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/ordered_union.rs b/crates/polars-stream/src/nodes/ordered_union.rs index f38c306505b4..3c72d9cc6e15 100644 --- a/crates/polars-stream/src/nodes/ordered_union.rs +++ b/crates/polars-stream/src/nodes/ordered_union.rs @@ -23,7 +23,7 @@ impl ComputeNode for OrderedUnionNode { "ordered_union" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(self.cur_input_idx <= recv.len() && send.len() == 1); // Skip inputs that are done. @@ -46,6 +46,7 @@ impl ComputeNode for OrderedUnionNode { // Set the morsel offset one higher than any sent so far. self.morsel_offset = self.max_morsel_seq_sent.successor(); + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/parquet_source.rs b/crates/polars-stream/src/nodes/parquet_source.rs new file mode 100644 index 000000000000..16184645da74 --- /dev/null +++ b/crates/polars-stream/src/nodes/parquet_source.rs @@ -0,0 +1,1920 @@ +use std::future::Future; +use std::path::PathBuf; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use futures::stream::FuturesUnordered; +use futures::StreamExt; +use polars_core::config; +use polars_core::frame::DataFrame; +use polars_core::prelude::{ + ArrowSchema, ChunkFull, DataType, IdxCa, InitHashMaps, PlHashMap, StringChunked, +}; +use polars_core::schema::IndexOfSchema; +use polars_core::series::{IntoSeries, IsSorted, Series}; +use polars_core::utils::operation_exceeded_idxsize_msg; +use polars_error::{polars_bail, polars_err, PolarsResult}; +use polars_expr::prelude::PhysicalExpr; +use polars_io::cloud::CloudOptions; +use polars_io::predicates::PhysicalIoExpr; +use polars_io::prelude::{FileMetaData, ParquetOptions}; +use polars_io::utils::byte_source::{ + ByteSource, DynByteSource, DynByteSourceBuilder, MemSliceByteSource, +}; +use polars_io::utils::slice::SplitSlicePosition; +use polars_io::{is_cloud_url, RowIndex}; +use polars_parquet::read::RowGroupMetaData; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::FileInfo; +use polars_plan::prelude::FileScanOptions; +use polars_utils::aliases::PlHashSet; +use polars_utils::mmap::MemSlice; +use polars_utils::slice::GetSaferUnchecked; +use polars_utils::IdxSize; + +use super::{MorselSeq, TaskPriority}; +use crate::async_executor::{self}; +use crate::async_primitives::connector::connector; +use crate::async_primitives::wait_group::{WaitGroup, WaitToken}; +use crate::morsel::get_ideal_morsel_size; +use crate::utils::notify_channel::{notify_channel, NotifyReceiver}; +use crate::utils::task_handles_ext; + +type AsyncTaskData = Option<( + Vec>, + async_executor::AbortOnDropHandle>, +)>; + +#[allow(clippy::type_complexity)] +pub struct ParquetSourceNode { + paths: Arc>, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option>, + options: ParquetOptions, + cloud_options: Option, + file_options: FileScanOptions, + // Run-time vars + config: Config, + verbose: bool, + physical_predicate: Option>, + projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, + byte_source_builder: DynByteSourceBuilder, + memory_prefetch_func: fn(&[u8]) -> (), + // This permit blocks execution until the first morsel is requested. + morsel_stream_starter: Option>, + // This is behind a Mutex so that we can call `shutdown()` asynchronously. + async_task_data: Arc>, + row_group_decoder: Option>, + is_finished: Arc, +} + +#[allow(clippy::too_many_arguments)] +impl ParquetSourceNode { + pub fn new( + paths: Arc>, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option>, + options: ParquetOptions, + cloud_options: Option, + file_options: FileScanOptions, + ) -> Self { + let verbose = config::verbose(); + + let byte_source_builder = + if is_cloud_url(paths[0].to_str().unwrap()) || config::force_async() { + DynByteSourceBuilder::ObjectStore + } else { + DynByteSourceBuilder::Mmap + }; + let memory_prefetch_func = get_memory_prefetch_func(verbose); + + Self { + paths, + file_info, + hive_parts, + predicate, + options, + cloud_options, + file_options, + + config: Config { + // Initialized later + num_pipelines: 0, + metadata_prefetch_size: 0, + metadata_decode_ahead_size: 0, + row_group_prefetch_size: 0, + }, + verbose, + physical_predicate: None, + projected_arrow_fields: Arc::new([]), + byte_source_builder, + memory_prefetch_func, + + morsel_stream_starter: None, + async_task_data: Arc::new(tokio::sync::Mutex::new(None)), + row_group_decoder: None, + is_finished: Arc::new(AtomicBool::new(false)), + } + } +} + +mod compute_node_impl { + + use std::sync::Arc; + + use polars_expr::prelude::phys_expr_to_io_expr; + + use super::super::compute_node_prelude::*; + use super::{Config, ParquetSourceNode}; + use crate::morsel::SourceToken; + + impl ComputeNode for ParquetSourceNode { + fn name(&self) -> &str { + "parquet_source" + } + + fn initialize(&mut self, num_pipelines: usize) { + self.config = { + let metadata_prefetch_size = polars_core::config::get_file_prefetch_size(); + // Limit metadata decode to the number of threads. + let metadata_decode_ahead_size = + (metadata_prefetch_size / 2).min(1 + num_pipelines).max(1); + let row_group_prefetch_size = polars_core::config::get_rg_prefetch_size(); + + Config { + num_pipelines, + metadata_prefetch_size, + metadata_decode_ahead_size, + row_group_prefetch_size, + } + }; + + if self.verbose { + eprintln!("[ParquetSource]: {:?}", &self.config); + } + + self.init_projected_arrow_fields(); + self.physical_predicate = self.predicate.clone().map(phys_expr_to_io_expr); + + let (raw_morsel_receivers, morsel_stream_task_handle) = self.init_raw_morsel_stream(); + + self.async_task_data + .try_lock() + .unwrap() + .replace((raw_morsel_receivers, morsel_stream_task_handle)); + + let row_group_decoder = self.init_row_group_decoder(); + self.row_group_decoder = Some(Arc::new(row_group_decoder)); + } + + fn update_state( + &mut self, + recv: &mut [PortState], + send: &mut [PortState], + ) -> PolarsResult<()> { + use std::sync::atomic::Ordering; + + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + + if self.is_finished.load(Ordering::Relaxed) { + send[0] = PortState::Done; + assert!( + self.async_task_data.try_lock().unwrap().is_none(), + "should have already been shut down" + ); + } else if send[0] == PortState::Done { + { + // Early shutdown - our port state was set to `Done` by the downstream nodes. + self.shutdown_in_background(); + }; + self.is_finished.store(true, Ordering::Relaxed); + } else { + send[0] = PortState::Ready + } + + Ok(()) + } + + fn spawn<'env, 's>( + &'env mut self, + scope: &'s TaskScope<'s, 'env>, + recv: &mut [Option>], + send: &mut [Option>], + _state: &'s ExecutionState, + join_handles: &mut Vec>>, + ) { + use std::sync::atomic::Ordering; + + assert!(recv.is_empty()); + assert_eq!(send.len(), 1); + assert!(!self.is_finished.load(Ordering::Relaxed)); + + let morsel_senders = send[0].take().unwrap().parallel(); + + let mut async_task_data_guard = self.async_task_data.try_lock().unwrap(); + let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); + + assert_eq!(raw_morsel_receivers.len(), morsel_senders.len()); + + if let Some(v) = self.morsel_stream_starter.take() { + v.send(()).unwrap(); + } + let is_finished = self.is_finished.clone(); + + let task_handles = raw_morsel_receivers + .drain(..) + .zip(morsel_senders) + .map(|(mut raw_morsel_rx, mut morsel_tx)| { + let is_finished = is_finished.clone(); + + scope.spawn_task(TaskPriority::Low, async move { + let source_token = SourceToken::new(); + loop { + let Ok((df, morsel_seq, wait_token)) = raw_morsel_rx.recv().await + else { + is_finished.store(true, Ordering::Relaxed); + break; + }; + + let mut morsel = Morsel::new(df, morsel_seq, source_token.clone()); + morsel.set_consume_token(wait_token); + + if morsel_tx.send(morsel).await.is_err() { + break; + } + + if source_token.stop_requested() { + break; + } + } + + raw_morsel_rx + }) + }) + .collect::>(); + + drop(async_task_data_guard); + + let async_task_data = self.async_task_data.clone(); + + join_handles.push(scope.spawn_task(TaskPriority::Low, async move { + { + let mut async_task_data_guard = async_task_data.try_lock().unwrap(); + let (raw_morsel_receivers, _) = async_task_data_guard.as_mut().unwrap(); + + for handle in task_handles { + raw_morsel_receivers.push(handle.await); + } + } + + if self.is_finished.load(Ordering::Relaxed) { + self.shutdown().await?; + } + + Ok(()) + })) + } + } +} + +impl ParquetSourceNode { + /// # Panics + /// Panics if called more than once. + async fn shutdown_impl( + async_task_data: Arc>, + verbose: bool, + ) -> PolarsResult<()> { + if verbose { + eprintln!("[ParquetSource]: Shutting down"); + } + + let (mut raw_morsel_receivers, morsel_stream_task_handle) = + async_task_data.try_lock().unwrap().take().unwrap(); + + raw_morsel_receivers.clear(); + // Join on the producer handle to catch errors/panics. + // Safety + // * We dropped the receivers on the line above + // * This function is only called once. + morsel_stream_task_handle.await + } + + fn shutdown(&self) -> impl Future> { + if self.verbose { + eprintln!("[ParquetSource]: Shutdown via `shutdown()`"); + } + Self::shutdown_impl(self.async_task_data.clone(), self.verbose) + } + + /// Spawns a task to shut down the source node to avoid blocking the current thread. This is + /// usually called when data is no longer needed from the source node, as such it does not + /// propagate any (non-critical) errors. If on the other hand the source node does not provide + /// more data when requested, then it is more suitable to call [`Self::shutdown`], as it returns + /// a result that can be used to distinguish between whether the data stream stopped due to an + /// error or EOF. + fn shutdown_in_background(&self) { + if self.verbose { + eprintln!("[ParquetSource]: Shutdown via `shutdown_in_background()`"); + } + let async_task_data = self.async_task_data.clone(); + polars_io::pl_async::get_runtime() + .spawn(Self::shutdown_impl(async_task_data, self.verbose)); + } + + /// Constructs the task that provides a morsel stream. + #[allow(clippy::type_complexity)] + fn init_raw_morsel_stream( + &mut self, + ) -> ( + Vec>, + async_executor::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + + let use_statistics = self.options.use_statistics; + + let (mut raw_morsel_senders, raw_morsel_receivers): (Vec<_>, Vec<_>) = + (0..self.config.num_pipelines).map(|_| connector()).unzip(); + + if let Some((_, 0)) = self.file_options.slice { + return ( + raw_morsel_receivers, + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + std::future::ready(Ok(())), + )), + ); + } + + let reader_schema = self + .file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left() + .clone(); + + let (normalized_slice_oneshot_rx, metadata_rx, metadata_task_handle) = + self.init_metadata_fetcher(); + + let num_pipelines = self.config.num_pipelines; + let row_group_prefetch_size = self.config.row_group_prefetch_size; + let projection = self.file_options.with_columns.clone(); + assert_eq!(self.physical_predicate.is_some(), self.predicate.is_some()); + let predicate = self.physical_predicate.clone(); + let memory_prefetch_func = self.memory_prefetch_func; + let (start_tx, start_rx) = tokio::sync::oneshot::channel(); + self.morsel_stream_starter = Some(start_tx); + + let mut row_group_data_fetcher = RowGroupDataFetcher { + metadata_rx, + use_statistics, + verbose, + reader_schema, + projection, + predicate, + slice_range: None, // Initialized later + memory_prefetch_func, + current_path_index: 0, + current_byte_source: Default::default(), + current_row_groups: Default::default(), + current_row_group_idx: 0, + current_max_row_group_height: 0, + current_row_offset: 0, + current_shared_file_state: Default::default(), + }; + + let row_group_decoder = self.init_row_group_decoder(); + let row_group_decoder = Arc::new(row_group_decoder); + + // Processes row group metadata and spawns I/O tasks to fetch row group data. This is + // currently spawned onto the CPU runtime as it does not directly make any async I/O calls, + // but instead it potentially performs predicate/slice evaluation on metadata. If we observe + // that under heavy CPU load scenarios the I/O throughput drops due to this task not being + // scheduled we can change it to be a high priority task. + let morsel_stream_task_handle = async_executor::spawn(TaskPriority::Low, async move { + if start_rx.await.is_err() { + drop(row_group_data_fetcher); + return metadata_task_handle.await.unwrap(); + } + + if verbose { + eprintln!("[ParquetSource]: Starting row group data fetch") + } + + // We must `recv()` from the `NotifyReceiver` before awaiting on the + // `normalized_slice_oneshot_rx`, as in the negative offset case the slice resolution + // only runs after the first notify. + if !row_group_data_fetcher.init_next_file_state().await { + drop(row_group_data_fetcher); + return metadata_task_handle.await.unwrap(); + }; + + let slice_range = { + let Ok(slice) = normalized_slice_oneshot_rx.await else { + // If we are here then the producer probably errored. + drop(row_group_data_fetcher); + return metadata_task_handle.await.unwrap(); + }; + + slice.map(|(offset, len)| offset..offset + len) + }; + + row_group_data_fetcher.slice_range = slice_range; + + // Pins a wait group to a channel index. + struct IndexedWaitGroup { + index: usize, + wait_group: WaitGroup, + } + + impl IndexedWaitGroup { + async fn wait(self) -> Self { + self.wait_group.wait().await; + self + } + } + + // Ensure proper backpressure by only polling the buffered iterator when a wait group + // is free. + let mut wait_groups = (0..num_pipelines) + .map(|index| { + let wait_group = WaitGroup::default(); + { + let _prime_this_wait_group = wait_group.token(); + } + IndexedWaitGroup { + index, + wait_group: WaitGroup::default(), + } + .wait() + }) + .collect::>(); + + let mut df_stream = row_group_data_fetcher + .into_stream() + .map(|x| async { + match x { + Ok(handle) => handle.await, + Err(e) => Err(e), + } + }) + .buffered(row_group_prefetch_size) + .map(|x| async { + let row_group_decoder = row_group_decoder.clone(); + + match x { + Ok(row_group_data) => { + async_executor::spawn(TaskPriority::Low, async move { + row_group_decoder.row_group_data_to_df(row_group_data).await + }) + .await + }, + Err(e) => Err(e), + } + }) + .buffered( + // Because we are using an ordered buffer, we may suffer from head-of-line blocking, + // so we add a small amount of buffer. + num_pipelines + 4, + ); + + let morsel_seq_ref = &mut MorselSeq::default(); + let mut dfs = vec![].into_iter(); + + 'main: loop { + let Some(mut indexed_wait_group) = wait_groups.next().await else { + break; + }; + + if dfs.len() == 0 { + let Some(v) = df_stream.next().await else { + break; + }; + + let v = v?; + assert!(!v.is_empty()); + + dfs = v.into_iter(); + } + + let mut df = dfs.next().unwrap(); + let morsel_seq = *morsel_seq_ref; + *morsel_seq_ref = morsel_seq.successor(); + + loop { + use crate::async_primitives::connector::SendError; + + let channel_index = indexed_wait_group.index; + let wait_token = indexed_wait_group.wait_group.token(); + + match raw_morsel_senders[channel_index].try_send((df, morsel_seq, wait_token)) { + Ok(_) => { + wait_groups.push(indexed_wait_group.wait()); + break; + }, + Err(SendError::Closed(v)) => { + // The port assigned to this wait group has been closed, so we will not + // add it back to the list of wait groups, and we will try to send this + // across another port. + df = v.0 + }, + Err(SendError::Full(_)) => unreachable!(), + } + + let Some(v) = wait_groups.next().await else { + // All ports have closed + break 'main; + }; + + indexed_wait_group = v; + } + } + + // Join on the producer handle to catch errors/panics. + drop(df_stream); + metadata_task_handle.await.unwrap() + }); + + let morsel_stream_task_handle = + async_executor::AbortOnDropHandle::new(morsel_stream_task_handle); + + (raw_morsel_receivers, morsel_stream_task_handle) + } + + /// Constructs the task that fetches file metadata. + /// Note: This must be called AFTER `self.projected_arrow_fields` has been initialized. + /// + /// TODO: During IR conversion the metadata of the first file is already downloaded - see if + /// we can find a way to re-use it. + #[allow(clippy::type_complexity)] + fn init_metadata_fetcher( + &self, + ) -> ( + tokio::sync::oneshot::Receiver>, + NotifyReceiver<(usize, usize, Arc, FileMetaData, usize)>, + task_handles_ext::AbortOnDropHandle>, + ) { + let verbose = self.verbose; + let io_runtime = polars_io::pl_async::get_runtime(); + + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let needs_max_row_group_height_calc = + self.file_options.include_file_paths.is_some() || self.hive_parts.is_some(); + + let (normalized_slice_oneshot_tx, normalized_slice_oneshot_rx) = + tokio::sync::oneshot::channel(); + let (metadata_tx, mut metadata_notify_rx, metadata_rx) = notify_channel(); + + let byte_source_builder = self.byte_source_builder.clone(); + + if self.verbose { + eprintln!( + "[ParquetSource]: Byte source builder: {:?}", + &byte_source_builder + ); + } + + let fetch_metadata_bytes_for_path_index = { + let paths = &self.paths; + let cloud_options = Arc::new(self.cloud_options.clone()); + + let paths = paths.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + move |path_idx: usize| { + let paths = paths.clone(); + let cloud_options = cloud_options.clone(); + let byte_source_builder = byte_source_builder.clone(); + + let handle = io_runtime.spawn(async move { + let mut byte_source = Arc::new( + byte_source_builder + .try_build_from_path( + paths[path_idx].to_str().unwrap(), + cloud_options.as_ref().as_ref(), + ) + .await?, + ); + let (metadata_bytes, maybe_full_bytes) = + read_parquet_metadata_bytes(byte_source.as_ref(), verbose).await?; + + if let Some(v) = maybe_full_bytes { + if !matches!(byte_source.as_ref(), DynByteSource::MemSlice(_)) { + if verbose { + eprintln!( + "[ParquetSource]: Parquet file was fully fetched during \ + metadata read ({} bytes).", + v.len(), + ); + } + + byte_source = Arc::new(DynByteSource::from(MemSliceByteSource(v))) + } + } + + PolarsResult::Ok((path_idx, byte_source, metadata_bytes)) + }); + + let handle = task_handles_ext::AbortOnDropHandle(handle); + + std::future::ready(handle) + } + }; + + let process_metadata_bytes = { + move |handle: task_handles_ext::AbortOnDropHandle< + PolarsResult<(usize, Arc, MemSlice)>, + >| { + let projected_arrow_fields = projected_arrow_fields.clone(); + // Run on CPU runtime - metadata deserialization is expensive, especially + // for very wide tables. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let (path_index, byte_source, metadata_bytes) = handle.await.unwrap()?; + + let metadata = polars_parquet::parquet::read::deserialize_metadata( + metadata_bytes.as_ref(), + metadata_bytes.len() * 2 + 1024, + )?; + + ensure_metadata_has_projected_fields( + projected_arrow_fields.as_ref(), + &metadata, + )?; + + let file_max_row_group_height = if needs_max_row_group_height_calc { + metadata + .row_groups + .iter() + .map(|x| x.num_rows()) + .max() + .unwrap_or(0) + } else { + 0 + }; + + PolarsResult::Ok((path_index, byte_source, metadata, file_max_row_group_height)) + }); + + async_executor::AbortOnDropHandle::new(handle) + } + }; + + let metadata_prefetch_size = self.config.metadata_prefetch_size; + let metadata_decode_ahead_size = self.config.metadata_decode_ahead_size; + + let metadata_task_handle = if self + .file_options + .slice + .map(|(offset, _)| offset >= 0) + .unwrap_or(true) + { + normalized_slice_oneshot_tx + .send( + self.file_options + .slice + .map(|(offset, len)| (offset as usize, len)), + ) + .unwrap(); + + // Safety: `offset + len` does not overflow. + let slice_range = self + .file_options + .slice + .map(|(offset, len)| offset as usize..offset as usize + len); + + let mut metadata_stream = futures::stream::iter(0..self.paths.len()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + let paths = self.paths.clone(); + + // We need to be able to both stop early as well as skip values, which is easier to do + // using a custom task instead of futures::stream + io_runtime.spawn(async move { + let current_row_offset_ref = &mut 0usize; + let current_path_index_ref = &mut 0usize; + + 'main: while metadata_notify_rx.recv().await.is_some() { + loop { + let current_path_index = *current_path_index_ref; + *current_path_index_ref += 1; + + let Some(v) = metadata_stream.next().await else { + break 'main; + }; + + let (path_index, byte_source, metadata, file_max_row_group_height) = v + .map_err(|err| { + err.wrap_msg(|msg| { + format!( + "error at path (index: {}, path: {}): {}", + current_path_index, + paths[current_path_index].to_str().unwrap(), + msg + ) + }) + })?; + + assert_eq!(path_index, current_path_index); + + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = + current_row_offset.saturating_add(metadata.num_rows); + + if let Some(slice_range) = slice_range.clone() { + match SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped file at index {} ({} rows)", + current_path_index, metadata.num_rows + ); + } + continue; + }, + SplitSlicePosition::After => unreachable!(), + SplitSlicePosition::Overlapping(..) => {}, + }; + }; + + { + use tokio::sync::mpsc::error::*; + match metadata_tx.try_send(( + path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) { + Err(TrySendError::Closed(_)) => break 'main, + Ok(_) => {}, + Err(TrySendError::Full(_)) => unreachable!(), + } + } + + if let Some(slice_range) = slice_range.as_ref() { + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + paths.len() - current_path_index - 1, + ); + } + break 'main; + } + }; + + break; + } + } + + Ok(()) + }) + } else { + // Walk the files in reverse to translate the slice into a positive offset. + let slice = self.file_options.slice.unwrap(); + let slice_start_as_n_from_end = -slice.0 as usize; + + let mut metadata_stream = futures::stream::iter((0..self.paths.len()).rev()) + .map(fetch_metadata_bytes_for_path_index) + .buffered(metadata_prefetch_size) + .map(process_metadata_bytes) + .buffered(metadata_decode_ahead_size); + + // Note: + // * We want to wait until the first morsel is requested before starting this + let init_negative_slice_and_metadata = async move { + let mut processed_metadata_rev = vec![]; + let mut cum_rows = 0; + + while let Some(v) = metadata_stream.next().await { + let v = v?; + let (_, _, metadata, _) = &v; + cum_rows += metadata.num_rows; + processed_metadata_rev.push(v); + + if cum_rows >= slice_start_as_n_from_end { + break; + } + } + + let (start, len) = if slice_start_as_n_from_end > cum_rows { + // We need to trim the slice, e.g. SLICE[offset: -100, len: 75] on a file of 50 + // rows should only give the first 25 rows. + let first_file_position = slice_start_as_n_from_end - cum_rows; + (0, slice.1.saturating_sub(first_file_position)) + } else { + (cum_rows - slice_start_as_n_from_end, slice.1) + }; + + if len == 0 { + processed_metadata_rev.clear(); + } + + normalized_slice_oneshot_tx + .send(Some((start, len))) + .unwrap(); + + let slice_range = start..(start + len); + + PolarsResult::Ok((slice_range, processed_metadata_rev, cum_rows)) + }; + + let path_count = self.paths.len(); + + io_runtime.spawn(async move { + // Wait for the first morsel request before we call `init_negative_slice_and_metadata` + // This also means the receiver must `recv()` once before awaiting on the + // `normalized_slice_oneshot_rx` to avoid hanging. + if metadata_notify_rx.recv().await.is_none() { + return Ok(()); + } + + let (slice_range, processed_metadata_rev, cum_rows) = + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + init_negative_slice_and_metadata, + )) + .await?; + + if verbose { + if let Some((path_index, ..)) = processed_metadata_rev.last() { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + begins at file index {}, translated to {:?}", + slice, path_index, slice_range + ); + } else { + eprintln!( + "[ParquetSource]: Slice pushdown: Negatively-offsetted slice {:?} \ + skipped all files ({} files containing {} rows)", + slice, path_count, cum_rows + ) + } + } + + let mut metadata_iter = processed_metadata_rev.into_iter().rev(); + let current_row_offset_ref = &mut 0usize; + + // do-while: We already consumed a notify above. + loop { + let Some(( + current_path_index, + byte_source, + metadata, + file_max_row_group_height, + )) = metadata_iter.next() + else { + break; + }; + + let current_row_offset = *current_row_offset_ref; + *current_row_offset_ref = current_row_offset.saturating_add(metadata.num_rows); + + assert!(matches!( + SplitSlicePosition::split_slice_at_file( + current_row_offset, + metadata.num_rows, + slice_range.clone(), + ), + SplitSlicePosition::Overlapping(..) + )); + + { + use tokio::sync::mpsc::error::*; + match metadata_tx.try_send(( + current_path_index, + current_row_offset, + byte_source, + metadata, + file_max_row_group_height, + )) { + Err(TrySendError::Closed(_)) => break, + Ok(v) => v, + Err(TrySendError::Full(_)) => unreachable!(), + } + } + + if *current_row_offset_ref >= slice_range.end { + if verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stopped reading at file at index {} \ + (remaining {} files will not be read)", + current_path_index, + path_count - current_path_index - 1, + ); + } + break; + } + + if metadata_notify_rx.recv().await.is_none() { + break; + } + } + + Ok(()) + }) + }; + + let metadata_task_handle = task_handles_ext::AbortOnDropHandle(metadata_task_handle); + + ( + normalized_slice_oneshot_rx, + metadata_rx, + metadata_task_handle, + ) + } + + /// Creates a `RowGroupDecoder` that turns `RowGroupData` into DataFrames. + /// This must be called AFTER the following have been initialized: + /// * `self.projected_arrow_fields` + /// * `self.physical_predicate` + fn init_row_group_decoder(&self) -> RowGroupDecoder { + assert!( + !self.projected_arrow_fields.is_empty() + || self.file_options.with_columns.as_deref() == Some(&[]) + ); + assert_eq!(self.predicate.is_some(), self.physical_predicate.is_some()); + + let paths = self.paths.clone(); + let hive_partitions = self.hive_parts.clone(); + let hive_partitions_width = hive_partitions + .as_deref() + .map(|x| x[0].get_statistics().column_stats().len()) + .unwrap_or(0); + let include_file_paths = self.file_options.include_file_paths.clone(); + let projected_arrow_fields = self.projected_arrow_fields.clone(); + let row_index = self.file_options.row_index.clone(); + let physical_predicate = self.physical_predicate.clone(); + let ideal_morsel_size = get_ideal_morsel_size(); + + RowGroupDecoder { + paths, + hive_partitions, + hive_partitions_width, + include_file_paths, + projected_arrow_fields, + row_index, + physical_predicate, + ideal_morsel_size, + } + } + + fn init_projected_arrow_fields(&mut self) { + let reader_schema = self + .file_info + .reader_schema + .as_ref() + .unwrap() + .as_ref() + .unwrap_left() + .clone(); + + self.projected_arrow_fields = + if let Some(columns) = self.file_options.with_columns.as_deref() { + columns + .iter() + .map(|x| { + // `index_of` on ArrowSchema is slow, so we use the polars native Schema, + // but we need to remember to subtact the row index. + let pos = self.file_info.schema.index_of(x.as_str()).unwrap() + - (self.file_options.row_index.is_some() as usize); + reader_schema.fields[pos].clone() + }) + .collect() + } else { + Arc::from(reader_schema.fields.as_slice()) + }; + + if self.verbose { + eprintln!( + "[ParquetSource]: {} columns to be projected from {} files", + self.projected_arrow_fields.len(), + self.paths.len(), + ); + } + } +} + +#[derive(Debug)] +struct Config { + num_pipelines: usize, + /// Number of files to pre-fetch metadata for concurrently + metadata_prefetch_size: usize, + /// Number of files to decode metadata for in parallel in advance + metadata_decode_ahead_size: usize, + /// Number of row groups to pre-fetch concurrently, this can be across files + row_group_prefetch_size: usize, +} + +/// Represents byte-data that can be transformed into a DataFrame after some computation. +struct RowGroupData { + byte_source: FetchedBytes, + path_index: usize, + row_offset: usize, + slice: Option<(usize, usize)>, + file_max_row_group_height: usize, + row_group_metadata: RowGroupMetaData, + shared_file_state: Arc>, +} + +struct RowGroupDataFetcher { + metadata_rx: NotifyReceiver<(usize, usize, Arc, FileMetaData, usize)>, + use_statistics: bool, + verbose: bool, + reader_schema: Arc, + projection: Option>, + predicate: Option>, + slice_range: Option>, + memory_prefetch_func: fn(&[u8]) -> (), + current_path_index: usize, + current_byte_source: Arc, + current_row_groups: std::vec::IntoIter, + current_row_group_idx: usize, + current_max_row_group_height: usize, + current_row_offset: usize, + current_shared_file_state: Arc>, +} + +fn read_this_row_group( + rg_md: &RowGroupMetaData, + predicate: Option<&dyn PhysicalIoExpr>, + reader_schema: &ArrowSchema, +) -> PolarsResult { + let Some(pred) = predicate else { + return Ok(true); + }; + use polars_io::prelude::_internal::*; + // TODO! + // Optimize this. Now we partition the predicate columns twice. (later on reading as well) + // I think we must add metadata context where we can cache and amortize the partitioning. + let mut part_md = PartitionedColumnChunkMD::new(rg_md); + let live = pred.live_variables(); + part_md.set_partitions( + live.as_ref() + .map(|vars| vars.iter().map(|s| s.as_ref()).collect::>()) + .as_ref(), + ); + read_this_row_group(Some(pred), &part_md, reader_schema) +} + +impl RowGroupDataFetcher { + fn into_stream(self) -> RowGroupDataStream { + RowGroupDataStream::new(self) + } + + async fn init_next_file_state(&mut self) -> bool { + let Some((path_index, row_offset, byte_source, metadata, file_max_row_group_height)) = + self.metadata_rx.recv().await + else { + return false; + }; + + self.current_path_index = path_index; + self.current_byte_source = byte_source; + self.current_max_row_group_height = file_max_row_group_height; + // The metadata task also sends a row offset to start counting from as it may skip files + // during slice pushdown. + self.current_row_offset = row_offset; + self.current_row_group_idx = 0; + self.current_row_groups = metadata.row_groups.into_iter(); + self.current_shared_file_state = Default::default(); + + true + } + + async fn next( + &mut self, + ) -> Option>>> { + 'main: loop { + for row_group_metadata in self.current_row_groups.by_ref() { + let current_row_offset = self.current_row_offset; + let current_row_group_idx = self.current_row_group_idx; + + let num_rows = row_group_metadata.num_rows(); + + self.current_row_offset = current_row_offset.saturating_add(num_rows); + self.current_row_group_idx += 1; + + if self.use_statistics + && !match read_this_row_group( + &row_group_metadata, + self.predicate.as_deref(), + self.reader_schema.as_ref(), + ) { + Ok(v) => v, + Err(e) => return Some(Err(e)), + } + { + if self.verbose { + eprintln!( + "[ParquetSource]: Predicate pushdown: \ + Skipped row group {} in file {} ({} rows)", + current_row_group_idx, self.current_path_index, num_rows + ); + } + continue; + } + + if num_rows > IdxSize::MAX as usize { + let msg = operation_exceeded_idxsize_msg( + format!("number of rows in row group ({})", num_rows).as_str(), + ); + return Some(Err(polars_err!(ComputeError: msg))); + } + + let slice = if let Some(slice_range) = self.slice_range.clone() { + let (offset, len) = match SplitSlicePosition::split_slice_at_file( + current_row_offset, + num_rows, + slice_range, + ) { + SplitSlicePosition::Before => { + if self.verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Skipped row group {} in file {} ({} rows)", + current_row_group_idx, self.current_path_index, num_rows + ); + } + continue; + }, + SplitSlicePosition::After => { + if self.verbose { + eprintln!( + "[ParquetSource]: Slice pushdown: \ + Stop at row group {} in file {} \ + (remaining {} row groups will not be read)", + current_row_group_idx, + self.current_path_index, + self.current_row_groups.len(), + ); + }; + break 'main; + }, + SplitSlicePosition::Overlapping(offset, len) => (offset, len), + }; + + Some((offset, len)) + } else { + None + }; + + let current_byte_source = self.current_byte_source.clone(); + let projection = self.projection.clone(); + let current_shared_file_state = self.current_shared_file_state.clone(); + let memory_prefetch_func = self.memory_prefetch_func; + let io_runtime = polars_io::pl_async::get_runtime(); + let current_path_index = self.current_path_index; + let current_max_row_group_height = self.current_max_row_group_height; + + // Push calculation of byte ranges to a task to run in parallel, as it can be + // expensive for very wide tables and projections. + let handle = async_executor::spawn(TaskPriority::Low, async move { + let byte_source = if let DynByteSource::MemSlice(mem_slice) = + current_byte_source.as_ref() + { + // Skip byte range calculation for `no_prefetch`. + if memory_prefetch_func as usize != mem_prefetch_funcs::no_prefetch as usize + { + let slice = mem_slice.0.as_ref(); + + if let Some(columns) = projection.as_ref() { + for range in get_row_group_byte_ranges_for_projection( + &row_group_metadata, + columns.as_ref(), + ) { + memory_prefetch_func(unsafe { + slice.get_unchecked_release(range) + }) + } + } else { + let mut iter = get_row_group_byte_ranges(&row_group_metadata); + let first = iter.next().unwrap(); + let range = + iter.fold(first, |l, r| l.start.min(r.start)..l.end.max(r.end)); + + memory_prefetch_func(unsafe { slice.get_unchecked_release(range) }) + }; + } + + // We have a mmapped or in-memory slice representing the entire + // file that can be sliced directly, so we can skip the byte-range + // calculations and HashMap allocation. + let mem_slice = mem_slice.0.clone(); + FetchedBytes::MemSlice { + offset: 0, + mem_slice, + } + } else if let Some(columns) = projection.as_ref() { + let ranges = get_row_group_byte_ranges_for_projection( + &row_group_metadata, + columns.as_ref(), + ) + .collect::>(); + + let bytes = { + let ranges_2 = ranges.clone(); + task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { + current_byte_source.get_ranges(ranges_2.as_ref()).await + })) + .await + .unwrap()? + }; + + assert_eq!(bytes.len(), ranges.len()); + + let mut bytes_map = PlHashMap::with_capacity(ranges.len()); + + for (range, bytes) in ranges.iter().zip(bytes) { + memory_prefetch_func(bytes.as_ref()); + let v = bytes_map.insert(range.start, bytes); + debug_assert!(v.is_none(), "duplicate range start {}", range.start); + } + + FetchedBytes::BytesMap(bytes_map) + } else { + // We have a dedicated code-path for a full projection that performs a + // single range request for the entire row group. During testing this + // provided much higher throughput from cloud than making multiple range + // request with `get_ranges()`. + let mut iter = get_row_group_byte_ranges(&row_group_metadata); + let mut ranges = Vec::with_capacity(iter.len()); + let first = iter.next().unwrap(); + ranges.push(first.clone()); + let full_range = iter.fold(first, |l, r| { + ranges.push(r.clone()); + l.start.min(r.start)..l.end.max(r.end) + }); + + let mem_slice = { + let full_range_2 = full_range.clone(); + task_handles_ext::AbortOnDropHandle(io_runtime.spawn(async move { + current_byte_source.get_range(full_range_2).await + })) + .await + .unwrap()? + }; + + FetchedBytes::MemSlice { + offset: full_range.start, + mem_slice, + } + }; + + PolarsResult::Ok(RowGroupData { + byte_source, + path_index: current_path_index, + row_offset: current_row_offset, + slice, + file_max_row_group_height: current_max_row_group_height, + row_group_metadata, + shared_file_state: current_shared_file_state.clone(), + }) + }); + + let handle = async_executor::AbortOnDropHandle::new(handle); + return Some(Ok(handle)); + } + + // Initialize state to the next file. + if !self.init_next_file_state().await { + break; + } + } + + None + } +} + +enum FetchedBytes { + MemSlice { mem_slice: MemSlice, offset: usize }, + BytesMap(PlHashMap), +} + +impl FetchedBytes { + fn get_range(&self, range: std::ops::Range) -> MemSlice { + match self { + Self::MemSlice { mem_slice, offset } => { + let offset = *offset; + debug_assert!(range.start >= offset); + mem_slice.slice(range.start - offset..range.end - offset) + }, + Self::BytesMap(v) => { + let v = v.get(&range.start).unwrap(); + debug_assert_eq!(v.len(), range.len()); + v.clone() + }, + } + } +} + +#[rustfmt::skip] +type RowGroupDataStreamFut = std::pin::Pin , + Option < + PolarsResult < + async_executor::AbortOnDropHandle < + PolarsResult < + RowGroupData > > > > + ) + > + Send +>>; + +struct RowGroupDataStream { + current_future: RowGroupDataStreamFut, +} + +impl RowGroupDataStream { + fn new(row_group_data_fetcher: RowGroupDataFetcher) -> Self { + // [`RowGroupDataFetcher`] is a big struct, so we Box it once here to avoid boxing it on + // every `next()` call. + let current_future = Self::call_next_owned(Box::new(row_group_data_fetcher)); + Self { current_future } + } + + fn call_next_owned( + mut row_group_data_fetcher: Box, + ) -> RowGroupDataStreamFut { + Box::pin(async move { + let out = row_group_data_fetcher.next().await; + (row_group_data_fetcher, out) + }) + } +} + +impl futures::stream::Stream for RowGroupDataStream { + type Item = PolarsResult>>; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + use std::pin::Pin; + use std::task::Poll; + + match Pin::new(&mut self.current_future.as_mut()).poll(cx) { + Poll::Ready((row_group_data_fetcher, out)) => { + if out.is_some() { + self.current_future = Self::call_next_owned(row_group_data_fetcher); + } + + Poll::Ready(out) + }, + Poll::Pending => Poll::Pending, + } + } +} + +/// State shared across row groups for a single file. +struct SharedFileState { + path_index: usize, + hive_series: Vec, + file_path_series: Option, +} + +/// Turns row group data into DataFrames. +struct RowGroupDecoder { + paths: Arc>, + hive_partitions: Option>>, + hive_partitions_width: usize, + include_file_paths: Option>, + projected_arrow_fields: Arc<[polars_core::prelude::ArrowField]>, + row_index: Option, + physical_predicate: Option>, + ideal_morsel_size: usize, +} + +impl RowGroupDecoder { + async fn row_group_data_to_df( + &self, + row_group_data: RowGroupData, + ) -> PolarsResult> { + let row_group_data = Arc::new(row_group_data); + + let out_width = self.row_index.is_some() as usize + + self.projected_arrow_fields.len() + + self.hive_partitions_width + + self.include_file_paths.is_some() as usize; + + let mut out_columns = Vec::with_capacity(out_width); + + if self.row_index.is_some() { + // Add a placeholder so that we don't have to shift the entire vec + // later. + out_columns.push(Series::default()); + } + + let slice_range = row_group_data + .slice + .map(|(offset, len)| offset..offset + len) + .unwrap_or(0..row_group_data.row_group_metadata.num_rows()); + + let projected_arrow_fields = &self.projected_arrow_fields; + let projected_arrow_fields = projected_arrow_fields.clone(); + + let row_group_data_2 = row_group_data.clone(); + let slice_range_2 = slice_range.clone(); + + // Minimum number of values to amortize the overhead of spawning tasks. + // This value is arbitrarily chosen. + const VALUES_PER_THREAD: usize = 16_777_216; + let n_rows = row_group_data.row_group_metadata.num_rows(); + let cols_per_task = 1 + VALUES_PER_THREAD / n_rows; + + let decode_fut_iter = (0..self.projected_arrow_fields.len()) + .step_by(cols_per_task) + .map(move |offset| { + let row_group_data = row_group_data_2.clone(); + let slice_range = slice_range_2.clone(); + let projected_arrow_fields = projected_arrow_fields.clone(); + + async move { + (offset + ..offset + .saturating_add(cols_per_task) + .min(projected_arrow_fields.len())) + .map(|i| { + let arrow_field = projected_arrow_fields[i].clone(); + + let columns_to_deserialize = row_group_data + .row_group_metadata + .columns() + .iter() + .filter(|col_md| { + col_md.descriptor().path_in_schema[0] == arrow_field.name + }) + .map(|col_md| { + let (offset, len) = col_md.byte_range(); + let offset = offset as usize; + let len = len as usize; + + ( + col_md, + row_group_data.byte_source.get_range(offset..offset + len), + ) + }) + .collect::>(); + + assert!( + slice_range.end <= row_group_data.row_group_metadata.num_rows() + ); + + let array = polars_io::prelude::_internal::to_deserializer( + columns_to_deserialize, + arrow_field.clone(), + Some(polars_parquet::read::Filter::Range(slice_range.clone())), + )?; + + let series = Series::try_from((&arrow_field, array))?; + + // TODO: Also load in the metadata. + + PolarsResult::Ok(series) + }) + .collect::>>() + } + }); + + if decode_fut_iter.len() > 1 { + for handle in decode_fut_iter.map(|fut| { + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + fut, + )) + }) { + out_columns.extend(handle.await?); + } + } else { + for fut in decode_fut_iter { + out_columns.extend(fut.await?); + } + } + + let projection_height = if self.projected_arrow_fields.is_empty() { + slice_range.len() + } else { + debug_assert!(out_columns.len() > self.row_index.is_some() as usize); + out_columns.last().unwrap().len() + }; + + if let Some(RowIndex { name, offset }) = self.row_index.as_ref() { + let Some(offset) = (|| { + let offset = offset + .checked_add((row_group_data.row_offset + slice_range.start) as IdxSize)?; + offset.checked_add(projection_height as IdxSize)?; + + Some(offset) + })() else { + let msg = format!( + "adding a row index column with offset {} overflows at {} rows", + offset, + row_group_data.row_offset + slice_range.end + ); + polars_bail!(ComputeError: msg) + }; + + // The DataFrame can be empty at this point if no columns were projected from the file, + // so we create the row index column manually instead of using `df.with_row_index` to + // ensure it has the correct number of rows. + let mut ca = IdxCa::from_vec( + name, + (offset..offset + projection_height as IdxSize).collect(), + ); + ca.set_sorted_flag(IsSorted::Ascending); + + out_columns[0] = ca.into_series(); + } + + let shared_file_state = row_group_data + .shared_file_state + .get_or_init(|| async { + let path_index = row_group_data.path_index; + + let hive_series = if let Some(hp) = self.hive_partitions.as_deref() { + let mut v = hp[path_index].materialize_partition_columns(); + for s in v.iter_mut() { + *s = s.new_from_index(0, row_group_data.file_max_row_group_height); + } + v + } else { + vec![] + }; + + let file_path_series = self.include_file_paths.as_deref().map(|file_path_col| { + StringChunked::full( + file_path_col, + self.paths[path_index].to_str().unwrap(), + row_group_data.file_max_row_group_height, + ) + .into_series() + }); + + SharedFileState { + path_index, + hive_series, + file_path_series, + } + }) + .await; + + assert_eq!(shared_file_state.path_index, row_group_data.path_index); + + for s in &shared_file_state.hive_series { + debug_assert!(s.len() >= projection_height); + out_columns.push(s.slice(0, projection_height)); + } + + if let Some(file_path_series) = &shared_file_state.file_path_series { + debug_assert!(file_path_series.len() >= projection_height); + out_columns.push(file_path_series.slice(0, projection_height)); + } + + let df = unsafe { DataFrame::new_no_checks(out_columns) }; + + // Re-calculate: A slice may have been applied. + let cols_per_task = 1 + VALUES_PER_THREAD / df.height(); + + let df = if let Some(predicate) = self.physical_predicate.as_deref() { + let mask = predicate.evaluate_io(&df)?; + let mask = mask.bool().unwrap(); + + if cols_per_task <= df.width() { + df._filter_seq(mask)? + } else { + let mask = mask.clone(); + let cols = Arc::new(df.take_columns()); + let mut out_cols = Vec::with_capacity(cols.len()); + + for handle in (0..cols.len()) + .step_by(cols_per_task) + .map(move |offset| { + let cols = cols.clone(); + let mask = mask.clone(); + async move { + cols[offset..offset.saturating_add(cols_per_task).min(cols.len())] + .iter() + .map(|s| s.filter(&mask)) + .collect::>>() + } + }) + .map(|fut| { + async_executor::AbortOnDropHandle::new(async_executor::spawn( + TaskPriority::Low, + fut, + )) + }) + { + out_cols.extend(handle.await?); + } + + unsafe { DataFrame::new_no_checks(out_cols) } + } + } else { + df + }; + + assert_eq!(df.width(), out_width); + + let n_morsels = if df.height() > 3 * self.ideal_morsel_size / 2 { + // num_rows > (1.5 * ideal_morsel_size) + (df.height() / self.ideal_morsel_size).max(2) + } else { + 1 + } as u64; + + if n_morsels == 1 { + return Ok(vec![df]); + } + + let rows_per_morsel = 1 + df.height() / n_morsels as usize; + + let out = (0..i64::try_from(df.height()).unwrap()) + .step_by(rows_per_morsel) + .map(|offset| df.slice(offset, rows_per_morsel)) + .collect::>(); + + Ok(out) + } +} + +/// Read the metadata bytes of a parquet file, does not decode the bytes. If during metadata fetch +/// the bytes of the entire file are loaded, it is returned in the second return value. +async fn read_parquet_metadata_bytes( + byte_source: &DynByteSource, + verbose: bool, +) -> PolarsResult<(MemSlice, Option)> { + use polars_parquet::parquet::error::ParquetError; + use polars_parquet::parquet::PARQUET_MAGIC; + + const FOOTER_HEADER_SIZE: usize = polars_parquet::parquet::FOOTER_SIZE as usize; + + let file_size = byte_source.get_size().await?; + + if file_size < FOOTER_HEADER_SIZE { + return Err(ParquetError::OutOfSpec(format!( + "file size ({}) is less than minimum size required to store parquet footer ({})", + file_size, FOOTER_HEADER_SIZE + )) + .into()); + } + + let estimated_metadata_size = if let DynByteSource::MemSlice(_) = byte_source { + // Mmapped or in-memory, reads are free. + file_size + } else { + (file_size / 2048).clamp(16_384, 131_072).min(file_size) + }; + + let bytes = byte_source + .get_range((file_size - estimated_metadata_size)..file_size) + .await?; + + let footer_header_bytes = bytes.slice((bytes.len() - FOOTER_HEADER_SIZE)..bytes.len()); + + let (v, remaining) = footer_header_bytes.split_at(4); + let footer_size = i32::from_le_bytes(v.try_into().unwrap()); + + if remaining != PARQUET_MAGIC { + return Err(ParquetError::OutOfSpec(format!( + r#"expected parquet magic bytes "{}" in footer, got "{}" instead"#, + std::str::from_utf8(&PARQUET_MAGIC).unwrap(), + String::from_utf8_lossy(remaining) + )) + .into()); + } + + if footer_size < 0 { + return Err(ParquetError::OutOfSpec(format!( + "expected positive footer size, got {} instead", + footer_size + )) + .into()); + } + + let footer_size = footer_size as usize + FOOTER_HEADER_SIZE; + + if file_size < footer_size { + return Err(ParquetError::OutOfSpec(format!( + "file size ({}) is less than the indicated footer size ({})", + file_size, footer_size + )) + .into()); + } + + if bytes.len() < footer_size { + debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); + if verbose { + eprintln!( + "[ParquetSource]: Extra {} bytes need to be fetched for metadata \ + (initial estimate = {}, actual size = {})", + footer_size - estimated_metadata_size, + bytes.len(), + footer_size, + ); + } + + let mut out = Vec::with_capacity(footer_size); + let offset = file_size - footer_size; + let len = footer_size - bytes.len(); + let delta_bytes = byte_source.get_range(offset..(offset + len)).await?; + + debug_assert!(out.capacity() >= delta_bytes.len() + bytes.len()); + + out.extend_from_slice(&delta_bytes); + out.extend_from_slice(&bytes); + + Ok((MemSlice::from_vec(out), None)) + } else { + if verbose && !matches!(byte_source, DynByteSource::MemSlice(_)) { + eprintln!( + "[ParquetSource]: Fetched all bytes for metadata on first try \ + (initial estimate = {}, actual size = {}, excess = {})", + bytes.len(), + footer_size, + estimated_metadata_size - footer_size, + ); + } + + let metadata_bytes = bytes.slice((bytes.len() - footer_size)..bytes.len()); + + if bytes.len() == file_size { + Ok((metadata_bytes, Some(bytes))) + } else { + debug_assert!(!matches!(byte_source, DynByteSource::MemSlice(_))); + let metadata_bytes = if bytes.len() - footer_size >= bytes.len() { + // Re-allocate to drop the excess bytes + MemSlice::from_vec(metadata_bytes.to_vec()) + } else { + metadata_bytes + }; + + Ok((metadata_bytes, None)) + } + } +} + +fn get_row_group_byte_ranges( + row_group_metadata: &RowGroupMetaData, +) -> impl ExactSizeIterator> + '_ { + let row_group_columns = row_group_metadata.columns(); + + row_group_columns.iter().map(|rg_col_metadata| { + let (offset, len) = rg_col_metadata.byte_range(); + (offset as usize)..(offset + len) as usize + }) +} + +/// TODO: This is quadratic - incorporate https://github.com/pola-rs/polars/pull/18327 that is +/// merged. +fn get_row_group_byte_ranges_for_projection<'a>( + row_group_metadata: &'a RowGroupMetaData, + columns: &'a [String], +) -> impl Iterator> + 'a { + let row_group_columns = row_group_metadata.columns(); + + row_group_columns.iter().filter_map(move |rg_col_metadata| { + for col_name in columns { + if &rg_col_metadata.descriptor().path_in_schema[0] == col_name { + let (offset, len) = rg_col_metadata.byte_range(); + let range = (offset as usize)..((offset + len) as usize); + return Some(range); + } + } + None + }) +} + +/// Ensures that a parquet file has all the necessary columns for a projection with the correct +/// dtype. There are no ordering requirements and extra columns are permitted. +fn ensure_metadata_has_projected_fields( + projected_fields: &[polars_core::prelude::ArrowField], + metadata: &FileMetaData, +) -> PolarsResult<()> { + let schema = polars_parquet::arrow::read::infer_schema(metadata)?; + + // Note: We convert to Polars-native dtypes for timezone normalization. + let mut schema = schema + .fields + .into_iter() + .map(|x| { + let dtype = DataType::from_arrow(&x.data_type, true); + (x.name, dtype) + }) + .collect::>(); + + for field in projected_fields { + let Some(dtype) = schema.remove(&field.name) else { + polars_bail!(SchemaMismatch: "did not find column: {}", field.name) + }; + + let expected_dtype = DataType::from_arrow(&field.data_type, true); + + if dtype != expected_dtype { + polars_bail!(SchemaMismatch: "data type mismatch for column {}: found: {}, expected: {}", + &field.name, dtype, expected_dtype + ) + } + } + + Ok(()) +} + +fn get_memory_prefetch_func(verbose: bool) -> fn(&[u8]) -> () { + let memory_prefetch_func = match std::env::var("POLARS_MEMORY_PREFETCH").ok().as_deref() { + None => { + // Sequential advice was observed to provide speedups on Linux. + // ref https://github.com/pola-rs/polars/pull/18152#discussion_r1721701965 + #[cfg(target_os = "linux")] + { + mem_prefetch_funcs::madvise_sequential + } + #[cfg(not(target_os = "linux"))] + { + mem_prefetch_funcs::no_prefetch + } + }, + Some("no_prefetch") => mem_prefetch_funcs::no_prefetch, + Some("prefetch_l2") => mem_prefetch_funcs::prefetch_l2, + Some("madvise_sequential") => { + #[cfg(target_family = "unix")] + { + mem_prefetch_funcs::madvise_sequential + } + #[cfg(not(target_family = "unix"))] + { + panic!("POLARS_MEMORY_PREFETCH=madvise_sequential is not supported by this system"); + } + }, + Some("madvise_willneed") => { + #[cfg(target_family = "unix")] + { + mem_prefetch_funcs::madvise_willneed + } + #[cfg(not(target_family = "unix"))] + { + panic!("POLARS_MEMORY_PREFETCH=madvise_willneed is not supported by this system"); + } + }, + Some("madvise_populate_read") => { + #[cfg(target_os = "linux")] + { + mem_prefetch_funcs::madvise_populate_read + } + #[cfg(not(target_os = "linux"))] + { + panic!( + "POLARS_MEMORY_PREFETCH=madvise_populate_read is not supported by this system" + ); + } + }, + Some(v) => panic!("invalid value for POLARS_MEMORY_PREFETCH: {}", v), + }; + + if verbose { + let func_name = match memory_prefetch_func as usize { + v if v == mem_prefetch_funcs::no_prefetch as usize => "no_prefetch", + v if v == mem_prefetch_funcs::prefetch_l2 as usize => "prefetch_l2", + v if v == mem_prefetch_funcs::madvise_sequential as usize => "madvise_sequential", + v if v == mem_prefetch_funcs::madvise_willneed as usize => "madvise_willneed", + v if v == mem_prefetch_funcs::madvise_populate_read as usize => "madvise_populate_read", + _ => unreachable!(), + }; + + eprintln!("[ParquetSource] Memory prefetch function: {}", func_name); + } + + memory_prefetch_func +} + +mod mem_prefetch_funcs { + pub use polars_utils::mem::{ + madvise_populate_read, madvise_sequential, madvise_willneed, prefetch_l2, + }; + + pub fn no_prefetch(_: &[u8]) {} +} diff --git a/crates/polars-stream/src/nodes/reduce.rs b/crates/polars-stream/src/nodes/reduce.rs index 4dc4d859ba62..3b6c7b2bea62 100644 --- a/crates/polars-stream/src/nodes/reduce.rs +++ b/crates/polars-stream/src/nodes/reduce.rs @@ -2,6 +2,7 @@ use std::sync::Arc; use polars_core::schema::Schema; use polars_expr::reduce::Reduction; +use polars_utils::itertools::Itertools; use super::compute_node_prelude::*; use crate::expression::StreamExpr; @@ -97,7 +98,7 @@ impl ComputeNode for ReduceNode { "reduce" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && send.len() == 1); // State transitions. @@ -108,7 +109,6 @@ impl ComputeNode for ReduceNode { }, // Input is done, transition to being a source. ReduceState::Sink { reductions, .. } if matches!(recv[0], PortState::Done) => { - // TODO! make `update_state` fallible. let columns = reductions .iter_mut() .zip(self.output_schema.iter_fields()) @@ -117,9 +117,8 @@ impl ComputeNode for ReduceNode { scalar.into_series(&field.name).cast(&field.dtype).unwrap() }) }) - .collect::>>() - .unwrap(); - let out = unsafe { DataFrame::new_no_checks(columns) }; + .try_collect_vec()?; + let out = DataFrame::new(columns).unwrap(); self.state = ReduceState::Source(Some(out)); }, @@ -146,6 +145,7 @@ impl ComputeNode for ReduceNode { send[0] = PortState::Done; }, } + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/select.rs b/crates/polars-stream/src/nodes/select.rs index 568351ee4f47..688580e10319 100644 --- a/crates/polars-stream/src/nodes/select.rs +++ b/crates/polars-stream/src/nodes/select.rs @@ -26,9 +26,10 @@ impl ComputeNode for SelectNode { "select" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && send.len() == 1); recv.swap_with_slice(send); + Ok(()) } fn spawn<'env, 's>( @@ -59,20 +60,7 @@ impl ComputeNode for SelectNode { out._add_columns(selected, &slf.schema)?; out } else { - // Broadcast scalars. - let max_non_unit_length = selected - .iter() - .map(|s| s.len()) - .filter(|l| *l != 1) - .max() - .unwrap_or(1); - for s in &mut selected { - if s.len() != max_non_unit_length { - assert!(s.len() == 1, "got series of incompatible lengths"); - *s = s.new_from_index(0, max_non_unit_length); - } - } - unsafe { DataFrame::new_no_checks(selected) } + DataFrame::new_with_broadcast(selected)? }; let mut morsel = Morsel::new(ret, seq, source_token); diff --git a/crates/polars-stream/src/nodes/simple_projection.rs b/crates/polars-stream/src/nodes/simple_projection.rs index 1a643b642e73..d4e82dde8ad8 100644 --- a/crates/polars-stream/src/nodes/simple_projection.rs +++ b/crates/polars-stream/src/nodes/simple_projection.rs @@ -23,9 +23,10 @@ impl ComputeNode for SimpleProjectionNode { "simple_projection" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(recv.len() == 1 && send.len() == 1); recv.swap_with_slice(send); + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/streaming_slice.rs b/crates/polars-stream/src/nodes/streaming_slice.rs index b46693bac808..950b39331588 100644 --- a/crates/polars-stream/src/nodes/streaming_slice.rs +++ b/crates/polars-stream/src/nodes/streaming_slice.rs @@ -30,13 +30,14 @@ impl ComputeNode for StreamingSliceNode { self.num_pipelines = num_pipelines; } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { if self.stream_offset >= self.start_offset + self.length || self.length == 0 { recv[0] = PortState::Done; send[0] = PortState::Done; } else { recv.swap_with_slice(send); } + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/nodes/zip.rs b/crates/polars-stream/src/nodes/zip.rs index b5b860880a1b..ff1e336a178f 100644 --- a/crates/polars-stream/src/nodes/zip.rs +++ b/crates/polars-stream/src/nodes/zip.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use polars_core::functions::concat_df_horizontal; use polars_core::schema::Schema; use polars_core::series::Series; +use polars_error::polars_ensure; use super::compute_node_prelude::*; use crate::morsel::SourceToken; @@ -138,7 +139,7 @@ impl ComputeNode for ZipNode { "zip" } - fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) { + fn update_state(&mut self, recv: &mut [PortState], send: &mut [PortState]) -> PolarsResult<()> { assert!(send.len() == 1); assert!(recv.len() == self.input_heads.len()); @@ -167,9 +168,9 @@ impl ComputeNode for ZipNode { } if !self.null_extend { - assert!( + polars_ensure!( !(at_least_one_non_broadcast_done && at_least_one_non_broadcast_nonempty), - "zip received non-equal length inputs" + ShapeMismatch: "zip node received non-equal length inputs" ); } @@ -196,6 +197,7 @@ impl ComputeNode for ZipNode { for r in recv { *r = new_recv_state; } + Ok(()) } fn spawn<'env, 's>( diff --git a/crates/polars-stream/src/physical_plan/fmt.rs b/crates/polars-stream/src/physical_plan/fmt.rs new file mode 100644 index 000000000000..20aa1cf1486f --- /dev/null +++ b/crates/polars-stream/src/physical_plan/fmt.rs @@ -0,0 +1,199 @@ +use std::fmt::Write; + +use polars_plan::plans::expr_ir::ExprIR; +use polars_plan::plans::{AExpr, EscapeLabel, FileScan, PathsDisplay}; +use polars_utils::arena::Arena; +use polars_utils::itertools::Itertools; +use slotmap::{Key, SecondaryMap, SlotMap}; + +use super::{PhysNode, PhysNodeKey, PhysNodeKind}; + +fn escape_graphviz(s: &str) -> String { + s.replace('\\', "\\\\") + .replace('\n', "\\n") + .replace('"', "\\\"") +} + +fn fmt_exprs(exprs: &[ExprIR], expr_arena: &Arena) -> String { + exprs + .iter() + .map(|e| escape_graphviz(&e.display(expr_arena).to_string())) + .collect_vec() + .join("\\n") +} + +#[recursive::recursive] +fn visualize_plan_rec( + node_key: PhysNodeKey, + phys_sm: &SlotMap, + expr_arena: &Arena, + visited: &mut SecondaryMap, + out: &mut Vec, +) { + if visited.contains_key(node_key) { + return; + } + visited.insert(node_key, ()); + + use std::slice::from_ref; + let (label, inputs) = match &phys_sm[node_key].kind { + PhysNodeKind::InMemorySource { df } => ( + format!( + "in-memory-source\\ncols: {}", + df.get_column_names().join(", ") + ), + &[][..], + ), + PhysNodeKind::Select { + input, + selectors, + extend_original, + } => { + let label = if *extend_original { + "with-columns" + } else { + "select" + }; + ( + format!("{label}\\n{}", fmt_exprs(selectors, expr_arena)), + from_ref(input), + ) + }, + PhysNodeKind::Reduce { input, exprs } => ( + format!("reduce\\n{}", fmt_exprs(exprs, expr_arena)), + from_ref(input), + ), + PhysNodeKind::StreamingSlice { + input, + offset, + length, + } => ( + format!("slice\\noffset: {offset}, length: {length}"), + from_ref(input), + ), + PhysNodeKind::Filter { input, predicate } => ( + format!("filter\\n{}", fmt_exprs(from_ref(predicate), expr_arena)), + from_ref(input), + ), + PhysNodeKind::SimpleProjection { input, columns } => ( + format!("select\\ncols: {}", columns.join(", ")), + from_ref(input), + ), + PhysNodeKind::InMemorySink { input } => ("in-memory-sink".to_string(), from_ref(input)), + PhysNodeKind::InMemoryMap { input, map: _ } => { + ("in-memory-map".to_string(), from_ref(input)) + }, + PhysNodeKind::Map { input, map: _ } => ("map".to_string(), from_ref(input)), + PhysNodeKind::Sort { + input, + by_column, + slice: _, + sort_options: _, + } => ( + format!("sort\\n{}", fmt_exprs(by_column, expr_arena)), + from_ref(input), + ), + PhysNodeKind::OrderedUnion { inputs } => ("ordered-union".to_string(), inputs.as_slice()), + PhysNodeKind::Zip { + inputs, + null_extend, + } => { + let label = if *null_extend { + "zip-null-extend" + } else { + "zip" + }; + (label.to_string(), inputs.as_slice()) + }, + PhysNodeKind::Multiplexer { input } => ("multiplexer".to_string(), from_ref(input)), + PhysNodeKind::FileScan { + paths, + file_info, + hive_parts, + output_schema: _, + scan_type, + predicate, + file_options, + } => { + let name = match scan_type { + FileScan::Parquet { .. } => "parquet-source", + FileScan::Csv { .. } => "csv-source", + FileScan::Ipc { .. } => "ipc-source", + FileScan::NDJson { .. } => "ndjson-source", + FileScan::Anonymous { .. } => "anonymous-source", + }; + + let mut out = name.to_string(); + let mut f = EscapeLabel(&mut out); + + { + let paths_display = PathsDisplay(paths.as_ref()); + + write!(f, "\npaths: {}", paths_display).unwrap(); + } + + { + let total_columns = + file_info.schema.len() - usize::from(file_options.row_index.is_some()); + let n_columns = file_options + .with_columns + .as_ref() + .map(|columns| columns.len()); + + if let Some(n) = n_columns { + write!(f, "\nprojection: {}/{total_columns}", n).unwrap(); + } else { + write!(f, "\nprojection: */{total_columns}").unwrap(); + } + } + + if let Some(polars_io::RowIndex { name, offset }) = &file_options.row_index { + write!(f, r#"\nrow index: name: "{}", offset: {}"#, name, offset).unwrap(); + } + + if let Some((offset, len)) = file_options.slice { + write!(f, "\nslice: offset: {}, len: {}", offset, len).unwrap(); + } + + if let Some(predicate) = predicate.as_ref() { + write!(f, "\nfilter: {}", predicate.display(expr_arena)).unwrap(); + } + + if let Some(v) = hive_parts + .as_deref() + .map(|x| x[0].get_statistics().column_stats().len()) + { + write!(f, "\nhive: {} columns", v).unwrap(); + } + + (out, &[][..]) + }, + }; + + out.push(format!( + "{} [label=\"{}\"];", + node_key.data().as_ffi(), + label + )); + for input in inputs { + visualize_plan_rec(*input, phys_sm, expr_arena, visited, out); + out.push(format!( + "{} -> {};", + input.data().as_ffi(), + node_key.data().as_ffi() + )); + } +} + +pub fn visualize_plan( + root: PhysNodeKey, + phys_sm: &SlotMap, + expr_arena: &Arena, +) -> String { + let mut visited: SecondaryMap = SecondaryMap::new(); + let mut out = Vec::with_capacity(phys_sm.len() + 2); + out.push("digraph polars {\nrankdir=\"BT\"".to_string()); + visualize_plan_rec(root, phys_sm, expr_arena, &mut visited, &mut out); + out.push("}".to_string()); + out.join("\n") +} diff --git a/crates/polars-stream/src/physical_plan/lower_expr.rs b/crates/polars-stream/src/physical_plan/lower_expr.rs new file mode 100644 index 000000000000..13a1a309e50b --- /dev/null +++ b/crates/polars-stream/src/physical_plan/lower_expr.rs @@ -0,0 +1,751 @@ +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + +use polars_core::frame::DataFrame; +use polars_core::prelude::{Field, InitHashMaps, PlHashMap, PlHashSet}; +use polars_core::schema::Schema; +use polars_error::PolarsResult; +use polars_expr::planner::get_expr_depth_limit; +use polars_expr::state::ExecutionState; +use polars_expr::{create_physical_expr, ExpressionConversionState}; +use polars_plan::plans::expr_ir::{ExprIR, OutputName}; +use polars_plan::plans::{AExpr, LiteralValue}; +use polars_plan::prelude::*; +use polars_utils::arena::{Arena, Node}; +use polars_utils::itertools::Itertools; +use slotmap::SlotMap; + +use super::{PhysNode, PhysNodeKey, PhysNodeKind}; + +type IRNodeKey = Node; + +fn unique_column_name() -> ColumnName { + static COUNTER: AtomicU64 = AtomicU64::new(0); + let idx = COUNTER.fetch_add(1, Ordering::Relaxed); + format!("__POLARS_STMP_{idx}").into() +} + +struct LowerExprContext<'a> { + expr_arena: &'a mut Arena, + phys_sm: &'a mut SlotMap, + is_elementwise_cache: PlHashMap, + is_input_independent_cache: PlHashMap, +} + +#[recursive::recursive] +fn is_elementwise_rec( + expr_key: IRNodeKey, + arena: &Arena, + cache: &mut PlHashMap, +) -> bool { + if let Some(ret) = cache.get(&expr_key) { + return *ret; + } + + let ret = match arena.get(expr_key) { + AExpr::Explode(_) => false, + AExpr::Alias(inner, _) => is_elementwise_rec(*inner, arena, cache), + AExpr::Column(_) => true, + AExpr::Literal(lit) => !matches!(lit, LiteralValue::Series(_) | LiteralValue::Range { .. }), + AExpr::BinaryExpr { left, op: _, right } => { + is_elementwise_rec(*left, arena, cache) && is_elementwise_rec(*right, arena, cache) + }, + AExpr::Cast { + expr, + data_type: _, + options: _, + } => is_elementwise_rec(*expr, arena, cache), + AExpr::Sort { .. } | AExpr::SortBy { .. } | AExpr::Gather { .. } => false, + AExpr::Filter { .. } => false, + AExpr::Agg(_) => false, + AExpr::Ternary { + predicate, + truthy, + falsy, + } => { + is_elementwise_rec(*predicate, arena, cache) + && is_elementwise_rec(*truthy, arena, cache) + && is_elementwise_rec(*falsy, arena, cache) + }, + AExpr::AnonymousFunction { + input: _, + function: _, + output_type: _, + options, + } => options.is_elementwise(), + AExpr::Function { + input, + function, + options, + } => match function { + FunctionExpr::AsStruct => input + .iter() + .all(|expr| is_elementwise_rec(expr.node(), arena, cache)), + _ => options.is_elementwise(), + }, + + AExpr::Window { .. } => false, + AExpr::Slice { .. } => false, + AExpr::Len => false, + }; + + cache.insert(expr_key, ret); + ret +} + +fn is_elementwise(expr_key: IRNodeKey, ctx: &mut LowerExprContext) -> bool { + is_elementwise_rec(expr_key, ctx.expr_arena, &mut ctx.is_elementwise_cache) +} + +#[recursive::recursive] +fn is_input_independent_rec( + expr_key: IRNodeKey, + arena: &Arena, + cache: &mut PlHashMap, +) -> bool { + if let Some(ret) = cache.get(&expr_key) { + return *ret; + } + + let ret = match arena.get(expr_key) { + AExpr::Explode(inner) + | AExpr::Alias(inner, _) + | AExpr::Cast { + expr: inner, + data_type: _, + options: _, + } + | AExpr::Sort { + expr: inner, + options: _, + } => is_input_independent_rec(*inner, arena, cache), + AExpr::Column(_) => false, + AExpr::Literal(_) => true, + AExpr::BinaryExpr { left, op: _, right } => { + is_input_independent_rec(*left, arena, cache) + && is_input_independent_rec(*right, arena, cache) + }, + AExpr::Gather { + expr, + idx, + returns_scalar: _, + } => { + is_input_independent_rec(*expr, arena, cache) + && is_input_independent_rec(*idx, arena, cache) + }, + AExpr::SortBy { + expr, + by, + sort_options: _, + } => { + is_input_independent_rec(*expr, arena, cache) + && by + .iter() + .all(|expr| is_input_independent_rec(*expr, arena, cache)) + }, + AExpr::Filter { input, by } => { + is_input_independent_rec(*input, arena, cache) + && is_input_independent_rec(*by, arena, cache) + }, + AExpr::Agg(agg_expr) => match agg_expr.get_input() { + polars_plan::plans::NodeInputs::Leaf => true, + polars_plan::plans::NodeInputs::Single(expr) => { + is_input_independent_rec(expr, arena, cache) + }, + polars_plan::plans::NodeInputs::Many(exprs) => exprs + .iter() + .all(|expr| is_input_independent_rec(*expr, arena, cache)), + }, + AExpr::Ternary { + predicate, + truthy, + falsy, + } => { + is_input_independent_rec(*predicate, arena, cache) + && is_input_independent_rec(*truthy, arena, cache) + && is_input_independent_rec(*falsy, arena, cache) + }, + AExpr::AnonymousFunction { + input, + function: _, + output_type: _, + options: _, + } + | AExpr::Function { + input, + function: _, + options: _, + } => input + .iter() + .all(|expr| is_input_independent_rec(expr.node(), arena, cache)), + AExpr::Window { + function, + partition_by, + order_by, + options: _, + } => { + is_input_independent_rec(*function, arena, cache) + && partition_by + .iter() + .all(|expr| is_input_independent_rec(*expr, arena, cache)) + && order_by + .iter() + .all(|(expr, _options)| is_input_independent_rec(*expr, arena, cache)) + }, + AExpr::Slice { + input, + offset, + length, + } => { + is_input_independent_rec(*input, arena, cache) + && is_input_independent_rec(*offset, arena, cache) + && is_input_independent_rec(*length, arena, cache) + }, + AExpr::Len => false, + }; + + cache.insert(expr_key, ret); + ret +} + +fn is_input_independent(expr_key: IRNodeKey, ctx: &mut LowerExprContext) -> bool { + is_input_independent_rec( + expr_key, + ctx.expr_arena, + &mut ctx.is_input_independent_cache, + ) +} + +fn build_input_independent_node_with_ctx( + exprs: &[ExprIR], + ctx: &mut LowerExprContext, +) -> PolarsResult { + let expr_depth_limit = get_expr_depth_limit()?; + let mut state = ExpressionConversionState::new(false, expr_depth_limit); + let empty = DataFrame::empty(); + let execution_state = ExecutionState::new(); + let columns = exprs + .iter() + .map(|expr| { + let phys_expr = + create_physical_expr(expr, Context::Default, ctx.expr_arena, None, &mut state)?; + + phys_expr.evaluate(&empty, &execution_state) + }) + .try_collect_vec()?; + + let df = Arc::new(DataFrame::new_with_broadcast(columns)?); + Ok(ctx.phys_sm.insert(PhysNode::new( + Arc::new(df.schema()), + PhysNodeKind::InMemorySource { df }, + ))) +} + +fn simplify_input_nodes( + orig_input: PhysNodeKey, + mut input_nodes: PlHashSet, + ctx: &mut LowerExprContext, +) -> PolarsResult> { + // Flatten nested zips (ensures the original input columns only occur once). + if input_nodes.len() > 1 { + let mut flattened_input_nodes = PlHashSet::with_capacity(input_nodes.len()); + for input_node in input_nodes { + if let PhysNodeKind::Zip { + inputs, + null_extend: false, + } = &ctx.phys_sm[input_node].kind + { + flattened_input_nodes.extend(inputs); + ctx.phys_sm.remove(input_node); + } else { + flattened_input_nodes.insert(input_node); + } + } + input_nodes = flattened_input_nodes; + } + + // Merge reduce nodes that directly operate on the original input. + let mut combined_exprs = vec![]; + input_nodes = input_nodes + .into_iter() + .filter(|input_node| { + if let PhysNodeKind::Reduce { + input: inner, + exprs, + } = &ctx.phys_sm[*input_node].kind + { + if *inner == orig_input { + combined_exprs.extend(exprs.iter().cloned()); + ctx.phys_sm.remove(*input_node); + return false; + } + } + true + }) + .collect(); + if !combined_exprs.is_empty() { + let output_schema = schema_for_select(orig_input, &combined_exprs, ctx)?; + let kind = PhysNodeKind::Reduce { + input: orig_input, + exprs: combined_exprs, + }; + let reduce_node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, kind)); + input_nodes.insert(reduce_node_key); + } + + Ok(input_nodes) +} + +fn build_fallback_node_with_ctx( + input: PhysNodeKey, + exprs: &[ExprIR], + ctx: &mut LowerExprContext, +) -> PolarsResult { + // Pre-select only the columns that are needed for this fallback expression. + let input_schema = &ctx.phys_sm[input].output_schema; + let select_names: PlHashSet<_> = exprs + .iter() + .flat_map(|expr| polars_plan::utils::aexpr_to_leaf_names_iter(expr.node(), ctx.expr_arena)) + .collect(); + let input_node = if input_schema + .iter_names() + .any(|name| !select_names.contains(name.as_str())) + { + let select_exprs = select_names + .into_iter() + .map(|name| { + ExprIR::new( + ctx.expr_arena.add(AExpr::Column(name.clone())), + OutputName::ColumnLhs(name), + ) + }) + .collect_vec(); + build_select_node_with_ctx(input, &select_exprs, ctx)? + } else { + input + }; + + let output_schema = schema_for_select(input_node, exprs, ctx)?; + let expr_depth_limit = get_expr_depth_limit()?; + let mut conv_state = ExpressionConversionState::new(false, expr_depth_limit); + let phys_exprs = exprs + .iter() + .map(|expr| { + create_physical_expr( + expr, + Context::Default, + ctx.expr_arena, + None, + &mut conv_state, + ) + }) + .try_collect_vec()?; + let map = move |df| { + let exec_state = ExecutionState::new(); + let columns = phys_exprs + .iter() + .map(|phys_expr| phys_expr.evaluate(&df, &exec_state)) + .try_collect()?; + DataFrame::new_with_broadcast(columns) + }; + let kind = PhysNodeKind::InMemoryMap { + input: input_node, + map: Arc::new(map), + }; + Ok(ctx.phys_sm.insert(PhysNode::new(output_schema, kind))) +} + +// In the recursive lowering we don't bother with named expressions at all, so +// we work directly with Nodes. +#[recursive::recursive] +fn lower_exprs_with_ctx( + input: PhysNodeKey, + exprs: &[Node], + ctx: &mut LowerExprContext, +) -> PolarsResult<(PhysNodeKey, Vec)> { + // We have to catch this case separately, in case all the input independent expressions are elementwise. + // TODO: we shouldn't always do this when recursing, e.g. pl.col.a.sum() + 1 will still hit this in the recursion. + if exprs.iter().all(|e| is_input_independent(*e, ctx)) { + let expr_irs = exprs + .iter() + .map(|e| ExprIR::new(*e, OutputName::Alias(unique_column_name()))) + .collect_vec(); + let node = build_input_independent_node_with_ctx(&expr_irs, ctx)?; + let out_exprs = expr_irs + .iter() + .map(|e| ctx.expr_arena.add(AExpr::Column(e.output_name().into()))) + .collect(); + return Ok((node, out_exprs)); + } + + // Fallback expressions that can directly be applied to the original input. + let mut fallback_subset = Vec::new(); + + // Nodes containing the columns used for executing transformed expressions. + let mut input_nodes = PlHashSet::new(); + + // The final transformed expressions that will be selected from the zipped + // together transformed nodes. + let mut transformed_exprs = Vec::with_capacity(exprs.len()); + + for expr in exprs.iter().copied() { + if is_elementwise(expr, ctx) { + if !is_input_independent(expr, ctx) { + input_nodes.insert(input); + } + transformed_exprs.push(expr); + continue; + } + + match ctx.expr_arena.get(expr).clone() { + AExpr::Explode(inner) => { + // While explode is streamable, it is not elementwise, so we + // have to transform it to a select node. + let (trans_input, trans_exprs) = lower_exprs_with_ctx(input, &[inner], ctx)?; + let exploded_name = unique_column_name(); + let trans_inner = ctx.expr_arena.add(AExpr::Explode(trans_exprs[0])); + let explode_expr = ExprIR::new(trans_inner, OutputName::Alias(exploded_name.clone())); + let output_schema = schema_for_select(trans_input, &[explode_expr.clone()], ctx)?; + let node_kind = PhysNodeKind::Select { + input: trans_input, + selectors: vec![explode_expr.clone()], + extend_original: false, + }; + let node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, node_kind)); + input_nodes.insert(node_key); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(exploded_name))); + }, + AExpr::Alias(_, _) => unreachable!("alias found in physical plan"), + AExpr::Column(_) => unreachable!("column should always be streamable"), + AExpr::Literal(_) => { + let out_name = unique_column_name(); + let inner_expr = ExprIR::new(expr, OutputName::Alias(out_name.clone())); + input_nodes.insert(build_input_independent_node_with_ctx(&[inner_expr], ctx)?); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); + }, + AExpr::BinaryExpr { left, op, right } => { + let (trans_input, trans_exprs) = lower_exprs_with_ctx(input, &[left, right], ctx)?; + let bin_expr = AExpr::BinaryExpr { + left: trans_exprs[0], + op, + right: trans_exprs[1], + }; + input_nodes.insert(trans_input); + transformed_exprs.push(ctx.expr_arena.add(bin_expr)); + }, + AExpr::Ternary { + predicate, + truthy, + falsy, + } => { + let (trans_input, trans_exprs) = + lower_exprs_with_ctx(input, &[predicate, truthy, falsy], ctx)?; + let tern_expr = AExpr::Ternary { + predicate: trans_exprs[0], + truthy: trans_exprs[1], + falsy: trans_exprs[2], + }; + input_nodes.insert(trans_input); + transformed_exprs.push(ctx.expr_arena.add(tern_expr)); + }, + AExpr::Cast { + expr: inner, + data_type, + options, + } => { + let (trans_input, trans_exprs) = lower_exprs_with_ctx(input, &[inner], ctx)?; + input_nodes.insert(trans_input); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Cast { + expr: trans_exprs[0], + data_type, + options, + })); + }, + AExpr::Sort { + expr: inner, + options, + } => { + // As we'll refer to the sorted column twice, ensure the inner + // expr is available as a column by selecting first. + let sorted_name = unique_column_name(); + let inner_expr_ir = ExprIR::new(inner, OutputName::Alias(sorted_name.clone())); + let select_node = build_select_node_with_ctx(input, &[inner_expr_ir.clone()], ctx)?; + let col_expr = ctx.expr_arena.add(AExpr::Column(sorted_name.clone())); + let kind = PhysNodeKind::Sort { + input: select_node, + by_column: vec![ExprIR::new(col_expr, OutputName::Alias(sorted_name))], + slice: None, + sort_options: (&options).into(), + }; + let output_schema = ctx.phys_sm[select_node].output_schema.clone(); + let node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, kind)); + input_nodes.insert(node_key); + transformed_exprs.push(col_expr); + }, + AExpr::SortBy { + expr: inner, + by, + sort_options, + } => { + // Select our inputs (if we don't do this we'll waste time sorting irrelevant columns). + let sorted_name = unique_column_name(); + let by_names = by.iter().map(|_| unique_column_name()).collect_vec(); + let all_inner_expr_irs = [(&sorted_name, inner)] + .into_iter() + .chain(by_names.iter().zip(by.iter().copied())) + .map(|(name, inner)| ExprIR::new(inner, OutputName::Alias(name.clone()))) + .collect_vec(); + let select_node = build_select_node_with_ctx(input, &all_inner_expr_irs, ctx)?; + + // Sort the inputs. + let kind = PhysNodeKind::Sort { + input: select_node, + by_column: by_names + .into_iter() + .map(|name| { + ExprIR::new( + ctx.expr_arena.add(AExpr::Column(name.clone())), + OutputName::Alias(name), + ) + }) + .collect(), + slice: None, + sort_options, + }; + let output_schema = ctx.phys_sm[select_node].output_schema.clone(); + let sort_node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, kind)); + + // Drop the by columns. + let sorted_col_expr = ctx.expr_arena.add(AExpr::Column(sorted_name.clone())); + let sorted_col_ir = + ExprIR::new(sorted_col_expr, OutputName::Alias(sorted_name.clone())); + let post_sort_select_node = + build_select_node_with_ctx(sort_node_key, &[sorted_col_ir], ctx)?; + input_nodes.insert(post_sort_select_node); + transformed_exprs.push(sorted_col_expr); + }, + AExpr::Gather { .. } => todo!(), + AExpr::Filter { input: inner, by } => { + // Select our inputs (if we don't do this we'll waste time filtering irrelevant columns). + let out_name = unique_column_name(); + let by_name = unique_column_name(); + let inner_expr_ir = ExprIR::new(inner, OutputName::Alias(out_name.clone())); + let by_expr_ir = ExprIR::new(by, OutputName::Alias(by_name.clone())); + let select_node = + build_select_node_with_ctx(input, &[inner_expr_ir, by_expr_ir], ctx)?; + + // Add a filter node. + let predicate = ExprIR::new( + ctx.expr_arena.add(AExpr::Column(by_name.clone())), + OutputName::Alias(by_name), + ); + let kind = PhysNodeKind::Filter { + input: select_node, + predicate, + }; + let output_schema = ctx.phys_sm[select_node].output_schema.clone(); + let filter_node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, kind)); + input_nodes.insert(filter_node_key); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); + }, + AExpr::Agg(mut agg) => match agg { + // Change agg mutably so we can share the codepath for all of these. + IRAggExpr::Min { + input: ref mut inner, + .. + } + | IRAggExpr::Max { + input: ref mut inner, + .. + } + | IRAggExpr::Sum(ref mut inner) + | IRAggExpr::Mean(ref mut inner) => { + let (trans_input, trans_exprs) = lower_exprs_with_ctx(input, &[*inner], ctx)?; + *inner = trans_exprs[0]; + + let out_name = unique_column_name(); + let trans_agg_expr = ctx.expr_arena.add(AExpr::Agg(agg)); + let expr_ir = ExprIR::new(trans_agg_expr, OutputName::Alias(out_name.clone())); + let output_schema = schema_for_select(trans_input, &[expr_ir.clone()], ctx)?; + let kind = PhysNodeKind::Reduce { + input: trans_input, + exprs: vec![expr_ir], + }; + let reduce_node_key = ctx.phys_sm.insert(PhysNode::new(output_schema, kind)); + input_nodes.insert(reduce_node_key); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); + }, + IRAggExpr::Median(_) + | IRAggExpr::NUnique(_) + | IRAggExpr::First(_) + | IRAggExpr::Last(_) + | IRAggExpr::Implode(_) + | IRAggExpr::Quantile { .. } + | IRAggExpr::Count(_, _) + | IRAggExpr::Std(_, _) + | IRAggExpr::Var(_, _) + | IRAggExpr::AggGroups(_) => { + let out_name = unique_column_name(); + fallback_subset.push(ExprIR::new(expr, OutputName::Alias(out_name.clone()))); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); + }, + }, + AExpr::AnonymousFunction { + .. + } + | AExpr::Function { + .. + } + | AExpr::Len // TODO: this one makes me really sad, make this streaming ASAP. + | AExpr::Slice { .. } + | AExpr::Window { .. } => { + let out_name = unique_column_name(); + fallback_subset.push(ExprIR::new(expr, OutputName::Alias(out_name.clone()))); + transformed_exprs.push(ctx.expr_arena.add(AExpr::Column(out_name))); + } + } + } + + if !fallback_subset.is_empty() { + input_nodes.insert(build_fallback_node_with_ctx(input, &fallback_subset, ctx)?); + } + + // Simplify the input nodes (also ensures the original input only occurs + // once in the zip). + input_nodes = simplify_input_nodes(input, input_nodes, ctx)?; + + if input_nodes.len() == 1 { + // No need for any multiplexing/zipping, can directly execute. + return Ok((input_nodes.into_iter().next().unwrap(), transformed_exprs)); + } + + let zip_inputs = input_nodes.into_iter().collect_vec(); + let output_schema = zip_inputs + .iter() + .flat_map(|node| ctx.phys_sm[*node].output_schema.iter_fields()) + .collect(); + let zip_kind = PhysNodeKind::Zip { + inputs: zip_inputs, + null_extend: false, + }; + let zip_node = ctx + .phys_sm + .insert(PhysNode::new(Arc::new(output_schema), zip_kind)); + + Ok((zip_node, transformed_exprs)) +} + +/// Computes the schema that selecting the given expressions on the input node +/// would result in. +fn schema_for_select( + input: PhysNodeKey, + exprs: &[ExprIR], + ctx: &mut LowerExprContext, +) -> PolarsResult> { + let input_schema = &ctx.phys_sm[input].output_schema; + let output_schema: Schema = exprs + .iter() + .map(|e| { + let name = e.output_name(); + let dtype = ctx.expr_arena.get(e.node()).to_dtype( + input_schema, + Context::Default, + ctx.expr_arena, + )?; + PolarsResult::Ok(Field::new(name, dtype)) + }) + .try_collect()?; + Ok(Arc::new(output_schema)) +} + +fn build_select_node_with_ctx( + input: PhysNodeKey, + exprs: &[ExprIR], + ctx: &mut LowerExprContext, +) -> PolarsResult { + if exprs.iter().all(|e| is_input_independent(e.node(), ctx)) { + return build_input_independent_node_with_ctx(exprs, ctx); + } + + // Are we only selecting simple columns, with the same name? + let all_simple_columns: Option> = exprs + .iter() + .map(|e| match ctx.expr_arena.get(e.node()) { + AExpr::Column(name) if name.as_ref() == e.output_name() => Some(name.to_string()), + _ => None, + }) + .collect(); + + if let Some(columns) = all_simple_columns { + let input_schema = ctx.phys_sm[input].output_schema.clone(); + if input_schema.len() == columns.len() + && input_schema.iter_names().zip(&columns).all(|(l, r)| l == r) + { + // Input node already has the correct schema, just pass through. + return Ok(input); + } + + let output_schema = Arc::new(input_schema.select(&columns)?); + let node_kind = PhysNodeKind::SimpleProjection { input, columns }; + return Ok(ctx.phys_sm.insert(PhysNode::new(output_schema, node_kind))); + } + + let node_exprs = exprs.iter().map(|e| e.node()).collect_vec(); + let (transformed_input, transformed_exprs) = lower_exprs_with_ctx(input, &node_exprs, ctx)?; + let trans_expr_irs = exprs + .iter() + .zip(transformed_exprs) + .map(|(e, te)| ExprIR::new(te, OutputName::Alias(e.output_name().into()))) + .collect_vec(); + let output_schema = schema_for_select(transformed_input, &trans_expr_irs, ctx)?; + let node_kind = PhysNodeKind::Select { + input: transformed_input, + selectors: trans_expr_irs, + extend_original: false, + }; + Ok(ctx.phys_sm.insert(PhysNode::new(output_schema, node_kind))) +} + +/// Lowers an input node plus a set of expressions on that input node to an +/// equivalent (input node, set of expressions) pair, ensuring that the new set +/// of expressions can run on the streaming engine. +/// +/// Ensures that if the input node is transformed it has unique column names. +pub fn lower_exprs( + input: PhysNodeKey, + exprs: &[ExprIR], + expr_arena: &mut Arena, + phys_sm: &mut SlotMap, +) -> PolarsResult<(PhysNodeKey, Vec)> { + let mut ctx = LowerExprContext { + expr_arena, + phys_sm, + is_elementwise_cache: PlHashMap::new(), + is_input_independent_cache: PlHashMap::new(), + }; + let node_exprs = exprs.iter().map(|e| e.node()).collect_vec(); + let (transformed_input, transformed_exprs) = + lower_exprs_with_ctx(input, &node_exprs, &mut ctx)?; + let trans_expr_irs = exprs + .iter() + .zip(transformed_exprs) + .map(|(e, te)| ExprIR::new(te, OutputName::Alias(e.output_name().into()))) + .collect_vec(); + Ok((transformed_input, trans_expr_irs)) +} + +/// Builds a selection node given an input node and the expressions to select for. +pub fn build_select_node( + input: PhysNodeKey, + exprs: &[ExprIR], + expr_arena: &mut Arena, + phys_sm: &mut SlotMap, +) -> PolarsResult { + let mut ctx = LowerExprContext { + expr_arena, + phys_sm, + is_elementwise_cache: PlHashMap::new(), + is_input_independent_cache: PlHashMap::new(), + }; + build_select_node_with_ctx(input, exprs, &mut ctx) +} diff --git a/crates/polars-stream/src/physical_plan/lower_ir.rs b/crates/polars-stream/src/physical_plan/lower_ir.rs index 75ae7daeb728..6e1a8bc4e056 100644 --- a/crates/polars-stream/src/physical_plan/lower_ir.rs +++ b/crates/polars-stream/src/physical_plan/lower_ir.rs @@ -1,13 +1,16 @@ use std::sync::Arc; +use polars_core::prelude::{InitHashMaps, PlHashMap, PlIndexMap}; +use polars_core::schema::Schema; use polars_error::PolarsResult; -use polars_expr::reduce::can_convert_into_reduction; -use polars_plan::plans::{AExpr, Context, IR}; +use polars_plan::plans::expr_ir::{ExprIR, OutputName}; +use polars_plan::plans::{AExpr, ColumnName, Context, IR}; use polars_plan::prelude::SinkType; use polars_utils::arena::{Arena, Node}; +use polars_utils::itertools::Itertools; use slotmap::SlotMap; -use super::{PhysNode, PhysNodeKey}; +use super::{PhysNode, PhysNodeKey, PhysNodeKind}; fn is_streamable(node: Node, arena: &Arena) -> bool { polars_plan::plans::is_streamable(node, arena, Context::Default) @@ -17,164 +20,192 @@ fn is_streamable(node: Node, arena: &Arena) -> bool { pub fn lower_ir( node: Node, ir_arena: &mut Arena, - expr_arena: &Arena, + expr_arena: &mut Arena, phys_sm: &mut SlotMap, + schema_cache: &mut PlHashMap>, ) -> PolarsResult { let ir_node = ir_arena.get(node); - match ir_node { + let output_schema = IR::schema_with_cache(node, ir_arena, schema_cache); + let node_kind = match ir_node { IR::SimpleProjection { input, columns } => { - let input_ir_node = ir_arena.get(*input); - let input_schema = input_ir_node.schema(ir_arena).into_owned(); let columns = columns.iter_names().map(|s| s.to_string()).collect(); - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - Ok(phys_sm.insert(PhysNode::SimpleProjection { - input, + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + PhysNodeKind::SimpleProjection { + input: phys_input, columns, - input_schema, - })) + } }, - // TODO: split partially streamable selections to avoid fallback as much as possible. - IR::Select { - input, - expr, - schema, - .. - } if expr.iter().all(|e| is_streamable(e.node(), expr_arena)) => { + IR::Select { input, expr, .. } => { let selectors = expr.clone(); - let output_schema = schema.clone(); - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - Ok(phys_sm.insert(PhysNode::Select { - input, - selectors, - output_schema, - extend_original: false, - })) - }, - // TODO: split reductions and streamable selections. E.g. sum(a) + sum(b) should be split - // into Select(a + b) -> Reduce(sum(a), sum(b) - IR::Select { - input, - expr, - schema: output_schema, - .. - } if expr - .iter() - .all(|e| can_convert_into_reduction(e.node(), expr_arena)) => - { - let exprs = expr.clone(); - let input_ir_node = ir_arena.get(*input); - let input_schema = input_ir_node.schema(ir_arena).into_owned(); - let output_schema = output_schema.clone(); - let input_node = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - Ok(phys_sm.insert(PhysNode::Reduce { - input: input_node, - exprs, - input_schema, - output_schema, - })) + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + return super::lower_expr::build_select_node( + phys_input, &selectors, expr_arena, phys_sm, + ); }, - // TODO: split partially streamable selections to avoid fallback as much as possible. - IR::HStack { - input, - exprs, - schema, - .. - } if exprs.iter().all(|e| is_streamable(e.node(), expr_arena)) => { + IR::HStack { input, exprs, .. } + if exprs.iter().all(|e| is_streamable(e.node(), expr_arena)) => + { + // FIXME: constant literal columns should be broadcasted with hstack. let selectors = exprs.clone(); - let output_schema = schema.clone(); - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - Ok(phys_sm.insert(PhysNode::Select { - input, + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + PhysNodeKind::Select { + input: phys_input, selectors, - output_schema, extend_original: true, - })) + } + }, + + IR::HStack { input, exprs, .. } => { + // We already handled the all-streamable case above, so things get more complicated. + // For simplicity we just do a normal select with all the original columns prepended. + // + // FIXME: constant literal columns should be broadcasted with hstack. + let exprs = exprs.clone(); + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + let input_schema = &phys_sm[phys_input].output_schema; + let mut selectors = PlIndexMap::with_capacity(input_schema.len() + exprs.len()); + for name in input_schema.iter_names() { + let col_name: Arc = name.as_str().into(); + let col_expr = expr_arena.add(AExpr::Column(col_name.clone())); + selectors.insert( + name.clone(), + ExprIR::new(col_expr, OutputName::ColumnLhs(col_name)), + ); + } + for expr in exprs { + selectors.insert(expr.output_name().into(), expr); + } + let selectors = selectors.into_values().collect_vec(); + return super::lower_expr::build_select_node( + phys_input, &selectors, expr_arena, phys_sm, + ); }, IR::Slice { input, offset, len } => { if *offset >= 0 { let offset = *offset as usize; let length = *len as usize; - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - Ok(phys_sm.insert(PhysNode::StreamingSlice { - input, + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + PhysNodeKind::StreamingSlice { + input: phys_input, offset, length, - })) + } } else { todo!() } }, - IR::Filter { input, predicate } if is_streamable(predicate.node(), expr_arena) => { + IR::Filter { input, predicate } => { let predicate = predicate.clone(); - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - Ok(phys_sm.insert(PhysNode::Filter { input, predicate })) + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + let cols_and_predicate = output_schema + .iter_names() + .map(|name| { + let name: ColumnName = name.as_str().into(); + ExprIR::new( + expr_arena.add(AExpr::Column(name.clone())), + OutputName::ColumnLhs(name), + ) + }) + .chain([predicate]) + .collect_vec(); + let (trans_input, mut trans_cols_and_predicate) = super::lower_expr::lower_exprs( + phys_input, + &cols_and_predicate, + expr_arena, + phys_sm, + )?; + + let filter_schema = phys_sm[trans_input].output_schema.clone(); + let filter = PhysNodeKind::Filter { + input: trans_input, + predicate: trans_cols_and_predicate.last().unwrap().clone(), + }; + + let post_filter = phys_sm.insert(PhysNode::new(filter_schema, filter)); + trans_cols_and_predicate.pop(); // Remove predicate. + return super::lower_expr::build_select_node( + post_filter, + &trans_cols_and_predicate, + expr_arena, + phys_sm, + ); }, IR::DataFrameScan { df, - output_schema, + output_schema: projection, filter, - schema: input_schema, + schema, .. } => { - if let Some(filter) = filter { - if !is_streamable(filter.node(), expr_arena) { - todo!() - } - } - - let mut phys_node = phys_sm.insert(PhysNode::InMemorySource { df: df.clone() }); + let mut schema = schema.clone(); // This is initially the schema of df, but can change with the projection. + let mut node_kind = PhysNodeKind::InMemorySource { df: df.clone() }; - if let Some(schema) = output_schema { - phys_node = phys_sm.insert(PhysNode::SimpleProjection { - input: phys_node, - input_schema: input_schema.clone(), - columns: schema.iter_names().map(|s| s.to_string()).collect(), - }) + // Do we need to apply a projection? + if let Some(projection_schema) = projection { + if projection_schema.len() != schema.len() + || projection_schema + .iter_names() + .zip(schema.iter_names()) + .any(|(l, r)| l != r) + { + let phys_input = phys_sm.insert(PhysNode::new(schema, node_kind)); + node_kind = PhysNodeKind::SimpleProjection { + input: phys_input, + columns: projection_schema + .iter_names() + .map(|s| s.to_string()) + .collect(), + }; + schema = projection_schema.clone(); + } } if let Some(predicate) = filter.clone() { - phys_node = phys_sm.insert(PhysNode::Filter { - input: phys_node, + if !is_streamable(predicate.node(), expr_arena) { + todo!() + } + + let phys_input = phys_sm.insert(PhysNode::new(schema, node_kind)); + node_kind = PhysNodeKind::Filter { + input: phys_input, predicate, - }) + }; } - Ok(phys_node) + node_kind }, IR::Sink { input, payload } => { if *payload == SinkType::Memory { - let schema = ir_node.schema(ir_arena).into_owned(); - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; - return Ok(phys_sm.insert(PhysNode::InMemorySink { input, schema })); + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; + PhysNodeKind::InMemorySink { input: phys_input } + } else { + todo!() } - - todo!() }, IR::MapFunction { input, function } => { - let input_schema = ir_arena.get(*input).schema(ir_arena).into_owned(); let function = function.clone(); - let input = lower_ir(*input, ir_arena, expr_arena, phys_sm)?; + let phys_input = lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?; - let phys_node = if function.is_streamable() { + if function.is_streamable() { let map = Arc::new(move |df| function.evaluate(df)); - PhysNode::Map { input, map } + PhysNodeKind::Map { + input: phys_input, + map, + } } else { let map = Arc::new(move |df| function.evaluate(df)); - PhysNode::InMemoryMap { - input, - input_schema, + PhysNodeKind::InMemoryMap { + input: phys_input, map, } - }; - - Ok(phys_sm.insert(phys_node)) + } }, IR::Sort { @@ -182,16 +213,11 @@ pub fn lower_ir( by_column, slice, sort_options, - } => { - let input_schema = ir_arena.get(*input).schema(ir_arena).into_owned(); - let phys_node = PhysNode::Sort { - input_schema, - by_column: by_column.clone(), - slice: *slice, - sort_options: sort_options.clone(), - input: lower_ir(*input, ir_arena, expr_arena, phys_sm)?, - }; - Ok(phys_sm.insert(phys_node)) + } => PhysNodeKind::Sort { + by_column: by_column.clone(), + slice: *slice, + sort_options: sort_options.clone(), + input: lower_ir(*input, ir_arena, expr_arena, phys_sm, schema_cache)?, }, IR::Union { inputs, options } => { @@ -202,9 +228,9 @@ pub fn lower_ir( let inputs = inputs .clone() // Needed to borrow ir_arena mutably. .into_iter() - .map(|input| lower_ir(input, ir_arena, expr_arena, phys_sm)) + .map(|input| lower_ir(input, ir_arena, expr_arena, phys_sm, schema_cache)) .collect::>()?; - Ok(phys_sm.insert(PhysNode::OrderedUnion { inputs })) + PhysNodeKind::OrderedUnion { inputs } }, IR::HConcat { @@ -212,26 +238,44 @@ pub fn lower_ir( schema: _, options: _, } => { - let input_schemas = inputs - .iter() - .map(|input| { - let input_ir_node = ir_arena.get(*input); - input_ir_node.schema(ir_arena).into_owned() - }) - .collect(); - let inputs = inputs .clone() // Needed to borrow ir_arena mutably. .into_iter() - .map(|input| lower_ir(input, ir_arena, expr_arena, phys_sm)) + .map(|input| lower_ir(input, ir_arena, expr_arena, phys_sm, schema_cache)) .collect::>()?; - Ok(phys_sm.insert(PhysNode::Zip { + PhysNodeKind::Zip { inputs, - input_schemas, null_extend: true, - })) + } + }, + + v @ IR::Scan { .. } => { + let IR::Scan { + paths, + file_info, + hive_parts, + output_schema, + scan_type, + predicate, + file_options, + } = v.clone() + else { + unreachable!(); + }; + + PhysNodeKind::FileScan { + paths, + file_info, + hive_parts, + output_schema, + scan_type, + predicate, + file_options, + } }, _ => todo!(), - } + }; + + Ok(phys_sm.insert(PhysNode::new(output_schema, node_kind))) } diff --git a/crates/polars-stream/src/physical_plan/mod.rs b/crates/polars-stream/src/physical_plan/mod.rs index dbc48a82077e..fd59a9ffc8d2 100644 --- a/crates/polars-stream/src/physical_plan/mod.rs +++ b/crates/polars-stream/src/physical_plan/mod.rs @@ -1,15 +1,23 @@ +use std::path::PathBuf; use std::sync::Arc; use polars_core::frame::DataFrame; -use polars_core::prelude::SortMultipleOptions; -use polars_core::schema::Schema; -use polars_plan::plans::DataFrameUdf; +use polars_core::prelude::{PlHashMap, SortMultipleOptions}; +use polars_core::schema::{Schema, SchemaRef}; +use polars_error::PolarsResult; +use polars_plan::plans::hive::HivePartitions; +use polars_plan::plans::{AExpr, DataFrameUdf, FileInfo, FileScan, IR}; use polars_plan::prelude::expr_ir::ExprIR; +mod fmt; +mod lower_expr; mod lower_ir; mod to_graph; -pub use lower_ir::lower_ir; +pub use fmt::visualize_plan; +use polars_plan::prelude::FileScanOptions; +use polars_utils::arena::{Arena, Node}; +use slotmap::{Key, SecondaryMap, SlotMap}; pub use to_graph::physical_plan_to_graph; slotmap::new_key_type! { @@ -22,7 +30,22 @@ slotmap::new_key_type! { /// A physical plan is created when the `IR` is translated to a directed /// acyclic graph of operations that can run on the streaming engine. #[derive(Clone, Debug)] -pub enum PhysNode { +pub struct PhysNode { + output_schema: Arc, + kind: PhysNodeKind, +} + +impl PhysNode { + pub fn new(output_schema: Arc, kind: PhysNodeKind) -> Self { + Self { + output_schema, + kind, + } + } +} + +#[derive(Clone, Debug)] +pub enum PhysNodeKind { InMemorySource { df: Arc, }, @@ -31,14 +54,11 @@ pub enum PhysNode { input: PhysNodeKey, selectors: Vec, extend_original: bool, - output_schema: Arc, }, Reduce { input: PhysNodeKey, exprs: Vec, - input_schema: Arc, - output_schema: Arc, }, StreamingSlice { @@ -54,18 +74,15 @@ pub enum PhysNode { SimpleProjection { input: PhysNodeKey, - input_schema: Arc, columns: Vec, }, InMemorySink { input: PhysNodeKey, - schema: Arc, }, InMemoryMap { input: PhysNodeKey, - input_schema: Arc, map: Arc, }, @@ -76,7 +93,6 @@ pub enum PhysNode { Sort { input: PhysNodeKey, - input_schema: Arc, // TODO: remove when not using fallback impl. by_column: Vec, slice: Option<(i64, usize)>, sort_options: SortMultipleOptions, @@ -88,7 +104,6 @@ pub enum PhysNode { Zip { inputs: Vec, - input_schemas: Vec>, /// If true shorter inputs are extended with nulls to the longest input, /// if false all inputs must be the same length, or have length 1 in /// which case they are broadcast. @@ -99,4 +114,78 @@ pub enum PhysNode { Multiplexer { input: PhysNodeKey, }, + + FileScan { + paths: Arc>, + file_info: FileInfo, + hive_parts: Option>>, + predicate: Option, + output_schema: Option, + scan_type: FileScan, + file_options: FileScanOptions, + }, +} + +#[recursive::recursive] +fn insert_multiplexers( + node: PhysNodeKey, + phys_sm: &mut SlotMap, + referenced: &mut SecondaryMap, +) { + let seen_before = referenced.insert(node, ()).is_some(); + if seen_before && !matches!(phys_sm[node].kind, PhysNodeKind::Multiplexer { .. }) { + // This node is referenced at least twice. We first set the input key to + // null and then update it to avoid a double-mutable-borrow issue. + let input_schema = phys_sm[node].output_schema.clone(); + let orig_input_node = core::mem::replace( + &mut phys_sm[node], + PhysNode::new( + input_schema, + PhysNodeKind::Multiplexer { + input: PhysNodeKey::null(), + }, + ), + ); + let orig_input_key = phys_sm.insert(orig_input_node); + phys_sm[node].kind = PhysNodeKind::Multiplexer { + input: orig_input_key, + }; + } + + if !seen_before { + match &phys_sm[node].kind { + PhysNodeKind::InMemorySource { .. } | PhysNodeKind::FileScan { .. } => {}, + PhysNodeKind::Select { input, .. } + | PhysNodeKind::Reduce { input, .. } + | PhysNodeKind::StreamingSlice { input, .. } + | PhysNodeKind::Filter { input, .. } + | PhysNodeKind::SimpleProjection { input, .. } + | PhysNodeKind::InMemorySink { input } + | PhysNodeKind::InMemoryMap { input, .. } + | PhysNodeKind::Map { input, .. } + | PhysNodeKind::Sort { input, .. } + | PhysNodeKind::Multiplexer { input } => { + insert_multiplexers(*input, phys_sm, referenced); + }, + + PhysNodeKind::OrderedUnion { inputs } | PhysNodeKind::Zip { inputs, .. } => { + for input in inputs.clone() { + insert_multiplexers(input, phys_sm, referenced); + } + }, + } + } +} + +pub fn build_physical_plan( + root: Node, + ir_arena: &mut Arena, + expr_arena: &mut Arena, + phys_sm: &mut SlotMap, + schema_cache: &mut PlHashMap>, +) -> PolarsResult { + let phys_root = lower_ir::lower_ir(root, ir_arena, expr_arena, phys_sm, schema_cache)?; + let mut referenced = SecondaryMap::with_capacity(phys_sm.capacity()); + insert_multiplexers(phys_root, phys_sm, &mut referenced); + Ok(phys_root) } diff --git a/crates/polars-stream/src/physical_plan/to_graph.rs b/crates/polars-stream/src/physical_plan/to_graph.rs index 2e4efd9410bc..44e32e6fc348 100644 --- a/crates/polars-stream/src/physical_plan/to_graph.rs +++ b/crates/polars-stream/src/physical_plan/to_graph.rs @@ -6,14 +6,16 @@ use polars_expr::planner::{create_physical_expr, get_expr_depth_limit, Expressio use polars_expr::reduce::into_reduction; use polars_expr::state::ExecutionState; use polars_mem_engine::create_physical_plan; +use polars_plan::global::_set_n_rows_for_scan; use polars_plan::plans::expr_ir::ExprIR; use polars_plan::plans::{AExpr, ArenaExprIter, Context, IR}; use polars_plan::prelude::FunctionFlags; use polars_utils::arena::{Arena, Node}; +use polars_utils::itertools::Itertools; use recursive::recursive; use slotmap::{SecondaryMap, SlotMap}; -use super::{PhysNode, PhysNodeKey}; +use super::{PhysNode, PhysNodeKey, PhysNodeKind}; use crate::expression::StreamExpr; use crate::graph::{Graph, GraphNodeKey}; use crate::nodes; @@ -52,6 +54,7 @@ struct GraphConversionContext<'a> { } pub fn physical_plan_to_graph( + root: PhysNodeKey, phys_sm: &SlotMap, expr_arena: &Arena, ) -> PolarsResult<(Graph, SecondaryMap)> { @@ -64,9 +67,7 @@ pub fn physical_plan_to_graph( expr_conversion_state: ExpressionConversionState::new(false, expr_depth_limit), }; - for key in phys_sm.keys() { - to_graph_rec(key, &mut ctx)?; - } + to_graph_rec(root, &mut ctx)?; Ok((ctx.graph, ctx.phys_to_graph)) } @@ -81,8 +82,9 @@ fn to_graph_rec<'a>( return Ok(*graph_key); } - use PhysNode::*; - let graph_key = match &ctx.phys_sm[phys_node_key] { + use PhysNodeKind::*; + let node = &ctx.phys_sm[phys_node_key]; + let graph_key = match &node.kind { InMemorySource { df } => ctx.graph.add_node( nodes::in_memory_source::InMemorySourceNode::new(df.clone()), [], @@ -112,7 +114,6 @@ fn to_graph_rec<'a>( Select { selectors, input, - output_schema, extend_original, } => { let phys_selectors = selectors @@ -123,27 +124,22 @@ fn to_graph_rec<'a>( ctx.graph.add_node( nodes::select::SelectNode::new( phys_selectors, - output_schema.clone(), + node.output_schema.clone(), *extend_original, ), [input_key], ) }, - Reduce { - input, - exprs, - input_schema, - output_schema, - } => { + Reduce { input, exprs } => { let input_key = to_graph_rec(*input, ctx)?; + let input_schema = &ctx.phys_sm[*input].output_schema; let mut reductions = Vec::with_capacity(exprs.len()); let mut inputs = Vec::with_capacity(reductions.len()); for e in exprs { let (red, input_node) = - into_reduction(e.node(), ctx.expr_arena, input_schema.as_ref())? - .expect("invariant"); + into_reduction(e.node(), ctx.expr_arena, input_schema)?.expect("invariant"); reductions.push(red); let input_phys = @@ -153,41 +149,33 @@ fn to_graph_rec<'a>( } ctx.graph.add_node( - nodes::reduce::ReduceNode::new(inputs, reductions, output_schema.clone()), + nodes::reduce::ReduceNode::new(inputs, reductions, node.output_schema.clone()), [input_key], ) }, - SimpleProjection { - input, - columns, - input_schema, - } => { + SimpleProjection { input, columns } => { + let input_schema = ctx.phys_sm[*input].output_schema.clone(); let input_key = to_graph_rec(*input, ctx)?; ctx.graph.add_node( - nodes::simple_projection::SimpleProjectionNode::new( - columns.clone(), - input_schema.clone(), - ), + nodes::simple_projection::SimpleProjectionNode::new(columns.clone(), input_schema), [input_key], ) }, - InMemorySink { input, schema } => { + InMemorySink { input } => { + let input_schema = ctx.phys_sm[*input].output_schema.clone(); let input_key = to_graph_rec(*input, ctx)?; ctx.graph.add_node( - nodes::in_memory_sink::InMemorySinkNode::new(schema.clone()), + nodes::in_memory_sink::InMemorySinkNode::new(input_schema), [input_key], ) }, - InMemoryMap { - input, - input_schema, - map, - } => { + InMemoryMap { input, map } => { + let input_schema = ctx.phys_sm[*input].output_schema.clone(); let input_key = to_graph_rec(*input, ctx)?; ctx.graph.add_node( - nodes::in_memory_map::InMemoryMapNode::new(input_schema.clone(), map.clone()), + nodes::in_memory_map::InMemoryMapNode::new(input_schema, map.clone()), [input_key], ) }, @@ -200,11 +188,11 @@ fn to_graph_rec<'a>( Sort { input, - input_schema, by_column, slice, sort_options, } => { + let input_schema = ctx.phys_sm[*input].output_schema.clone(); let lmdf = Arc::new(LateMaterializedDataFrame::default()); let mut lp_arena = Arena::default(); let df_node = lp_arena.add(lmdf.clone().as_ir_node(input_schema.clone())); @@ -223,7 +211,7 @@ fn to_graph_rec<'a>( let input_key = to_graph_rec(*input, ctx)?; ctx.graph.add_node( nodes::in_memory_map::InMemoryMapNode::new( - input_schema.clone(), + input_schema, Arc::new(move |df| { lmdf.set_materialized_dataframe(df); let mut state = ExecutionState::new(); @@ -245,15 +233,18 @@ fn to_graph_rec<'a>( Zip { inputs, - input_schemas, null_extend, } => { + let input_schemas = inputs + .iter() + .map(|i| ctx.phys_sm[*i].output_schema.clone()) + .collect_vec(); let input_keys = inputs .iter() .map(|i| to_graph_rec(*i, ctx)) - .collect::, _>>()?; + .try_collect_vec()?; ctx.graph.add_node( - nodes::zip::ZipNode::new(*null_extend, input_schemas.clone()), + nodes::zip::ZipNode::new(*null_extend, input_schemas), input_keys, ) }, @@ -263,6 +254,69 @@ fn to_graph_rec<'a>( ctx.graph .add_node(nodes::multiplexer::MultiplexerNode::new(), [input_key]) }, + + v @ FileScan { .. } => { + let FileScan { + paths, + file_info, + hive_parts, + output_schema, + scan_type, + predicate, + mut file_options, + } = v.clone() + else { + unreachable!() + }; + + file_options.slice = if let Some((offset, len)) = file_options.slice { + Some((offset, _set_n_rows_for_scan(Some(len)).unwrap())) + } else { + _set_n_rows_for_scan(None).map(|x| (0, x)) + }; + + let predicate = predicate + .map(|pred| { + create_physical_expr( + &pred, + Context::Default, + ctx.expr_arena, + output_schema.as_ref(), + &mut ctx.expr_conversion_state, + ) + }) + .map_or(Ok(None), |v| v.map(Some))?; + + { + use polars_plan::prelude::FileScan; + + match scan_type { + FileScan::Parquet { + options, + cloud_options, + metadata: _, + } => { + if std::env::var("POLARS_DISABLE_PARQUET_SOURCE").as_deref() != Ok("1") { + ctx.graph.add_node( + nodes::parquet_source::ParquetSourceNode::new( + paths, + file_info, + hive_parts, + predicate, + options, + cloud_options, + file_options, + ), + [], + ) + } else { + todo!() + } + }, + _ => todo!(), + } + } + }, }; ctx.phys_to_graph.insert(phys_node_key, graph_key); diff --git a/crates/polars-stream/src/skeleton.rs b/crates/polars-stream/src/skeleton.rs index 64fcdc4d5c5e..435e12d39ef5 100644 --- a/crates/polars-stream/src/skeleton.rs +++ b/crates/polars-stream/src/skeleton.rs @@ -15,13 +15,23 @@ fn is_streamable(node: Node, arena: &Arena) -> bool { pub fn run_query( node: Node, mut ir_arena: Arena, - expr_arena: &Arena, + expr_arena: &mut Arena, ) -> PolarsResult { let mut phys_sm = SlotMap::with_capacity_and_key(ir_arena.len()); - - let root = crate::physical_plan::lower_ir(node, &mut ir_arena, expr_arena, &mut phys_sm)?; + let mut schema_cache = PlHashMap::with_capacity(ir_arena.len()); + let root = crate::physical_plan::build_physical_plan( + node, + &mut ir_arena, + expr_arena, + &mut phys_sm, + &mut schema_cache, + )?; + if let Ok(visual_path) = std::env::var("POLARS_VISUALIZE_PHYSICAL_PLAN") { + let visualization = crate::physical_plan::visualize_plan(root, &phys_sm, expr_arena); + std::fs::write(visual_path, visualization).unwrap(); + } let (mut graph, phys_to_graph) = - crate::physical_plan::physical_plan_to_graph(&phys_sm, expr_arena)?; + crate::physical_plan::physical_plan_to_graph(root, &phys_sm, expr_arena)?; let mut results = crate::execute::execute_graph(&mut graph)?; Ok(results.remove(phys_to_graph[root]).unwrap()) } diff --git a/crates/polars-stream/src/utils/mod.rs b/crates/polars-stream/src/utils/mod.rs index 018b893ea992..f8d0d74ff027 100644 --- a/crates/polars-stream/src/utils/mod.rs +++ b/crates/polars-stream/src/utils/mod.rs @@ -1,3 +1,5 @@ pub mod in_memory_linearize; pub mod late_materialized_df; pub mod linearizer; +pub mod notify_channel; +pub mod task_handles_ext; diff --git a/crates/polars-stream/src/utils/notify_channel.rs b/crates/polars-stream/src/utils/notify_channel.rs new file mode 100644 index 000000000000..5aaef03ddc61 --- /dev/null +++ b/crates/polars-stream/src/utils/notify_channel.rs @@ -0,0 +1,56 @@ +use tokio::sync::mpsc::error::TrySendError; +use tokio::sync::mpsc::{channel, Receiver, Sender}; + +/// Receiver that calls `notify()` before `recv()` +pub struct NotifyReceiver { + receiver: Receiver, + /// We use a channel for notify because it lets the sender know when the receiver has been + /// dropped. + notify: Sender<()>, +} + +impl NotifyReceiver { + pub async fn recv(&mut self) -> Option { + match self.notify.try_send(()) { + Err(TrySendError::Closed(_)) => None, + Ok(_) => self.receiver.recv().await, + v @ Err(TrySendError::Full(_)) => { + v.unwrap(); + unreachable!(); + }, + } + } +} + +/// The notify allows us to make the producer only produce values when requested. Otherwise it would +/// produce a new value as soon as the previous value was consumed (as there would be channel +/// capacity). +pub fn notify_channel() -> (Sender, Receiver<()>, NotifyReceiver) { + let (tx, rx) = channel::(1); + let (notify_tx, notify_rx) = channel(1); + + ( + tx, + notify_rx, + NotifyReceiver { + receiver: rx, + notify: notify_tx, + }, + ) +} + +mod tests { + + #[test] + fn test_notify_channel() { + use futures::FutureExt; + + use super::notify_channel; + let (tx, mut notify, mut rx) = notify_channel(); + assert!(notify.recv().now_or_never().is_none()); + assert!(rx.recv().now_or_never().is_none()); + assert_eq!(notify.recv().now_or_never().unwrap(), Some(())); + assert!(tx.try_send(()).is_ok()); + assert!(rx.recv().now_or_never().is_some()); + } +} diff --git a/crates/polars-stream/src/utils/task_handles_ext.rs b/crates/polars-stream/src/utils/task_handles_ext.rs new file mode 100644 index 000000000000..edeca1558e80 --- /dev/null +++ b/crates/polars-stream/src/utils/task_handles_ext.rs @@ -0,0 +1,20 @@ +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; + +/// Calls [`tokio::task::JoinHandle::abort`] on the join handle when dropped. +pub struct AbortOnDropHandle(pub tokio::task::JoinHandle); + +impl Future for AbortOnDropHandle { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + Pin::new(&mut self.0).poll(cx) + } +} + +impl Drop for AbortOnDropHandle { + fn drop(&mut self) { + self.0.abort(); + } +} diff --git a/crates/polars-time/Cargo.toml b/crates/polars-time/Cargo.toml index 9fa609614c59..a716974f0e76 100644 --- a/crates/polars-time/Cargo.toml +++ b/crates/polars-time/Cargo.toml @@ -34,7 +34,7 @@ dtype-datetime = ["polars-core/dtype-datetime", "temporal"] dtype-time = ["polars-core/dtype-time", "temporal"] dtype-duration = ["polars-core/dtype-duration", "temporal"] month_start = [] -month_end = [] +month_end = ["month_start"] offset_by = [] rolling_window = ["polars-core/rolling_window"] rolling_window_by = ["polars-core/rolling_window_by", "dtype-duration"] diff --git a/crates/polars-time/src/round.rs b/crates/polars-time/src/round.rs index 4bb6f2a3386f..7fd48a407f51 100644 --- a/crates/polars-time/src/round.rs +++ b/crates/polars-time/src/round.rs @@ -5,6 +5,12 @@ use polars_core::prelude::*; use polars_utils::cache::FastFixedCache; use crate::prelude::*; +use crate::truncate::fast_truncate; + +#[inline(always)] +fn fast_round(t: i64, every: i64) -> i64 { + fast_truncate(t + every / 2, every) +} pub trait PolarsRound { fn round(&self, every: &StringChunked, tz: Option<&Tz>) -> PolarsResult @@ -35,11 +41,7 @@ impl PolarsRound for DatetimeChunked { TimeUnit::Nanoseconds => every_parsed.duration_ns(), }; return Ok(self - .apply_values(|t| { - // Round half-way values away from zero - let half_away = t.signum() * every / 2; - t + half_away - (t + half_away) % every - }) + .apply_values(|t| fast_round(t, every)) .into_datetime(self.time_unit(), time_zone.clone())); } else { let w = Window::new(every_parsed, every_parsed, offset); diff --git a/crates/polars-time/src/truncate.rs b/crates/polars-time/src/truncate.rs index 991ce50b547a..d3c74420252f 100644 --- a/crates/polars-time/src/truncate.rs +++ b/crates/polars-time/src/truncate.rs @@ -12,6 +12,12 @@ pub trait PolarsTruncate { Self: Sized; } +#[inline(always)] +pub(crate) fn fast_truncate(t: i64, every: i64) -> i64 { + let remainder = t % every; + t - (remainder + every * (remainder < 0) as i64) +} + impl PolarsTruncate for DatetimeChunked { fn truncate(&self, tz: Option<&Tz>, every: &StringChunked) -> PolarsResult { let time_zone = self.time_zone(); @@ -35,10 +41,7 @@ impl PolarsTruncate for DatetimeChunked { TimeUnit::Nanoseconds => every_parsed.duration_ns(), }; return Ok(self - .apply_values(|t| { - let remainder = t % every; - t - (remainder + every * (remainder < 0) as i64) - }) + .apply_values(|t| fast_truncate(t, every)) .into_datetime(self.time_unit(), time_zone.clone())); } else { let w = Window::new(every_parsed, every_parsed, offset); diff --git a/crates/polars-time/src/upsample.rs b/crates/polars-time/src/upsample.rs index 692f1a35744c..235ec383fbc8 100644 --- a/crates/polars-time/src/upsample.rs +++ b/crates/polars-time/src/upsample.rs @@ -121,7 +121,6 @@ fn upsample_impl( stable: bool, ) -> PolarsResult { let s = source.column(index_column)?; - s.ensure_sorted_arg("upsample")?; let time_type = s.dtype(); if matches!(time_type, DataType::Date) { let mut df = source.clone(); @@ -184,6 +183,7 @@ fn upsample_single_impl( index_column: &Series, every: Duration, ) -> PolarsResult { + index_column.ensure_sorted_arg("upsample")?; let index_col_name = index_column.name(); use DataType::*; diff --git a/crates/polars-time/src/windows/group_by.rs b/crates/polars-time/src/windows/group_by.rs index 380a92180322..9ba3a2d3dbc2 100644 --- a/crates/polars-time/src/windows/group_by.rs +++ b/crates/polars-time/src/windows/group_by.rs @@ -557,7 +557,9 @@ pub(crate) fn group_by_values_iter_lookahead_collected( } /// Different from `group_by_windows`, where define window buckets and search which values fit that -/// pre-defined bucket, this function defines every window based on the: +/// pre-defined bucket. +/// +/// This function defines every window based on the: /// - timestamp (lower bound) /// - timestamp + period (upper bound) /// where timestamps are the individual values in the array `time` diff --git a/crates/polars-utils/Cargo.toml b/crates/polars-utils/Cargo.toml index d8b2d0bc9f73..6e2ac16c6e85 100644 --- a/crates/polars-utils/Cargo.toml +++ b/crates/polars-utils/Cargo.toml @@ -16,11 +16,13 @@ bytemuck = { workspace = true } bytes = { workspace = true } hashbrown = { workspace = true } indexmap = { workspace = true } +libc = { workspace = true } memmap = { workspace = true, optional = true } num-traits = { workspace = true } once_cell = { workspace = true } raw-cpuid = { workspace = true } rayon = { workspace = true } +serde = { workspace = true, optional = true } smartstring = { workspace = true } stacker = { workspace = true } sysinfo = { version = "0.31", default-features = false, features = ["system"], optional = true } @@ -35,3 +37,4 @@ version_check = { workspace = true } mmap = ["memmap"] bigidx = [] nightly = [] +ir_serde = ["serde"] diff --git a/crates/polars-utils/src/arena.rs b/crates/polars-utils/src/arena.rs index 06741ff454fe..d5748725c4d1 100644 --- a/crates/polars-utils/src/arena.rs +++ b/crates/polars-utils/src/arena.rs @@ -1,5 +1,8 @@ use std::sync::atomic::{AtomicU32, Ordering}; +#[cfg(feature = "ir_serde")] +use serde::{Deserialize, Serialize}; + use crate::error::*; use crate::slice::GetSaferUnchecked; @@ -21,6 +24,7 @@ fn index_of(slice: &[T], item: &T) -> Option { #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Ord, PartialOrd)] #[repr(transparent)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub struct Node(pub usize); impl Default for Node { @@ -32,6 +36,7 @@ impl Default for Node { static ARENA_VERSION: AtomicU32 = AtomicU32::new(0); #[derive(Debug, Clone)] +#[cfg_attr(feature = "ir_serde", derive(Serialize, Deserialize))] pub struct Arena { version: u32, items: Vec, diff --git a/crates/polars-utils/src/mem.rs b/crates/polars-utils/src/mem.rs index d4f4e3d028fd..8462d1a57538 100644 --- a/crates/polars-utils/src/mem.rs +++ b/crates/polars-utils/src/mem.rs @@ -1,3 +1,15 @@ +use once_cell::sync::Lazy; +static PAGE_SIZE: Lazy = Lazy::new(|| { + #[cfg(target_family = "unix")] + unsafe { + libc::sysconf(libc::_SC_PAGESIZE) as usize + } + #[cfg(not(target_family = "unix"))] + { + 4096 + } +}); + /// # Safety /// This may break aliasing rules, make sure you are the only owner. #[allow(clippy::mut_from_ref)] @@ -10,7 +22,7 @@ pub unsafe fn to_mutable_slice(s: &[T]) -> &mut [T] { /// # Safety /// /// This should only be called with pointers to valid memory. -pub unsafe fn prefetch_l2(ptr: *const u8) { +unsafe fn prefetch_l2_impl(ptr: *const u8) { #[cfg(target_arch = "x86_64")] { use std::arch::x86_64::*; @@ -23,3 +35,54 @@ pub unsafe fn prefetch_l2(ptr: *const u8) { unsafe { _prefetch(ptr as *const _, _PREFETCH_READ, _PREFETCH_LOCALITY2) }; } } + +/// Attempt to prefetch the memory in the slice to the L2 cache. +pub fn prefetch_l2(slice: &[u8]) { + if slice.is_empty() { + return; + } + + // @TODO: We can play a bit more with this prefetching. Maybe introduce a maximum number of + // prefetches as to not overwhelm the processor. The linear prefetcher should pick it up + // at a certain point. + + for i in (0..slice.len()).step_by(*PAGE_SIZE) { + unsafe { prefetch_l2_impl(slice[i..].as_ptr()) }; + } + + unsafe { prefetch_l2_impl(slice[slice.len() - 1..].as_ptr()) } +} + +/// `madvise()` with `MADV_SEQUENTIAL` on unix systems. This is a no-op on non-unix systems. +pub fn madvise_sequential(slice: &[u8]) { + #[cfg(target_family = "unix")] + madvise(slice, libc::MADV_SEQUENTIAL); +} + +/// `madvise()` with `MADV_WILLNEED` on unix systems. This is a no-op on non-unix systems. +pub fn madvise_willneed(slice: &[u8]) { + #[cfg(target_family = "unix")] + madvise(slice, libc::MADV_WILLNEED); +} + +/// `madvise()` with `MADV_POPULATE_READ` on linux systems. This a no-op on non-linux systems. +pub fn madvise_populate_read(#[allow(unused)] slice: &[u8]) { + #[cfg(target_os = "linux")] + madvise(slice, libc::MADV_POPULATE_READ); +} + +#[cfg(target_family = "unix")] +fn madvise(slice: &[u8], advice: libc::c_int) { + let ptr = slice.as_ptr(); + + let align = ptr as usize % *PAGE_SIZE; + let ptr = ptr.wrapping_sub(align); + let len = slice.len() + align; + + if unsafe { libc::madvise(ptr as *mut libc::c_void, len, advice) } != 0 { + let err = std::io::Error::last_os_error(); + if let std::io::ErrorKind::InvalidInput = err.kind() { + panic!("{}", err); + } + } +} diff --git a/crates/polars-utils/src/mmap.rs b/crates/polars-utils/src/mmap.rs index 5bd8e2df12a5..d8db6d0ae671 100644 --- a/crates/polars-utils/src/mmap.rs +++ b/crates/polars-utils/src/mmap.rs @@ -93,19 +93,7 @@ mod private { /// Attempt to prefetch the memory belonging to to this [`MemSlice`] #[inline] pub fn prefetch(&self) { - if self.len() == 0 { - return; - } - - // @TODO: We can play a bit more with this prefetching. Maybe introduce a maximum number of - // prefetches as to not overwhelm the processor. The linear prefetcher should pick it up - // at a certain point. - - const PAGE_SIZE: usize = 4096; - for i in 0..self.len() / PAGE_SIZE { - unsafe { prefetch_l2(self[i * PAGE_SIZE..].as_ptr()) }; - } - unsafe { prefetch_l2(self[self.len() - 1..].as_ptr()) } + prefetch_l2(self.as_ref()); } /// # Panics diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index dc1e5c952371..196f1832ada4 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -228,6 +228,7 @@ zip_with = ["polars-core/zip_with"] bigidx = ["polars-core/bigidx", "polars-lazy?/bigidx", "polars-ops/big_idx"] polars_cloud = ["polars-lazy?/polars_cloud"] +ir_serde = ["polars-plan/ir_serde"] test = [ "lazy", diff --git a/crates/polars/src/lib.rs b/crates/polars/src/lib.rs index 00086736c6e5..9910df124fa5 100644 --- a/crates/polars/src/lib.rs +++ b/crates/polars/src/lib.rs @@ -405,6 +405,7 @@ //! `T` in complex lazy expressions. However this does require `unsafe` code allow this. //! * `POLARS_NO_PARQUET_STATISTICS` -> if set, statistics in parquet files are ignored. //! * `POLARS_PANIC_ON_ERR` -> panic instead of returning an Error. +//! * `POLARS_BACKTRACE_IN_ERR` -> include a Rust backtrace in Error messages. //! * `POLARS_NO_CHUNKED_JOIN` -> force rechunk before joins. //! //! ## User guide diff --git a/crates/polars/tests/it/io/parquet/arrow/mod.rs b/crates/polars/tests/it/io/parquet/arrow/mod.rs index 11bee66dba73..f5e0b2e39e3d 100644 --- a/crates/polars/tests/it/io/parquet/arrow/mod.rs +++ b/crates/polars/tests/it/io/parquet/arrow/mod.rs @@ -17,8 +17,6 @@ use polars_parquet::write::*; use super::read::file::FileReader; -type ArrayStats = (Box, Statistics); - fn new_struct( arrays: Vec>, names: Vec, @@ -32,33 +30,17 @@ fn new_struct( StructArray::new(ArrowDataType::Struct(fields), arrays, validity) } -pub fn read_column(mut reader: R, column: &str) -> PolarsResult { +pub fn read_column(mut reader: R, column: &str) -> PolarsResult> { let metadata = p_read::read_metadata(&mut reader)?; let schema = p_read::infer_schema(&metadata)?; - let row_group = &metadata.row_groups[0]; - - // verify that we can read indexes - if p_read::indexes::has_indexes(row_group) { - let _indexes = p_read::indexes::read_filtered_pages( - &mut reader, - row_group, - &schema.fields, - |_, _| vec![], - )?; - } - let schema = schema.filter(|_, f| f.name == column); - let field = &schema.fields[0]; - - let statistics = deserialize(field, row_group)?; - let mut reader = FileReader::new(reader, metadata.row_groups, schema, None); let array = reader.next().unwrap()?.into_arrays().pop().unwrap(); - Ok((array, statistics)) + Ok(array) } pub fn pyarrow_nested_edge(column: &str) -> Box { @@ -1300,10 +1282,6 @@ fn integration_read(data: &[u8], limit: Option) -> PolarsResult, ) -> PolarsResult<()> { - round_trip_opt_stats(column, file, version, compression, encodings, true) + round_trip_opt_stats(column, file, version, compression, encodings) } fn round_trip_opt_stats( @@ -18,9 +18,8 @@ fn round_trip_opt_stats( version: Version, compression: CompressionOptions, encodings: Vec, - check_stats: bool, ) -> PolarsResult<()> { - let (array, statistics) = match file { + let (array, _statistics) = match file { "nested" => ( pyarrow_nested_nullable(column), pyarrow_nested_nullable_statistics(column), @@ -68,12 +67,9 @@ fn round_trip_opt_stats( std::fs::write("list_struct_list_nullable.parquet", &data).unwrap(); - let (result, stats) = read_column(&mut Cursor::new(data), "a1")?; + let result = read_column(&mut Cursor::new(data), "a1")?; assert_eq!(array.as_ref(), result.as_ref()); - if check_stats { - assert_eq!(statistics, stats); - } Ok(()) } @@ -364,7 +360,6 @@ fn list_nested_inner_required_required_i64() -> PolarsResult<()> { Version::V1, CompressionOptions::Uncompressed, vec![Encoding::Plain], - false, ) } @@ -376,7 +371,6 @@ fn v1_nested_struct_list_nullable() -> PolarsResult<()> { Version::V1, CompressionOptions::Uncompressed, vec![Encoding::Plain], - true, ) } @@ -388,7 +382,6 @@ fn v1_nested_list_struct_list_nullable() -> PolarsResult<()> { Version::V1, CompressionOptions::Uncompressed, vec![Encoding::Plain], - true, ) } diff --git a/crates/polars/tests/it/io/parquet/read/indexes.rs b/crates/polars/tests/it/io/parquet/read/indexes.rs deleted file mode 100644 index e55c8b37a474..000000000000 --- a/crates/polars/tests/it/io/parquet/read/indexes.rs +++ /dev/null @@ -1,143 +0,0 @@ -use polars_parquet::parquet::error::ParquetError; -use polars_parquet::parquet::indexes::{ - BooleanIndex, BoundaryOrder, ByteIndex, Index, NativeIndex, PageIndex, PageLocation, -}; -use polars_parquet::parquet::read::{read_columns_indexes, read_metadata, read_pages_locations}; -use polars_parquet::parquet::schema::types::{ - FieldInfo, PhysicalType, PrimitiveConvertedType, PrimitiveLogicalType, PrimitiveType, -}; -use polars_parquet::parquet::schema::Repetition; - -/* -import pyspark.sql # 3.2.1 -spark = pyspark.sql.SparkSession.builder.getOrCreate() -spark.conf.set("parquet.bloom.filter.enabled", True) -spark.conf.set("parquet.bloom.filter.expected.ndv", 10) -spark.conf.set("parquet.bloom.filter.max.bytes", 32) - -data = [(i, f"{i}", False) for i in range(10)] -df = spark.createDataFrame(data, ["id", "string", "bool"]).repartition(1) - -df.write.parquet("bla.parquet", mode = "overwrite") -*/ -const FILE: &[u8] = &[ - 80, 65, 82, 49, 21, 0, 21, 172, 1, 21, 138, 1, 21, 169, 161, 209, 137, 5, 28, 21, 20, 21, 0, - 21, 6, 21, 8, 0, 0, 86, 24, 2, 0, 0, 0, 20, 1, 0, 13, 1, 17, 9, 1, 22, 1, 1, 0, 3, 1, 5, 12, 0, - 0, 0, 4, 1, 5, 12, 0, 0, 0, 5, 1, 5, 12, 0, 0, 0, 6, 1, 5, 12, 0, 0, 0, 7, 1, 5, 72, 0, 0, 0, - 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 21, 0, 21, 112, 21, 104, 21, 138, 239, 232, - 170, 15, 28, 21, 20, 21, 0, 21, 6, 21, 8, 0, 0, 56, 40, 2, 0, 0, 0, 20, 1, 1, 0, 0, 0, 48, 1, - 5, 0, 49, 1, 5, 0, 50, 1, 5, 0, 51, 1, 5, 0, 52, 1, 5, 0, 53, 1, 5, 60, 54, 1, 0, 0, 0, 55, 1, - 0, 0, 0, 56, 1, 0, 0, 0, 57, 21, 0, 21, 16, 21, 20, 21, 202, 209, 169, 227, 4, 28, 21, 20, 21, - 0, 21, 6, 21, 8, 0, 0, 8, 28, 2, 0, 0, 0, 20, 1, 0, 0, 25, 17, 2, 25, 24, 8, 0, 0, 0, 0, 0, 0, - 0, 0, 25, 24, 8, 9, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 48, 25, 24, - 1, 57, 21, 2, 25, 22, 0, 0, 25, 17, 2, 25, 24, 1, 0, 25, 24, 1, 0, 21, 2, 25, 22, 0, 0, 25, 28, - 22, 8, 21, 188, 1, 22, 0, 0, 0, 25, 28, 22, 196, 1, 21, 150, 1, 22, 0, 0, 0, 25, 28, 22, 218, - 2, 21, 66, 22, 0, 0, 0, 21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 24, 130, 24, 8, - 134, 8, 68, 6, 2, 101, 128, 10, 64, 2, 38, 78, 114, 1, 64, 38, 1, 192, 194, 152, 64, 70, 0, 36, - 56, 121, 64, 0, 21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 8, 17, 10, 29, 5, 88, 194, - 0, 35, 208, 25, 16, 70, 68, 48, 38, 17, 16, 140, 68, 98, 56, 0, 131, 4, 193, 40, 129, 161, 160, - 1, 96, 21, 64, 28, 28, 0, 0, 28, 28, 0, 0, 28, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 2, 25, 76, 72, 12, 115, 112, - 97, 114, 107, 95, 115, 99, 104, 101, 109, 97, 21, 6, 0, 21, 4, 37, 2, 24, 2, 105, 100, 0, 21, - 12, 37, 2, 24, 6, 115, 116, 114, 105, 110, 103, 37, 0, 76, 28, 0, 0, 0, 21, 0, 37, 2, 24, 4, - 98, 111, 111, 108, 0, 22, 20, 25, 28, 25, 60, 38, 8, 28, 21, 4, 25, 53, 0, 6, 8, 25, 24, 2, - 105, 100, 21, 2, 22, 20, 22, 222, 1, 22, 188, 1, 38, 8, 60, 24, 8, 9, 0, 0, 0, 0, 0, 0, 0, 24, - 8, 0, 0, 0, 0, 0, 0, 0, 0, 22, 0, 40, 8, 9, 0, 0, 0, 0, 0, 0, 0, 24, 8, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 25, 28, 21, 0, 21, 0, 21, 2, 0, 22, 226, 4, 0, 22, 158, 4, 21, 22, 22, 156, 3, 21, 62, 0, - 38, 196, 1, 28, 21, 12, 25, 53, 0, 6, 8, 25, 24, 6, 115, 116, 114, 105, 110, 103, 21, 2, 22, - 20, 22, 158, 1, 22, 150, 1, 38, 196, 1, 60, 54, 0, 40, 1, 57, 24, 1, 48, 0, 25, 28, 21, 0, 21, - 0, 21, 2, 0, 22, 192, 5, 0, 22, 180, 4, 21, 24, 22, 218, 3, 21, 34, 0, 38, 218, 2, 28, 21, 0, - 25, 53, 0, 6, 8, 25, 24, 4, 98, 111, 111, 108, 21, 2, 22, 20, 22, 62, 22, 66, 38, 218, 2, 60, - 24, 1, 0, 24, 1, 0, 22, 0, 40, 1, 0, 24, 1, 0, 0, 25, 28, 21, 0, 21, 0, 21, 2, 0, 22, 158, 6, - 0, 22, 204, 4, 21, 22, 22, 252, 3, 21, 34, 0, 22, 186, 3, 22, 20, 38, 8, 22, 148, 3, 20, 0, 0, - 25, 44, 24, 24, 111, 114, 103, 46, 97, 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, - 118, 101, 114, 115, 105, 111, 110, 24, 5, 51, 46, 50, 46, 49, 0, 24, 41, 111, 114, 103, 46, 97, - 112, 97, 99, 104, 101, 46, 115, 112, 97, 114, 107, 46, 115, 113, 108, 46, 112, 97, 114, 113, - 117, 101, 116, 46, 114, 111, 119, 46, 109, 101, 116, 97, 100, 97, 116, 97, 24, 213, 1, 123, 34, - 116, 121, 112, 101, 34, 58, 34, 115, 116, 114, 117, 99, 116, 34, 44, 34, 102, 105, 101, 108, - 100, 115, 34, 58, 91, 123, 34, 110, 97, 109, 101, 34, 58, 34, 105, 100, 34, 44, 34, 116, 121, - 112, 101, 34, 58, 34, 108, 111, 110, 103, 34, 44, 34, 110, 117, 108, 108, 97, 98, 108, 101, 34, - 58, 116, 114, 117, 101, 44, 34, 109, 101, 116, 97, 100, 97, 116, 97, 34, 58, 123, 125, 125, 44, - 123, 34, 110, 97, 109, 101, 34, 58, 34, 115, 116, 114, 105, 110, 103, 34, 44, 34, 116, 121, - 112, 101, 34, 58, 34, 115, 116, 114, 105, 110, 103, 34, 44, 34, 110, 117, 108, 108, 97, 98, - 108, 101, 34, 58, 116, 114, 117, 101, 44, 34, 109, 101, 116, 97, 100, 97, 116, 97, 34, 58, 123, - 125, 125, 44, 123, 34, 110, 97, 109, 101, 34, 58, 34, 98, 111, 111, 108, 34, 44, 34, 116, 121, - 112, 101, 34, 58, 34, 98, 111, 111, 108, 101, 97, 110, 34, 44, 34, 110, 117, 108, 108, 97, 98, - 108, 101, 34, 58, 116, 114, 117, 101, 44, 34, 109, 101, 116, 97, 100, 97, 116, 97, 34, 58, 123, - 125, 125, 93, 125, 0, 24, 74, 112, 97, 114, 113, 117, 101, 116, 45, 109, 114, 32, 118, 101, - 114, 115, 105, 111, 110, 32, 49, 46, 49, 50, 46, 50, 32, 40, 98, 117, 105, 108, 100, 32, 55, - 55, 101, 51, 48, 99, 56, 48, 57, 51, 51, 56, 54, 101, 99, 53, 50, 99, 51, 99, 102, 97, 54, 99, - 51, 52, 98, 55, 101, 102, 51, 51, 50, 49, 51, 50, 50, 99, 57, 52, 41, 25, 60, 28, 0, 0, 28, 0, - 0, 28, 0, 0, 0, 182, 2, 0, 0, 80, 65, 82, 49, -]; - -#[test] -fn test() -> Result<(), ParquetError> { - let mut reader = std::io::Cursor::new(FILE); - - let expected_index = vec![ - Box::new(NativeIndex:: { - primitive_type: PrimitiveType::from_physical("id".to_string(), PhysicalType::Int64), - indexes: vec![PageIndex { - min: Some(0), - max: Some(9), - null_count: Some(0), - }], - boundary_order: BoundaryOrder::Ascending, - }) as Box, - Box::new(ByteIndex { - primitive_type: PrimitiveType { - field_info: FieldInfo { - name: "string".to_string(), - repetition: Repetition::Optional, - id: None, - }, - logical_type: Some(PrimitiveLogicalType::String), - converted_type: Some(PrimitiveConvertedType::Utf8), - physical_type: PhysicalType::ByteArray, - }, - indexes: vec![PageIndex { - min: Some(b"0".to_vec()), - max: Some(b"9".to_vec()), - null_count: Some(0), - }], - boundary_order: BoundaryOrder::Ascending, - }), - Box::new(BooleanIndex { - indexes: vec![PageIndex { - min: Some(false), - max: Some(false), - null_count: Some(0), - }], - boundary_order: BoundaryOrder::Ascending, - }), - ]; - let expected_page_locations = vec![ - vec![PageLocation { - offset: 4, - compressed_page_size: 94, - first_row_index: 0, - }], - vec![PageLocation { - offset: 98, - compressed_page_size: 75, - first_row_index: 0, - }], - vec![PageLocation { - offset: 173, - compressed_page_size: 33, - first_row_index: 0, - }], - ]; - - let metadata = read_metadata(&mut reader)?; - let columns = &metadata.row_groups[0].columns(); - - let indexes = read_columns_indexes(&mut reader, columns)?; - assert_eq!(&indexes, &expected_index); - - let pages = read_pages_locations(&mut reader, columns)?; - assert_eq!(pages, expected_page_locations); - - Ok(()) -} diff --git a/crates/polars/tests/it/io/parquet/read/mod.rs b/crates/polars/tests/it/io/parquet/read/mod.rs index f9e16619556c..73625107685f 100644 --- a/crates/polars/tests/it/io/parquet/read/mod.rs +++ b/crates/polars/tests/it/io/parquet/read/mod.rs @@ -6,7 +6,6 @@ mod boolean; mod dictionary; pub(crate) mod file; mod fixed_binary; -mod indexes; mod primitive; mod primitive_nested; pub(crate) mod row_group; @@ -159,6 +158,7 @@ where .map(|dict| dictionary::deserialize(&dict, column.physical_type())) .transpose()?; while let Some(page) = iterator.next().transpose()? { + let page = page.decompress(&mut iterator)?; if !has_filled { struct_::extend_validity(&mut validity, &page)?; } diff --git a/crates/polars/tests/it/io/parquet/read/primitive.rs b/crates/polars/tests/it/io/parquet/read/primitive.rs index d9665f353c53..960c502fb82d 100644 --- a/crates/polars/tests/it/io/parquet/read/primitive.rs +++ b/crates/polars/tests/it/io/parquet/read/primitive.rs @@ -26,7 +26,6 @@ impl<'a, T: NativeType> PageState<'a, T> { page: &'a DataPage, dict: Option<&'a PrimitivePageDict>, ) -> Result { - assert!(page.selected_rows().is_none()); NativePageState::try_new(page, dict).map(Self::Nominal) } } diff --git a/crates/polars/tests/it/io/parquet/write/binary.rs b/crates/polars/tests/it/io/parquet/write/binary.rs index bb9abc62c258..8176a42cbf83 100644 --- a/crates/polars/tests/it/io/parquet/write/binary.rs +++ b/crates/polars/tests/it/io/parquet/write/binary.rs @@ -83,6 +83,6 @@ pub fn array_to_page_v1( DataPageHeader::V1(header), CowBuffer::Owned(buffer), descriptor.clone(), - Some(array.len()), + array.len(), ))) } diff --git a/crates/polars/tests/it/io/parquet/write/indexes.rs b/crates/polars/tests/it/io/parquet/write/indexes.rs deleted file mode 100644 index 3f5f15c92828..000000000000 --- a/crates/polars/tests/it/io/parquet/write/indexes.rs +++ /dev/null @@ -1,100 +0,0 @@ -use std::io::Cursor; - -use polars_parquet::parquet::compression::CompressionOptions; -use polars_parquet::parquet::error::ParquetResult; -use polars_parquet::parquet::indexes::{ - BoundaryOrder, Index, NativeIndex, PageIndex, PageLocation, -}; -use polars_parquet::parquet::metadata::SchemaDescriptor; -use polars_parquet::parquet::read::{read_columns_indexes, read_metadata, read_pages_locations}; -use polars_parquet::parquet::schema::types::{ParquetType, PhysicalType, PrimitiveType}; -use polars_parquet::parquet::write::{ - Compressor, DynIter, DynStreamingIterator, FileWriter, Version, WriteOptions, -}; - -use super::primitive::array_to_page_v1; - -fn write_file() -> ParquetResult> { - let page1 = vec![Some(0), Some(1), None, Some(3), Some(4), Some(5), Some(6)]; - let page2 = vec![Some(10), Some(11)]; - - let options = WriteOptions { - write_statistics: true, - version: Version::V1, - }; - - let schema = SchemaDescriptor::new( - "schema".to_string(), - vec![ParquetType::from_physical( - "col1".to_string(), - PhysicalType::Int32, - )], - ); - - let pages = vec![ - array_to_page_v1::(&page1, &options, &schema.columns()[0].descriptor), - array_to_page_v1::(&page2, &options, &schema.columns()[0].descriptor), - ]; - - let pages = DynStreamingIterator::new(Compressor::new( - DynIter::new(pages.into_iter()), - CompressionOptions::Uncompressed, - vec![], - )); - let columns = std::iter::once(Ok(pages)); - - let writer = Cursor::new(vec![]); - let mut writer = FileWriter::new(writer, schema, options, None); - - writer.write(DynIter::new(columns))?; - writer.end(None)?; - - Ok(writer.into_inner().into_inner()) -} - -#[test] -fn read_indexes_and_locations() -> ParquetResult<()> { - let data = write_file()?; - let mut reader = Cursor::new(data); - - let metadata = read_metadata(&mut reader)?; - - let columns = &metadata.row_groups[0].columns(); - - let expected_page_locations = vec![vec![ - PageLocation { - offset: 4, - compressed_page_size: 63, - first_row_index: 0, - }, - PageLocation { - offset: 67, - compressed_page_size: 47, - first_row_index: 7, - }, - ]]; - let expected_index = vec![Box::new(NativeIndex:: { - primitive_type: PrimitiveType::from_physical("col1".to_string(), PhysicalType::Int32), - indexes: vec![ - PageIndex { - min: Some(0), - max: Some(6), - null_count: Some(1), - }, - PageIndex { - min: Some(10), - max: Some(11), - null_count: Some(0), - }, - ], - boundary_order: BoundaryOrder::Unordered, - }) as Box]; - - let indexes = read_columns_indexes(&mut reader, columns)?; - assert_eq!(&indexes, &expected_index); - - let pages = read_pages_locations(&mut reader, columns)?; - assert_eq!(pages, expected_page_locations); - - Ok(()) -} diff --git a/crates/polars/tests/it/io/parquet/write/mod.rs b/crates/polars/tests/it/io/parquet/write/mod.rs index 7f066fe726e4..9d1686ffdf87 100644 --- a/crates/polars/tests/it/io/parquet/write/mod.rs +++ b/crates/polars/tests/it/io/parquet/write/mod.rs @@ -1,5 +1,4 @@ mod binary; -mod indexes; mod primitive; mod sidecar; diff --git a/crates/polars/tests/it/io/parquet/write/primitive.rs b/crates/polars/tests/it/io/parquet/write/primitive.rs index 044925c5bb11..210bf0e6cefb 100644 --- a/crates/polars/tests/it/io/parquet/write/primitive.rs +++ b/crates/polars/tests/it/io/parquet/write/primitive.rs @@ -74,6 +74,6 @@ pub fn array_to_page_v1( DataPageHeader::V1(header), CowBuffer::Owned(buffer), descriptor.clone(), - Some(array.len()), + array.len(), ))) } diff --git a/docs/src/rust/user-guide/expressions/lists.rs b/docs/src/rust/user-guide/expressions/lists.rs index 530ae4d79892..c03824c7e368 100644 --- a/docs/src/rust/user-guide/expressions/lists.rs +++ b/docs/src/rust/user-guide/expressions/lists.rs @@ -141,7 +141,10 @@ fn main() -> Result<(), Box> { ListPrimitiveChunkedBuilder::new("Array_2", 8, 8, DataType::Int32); col2.append_slice(&[1, 7, 3]); col2.append_slice(&[8, 1, 0]); - let array_df = DataFrame::new([col1.finish(), col2.finish()].into())?; + let array_df = DataFrame::new(vec![ + col1.finish().into_series(), + col2.finish().into_series(), + ])?; println!("{}", &array_df); // --8<-- [end:array_df] diff --git a/py-polars/docs/source/reference/dataframe/export.rst b/py-polars/docs/source/reference/dataframe/export.rst index c9446dd2e2d3..8ebb005221eb 100644 --- a/py-polars/docs/source/reference/dataframe/export.rst +++ b/py-polars/docs/source/reference/dataframe/export.rst @@ -8,6 +8,7 @@ Export DataFrame data to other formats: .. autosummary:: :toctree: api/ + DataFrame.__array__ DataFrame.__arrow_c_stream__ DataFrame.__dataframe__ DataFrame.to_arrow diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst index 11042e70c7bd..4d9e62556533 100644 --- a/py-polars/docs/source/reference/dataframe/modify_select.rst +++ b/py-polars/docs/source/reference/dataframe/modify_select.rst @@ -6,6 +6,7 @@ Manipulation/selection .. autosummary:: :toctree: api/ + DataFrame.__getitem__ DataFrame.bottom_k DataFrame.cast DataFrame.clear diff --git a/py-polars/docs/source/reference/expressions/aggregation.rst b/py-polars/docs/source/reference/expressions/aggregation.rst index d57b76618b31..05f4ce1fabfb 100644 --- a/py-polars/docs/source/reference/expressions/aggregation.rst +++ b/py-polars/docs/source/reference/expressions/aggregation.rst @@ -7,6 +7,9 @@ Aggregation :toctree: api/ Expr.agg_groups + Expr.all + Expr.any + Expr.approx_n_unique Expr.arg_max Expr.arg_min Expr.count @@ -18,6 +21,7 @@ Aggregation Expr.mean Expr.median Expr.min + Expr.n_unique Expr.nan_max Expr.nan_min Expr.product diff --git a/py-polars/docs/source/reference/expressions/col.rst b/py-polars/docs/source/reference/expressions/col.rst index 09b5c33e82f7..612e56e4cd63 100644 --- a/py-polars/docs/source/reference/expressions/col.rst +++ b/py-polars/docs/source/reference/expressions/col.rst @@ -2,7 +2,7 @@ polars.col ========== -Create an expression representing column(s) in a dataframe. +Create an expression representing column(s) in a DataFrame. ``col`` is technically not a function, but it can be used like one. diff --git a/py-polars/docs/source/reference/expressions/functions.rst b/py-polars/docs/source/reference/expressions/functions.rst index 4a8ca0425fca..9831d07cb803 100644 --- a/py-polars/docs/source/reference/expressions/functions.rst +++ b/py-polars/docs/source/reference/expressions/functions.rst @@ -97,7 +97,6 @@ These functions are available from the Polars module root and can be used as exp Expr.any Expr.approx_n_unique Expr.count - Expr.exclude Expr.first Expr.head Expr.implode diff --git a/py-polars/docs/source/reference/series/aggregation.rst b/py-polars/docs/source/reference/series/aggregation.rst index 2f6f8776ea34..fe74d9eb4fd0 100644 --- a/py-polars/docs/source/reference/series/aggregation.rst +++ b/py-polars/docs/source/reference/series/aggregation.rst @@ -8,6 +8,7 @@ Aggregation Series.arg_max Series.arg_min + Series.count Series.implode Series.max Series.mean diff --git a/py-polars/docs/source/reference/series/export.rst b/py-polars/docs/source/reference/series/export.rst index 2be5814ba201..06201adc2b1f 100644 --- a/py-polars/docs/source/reference/series/export.rst +++ b/py-polars/docs/source/reference/series/export.rst @@ -8,6 +8,7 @@ Export Series data to other formats: .. autosummary:: :toctree: api/ + Series.__array__ Series.__arrow_c_stream__ Series.to_arrow Series.to_frame diff --git a/py-polars/docs/source/reference/series/index.rst b/py-polars/docs/source/reference/series/index.rst index a8476da64b97..5e054f4a2366 100644 --- a/py-polars/docs/source/reference/series/index.rst +++ b/py-polars/docs/source/reference/series/index.rst @@ -19,6 +19,7 @@ This page gives an overview of all public Series methods. export list modify_select + operators miscellaneous plot string diff --git a/py-polars/docs/source/reference/series/modify_select.rst b/py-polars/docs/source/reference/series/modify_select.rst index d7ad90029349..3b15ec11ecb3 100644 --- a/py-polars/docs/source/reference/series/modify_select.rst +++ b/py-polars/docs/source/reference/series/modify_select.rst @@ -6,6 +6,7 @@ Manipulation/selection .. autosummary:: :toctree: api/ + Series.__getitem__ Series.alias Series.append Series.arg_sort diff --git a/py-polars/docs/source/reference/series/operators.rst b/py-polars/docs/source/reference/series/operators.rst new file mode 100644 index 000000000000..e01c1b39e9de --- /dev/null +++ b/py-polars/docs/source/reference/series/operators.rst @@ -0,0 +1,31 @@ +========= +Operators +========= + +Polars supports native Python operators for all common operations; +many of these operators are also available as methods on the :class:`Series` +class. + +Comparison +~~~~~~~~~~ + +.. currentmodule:: polars +.. autosummary:: + :toctree: api/ + + Series.eq + Series.eq_missing + Series.ge + Series.gt + Series.le + Series.lt + Series.ne + Series.ne_missing + +Numeric +~~~~~~~ + +.. autosummary:: + :toctree: api/ + + Series.pow diff --git a/py-polars/polars/_typing.py b/py-polars/polars/_typing.py index 428c13da0e96..9b0cc722de57 100644 --- a/py-polars/polars/_typing.py +++ b/py-polars/polars/_typing.py @@ -70,6 +70,7 @@ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: Type[List[Any]], Type[Tuple[Any, ...]], Type[bytes], + Type[object], Type["Decimal"], Type[None], ] diff --git a/py-polars/polars/_utils/cloud.py b/py-polars/polars/_utils/cloud.py index 5b427fce4059..62d1dfd3b6ec 100644 --- a/py-polars/polars/_utils/cloud.py +++ b/py-polars/polars/_utils/cloud.py @@ -3,17 +3,13 @@ from typing import TYPE_CHECKING import polars.polars as plr -from polars._utils.various import normalize_filepath if TYPE_CHECKING: - from pathlib import Path - from polars import LazyFrame def prepare_cloud_plan( lf: LazyFrame, - uri: Path | str, **optimizations: bool, ) -> bytes: """ @@ -23,9 +19,6 @@ def prepare_cloud_plan( ---------- lf The LazyFrame to prepare. - uri - Path to which the file should be written. - Must be a URI to an accessible object store location. **optimizations Optimizations to enable or disable in the query optimizer, e.g. `projection_pushdown=False`. @@ -41,6 +34,5 @@ def prepare_cloud_plan( ComputeError If the given LazyFrame cannot be serialized. """ - uri = normalize_filepath(uri) pylf = lf._set_sink_optimizations(**optimizations) - return plr.prepare_cloud_plan(pylf, uri) + return plr.prepare_cloud_plan(pylf) diff --git a/py-polars/polars/_utils/construction/series.py b/py-polars/polars/_utils/construction/series.py index f13b9f5b0ec5..379bdbeb0a30 100644 --- a/py-polars/polars/_utils/construction/series.py +++ b/py-polars/polars/_utils/construction/series.py @@ -179,7 +179,7 @@ def sequence_to_pyseries( python_dtype = type(value) # temporal branch - if python_dtype in py_temporal_types: + if issubclass(python_dtype, tuple(py_temporal_types)): if dtype is None: dtype = parse_into_dtype(python_dtype) # construct from integer elif dtype in py_temporal_types: diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 427ac0031d56..53eb82e5342e 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -1218,7 +1218,130 @@ def __getitem__( | tuple[MultiIndexSelector, MultiColSelector] ), ) -> DataFrame | Series | Any: - """Get part of the DataFrame as a new DataFrame, Series, or scalar.""" + """ + Get part of the DataFrame as a new DataFrame, Series, or scalar. + + Parameters + ---------- + key + Rows / columns to select. This is easiest to explain via example. Suppose + we have a DataFrame with columns `'a'`, `'d'`, `'c'`, `'d'`. Here is what + various types of `key` would do: + + - `df[0, 'a']` extracts the first element of column `'a'` and returns a + scalar. + - `df[0]` extracts the first row and returns a Dataframe. + - `df['a']` extracts column `'a'` and returns a Series. + - `df[0:2]` extracts the first two rows and returns a Dataframe. + - `df[0:2, 'a']` extracts the first two rows from column `'a'` and returns + a Series. + - `df[0:2, 0]` extracts the first two rows from the first column and returns + a Series. + - `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three + columns and returns a Dataframe. + - `df[0: 2, ['a', 'c']]` extracts the first two rows from columns `'a'` and + `'c'` and returns a Dataframe. + - `df[:, 0: 2]` extracts all rows from the first two columns and returns a + Dataframe. + - `df[:, 'a': 'c']` extracts all rows and all columns positioned between + `'a'` and `'c'` *inclusive* and returns a Dataframe. In our example, + that would extract columns `'a'`, `'d'`, and `'c'`. + + Returns + ------- + DataFrame, Series, or scalar, depending on `key`. + + Examples + -------- + >>> df = pl.DataFrame( + ... {"a": [1, 2, 3], "d": [4, 5, 6], "c": [1, 3, 2], "b": [7, 8, 9]} + ... ) + >>> df[0] + shape: (1, 4) + ┌─────┬─────┬─────┬─────┐ + │ a ┆ d ┆ c ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 1 ┆ 7 │ + └─────┴─────┴─────┴─────┘ + >>> df[0, "a"] + 1 + >>> df["a"] + shape: (3,) + Series: 'a' [i64] + [ + 1 + 2 + 3 + ] + >>> df[0:2] + shape: (2, 4) + ┌─────┬─────┬─────┬─────┐ + │ a ┆ d ┆ c ┆ b │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 1 ┆ 7 │ + │ 2 ┆ 5 ┆ 3 ┆ 8 │ + └─────┴─────┴─────┴─────┘ + >>> df[0:2, "a"] + shape: (2,) + Series: 'a' [i64] + [ + 1 + 2 + ] + >>> df[0:2, 0] + shape: (2,) + Series: 'a' [i64] + [ + 1 + 2 + ] + >>> df[[0, 1], [0, 1, 2]] + shape: (2, 3) + ┌─────┬─────┬─────┐ + │ a ┆ d ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 1 │ + │ 2 ┆ 5 ┆ 3 │ + └─────┴─────┴─────┘ + >>> df[0:2, ["a", "c"]] + shape: (2, 2) + ┌─────┬─────┐ + │ a ┆ c │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 1 │ + │ 2 ┆ 3 │ + └─────┴─────┘ + >>> df[:, 0:2] + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ d │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + └─────┴─────┘ + >>> df[:, "a":"c"] + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ a ┆ d ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 4 ┆ 1 │ + │ 2 ┆ 5 ┆ 3 │ + │ 3 ┆ 6 ┆ 2 │ + └─────┴─────┴─────┘ + """ return get_df_item_by_key(self, key) def __setitem__( @@ -6462,7 +6585,7 @@ def join_asof( tolerance: str | int | float | timedelta | None = None, allow_parallel: bool = True, force_parallel: bool = False, - coalesce: bool | None = None, + coalesce: bool = True, ) -> DataFrame: """ Perform an asof join. @@ -6540,9 +6663,8 @@ def join_asof( Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. coalesce - Coalescing behavior (merging of join columns). + Coalescing behavior (merging of `on` / `left_on` / `right_on` columns): - - None: -> join specific. - True: -> Always coalesce join columns. - False: -> Never coalesce join columns. @@ -6616,6 +6738,20 @@ def join_asof( - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`; - date `2018-08-01` from `population` is matched with `2018-01-01` from `gdp`. + You can verify this by passing `coalesce=False`: + + >>> population.join_asof(gdp, on="date", strategy="backward", coalesce=False) + shape: (3, 4) + ┌────────────┬────────────┬────────────┬──────┐ + │ date ┆ population ┆ date_right ┆ gdp │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ date ┆ i64 │ + ╞════════════╪════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 2016-01-01 ┆ 4164 │ + │ 2018-08-01 ┆ 82.66 ┆ 2018-01-01 ┆ 4566 │ + │ 2019-01-01 ┆ 83.12 ┆ 2019-01-01 ┆ 4696 │ + └────────────┴────────────┴────────────┴──────┘ + If we instead use `strategy='forward'`, then each date from `population` which doesn't have an exact match is matched with the closest later date from `gdp`: diff --git a/py-polars/polars/datatypes/_parse.py b/py-polars/polars/datatypes/_parse.py index 2649bc7905ec..e7ac78cae6dd 100644 --- a/py-polars/polars/datatypes/_parse.py +++ b/py-polars/polars/datatypes/_parse.py @@ -76,10 +76,10 @@ def parse_py_type_into_dtype(input: PythonDataType | type[object]) -> PolarsData return String() elif input is bool: return Boolean() - elif input is date: - return Date() - elif input is datetime: + elif isinstance(input, type) and issubclass(input, datetime): # type: ignore[redundant-expr] return Datetime("us") + elif isinstance(input, type) and issubclass(input, date): # type: ignore[redundant-expr] + return Date() elif input is timedelta: return Duration elif input is time: @@ -97,16 +97,14 @@ def parse_py_type_into_dtype(input: PythonDataType | type[object]) -> PolarsData # this is required as pass through. Don't remove elif input == Unknown: return Unknown - elif hasattr(input, "__origin__") and hasattr(input, "__args__"): return _parse_generic_into_dtype(input) - else: _raise_on_invalid_dtype(input) def _parse_generic_into_dtype(input: Any) -> PolarsDataType: - """Parse a generic type into a Polars data type.""" + """Parse a generic type (from typing annotation) into a Polars data type.""" base_type = input.__origin__ if base_type not in (tuple, list): _raise_on_invalid_dtype(input) @@ -124,19 +122,19 @@ def _parse_generic_into_dtype(input: Any) -> PolarsDataType: PY_TYPE_STR_TO_DTYPE: SchemaDict = { - "int": Int64(), - "float": Float64(), + "Decimal": Decimal, + "NoneType": Null(), "bool": Boolean(), - "str": String(), "bytes": Binary(), "date": Date(), - "time": Time(), "datetime": Datetime("us"), + "float": Float64(), + "int": Int64(), + "list": List, "object": Object(), - "NoneType": Null(), + "str": String(), + "time": Time(), "timedelta": Duration, - "Decimal": Decimal, - "list": List, "tuple": List, } @@ -177,5 +175,7 @@ def _parse_union_type_into_dtype(input: Any) -> PolarsDataType: def _raise_on_invalid_dtype(input: Any) -> NoReturn: """Raise an informative error if the input could not be parsed.""" - msg = f"cannot parse input of type {type(input).__name__!r} into Polars data type: {input!r}" + input_type = input if type(input) is type else f"of type {type(input).__name__!r}" + input_detail = "" if type(input) is type else f" (given: {input!r})" + msg = f"cannot parse input {input_type} into Polars data type{input_detail}" raise TypeError(msg) from None diff --git a/py-polars/polars/datatypes/classes.py b/py-polars/polars/datatypes/classes.py index 08aeb53c5674..b815d7d17608 100644 --- a/py-polars/polars/datatypes/classes.py +++ b/py-polars/polars/datatypes/classes.py @@ -83,6 +83,14 @@ def is_temporal(cls) -> bool: # noqa: D102 def is_nested(cls) -> bool: # noqa: D102 ... + @classmethod + def from_python(cls, py_type: PythonDataType) -> PolarsDataType: # noqa: D102 + ... + + @classmethod + def to_python(self) -> PythonDataType: # noqa: D102 + ... + class DataType(metaclass=DataTypeClass): """Base class for all Polars data types.""" @@ -180,6 +188,49 @@ def is_nested(cls) -> bool: """Check whether the data type is a nested type.""" return issubclass(cls, NestedType) + @classmethod + def from_python(cls, py_type: PythonDataType) -> PolarsDataType: + """ + Return the Polars data type corresponding to a given Python type. + + Notes + ----- + Not every Python type has a corresponding Polars data type; in general + you should declare Polars data types explicitly to exactly specify + the desired type and its properties (such as scale/unit). + + Examples + -------- + >>> pl.DataType.from_python(int) + Int64 + >>> pl.DataType.from_python(float) + Float64 + >>> from datetime import tzinfo + >>> pl.DataType.from_python(tzinfo) # doctest: +SKIP + TypeError: cannot parse input into Polars data type + """ + from polars.datatypes._parse import parse_into_dtype + + return parse_into_dtype(py_type) + + @classinstmethod # type: ignore[arg-type] + def to_python(self) -> PythonDataType: + """ + Return the Python type corresponding to this Polars data type. + + Examples + -------- + >>> pl.Int16().to_python() + + >>> pl.Float32().to_python() + + >>> pl.Array(pl.Date(), 10).to_python() + + """ + from polars.datatypes import dtype_to_py_type + + return dtype_to_py_type(self) + class NumericType(DataType): """Base class for numeric data types.""" diff --git a/py-polars/polars/datatypes/convert.py b/py-polars/polars/datatypes/convert.py index a965422c7530..1b0806b2ea75 100644 --- a/py-polars/polars/datatypes/convert.py +++ b/py-polars/polars/datatypes/convert.py @@ -19,6 +19,7 @@ Datetime, Decimal, Duration, + Enum, Field, Float32, Float64, @@ -134,55 +135,60 @@ class _DataTypeMappings: @functools.lru_cache # noqa: B019 def DTYPE_TO_FFINAME(self) -> dict[PolarsDataType, str]: return { - Int8: "i8", - Int16: "i16", - Int32: "i32", - Int64: "i64", - UInt8: "u8", - UInt16: "u16", - UInt32: "u32", - UInt64: "u64", - Float32: "f32", - Float64: "f64", - Decimal: "decimal", + Binary: "binary", Boolean: "bool", - String: "str", - List: "list", + Categorical: "categorical", Date: "date", Datetime: "datetime", + Decimal: "decimal", Duration: "duration", - Time: "time", + Float32: "f32", + Float64: "f64", + Int16: "i16", + Int32: "i32", + Int64: "i64", + Int8: "i8", + List: "list", Object: "object", - Categorical: "categorical", + String: "str", Struct: "struct", - Binary: "binary", + Time: "time", + UInt16: "u16", + UInt32: "u32", + UInt64: "u64", + UInt8: "u8", } @property @functools.lru_cache # noqa: B019 def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]: return { - Float64: float, + Array: list, + Binary: bytes, + Boolean: bool, + Date: date, + Datetime: datetime, + Decimal: PyDecimal, + Duration: timedelta, Float32: float, - Int64: int, - Int32: int, + Float64: float, Int16: int, + Int32: int, + Int64: int, Int8: int, + List: list, + Null: None.__class__, + Object: object, String: str, - UInt8: int, + Struct: dict, + Time: time, UInt16: int, UInt32: int, UInt64: int, - Decimal: PyDecimal, - Boolean: bool, - Duration: timedelta, - Datetime: datetime, - Date: date, - Time: time, - Binary: bytes, - List: list, - Array: list, - Null: None.__class__, + UInt8: int, + # the below mappings are appropriate as we restrict cat/enum to strings + Enum: str, + Categorical: str, } @property @@ -190,32 +196,32 @@ def DTYPE_TO_PY_TYPE(self) -> dict[PolarsDataType, PythonDataType]: def NUMPY_KIND_AND_ITEMSIZE_TO_DTYPE(self) -> dict[tuple[str, int], PolarsDataType]: return { # (np.dtype().kind, np.dtype().itemsize) + ("M", 8): Datetime, ("b", 1): Boolean, + ("f", 4): Float32, + ("f", 8): Float64, ("i", 1): Int8, ("i", 2): Int16, ("i", 4): Int32, ("i", 8): Int64, + ("m", 8): Duration, ("u", 1): UInt8, ("u", 2): UInt16, ("u", 4): UInt32, ("u", 8): UInt64, - ("f", 4): Float32, - ("f", 8): Float64, - ("m", 8): Duration, - ("M", 8): Datetime, } @property @functools.lru_cache # noqa: B019 def PY_TYPE_TO_ARROW_TYPE(self) -> dict[PythonDataType, pa.lib.DataType]: return { + bool: pa.bool_(), + date: pa.date32(), + datetime: pa.timestamp("us"), float: pa.float64(), int: pa.int64(), str: pa.large_utf8(), - bool: pa.bool_(), - date: pa.date32(), time: pa.time64("us"), - datetime: pa.timestamp("us"), timedelta: pa.duration("us"), None.__class__: pa.null(), } @@ -338,7 +344,7 @@ def maybe_cast(el: Any, dtype: PolarsDataType) -> Any: py_type = dtype_to_py_type(dtype) if not isinstance(el, py_type): try: - el = py_type(el) # type: ignore[call-arg, misc] + el = py_type(el) # type: ignore[call-arg] except Exception: msg = f"cannot convert Python type {type(el).__name__!r} to {dtype!r}" raise TypeError(msg) from None diff --git a/py-polars/polars/expr/binary.py b/py-polars/polars/expr/binary.py index cac394aa457a..7ea6dc4d79ea 100644 --- a/py-polars/polars/expr/binary.py +++ b/py-polars/polars/expr/binary.py @@ -257,15 +257,20 @@ def size(self, unit: SizeUnit = "b") -> Expr: r""" Get the size of binary values in the given unit. + Parameters + ---------- + unit : {'b', 'kb', 'mb', 'gb', 'tb'} + Scale the returned size to the given unit. + Returns ------- Expr - Expression of data type :class:`UInt32`. + Expression of data type :class:`UInt32` or `Float64`. Examples -------- >>> from os import urandom - >>> df = pl.DataFrame({"data": [urandom(n) for n in (512, 256, 2560, 1024)]}) + >>> df = pl.DataFrame({"data": [urandom(n) for n in (512, 256, 1024)]}) >>> df.with_columns( # doctest: +IGNORE_RESULT ... n_bytes=pl.col("data").bin.size(), ... n_kilobytes=pl.col("data").bin.size("kb"), @@ -278,7 +283,6 @@ def size(self, unit: SizeUnit = "b") -> Expr: ╞═════════════════════════════════╪═════════╪═════════════╡ │ b"y?~B\x83\xf4V\x07\xd3\xfb\xb… ┆ 512 ┆ 0.5 │ │ b"\xee$4@f\xc14\x07\x8e\x88\x1… ┆ 256 ┆ 0.25 │ - │ b"~\x17\x9c\xb1\xf4\xdb?\xe9\x… ┆ 2560 ┆ 2.5 │ │ b"\x80\xbd\xb9nEq;2\x99$\xf9\x… ┆ 1024 ┆ 1.0 │ └─────────────────────────────────┴─────────┴─────────────┘ """ diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index cdf6ccb6516f..9a03b46b12d3 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -284,10 +284,12 @@ def round(self, every: str | dt.timedelta | IntoExprColumn) -> Expr: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. - Each date/datetime in the first half of the interval - is mapped to the start of its bucket. - Each date/datetime in the second half of the interval - is mapped to the end of its bucket. + - Each date/datetime in the first half of the interval + is mapped to the start of its bucket. + - Each date/datetime in the second half of the interval + is mapped to the end of its bucket. + - Half-way points are mapped to the start of their bucket. + Ambiguous results are localised using the DST offset of the original timestamp - for example, rounding `'2022-11-06 01:20:00 CST'` by `'1h'` results in `'2022-11-06 01:00:00 CST'`, whereas rounding `'2022-11-06 01:20:00 CDT'` by diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index 9acae4b745c9..54c9ba55e09d 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -681,9 +681,9 @@ def alias(self, name: str) -> Expr: See Also -------- - map - prefix - suffix + name.map + name.prefix + name.suffix Examples -------- @@ -4300,14 +4300,14 @@ def map_batches( Dtype of the output Series. If not set, the dtype will be inferred based on the first non-null value that is returned by the function. - is_elementwise - If set to true this can run in the streaming engine, but may yield - incorrect results in group-by. Ensure you know what you are doing! agg_list Aggregate the values of the expression into a list before applying the function. This parameter only works in a group-by context. The function will be invoked only once on a list of groups, rather than once per group. + is_elementwise + If set to true this can run in the streaming engine, but may yield + incorrect results in group-by. Ensure you know what you are doing! returns_scalar If the function returns a scalar, by default it will be wrapped in a list in the output, since the assumption is that the function @@ -4745,7 +4745,7 @@ def flatten(self) -> Expr: """ Flatten a list or string column. - Alias for :func:`polars.expr.list.ExprListNameSpace.explode`. + Alias for :func:`Expr.list.explode`. Examples -------- @@ -4885,7 +4885,7 @@ def head(self, n: int | Expr = 10) -> Expr: Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.head(3) + >>> df.select(pl.col("foo").head(3)) shape: (3, 1) ┌─────┐ │ foo │ @@ -4911,7 +4911,7 @@ def tail(self, n: int | Expr = 10) -> Expr: Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.tail(3) + >>> df.select(pl.col("foo").tail(3)) shape: (3, 1) ┌─────┐ │ foo │ @@ -4942,7 +4942,7 @@ def limit(self, n: int | Expr = 10) -> Expr: Examples -------- >>> df = pl.DataFrame({"foo": [1, 2, 3, 4, 5, 6, 7]}) - >>> df.limit(3) + >>> df.select(pl.col("foo").limit(3)) shape: (3, 1) ┌─────┐ │ foo │ @@ -9213,6 +9213,9 @@ def shuffle(self, seed: int | None = None) -> Expr: """ Shuffle the contents of this expression. + Note this is shuffled independently of any other column or Expression. If you + want each row to stay the same use df.sample(shuffle=True) + Parameters ---------- seed diff --git a/py-polars/polars/expr/list.py b/py-polars/polars/expr/list.py index 390904997697..5655b58c86cb 100644 --- a/py-polars/polars/expr/list.py +++ b/py-polars/polars/expr/list.py @@ -1005,7 +1005,7 @@ def explode(self) -> Expr: See Also -------- - ExprNameSpace.reshape: Reshape this Expr to a flat Series or a Series of Lists. + Expr.reshape: Reshape this Expr to a flat Series or a Series of Lists. Examples -------- diff --git a/py-polars/polars/expr/name.py b/py-polars/polars/expr/name.py index 9c730d2d3206..8b6fe24d8dea 100644 --- a/py-polars/polars/expr/name.py +++ b/py-polars/polars/expr/name.py @@ -286,17 +286,22 @@ def to_uppercase(self) -> Expr: def map_fields(self, function: Callable[[str], str]) -> Expr: """ - Rename fields of a struct by mapping a function over the field name. + Rename fields of a struct by mapping a function over the field name(s). Notes ----- - This only take effects for struct. + This only takes effect for struct columns. Parameters ---------- function Function that maps a field name to a new name. + See Also + -------- + prefix_fields + suffix_fields + Examples -------- >>> df = pl.DataFrame({"x": {"a": 1, "b": 2}}) @@ -307,16 +312,21 @@ def map_fields(self, function: Callable[[str], str]) -> Expr: def prefix_fields(self, prefix: str) -> Expr: """ - Add a prefix to all fields name of a struct. + Add a prefix to all field names of a struct. Notes ----- - This only take effects for struct. + This only takes effect for struct columns. Parameters ---------- prefix - Prefix to add to the filed name + Prefix to add to the field name. + + See Also + -------- + map_fields + suffix_fields Examples -------- @@ -328,16 +338,21 @@ def prefix_fields(self, prefix: str) -> Expr: def suffix_fields(self, suffix: str) -> Expr: """ - Add a suffix to all fields name of a struct. + Add a suffix to all field names of a struct. Notes ----- - This only take effects for struct. + This only takes effect for struct columns. Parameters ---------- suffix - Suffix to add to the filed name + Suffix to add to the field name. + + See Also + -------- + map_fields + prefix_fields Examples -------- diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index d6232f7ff19b..bf8e739462b2 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -923,7 +923,7 @@ def contains( self, pattern: str | Expr, *, literal: bool = False, strict: bool = True ) -> Expr: """ - Check if string contains a substring that matches a pattern. + Check if the string contains a substring that matches a pattern. Parameters ---------- @@ -1034,7 +1034,7 @@ def find( See Also -------- - contains : Check if string contains a substring that matches a regex. + contains : Check if the string contains a substring that matches a pattern. Examples -------- @@ -1093,7 +1093,7 @@ def ends_with(self, suffix: str | Expr) -> Expr: See Also -------- - contains : Check if string contains a substring that matches a regex. + contains : Check if the string contains a substring that matches a pattern. starts_with : Check if string values start with a substring. Examples @@ -1156,7 +1156,7 @@ def starts_with(self, prefix: str | Expr) -> Expr: See Also -------- - contains : Check if string contains a substring that matches a regex. + contains : Check if the string contains a substring that matches a pattern. ends_with : Check if string values end with a substring. Examples diff --git a/py-polars/polars/functions/lit.py b/py-polars/polars/functions/lit.py index 700c65dbee7c..8853963cbeed 100644 --- a/py-polars/polars/functions/lit.py +++ b/py-polars/polars/functions/lit.py @@ -6,12 +6,8 @@ from typing import TYPE_CHECKING, Any import polars._reexport as pl -from polars._utils.convert import ( - time_to_int, - timedelta_to_int, -) from polars._utils.wrap import wrap_expr -from polars.datatypes import Date, Datetime, Duration, Enum, Time +from polars.datatypes import Date, Datetime, Duration, Enum from polars.dependencies import _check_for_numpy from polars.dependencies import numpy as np @@ -114,17 +110,13 @@ def lit( return expr elif isinstance(value, timedelta): - if dtype is not None and (tu := getattr(dtype, "time_unit", "us")) is not None: - time_unit = tu # type: ignore[assignment] - else: - time_unit = "us" - - td_int = timedelta_to_int(value, time_unit) - return lit(td_int).cast(Duration(time_unit)) + expr = wrap_expr(plr.lit(value, allow_object=False)) + if dtype is not None and (tu := getattr(dtype, "time_unit", None)) is not None: + expr = expr.cast(Duration(tu)) + return expr elif isinstance(value, time): - time_int = time_to_int(value) - return lit(time_int).cast(Time) + return wrap_expr(plr.lit(value, allow_object=False)) elif isinstance(value, date): if dtype == Datetime: diff --git a/py-polars/polars/io/database/_executor.py b/py-polars/polars/io/database/_executor.py index ef044d70d139..0c8513ff2f4d 100644 --- a/py-polars/polars/io/database/_executor.py +++ b/py-polars/polars/io/database/_executor.py @@ -384,7 +384,7 @@ def _normalise_cursor(self, conn: Any) -> Cursor: return conn.engine.raw_connection().cursor() elif conn.engine.driver == "duckdb_engine": self.driver_name = "duckdb" - return conn.engine.raw_connection().driver_connection.c + return conn.engine.raw_connection().driver_connection elif self._is_alchemy_engine(conn): # note: if we create it, we can close it self.can_close_cursor = True diff --git a/py-polars/polars/lazyframe/engine_config.py b/py-polars/polars/lazyframe/engine_config.py index 8dd75ebc48b6..ee6c2f8b7941 100644 --- a/py-polars/polars/lazyframe/engine_config.py +++ b/py-polars/polars/lazyframe/engine_config.py @@ -18,7 +18,7 @@ class GPUEngine: - `device`: Select the device to run the query on. - `memory_resource`: Set an RMM memory resource for - device-side allocations. + device-side allocations. """ device: int | None diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index 8ce2a1d2e362..ff4ab963faaf 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -2,6 +2,7 @@ import contextlib import os +import warnings from datetime import date, datetime, time, timedelta from functools import lru_cache, partial, reduce from io import BytesIO, StringIO @@ -41,6 +42,7 @@ _in_notebook, _is_generator, extend_bool, + find_stacklevel, is_bool_sequence, is_sequence, issue_warning, @@ -680,7 +682,7 @@ def serialize( The format in which to serialize. Options: - `"binary"`: Serialize to binary format (bytes). This is the default. - - `"json"`: Serialize to JSON format (string). + - `"json"`: Serialize to JSON format (string) (deprecated). See Also -------- @@ -716,6 +718,11 @@ def serialize( if format == "binary": serializer = self._ldf.serialize_binary elif format == "json": + msg = "'json' serialization format of LazyFrame is deprecated" + warnings.warn( + msg, + stacklevel=find_stacklevel(), + ) serializer = self._ldf.serialize_json else: msg = f"`format` must be one of {{'binary', 'json'}}, got {format!r}" @@ -3993,7 +4000,7 @@ def join_asof( tolerance: str | int | float | timedelta | None = None, allow_parallel: bool = True, force_parallel: bool = False, - coalesce: bool | None = None, + coalesce: bool = True, ) -> LazyFrame: """ Perform an asof join. @@ -4071,53 +4078,214 @@ def join_asof( Force the physical plan to evaluate the computation of both DataFrames up to the join in parallel. coalesce - Coalescing behavior (merging of join columns). + Coalescing behavior (merging of `on` / `left_on` / `right_on` columns): - - None: -> join specific. - True: -> Always coalesce join columns. - False: -> Never coalesce join columns. Note that joining on any other expressions than `col` will turn off coalescing. - Examples -------- - >>> from datetime import datetime + >>> from datetime import date >>> gdp = pl.LazyFrame( ... { - ... "date": [ - ... datetime(2016, 1, 1), - ... datetime(2017, 1, 1), - ... datetime(2018, 1, 1), - ... datetime(2019, 1, 1), - ... ], # note record date: Jan 1st (sorted!) - ... "gdp": [4164, 4411, 4566, 4696], + ... "date": pl.date_range( + ... date(2016, 1, 1), + ... date(2020, 1, 1), + ... "1y", + ... eager=True, + ... ), + ... "gdp": [4164, 4411, 4566, 4696, 4827], ... } - ... ).set_sorted("date") + ... ) + >>> gdp.collect() + shape: (5, 2) + ┌────────────┬──────┐ + │ date ┆ gdp │ + │ --- ┆ --- │ + │ date ┆ i64 │ + ╞════════════╪══════╡ + │ 2016-01-01 ┆ 4164 │ + │ 2017-01-01 ┆ 4411 │ + │ 2018-01-01 ┆ 4566 │ + │ 2019-01-01 ┆ 4696 │ + │ 2020-01-01 ┆ 4827 │ + └────────────┴──────┘ + >>> population = pl.LazyFrame( ... { - ... "date": [ - ... datetime(2016, 5, 12), - ... datetime(2017, 5, 12), - ... datetime(2018, 5, 12), - ... datetime(2019, 5, 12), - ... ], # note record date: May 12th (sorted!) - ... "population": [82.19, 82.66, 83.12, 83.52], + ... "date": [date(2016, 3, 1), date(2018, 8, 1), date(2019, 1, 1)], + ... "population": [82.19, 82.66, 83.12], ... } - ... ).set_sorted("date") + ... ).sort("date") + >>> population.collect() + shape: (3, 2) + ┌────────────┬────────────┐ + │ date ┆ population │ + │ --- ┆ --- │ + │ date ┆ f64 │ + ╞════════════╪════════════╡ + │ 2016-03-01 ┆ 82.19 │ + │ 2018-08-01 ┆ 82.66 │ + │ 2019-01-01 ┆ 83.12 │ + └────────────┴────────────┘ + + Note how the dates don't quite match. If we join them using `join_asof` and + `strategy='backward'`, then each date from `population` which doesn't have an + exact match is matched with the closest earlier date from `gdp`: + >>> population.join_asof(gdp, on="date", strategy="backward").collect() - shape: (4, 3) - ┌─────────────────────┬────────────┬──────┐ - │ date ┆ population ┆ gdp │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ f64 ┆ i64 │ - ╞═════════════════════╪════════════╪══════╡ - │ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │ - │ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │ - │ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │ - │ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │ - └─────────────────────┴────────────┴──────┘ + shape: (3, 3) + ┌────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ i64 │ + ╞════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 ┆ 82.66 ┆ 4566 │ + │ 2019-01-01 ┆ 83.12 ┆ 4696 │ + └────────────┴────────────┴──────┘ + + Note how: + + - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`; + - date `2018-08-01` from `population` is matched with `2018-01-01` from `gdp`. + + You can verify this by passing `coalesce=False`: + + >>> population.join_asof( + ... gdp, on="date", strategy="backward", coalesce=False + ... ).collect() + shape: (3, 4) + ┌────────────┬────────────┬────────────┬──────┐ + │ date ┆ population ┆ date_right ┆ gdp │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ date ┆ i64 │ + ╞════════════╪════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 2016-01-01 ┆ 4164 │ + │ 2018-08-01 ┆ 82.66 ┆ 2018-01-01 ┆ 4566 │ + │ 2019-01-01 ┆ 83.12 ┆ 2019-01-01 ┆ 4696 │ + └────────────┴────────────┴────────────┴──────┘ + + If we instead use `strategy='forward'`, then each date from `population` which + doesn't have an exact match is matched with the closest later date from `gdp`: + + >>> population.join_asof(gdp, on="date", strategy="forward").collect() + shape: (3, 3) + ┌────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ i64 │ + ╞════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 4411 │ + │ 2018-08-01 ┆ 82.66 ┆ 4696 │ + │ 2019-01-01 ┆ 83.12 ┆ 4696 │ + └────────────┴────────────┴──────┘ + + Note how: + + - date `2016-03-01` from `population` is matched with `2017-01-01` from `gdp`; + - date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`. + + Finally, `strategy='nearest'` gives us a mix of the two results above, as each + date from `population` which doesn't have an exact match is matched with the + closest date from `gdp`, regardless of whether it's earlier or later: + + >>> population.join_asof(gdp, on="date", strategy="nearest").collect() + shape: (3, 3) + ┌────────────┬────────────┬──────┐ + │ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ date ┆ f64 ┆ i64 │ + ╞════════════╪════════════╪══════╡ + │ 2016-03-01 ┆ 82.19 ┆ 4164 │ + │ 2018-08-01 ┆ 82.66 ┆ 4696 │ + │ 2019-01-01 ┆ 83.12 ┆ 4696 │ + └────────────┴────────────┴──────┘ + + Note how: + + - date `2016-03-01` from `population` is matched with `2016-01-01` from `gdp`; + - date `2018-08-01` from `population` is matched with `2019-01-01` from `gdp`. + + They `by` argument allows joining on another column first, before the asof join. + In this example we join by `country` first, then asof join by date, as above. + + >>> gdp_dates = pl.date_range( # fmt: skip + ... date(2016, 1, 1), date(2020, 1, 1), "1y", eager=True + ... ) + >>> gdp2 = pl.LazyFrame( + ... { + ... "country": ["Germany"] * 5 + ["Netherlands"] * 5, + ... "date": pl.concat([gdp_dates, gdp_dates]), + ... "gdp": [4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909], + ... } + ... ).sort("country", "date") + >>> + >>> gdp2.collect() + shape: (10, 3) + ┌─────────────┬────────────┬──────┐ + │ country ┆ date ┆ gdp │ + │ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ i64 │ + ╞═════════════╪════════════╪══════╡ + │ Germany ┆ 2016-01-01 ┆ 4164 │ + │ Germany ┆ 2017-01-01 ┆ 4411 │ + │ Germany ┆ 2018-01-01 ┆ 4566 │ + │ Germany ┆ 2019-01-01 ┆ 4696 │ + │ Germany ┆ 2020-01-01 ┆ 4827 │ + │ Netherlands ┆ 2016-01-01 ┆ 784 │ + │ Netherlands ┆ 2017-01-01 ┆ 833 │ + │ Netherlands ┆ 2018-01-01 ┆ 914 │ + │ Netherlands ┆ 2019-01-01 ┆ 910 │ + │ Netherlands ┆ 2020-01-01 ┆ 909 │ + └─────────────┴────────────┴──────┘ + >>> pop2 = pl.LazyFrame( + ... { + ... "country": ["Germany"] * 3 + ["Netherlands"] * 3, + ... "date": [ + ... date(2016, 3, 1), + ... date(2018, 8, 1), + ... date(2019, 1, 1), + ... date(2016, 3, 1), + ... date(2018, 8, 1), + ... date(2019, 1, 1), + ... ], + ... "population": [82.19, 82.66, 83.12, 17.11, 17.32, 17.40], + ... } + ... ).sort("country", "date") + >>> + >>> pop2.collect() + shape: (6, 3) + ┌─────────────┬────────────┬────────────┐ + │ country ┆ date ┆ population │ + │ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ f64 │ + ╞═════════════╪════════════╪════════════╡ + │ Germany ┆ 2016-03-01 ┆ 82.19 │ + │ Germany ┆ 2018-08-01 ┆ 82.66 │ + │ Germany ┆ 2019-01-01 ┆ 83.12 │ + │ Netherlands ┆ 2016-03-01 ┆ 17.11 │ + │ Netherlands ┆ 2018-08-01 ┆ 17.32 │ + │ Netherlands ┆ 2019-01-01 ┆ 17.4 │ + └─────────────┴────────────┴────────────┘ + >>> pop2.join_asof(gdp2, by="country", on="date", strategy="nearest").collect() + shape: (6, 4) + ┌─────────────┬────────────┬────────────┬──────┐ + │ country ┆ date ┆ population ┆ gdp │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ date ┆ f64 ┆ i64 │ + ╞═════════════╪════════════╪════════════╪══════╡ + │ Germany ┆ 2016-03-01 ┆ 82.19 ┆ 4164 │ + │ Germany ┆ 2018-08-01 ┆ 82.66 ┆ 4696 │ + │ Germany ┆ 2019-01-01 ┆ 83.12 ┆ 4696 │ + │ Netherlands ┆ 2016-03-01 ┆ 17.11 ┆ 784 │ + │ Netherlands ┆ 2018-08-01 ┆ 17.32 ┆ 910 │ + │ Netherlands ┆ 2019-01-01 ┆ 17.4 ┆ 910 │ + └─────────────┴────────────┴────────────┴──────┘ + """ if not isinstance(other, LazyFrame): msg = f"expected `other` join table to be a LazyFrame, not a {type(other).__name__!r}" diff --git a/py-polars/polars/schema.py b/py-polars/polars/schema.py index 718ffec75b93..019d2d2f3ad0 100644 --- a/py-polars/polars/schema.py +++ b/py-polars/polars/schema.py @@ -1,9 +1,13 @@ from __future__ import annotations from collections import OrderedDict -from typing import TYPE_CHECKING, Iterable, Mapping +from collections.abc import Mapping +from typing import TYPE_CHECKING, Iterable + +from polars.datatypes._parse import parse_into_dtype if TYPE_CHECKING: + from polars._typing import PythonDataType from polars.datatypes import DataType BaseSchema = OrderedDict[str, DataType] @@ -49,10 +53,19 @@ class Schema(BaseSchema): def __init__( self, - schema: Mapping[str, DataType] | Iterable[tuple[str, DataType]] | None = None, + schema: ( + Mapping[str, DataType | PythonDataType] + | Iterable[tuple[str, DataType | PythonDataType]] + | None + ) = None, ): - schema = schema or {} - super().__init__(schema) + input = ( + schema.items() if schema and isinstance(schema, Mapping) else (schema or {}) + ) + super().__init__({name: parse_into_dtype(tp) for name, tp in input}) # type: ignore[misc] + + def __setitem__(self, name: str, dtype: DataType | PythonDataType) -> None: + super().__setitem__(name, parse_into_dtype(dtype)) # type: ignore[assignment] def names(self) -> list[str]: """Get the column names of the schema.""" @@ -65,3 +78,15 @@ def dtypes(self) -> list[DataType]: def len(self) -> int: """Get the number of columns in the schema.""" return len(self) + + def to_python(self) -> dict[str, type]: + """ + Return Schema as a dictionary of column names and their Python types. + + Examples + -------- + >>> s = pl.Schema({"x": pl.Int8(), "y": pl.String(), "z": pl.Duration("ms")}) + >>> s.to_python() + {'x': , 'y': , 'z': } + """ + return {name: tp.to_python() for name, tp in self.items()} diff --git a/py-polars/polars/selectors.py b/py-polars/polars/selectors.py index a42f346c6562..2e56aa3fb91e 100644 --- a/py-polars/polars/selectors.py +++ b/py-polars/polars/selectors.py @@ -2494,7 +2494,7 @@ def starts_with(*prefix: str) -> SelectorType: def string(*, include_categorical: bool = False) -> SelectorType: """ - Select all String (and, optionally, Categorical) string columns . + Select all String (and, optionally, Categorical) string columns. See Also -------- diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index 8c8bfb32bad8..928c81410a62 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -1764,10 +1764,12 @@ def round(self, every: str | dt.timedelta | IntoExprColumn) -> Series: This functionality is considered **unstable**. It may be changed at any point without it being considered a breaking change. - Each date/datetime in the first half of the interval is mapped to the start of - its bucket. - Each date/datetime in the second half of the interval is mapped to the end of - its bucket. + - Each date/datetime in the first half of the interval + is mapped to the start of its bucket. + - Each date/datetime in the second half of the interval + is mapped to the end of its bucket. + - Half-way points are mapped to the start of their bucket. + Ambiguous results are localized using the DST offset of the original timestamp - for example, rounding `'2022-11-06 01:20:00 CST'` by `'1h'` results in `'2022-11-06 01:00:00 CST'`, whereas rounding `'2022-11-06 01:20:00 CDT'` by diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index b61198fb9deb..901e424641fa 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -1232,7 +1232,31 @@ def __getitem__(self, key: MultiIndexSelector) -> Series: ... def __getitem__( self, key: SingleIndexSelector | MultiIndexSelector ) -> Any | Series: - """Get part of the Series as a new Series or scalar.""" + """ + Get part of the Series as a new Series or scalar. + + Parameters + ---------- + key + Row(s) to select. + + Returns + ------- + Series or scalar, depending on `key`. + + Examples + -------- + >>> s = pl.Series("a", [1, 4, 2]) + >>> s[0] + 1 + >>> s[0:2] + shape: (2,) + Series: 'a' [i64] + [ + 1 + 4 + ] + """ return get_series_item_by_key(self, key) def __setitem__( @@ -2404,7 +2428,7 @@ def hist( If None given, we determine the boundaries based on the data. bin_count If no bins provided, this will be used to determine - the distance of the bins + the distance of the bins. include_breakpoint Include a column that indicates the upper breakpoint. include_category @@ -2418,18 +2442,17 @@ def hist( -------- >>> a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) >>> a.hist(bin_count=4) - shape: (5, 3) - ┌────────────┬─────────────┬───────┐ - │ breakpoint ┆ category ┆ count │ - │ --- ┆ --- ┆ --- │ - │ f64 ┆ cat ┆ u32 │ - ╞════════════╪═════════════╪═══════╡ - │ 0.0 ┆ (-inf, 0.0] ┆ 0 │ - │ 2.25 ┆ (0.0, 2.25] ┆ 3 │ - │ 4.5 ┆ (2.25, 4.5] ┆ 2 │ - │ 6.75 ┆ (4.5, 6.75] ┆ 0 │ - │ inf ┆ (6.75, inf] ┆ 2 │ - └────────────┴─────────────┴───────┘ + shape: (4, 3) + ┌────────────┬───────────────┬───────┐ + │ breakpoint ┆ category ┆ count │ + │ --- ┆ --- ┆ --- │ + │ f64 ┆ cat ┆ u32 │ + ╞════════════╪═══════════════╪═══════╡ + │ 2.75 ┆ (0.993, 2.75] ┆ 3 │ + │ 4.5 ┆ (2.75, 4.5] ┆ 2 │ + │ 6.25 ┆ (4.5, 6.25] ┆ 0 │ + │ 8.0 ┆ (6.25, 8.0] ┆ 2 │ + └────────────┴───────────────┴───────┘ """ out = ( self.to_frame() diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index 30c5ac90b5c1..cb6d9fbb999d 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -376,7 +376,7 @@ def contains( self, pattern: str | Expr, *, literal: bool = False, strict: bool = True ) -> Series: """ - Check if strings in Series contain a substring that matches a regex. + Check if the string contains a substring that matches a pattern. Parameters ---------- @@ -480,7 +480,7 @@ def find( See Also -------- - contains : Check if string contains a substring that matches a regex. + contains : Check if the string contains a substring that matches a pattern. Examples -------- @@ -535,7 +535,7 @@ def ends_with(self, suffix: str | Expr) -> Series: See Also -------- - contains : Check if string contains a substring that matches a regex. + contains : Check if the string contains a substring that matches a pattern. starts_with : Check if string values start with a substring. Examples @@ -562,7 +562,7 @@ def starts_with(self, prefix: str | Expr) -> Series: See Also -------- - contains : Check if string contains a substring that matches a regex. + contains : Check if the string contains a substring that matches a pattern. ends_with : Check if string values end with a substring. Examples diff --git a/py-polars/tests/unit/cloud/test_prepare_cloud_plan.py b/py-polars/tests/unit/cloud/test_prepare_cloud_plan.py index 825c2c130e57..c54d0c7fc7ad 100644 --- a/py-polars/tests/unit/cloud/test_prepare_cloud_plan.py +++ b/py-polars/tests/unit/cloud/test_prepare_cloud_plan.py @@ -9,7 +9,6 @@ from polars.exceptions import InvalidOperationError CLOUD_SOURCE = "s3://my-nonexistent-bucket/dataset" -CLOUD_SINK = "s3://my-nonexistent-bucket/result" @pytest.mark.parametrize( @@ -22,37 +21,20 @@ ], ) def test_prepare_cloud_plan(lf: pl.LazyFrame) -> None: - result = prepare_cloud_plan(lf, CLOUD_SINK) + result = prepare_cloud_plan(lf) assert isinstance(result, bytes) deserialized = pl.LazyFrame.deserialize(BytesIO(result)) assert isinstance(deserialized, pl.LazyFrame) -def test_prepare_cloud_plan_sink_added() -> None: - lf = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) - - result = prepare_cloud_plan(lf, CLOUD_SINK) - - deserialized = pl.LazyFrame.deserialize(BytesIO(result)) - assert "SINK (cloud)" in deserialized.explain() - - -def test_prepare_cloud_plan_invalid_sink_uri() -> None: - lf = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) - local_path = "~/local/result.parquet" - - with pytest.raises(InvalidOperationError, match="non-cloud paths not supported"): - prepare_cloud_plan(lf, local_path) - - def test_prepare_cloud_plan_optimization_toggle() -> None: lf = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}) with pytest.raises(TypeError, match="unexpected keyword argument"): - prepare_cloud_plan(lf, CLOUD_SINK, nonexistent_optimization=False) + prepare_cloud_plan(lf, nonexistent_optimization=False) - result = prepare_cloud_plan(lf, CLOUD_SINK, projection_pushdown=False) + result = prepare_cloud_plan(lf, projection_pushdown=False) assert isinstance(result, bytes) # TODO: How to check that this optimization was toggled correctly? @@ -92,7 +74,7 @@ def test_prepare_cloud_plan_fail_on_udf(lf: pl.LazyFrame) -> None: InvalidOperationError, match="logical plan ineligible for execution on Polars Cloud", ): - prepare_cloud_plan(lf, CLOUD_SINK) + prepare_cloud_plan(lf) @pytest.mark.parametrize( @@ -109,7 +91,7 @@ def test_prepare_cloud_plan_fail_on_local_data_source(lf: pl.LazyFrame) -> None: InvalidOperationError, match="logical plan ineligible for execution on Polars Cloud", ): - prepare_cloud_plan(lf, CLOUD_SINK) + prepare_cloud_plan(lf) @pytest.mark.write_disk() @@ -124,4 +106,4 @@ def test_prepare_cloud_plan_fail_on_python_scan(tmp_path: Path) -> None: InvalidOperationError, match="logical plan ineligible for execution on Polars Cloud", ): - prepare_cloud_plan(lf, CLOUD_SINK) + prepare_cloud_plan(lf) diff --git a/py-polars/tests/unit/constructors/test_series.py b/py-polars/tests/unit/constructors/test_series.py index cfc0c76b6dc2..c31a5b48ce68 100644 --- a/py-polars/tests/unit/constructors/test_series.py +++ b/py-polars/tests/unit/constructors/test_series.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any import numpy as np +import pandas as pd import pytest import polars as pl @@ -148,6 +149,12 @@ def test_series_init_np_temporal_with_nat_15518() -> None: assert_series_equal(result, expected) +def test_series_init_pandas_timestamp_18127() -> None: + result = pl.Series([pd.Timestamp("2000-01-01T00:00:00.123456789", tz="UTC")]) + # Note: time unit is not (yet) respected, it should be Datetime('ns', 'UTC'). + assert result.dtype == pl.Datetime("us", "UTC") + + def test_series_init_np_2d_zero_zero_shape() -> None: arr = np.array([]).reshape(0, 0) with pytest.raises( diff --git a/py-polars/tests/unit/dataframe/test_upsample.py b/py-polars/tests/unit/dataframe/test_upsample.py index 163245fb1502..21160ad54df8 100644 --- a/py-polars/tests/unit/dataframe/test_upsample.py +++ b/py-polars/tests/unit/dataframe/test_upsample.py @@ -216,3 +216,71 @@ def test_upsample_index_invalid( every="1h", maintain_order=maintain_order, ) + + +def test_upsample_sorted_only_within_group() -> None: + df = pl.DataFrame( + { + "time": [ + datetime(2021, 4, 1), + datetime(2021, 2, 1), + datetime(2021, 5, 1), + datetime(2021, 6, 1), + ], + "admin": ["Netherlands", "Åland", "Åland", "Netherlands"], + "test2": [1, 0, 2, 3], + } + ) + + up = df.upsample( + time_column="time", + every="1mo", + group_by="admin", + maintain_order=True, + ).select(pl.all().forward_fill()) + + expected = pl.DataFrame( + { + "time": [ + datetime(2021, 4, 1, 0, 0), + datetime(2021, 5, 1, 0, 0), + datetime(2021, 6, 1, 0, 0), + datetime(2021, 2, 1, 0, 0), + datetime(2021, 3, 1, 0, 0), + datetime(2021, 4, 1, 0, 0), + datetime(2021, 5, 1, 0, 0), + ], + "admin": [ + "Netherlands", + "Netherlands", + "Netherlands", + "Åland", + "Åland", + "Åland", + "Åland", + ], + "test2": [1, 1, 3, 0, 0, 0, 2], + } + ) + + assert_frame_equal(up, expected) + + +def test_upsample_sorted_only_within_group_but_no_group_by_provided() -> None: + df = pl.DataFrame( + { + "time": [ + datetime(2021, 4, 1), + datetime(2021, 2, 1), + datetime(2021, 5, 1), + datetime(2021, 6, 1), + ], + "admin": ["Netherlands", "Åland", "Åland", "Netherlands"], + "test2": [1, 0, 2, 3], + } + ) + with pytest.raises( + InvalidOperationError, + match=r"argument in operation 'upsample' is not sorted, please sort the 'expr/series/column' first", + ): + df.upsample(time_column="time", every="1mo") diff --git a/py-polars/tests/unit/io/test_lazy_count_star.py b/py-polars/tests/unit/io/test_lazy_count_star.py index 222f0cb25607..a20f4ea75a36 100644 --- a/py-polars/tests/unit/io/test_lazy_count_star.py +++ b/py-polars/tests/unit/io/test_lazy_count_star.py @@ -5,6 +5,7 @@ if TYPE_CHECKING: from pathlib import Path +import gzip from tempfile import NamedTemporaryFile import pytest @@ -82,3 +83,30 @@ def test_count_ndjson(io_files_path: Path, path: str, n_rows: int) -> None: # Check if we are using our fast count star assert "FAST_COUNT" in lf.explain() assert_frame_equal(lf.collect(), expected) + + +def test_count_compressed_csv_18057(io_files_path: Path) -> None: + csv_file = io_files_path / "gzipped.csv.gz" + + expected = pl.DataFrame( + {"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0]} + ) + lf = pl.scan_csv(csv_file, truncate_ragged_lines=True) + out = lf.collect() + assert_frame_equal(out, expected) + # This also tests: + # #18070 "CSV count_rows does not skip empty lines at file start" + # as the file has an empty line at the beginning. + assert lf.select(pl.len()).collect().item() == 3 + + +def test_count_compressed_ndjson(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + path = tmp_path / "data.jsonl.gz" + df = pl.DataFrame({"x": range(5)}) + + with gzip.open(path, "wb") as f: + df.write_ndjson(f) + + lf = pl.scan_ndjson(path) + assert lf.select(pl.len()).collect().item() == 5 diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index c2351ec109bc..5672c4b1b7c4 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -438,18 +438,3 @@ def test_scan_csv_with_column_names_nonexistent_file() -> None: # Upon collection, it should fail with pytest.raises(FileNotFoundError): result.collect() - - -def test_scan_csv_compressed_row_count_18057(io_files_path: Path) -> None: - csv_file = io_files_path / "gzipped.csv.gz" - - expected = pl.DataFrame( - {"a": [1, 2, 3], "b": ["a", "b", "c"], "c": [1.0, 2.0, 3.0]} - ) - lf = pl.scan_csv(csv_file, truncate_ragged_lines=True) - out = lf.collect() - assert_frame_equal(out, expected) - # This also tests: - # #18070 "CSV count_rows does not skip empty lines at file start" - # as the file has an empty line at the beginning. - assert lf.select(pl.len()).collect().item() == 3 diff --git a/py-polars/tests/unit/io/test_parquet.py b/py-polars/tests/unit/io/test_parquet.py index 2f9c3436c4be..f93090ad8302 100644 --- a/py-polars/tests/unit/io/test_parquet.py +++ b/py-polars/tests/unit/io/test_parquet.py @@ -3,7 +3,7 @@ import io from datetime import datetime, time, timezone from decimal import Decimal -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, Literal, cast import fsspec import numpy as np @@ -18,7 +18,7 @@ import polars as pl from polars.exceptions import ComputeError from polars.testing import assert_frame_equal, assert_series_equal -from polars.testing.parametric import dataframes +from polars.testing.parametric import column, dataframes if TYPE_CHECKING: from pathlib import Path @@ -1059,8 +1059,8 @@ def test_hybrid_rle() -> None: f = io.BytesIO() df.write_parquet(f) f.seek(0) - for column in pq.ParquetFile(f).metadata.to_dict()["row_groups"][0]["columns"]: - assert "RLE_DICTIONARY" in column["encodings"] + for col in pq.ParquetFile(f).metadata.to_dict()["row_groups"][0]["columns"]: + assert "RLE_DICTIONARY" in col["encodings"] f.seek(0) assert_frame_equal(pl.read_parquet(f), df) @@ -1450,9 +1450,6 @@ def test_null_array_dict_pages_18085() -> None: row_group_size=st.integers(min_value=10, max_value=1000), ) def test_delta_encoding_roundtrip(df: pl.DataFrame, row_group_size: int) -> None: - print(df.schema) - print(df) - f = io.BytesIO() pq.write_table( df.to_arrow(), @@ -1510,3 +1507,226 @@ def test_delta_strings_encoding_roundtrip( f.seek(0) assert_frame_equal(pl.read_parquet(f), df) + + +EQUALITY_OPERATORS = ["__eq__", "__lt__", "__le__", "__gt__", "__ge__"] +BOOLEAN_OPERATORS = ["__or__", "__and__"] + + +@given( + df=dataframes( + min_size=0, max_size=100, min_cols=2, max_cols=5, allowed_dtypes=[pl.Int32] + ), + first_op=st.sampled_from(EQUALITY_OPERATORS), + second_op=st.sampled_from( + [None] + + [ + (booljoin, eq) + for booljoin in BOOLEAN_OPERATORS + for eq in EQUALITY_OPERATORS + ] + ), + l1=st.integers(min_value=0, max_value=1000), + l2=st.integers(min_value=0, max_value=1000), + r1=st.integers(min_value=0, max_value=1000), + r2=st.integers(min_value=0, max_value=1000), +) +@pytest.mark.parametrize("parallel_st", ["auto", "prefiltered"]) +@settings( + deadline=None, + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +@pytest.mark.write_disk() +def test_predicate_filtering( + tmp_path: Path, + df: pl.DataFrame, + first_op: str, + second_op: None | tuple[str, str], + l1: int, + l2: int, + r1: int, + r2: int, + parallel_st: Literal["auto", "prefiltered"], +) -> None: + tmp_path.mkdir(exist_ok=True) + f = tmp_path / "test.parquet" + + df.write_parquet(f, row_group_size=5) + + cols = df.columns + + l1s = cols[l1 % len(cols)] + l2s = cols[l2 % len(cols)] + expr = (getattr(pl.col(l1s), first_op))(pl.col(l2s)) + + if second_op is not None: + r1s = cols[r1 % len(cols)] + r2s = cols[r2 % len(cols)] + expr = getattr(expr, second_op[0])( + (getattr(pl.col(r1s), second_op[1]))(pl.col(r2s)) + ) + + result = pl.scan_parquet(f, parallel=parallel_st).filter(expr).collect() + assert_frame_equal(result, df.filter(expr)) + + +@given( + df=dataframes( + min_size=1, + max_size=5, + min_cols=1, + max_cols=1, + excluded_dtypes=[pl.Decimal, pl.Categorical, pl.Enum], + ), + offset=st.integers(0, 100), + length=st.integers(0, 100), +) +@settings( + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +@pytest.mark.write_disk() +def test_slice_roundtrip( + df: pl.DataFrame, offset: int, length: int, tmp_path: Path +) -> None: + tmp_path.mkdir(exist_ok=True) + f = tmp_path / "test.parquet" + + offset %= df.height + 1 + length %= df.height - offset + 1 + + df.write_parquet(f) + + scanned = pl.scan_parquet(f).slice(offset, length).collect() + assert_frame_equal(scanned, df.slice(offset, length)) + + +@pytest.mark.write_disk() +def test_struct_prefiltered(tmp_path: Path) -> None: + tmp_path.mkdir(exist_ok=True) + f = tmp_path / "test.parquet" + + df = pl.DataFrame({"a": {"x": 1, "y": 2}}) + df.write_parquet(f) + + ( + pl.scan_parquet(f, parallel="prefiltered") + .filter(pl.col("a").struct.field("x") == 1) + .collect() + ) + + +@pytest.mark.parametrize( + "data", + [ + ( + [{"x": ""}, {"x": "0"}], + pa.struct([pa.field("x", pa.string(), nullable=True)]), + ), + ( + [{"x": ""}, {"x": "0"}], + pa.struct([pa.field("x", pa.string(), nullable=False)]), + ), + ([[""], ["0"]], pa.list_(pa.field("item", pa.string(), nullable=False))), + ([[""], ["0"]], pa.list_(pa.field("item", pa.string(), nullable=True))), + ([[""], ["0"]], pa.list_(pa.field("item", pa.string(), nullable=False), 1)), + ([[""], ["0"]], pa.list_(pa.field("item", pa.string(), nullable=True), 1)), + ( + [["", "1"], ["0", "2"]], + pa.list_(pa.field("item", pa.string(), nullable=False), 2), + ), + ( + [["", "1"], ["0", "2"]], + pa.list_(pa.field("item", pa.string(), nullable=True), 2), + ), + ], +) +@pytest.mark.parametrize("nullable", [False, True]) +@pytest.mark.write_disk() +def test_nested_skip_18303( + data: tuple[list[dict[str, str] | list[str]], pa.DataType], + nullable: bool, + tmp_path: Path, +) -> None: + tmp_path.mkdir(exist_ok=True) + f = tmp_path / "test.parquet" + + schema = pa.schema([pa.field("a", data[1], nullable=nullable)]) + tb = pa.table({"a": data[0]}, schema=schema) + pq.write_table(tb, f) + + scanned = pl.scan_parquet(f).slice(1, 1).collect() + + assert_frame_equal(scanned, pl.DataFrame(tb).slice(1, 1)) + + +@given( + df=dataframes( + min_size=0, + max_size=100, + min_cols=2, + max_cols=5, + allowed_dtypes=[pl.String, pl.Binary], + include_cols=[ + column("filter_col", pl.Int8, st.integers(0, 1), allow_null=False) + ], + ), +) +@settings( + deadline=None, + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +@pytest.mark.write_disk() +def test_delta_length_byte_array_prefiltering( + tmp_path: Path, + df: pl.DataFrame, +) -> None: + tmp_path.mkdir(exist_ok=True) + f = tmp_path / "test.parquet" + + cols = df.columns + + encodings = {col: "DELTA_LENGTH_BYTE_ARRAY" for col in cols} + encodings["filter_col"] = "PLAIN" + + pq.write_table( + df.to_arrow(), + f, + use_dictionary=False, + column_encoding=encodings, + ) + + expr = pl.col("filter_col") == 0 + result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() + assert_frame_equal(result, df.filter(expr)) + + +@given( + df=dataframes( + min_size=0, + max_size=10, + min_cols=1, + max_cols=5, + excluded_dtypes=[pl.Decimal, pl.Categorical, pl.Enum], + include_cols=[ + column("filter_col", pl.Int8, st.integers(0, 1), allow_null=False) + ], + ), +) +@settings( + deadline=None, + suppress_health_check=[HealthCheck.function_scoped_fixture], +) +@pytest.mark.write_disk() +def test_general_prefiltering( + tmp_path: Path, + df: pl.DataFrame, +) -> None: + tmp_path.mkdir(exist_ok=True) + f = tmp_path / "test.parquet" + + df.write_parquet(f) + + expr = pl.col("filter_col") == 0 + + result = pl.scan_parquet(f, parallel="prefiltered").filter(expr).collect() + assert_frame_equal(result, df.filter(expr)) diff --git a/py-polars/tests/unit/io/test_scan.py b/py-polars/tests/unit/io/test_scan.py index a1094ec778f0..8476b8b5f666 100644 --- a/py-polars/tests/unit/io/test_scan.py +++ b/py-polars/tests/unit/io/test_scan.py @@ -640,18 +640,14 @@ def test_scan_include_file_name( streaming: bool, ) -> None: tmp_path.mkdir(exist_ok=True) - paths: list[Path] = [] dfs: list[pl.DataFrame] = [] for x in ["1", "2"]: - paths.append(Path(f"{tmp_path}/{x}.bin").absolute()) - dfs.append(pl.DataFrame({"x": x})) - write_func(dfs[-1], paths[-1]) - - df = pl.concat(dfs).with_columns( - pl.Series("path", map(str, paths), dtype=pl.String) - ) + path = Path(f"{tmp_path}/{x}.bin").absolute() + dfs.append(pl.DataFrame({"x": 10 * [x]}).with_columns(path=pl.lit(str(path)))) + write_func(dfs[-1].drop("path"), path) + df = pl.concat(dfs) assert df.columns == ["x", "path"] with pytest.raises( diff --git a/py-polars/tests/unit/lazyframe/test_serde.py b/py-polars/tests/unit/lazyframe/test_serde.py index 515ce490693e..86a5c932b7f5 100644 --- a/py-polars/tests/unit/lazyframe/test_serde.py +++ b/py-polars/tests/unit/lazyframe/test_serde.py @@ -40,6 +40,7 @@ def test_lf_serde_roundtrip_binary(lf: pl.LazyFrame) -> None: ], ) ) +@pytest.mark.filterwarnings("ignore") def test_lf_serde_roundtrip_json(lf: pl.LazyFrame) -> None: serialized = lf.serialize(format="json") result = pl.LazyFrame.deserialize(io.StringIO(serialized), format="json") @@ -52,13 +53,7 @@ def lf() -> pl.LazyFrame: return pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}).select("a").sum() -def test_lf_serde(lf: pl.LazyFrame) -> None: - serialized = lf.serialize() - assert isinstance(serialized, bytes) - result = pl.LazyFrame.deserialize(io.BytesIO(serialized)) - assert_frame_equal(result, lf) - - +@pytest.mark.filterwarnings("ignore") def test_lf_serde_json_stringio(lf: pl.LazyFrame) -> None: serialized = lf.serialize(format="json") assert isinstance(serialized, str) @@ -66,6 +61,13 @@ def test_lf_serde_json_stringio(lf: pl.LazyFrame) -> None: assert_frame_equal(result, lf) +def test_lf_serde(lf: pl.LazyFrame) -> None: + serialized = lf.serialize() + assert isinstance(serialized, bytes) + result = pl.LazyFrame.deserialize(io.BytesIO(serialized)) + assert_frame_equal(result, lf) + + @pytest.mark.parametrize( ("format", "buf"), [ @@ -74,6 +76,7 @@ def test_lf_serde_json_stringio(lf: pl.LazyFrame) -> None: ("json", io.BytesIO()), ], ) +@pytest.mark.filterwarnings("ignore") def test_lf_serde_to_from_buffer( lf: pl.LazyFrame, format: SerializationFormat, buf: io.IOBase ) -> None: diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py index b212c7be133d..cd4ee34bde71 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py @@ -1427,3 +1427,42 @@ def test_literal_from_datetime( assert out.schema == OrderedDict({"literal": dtype}) assert out.item() == value + + +@pytest.mark.parametrize( + "value", + [ + time(0), + time(hour=1), + time(hour=16, minute=43, microsecond=500), + time(hour=23, minute=59, second=59, microsecond=999999), + ], +) +def test_literal_from_time(value: time) -> None: + out = pl.select(pl.lit(value)) + assert out.schema == OrderedDict({"literal": pl.Time}) + assert out.item() == value + + +@pytest.mark.parametrize( + "dtype", + [ + None, + pl.Duration("ms"), + pl.Duration("us"), + pl.Duration("ns"), + ], +) +@pytest.mark.parametrize( + "value", + [ + timedelta(0), + timedelta(hours=1), + timedelta(days=-99999), + timedelta(days=99999), + ], +) +def test_literal_from_timedelta(value: time, dtype: pl.Duration | None) -> None: + out = pl.select(pl.lit(value, dtype=dtype)) + assert out.schema == OrderedDict({"literal": dtype or pl.Duration("us")}) + assert out.item() == value diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_round.py b/py-polars/tests/unit/operations/namespaces/temporal/test_round.py index 1ac7acc3edcd..49ed4328b8f0 100644 --- a/py-polars/tests/unit/operations/namespaces/temporal/test_round.py +++ b/py-polars/tests/unit/operations/namespaces/temporal/test_round.py @@ -189,3 +189,51 @@ def test_round_datetime_w_expression(time_unit: TimeUnit) -> None: result = df.select(pl.col("a").dt.round(pl.col("b")))["a"] assert result[0] == datetime(2020, 1, 1) assert result[1] == datetime(2020, 1, 21) + + +@pytest.mark.parametrize( + ("time_unit", "expected"), + [ + ("ms", 0), + ("us", 0), + ("ns", 0), + ], +) +def test_round_negative_towards_epoch_18239(time_unit: TimeUnit, expected: int) -> None: + s = pl.Series([datetime(1970, 1, 1)], dtype=pl.Datetime(time_unit)) + s = s.dt.offset_by(f"-1{time_unit}") + result = s.dt.round(f"2{time_unit}").dt.timestamp(time_unit="ns").item() + assert result == expected + result = ( + s.dt.replace_time_zone("Europe/London") + .dt.round(f"2{time_unit}") + .dt.replace_time_zone(None) + .dt.timestamp(time_unit="ns") + .item() + ) + assert result == expected + + +@pytest.mark.parametrize( + ("time_unit", "expected"), + [ + ("ms", 2_000_000), + ("us", 2_000), + ("ns", 2), + ], +) +def test_round_positive_away_from_epoch_18239( + time_unit: TimeUnit, expected: int +) -> None: + s = pl.Series([datetime(1970, 1, 1)], dtype=pl.Datetime(time_unit)) + s = s.dt.offset_by(f"1{time_unit}") + result = s.dt.round(f"2{time_unit}").dt.timestamp(time_unit="ns").item() + assert result == expected + result = ( + s.dt.replace_time_zone("Europe/London") + .dt.round(f"2{time_unit}") + .dt.replace_time_zone(None) + .dt.timestamp(time_unit="ns") + .item() + ) + assert result == expected diff --git a/py-polars/tests/unit/operations/rolling/test_rolling.py b/py-polars/tests/unit/operations/rolling/test_rolling.py index 8e5bbfd69bd1..d934683d645f 100644 --- a/py-polars/tests/unit/operations/rolling/test_rolling.py +++ b/py-polars/tests/unit/operations/rolling/test_rolling.py @@ -589,6 +589,44 @@ def test_rolling_cov_corr() -> None: assert res["corr"][:2] == [None] * 2 +def test_rolling_cov_corr_nulls() -> None: + df1 = pl.DataFrame( + {"a": [1.06, 1.07, 0.93, 0.78, 0.85], "lag_a": [1.0, 1.06, 1.07, 0.93, 0.78]} + ) + df2 = pl.DataFrame( + { + "a": [1.0, 1.06, 1.07, 0.93, 0.78, 0.85], + "lag_a": [None, 1.0, 1.06, 1.07, 0.93, 0.78], + } + ) + + val_1 = df1.select( + pl.rolling_corr("a", "lag_a", window_size=10, min_periods=5, ddof=1) + ) + val_2 = df2.select( + pl.rolling_corr("a", "lag_a", window_size=10, min_periods=5, ddof=1) + ) + + df1_expected = pl.DataFrame({"a": [None, None, None, None, 0.62204709]}) + df2_expected = pl.DataFrame({"a": [None, None, None, None, None, 0.62204709]}) + + assert_frame_equal(val_1, df1_expected, atol=0.0000001) + assert_frame_equal(val_2, df2_expected, atol=0.0000001) + + val_1 = df1.select( + pl.rolling_cov("a", "lag_a", window_size=10, min_periods=5, ddof=1) + ) + val_2 = df2.select( + pl.rolling_cov("a", "lag_a", window_size=10, min_periods=5, ddof=1) + ) + + df1_expected = pl.DataFrame({"a": [None, None, None, None, 0.009445]}) + df2_expected = pl.DataFrame({"a": [None, None, None, None, None, 0.009445]}) + + assert_frame_equal(val_1, df1_expected, atol=0.0000001) + assert_frame_equal(val_2, df2_expected, atol=0.0000001) + + @pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) def test_rolling_empty_window_9406(time_unit: TimeUnit) -> None: datecol = pl.Series( diff --git a/py-polars/tests/unit/operations/test_filter.py b/py-polars/tests/unit/operations/test_filter.py index df796b44b991..eed550fac516 100644 --- a/py-polars/tests/unit/operations/test_filter.py +++ b/py-polars/tests/unit/operations/test_filter.py @@ -285,3 +285,16 @@ def test_filter_group_aware_17030() -> None: (group_count > 2) & (group_cum_count > 1) & (group_cum_count < group_count) ) assert df.filter(filter_expr)["foo"].to_list() == ["1", "2"] + + +def test_invalid_filter_18295() -> None: + codes = ["a"] * 5 + ["b"] * 5 + values = list(range(-2, 3)) + list(range(2, -3, -1)) + df = pl.DataFrame({"code": codes, "value": values}) + with pytest.raises(pl.exceptions.ShapeError): + df.group_by("code").agg( + pl.col("value") + .ewm_mean(span=2, ignore_nulls=True) + .tail(3) + .filter(pl.col("value") > 0), + ).sort("code") diff --git a/py-polars/tests/unit/operations/test_slice.py b/py-polars/tests/unit/operations/test_slice.py index 7c8fb22665c1..692fcb5634dc 100644 --- a/py-polars/tests/unit/operations/test_slice.py +++ b/py-polars/tests/unit/operations/test_slice.py @@ -243,3 +243,33 @@ def test_double_sort_slice_pushdown_15779() -> None: assert ( pl.LazyFrame({"foo": [1, 2]}).sort("foo").head(0).sort("foo").collect() ).shape == (0, 1) + + +def test_slice_pushdown_simple_projection_18288() -> None: + lf = pl.DataFrame({"col": ["0", "notanumber"]}).lazy() + lf = lf.with_columns([pl.col("col").cast(pl.Int64)]) + lf = lf.with_columns([pl.col("col"), pl.lit(None)]) + assert lf.head(1).collect().to_dict(as_series=False) == { + "col": [0], + "literal": [None], + } + + +def test_group_by_slice_all_keys() -> None: + df = pl.DataFrame( + { + "a": ["Tom", "Nick", "Marry", "Krish", "Jack", None], + "b": [ + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-04", + "2020-01-05", + None, + ], + "c": [5, 6, 6, 7, 8, 5], + } + ) + + gb = df.group_by(["a", "b", "c"], maintain_order=True) + assert_frame_equal(gb.tail(1), gb.head(1)) diff --git a/py-polars/tests/unit/operations/test_statistics.py b/py-polars/tests/unit/operations/test_statistics.py index 8aa1b0ae6811..ed8b964582cb 100644 --- a/py-polars/tests/unit/operations/test_statistics.py +++ b/py-polars/tests/unit/operations/test_statistics.py @@ -7,6 +7,7 @@ import pytest import polars as pl +from polars import StringCache from polars.testing import assert_frame_equal @@ -37,16 +38,21 @@ def test_corr_nan() -> None: assert str(df.select(pl.corr("a", "b", ddof=1))[0, 0]) == "nan" +@StringCache() def test_hist() -> None: - a = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) - assert ( - str(a.hist(bin_count=4).to_dict(as_series=False)) - == "{'breakpoint': [0.0, 2.25, 4.5, 6.75, inf], 'category': ['(-inf, 0.0]', '(0.0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, inf]'], 'count': [0, 3, 2, 0, 2]}" + s = pl.Series("a", [1, 3, 8, 8, 2, 1, 3]) + out = s.hist(bin_count=4) + expected = pl.DataFrame( + { + "breakpoint": pl.Series([2.75, 4.5, 6.25, 8.0], dtype=pl.Float64), + "category": pl.Series( + ["(0.993, 2.75]", "(2.75, 4.5]", "(4.5, 6.25]", "(6.25, 8.0]"], + dtype=pl.Categorical, + ), + "count": pl.Series([3, 2, 0, 2], dtype=pl.get_index_type()), + } ) - - assert a.hist( - bins=[0, 2], include_category=False, include_breakpoint=False - ).to_series().to_list() == [0, 3, 4] + assert_frame_equal(out, expected, categorical_as_str=True) @pytest.mark.parametrize("values", [[], [None]]) diff --git a/py-polars/tests/unit/sql/test_set_ops.py b/py-polars/tests/unit/sql/test_set_ops.py index 64508887d1c5..f148d561c31b 100644 --- a/py-polars/tests/unit/sql/test_set_ops.py +++ b/py-polars/tests/unit/sql/test_set_ops.py @@ -69,6 +69,26 @@ def test_except_intersect_by_name() -> None: assert res_i.columns == ["x", "y", "z"] +@pytest.mark.parametrize( + ("op", "op_subtype"), + [ + ("EXCEPT", "ALL"), + ("EXCEPT", "ALL BY NAME"), + ("INTERSECT", "ALL"), + ("INTERSECT", "ALL BY NAME"), + ], +) +def test_except_intersect_all_unsupported(op: str, op_subtype: str) -> None: + df1 = pl.DataFrame({"n": [1, 1, 1, 2, 2, 2, 3]}) # noqa: F841 + df2 = pl.DataFrame({"n": [1, 1, 2, 2]}) # noqa: F841 + + with pytest.raises( + SQLInterfaceError, + match=f"'{op} {op_subtype}' is not supported", + ): + pl.sql(f"SELECT * FROM df1 {op} {op_subtype} SELECT * FROM df2") + + @pytest.mark.parametrize("op", ["EXCEPT", "INTERSECT", "UNION"]) def test_except_intersect_errors(op: str) -> None: df1 = pl.DataFrame({"x": [1, 9, 1, 1], "y": [2, 3, 4, 4], "z": [5, 5, 5, 5]}) # noqa: F841 diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 244555715657..2087387b1a8a 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -1,7 +1,7 @@ from __future__ import annotations import io -from datetime import date, datetime, time +from datetime import date, datetime, time, tzinfo from decimal import Decimal from typing import TYPE_CHECKING, Any @@ -326,10 +326,16 @@ def test_datetime_time_add_err() -> None: def test_invalid_dtype() -> None: with pytest.raises( TypeError, - match="cannot parse input of type 'str' into Polars data type: 'mayonnaise'", + match=r"cannot parse input of type 'str' into Polars data type \(given: 'mayonnaise'\)", ): pl.Series([1, 2], dtype="mayonnaise") # type: ignore[arg-type] + with pytest.raises( + TypeError, + match="cannot parse input into Polars data type", + ): + pl.Series([None], dtype=tzinfo) # type: ignore[arg-type] + def test_arr_eval_named_cols() -> None: df = pl.DataFrame({"A": ["a", "b"], "B": [["a", "b"], ["c", "d"]]}) @@ -484,7 +490,7 @@ def test_skip_nulls_err() -> None: with pytest.raises( ComputeError, - match=r"The output type of the 'apply' function cannot be determined", + match=r"The output type of the 'map_elements' function cannot be determined", ): df.with_columns(pl.col("foo").map_elements(lambda x: x, skip_nulls=True)) diff --git a/py-polars/tests/unit/test_schema.py b/py-polars/tests/unit/test_schema.py index 8ccb4497ac0f..4c09c986eeb9 100644 --- a/py-polars/tests/unit/test_schema.py +++ b/py-polars/tests/unit/test_schema.py @@ -1,4 +1,5 @@ import pickle +from datetime import datetime import polars as pl @@ -14,20 +15,29 @@ def test_schema() -> None: def test_schema_parse_nonpolars_dtypes() -> None: - # Currently, no parsing is being done. - s = pl.Schema({"foo": pl.List, "bar": int}) # type: ignore[arg-type] + cardinal_directions = pl.Enum(["north", "south", "east", "west"]) + + s = pl.Schema({"foo": pl.List, "bar": int, "baz": cardinal_directions}) # type: ignore[arg-type] + s["ham"] = datetime assert s["foo"] == pl.List - assert s["bar"] is int - assert s.len() == 2 - assert s.names() == ["foo", "bar"] - assert s.dtypes() == [pl.List, int] + assert s["bar"] == pl.Int64 + assert s["baz"] == cardinal_directions + assert s["ham"] == pl.Datetime("us") + + assert s.len() == 4 + assert s.names() == ["foo", "bar", "baz", "ham"] + assert s.dtypes() == [pl.List, pl.Int64, cardinal_directions, pl.Datetime("us")] + + assert list(s.to_python().values()) == [list, int, str, datetime] + assert [tp.to_python() for tp in s.dtypes()] == [list, int, str, datetime] def test_schema_equality() -> None: s1 = pl.Schema({"foo": pl.Int8(), "bar": pl.Float64()}) s2 = pl.Schema({"foo": pl.Int8(), "bar": pl.String()}) s3 = pl.Schema({"bar": pl.Float64(), "foo": pl.Int8()}) + assert s1 == s1 assert s2 == s2 assert s3 == s3 @@ -37,14 +47,38 @@ def test_schema_equality() -> None: def test_schema_picklable() -> None: - s = pl.Schema({"foo": pl.Int8(), "bar": pl.String()}) - + s = pl.Schema( + { + "foo": pl.Int8(), + "bar": pl.String(), + "ham": pl.Struct({"x": pl.List(pl.Date)}), + } + ) pickled = pickle.dumps(s) s2 = pickle.loads(pickled) - assert s == s2 +def test_schema_python() -> None: + input = { + "foo": pl.Int8(), + "bar": pl.String(), + "baz": pl.Categorical("lexical"), + "ham": pl.Object(), + "spam": pl.Struct({"time": pl.List(pl.Duration), "dist": pl.Float64}), + } + expected = { + "foo": int, + "bar": str, + "baz": str, + "ham": object, + "spam": dict, + } + for schema in (input, input.items(), list(input.items())): + s = pl.Schema(schema) + assert expected == s.to_python() + + def test_schema_in_map_elements_returns_scalar() -> None: schema = pl.Schema([("portfolio", pl.String()), ("irr", pl.Float64())]) @@ -62,6 +96,11 @@ def test_schema_in_map_elements_returns_scalar() -> None: ) .alias("irr") ) - assert (q.collect_schema()) == schema assert q.collect().schema == schema + + +def test_ir_cache_unique_18198() -> None: + lf = pl.LazyFrame({"a": [1]}) + lf.collect_schema() + assert pl.concat([lf, lf]).collect().to_dict(as_series=False) == {"a": [1, 1]} diff --git a/rust-toolchain.toml b/rust-toolchain.toml index a6e580b10775..ef33a7d39711 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2024-07-26" +channel = "nightly-2024-08-26"