From afce7e5a095c219ca5ff0d8f760dc97c596c6aa5 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Sat, 6 Jul 2024 04:12:37 +1000 Subject: [PATCH] docs(python): Add examples for scanning hive datasets to user guide (#17431) --- docs/src/python/user-guide/io/hive.py | 118 ++++++++++++++++++++++++++ docs/user-guide/io/hive.md | 71 ++++++++++++++++ docs/user-guide/io/index.md | 1 + mkdocs.yml | 1 + 4 files changed, 191 insertions(+) create mode 100644 docs/src/python/user-guide/io/hive.py create mode 100644 docs/user-guide/io/hive.md diff --git a/docs/src/python/user-guide/io/hive.py b/docs/src/python/user-guide/io/hive.py new file mode 100644 index 000000000000..55e8825c3ee7 --- /dev/null +++ b/docs/src/python/user-guide/io/hive.py @@ -0,0 +1,118 @@ +# --8<-- [start:init_paths] +import polars as pl +from pathlib import Path + +dfs = [ + pl.DataFrame({"x": [1, 2]}), + pl.DataFrame({"x": [3, 4, 5]}), + pl.DataFrame({"x": [6, 7]}), + pl.DataFrame({"x": [8, 9, 10, 11]}), +] + +parts = [ + "year=2023/month=11", + "year=2023/month=12", + "year=2024/month=01", + "year=2024/month=02", +] + +for df, part in zip(dfs, parts): + path = Path("docs/data/hive/") / part / "data.parquet" + Path(path).parent.mkdir(exist_ok=True, parents=True) + df.write_parquet(path) + + path = Path("docs/data/hive_mixed/") / part / "data.parquet" + Path(path).parent.mkdir(exist_ok=True, parents=True) + df.write_parquet(path) + +Path("docs/data/hive_mixed/description.txt").touch() + + +def print_paths(path: str) -> None: + def dir_recurse(path: Path): + if path.is_dir(): + for p in path.iterdir(): + yield from dir_recurse(p) + else: + yield path + + df = ( + pl.Series( + "File path", + (str(x) for x in dir_recurse(Path(path))), + dtype=pl.String, + ) + .sort() + .to_frame() + ) + + with pl.Config( + tbl_hide_column_data_types=True, + tbl_hide_dataframe_shape=True, + fmt_str_lengths=999, + ): + print(df) + + +print_paths("docs/data/hive/") +# --8<-- [end:init_paths] + +# --8<-- [start:show_mixed_paths] +print_paths("docs/data/hive_mixed/") +# --8<-- [end:show_mixed_paths] + +# --8<-- [start:scan_dir] +import polars as pl + +df = pl.scan_parquet("docs/data/hive/").collect() + +with pl.Config(tbl_rows=99): + print(df) +# --8<-- [end:scan_dir] + +# --8<-- [start:scan_dir_err] +from pathlib import Path + +try: + pl.scan_parquet("docs/data/hive_mixed/").collect() +except Exception as e: + print(e) + +# --8<-- [end:scan_dir_err] + +# --8<-- [start:scan_glob] +df = pl.scan_parquet( + # Glob to match all files ending in `.parquet` + "docs/data/hive_mixed/**/*.parquet", + hive_partitioning=True, +).collect() + +with pl.Config(tbl_rows=99): + print(df) + +# --8<-- [end:scan_glob] + +# --8<-- [start:scan_file_no_hive] +df = pl.scan_parquet( + [ + "docs/data/hive/year=2024/month=01/data.parquet", + "docs/data/hive/year=2024/month=02/data.parquet", + ], +).collect() + +print(df) + +# --8<-- [end:scan_file_no_hive] + +# --8<-- [start:scan_file_hive] +df = pl.scan_parquet( + [ + "docs/data/hive/year=2024/month=01/data.parquet", + "docs/data/hive/year=2024/month=02/data.parquet", + ], + hive_partitioning=True, +).collect() + +print(df) + +# --8<-- [end:scan_file_hive] diff --git a/docs/user-guide/io/hive.md b/docs/user-guide/io/hive.md new file mode 100644 index 000000000000..db95f252ad8f --- /dev/null +++ b/docs/user-guide/io/hive.md @@ -0,0 +1,71 @@ +## Hive partitioned data + +Polars supports scanning hive partitioned parquet and IPC datasets, with planned support for other +formats in the future. + +Hive partition parsing is enabled by default if `scan_parquet` receives a single directory path, +otherwise it is disabled by default. This can be explicitly configured using the `hive_partitioning` +parameter. + +### Scanning a hive directory + +For this example the following directory structure is used: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:init_paths" +``` + +Simply pass the directory to `scan_parquet`, and all files will be loaded with the hive parts in the +path included in the output: + +{{code_block('user-guide/io/hive','scan_dir',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_dir" +``` + +### Handling mixed files + +Passing a directory to `scan_parquet` may not work if there are extra non-data files next to the +data files. + +For this example the following directory structure is used: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:show_mixed_paths" +``` + +{{code_block('user-guide/io/hive','scan_dir_err',['scan_parquet'])}} + +The above fails as `description.txt` is not a valid parquet file: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_dir_err" +``` + +In this situation, a glob pattern can be used to be more specific about which files to load. Note +that `hive_partitioning` must explicitly set to `True`: + +{{code_block('user-guide/io/hive','scan_glob',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_glob" +``` + +### Scanning file paths with hive parts + +`hive_partitioning` is not enabled by default for file paths: + +{{code_block('user-guide/io/hive','scan_file_no_hive',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_file_no_hive" +``` + +Pass `hive_partitioning=True` to enable hive partition parsing: + +{{code_block('user-guide/io/hive','scan_file_hive',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:scan_file_hive" +``` diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md index 5a3548871e8a..4495cefc9257 100644 --- a/docs/user-guide/io/index.md +++ b/docs/user-guide/io/index.md @@ -7,6 +7,7 @@ Reading and writing your data is crucial for a DataFrame library. In this chapte - [Parquet](parquet.md) - [Json](json.md) - [Multiple](multiple.md) +- [Hive](hive.md) - [Database](database.md) - [Cloud storage](cloud-storage.md) - [Google Big Query](bigquery.md) diff --git a/mkdocs.yml b/mkdocs.yml index d2e3c1e637fa..87e1b03a0212 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -65,6 +65,7 @@ nav: - user-guide/io/parquet.md - user-guide/io/json.md - user-guide/io/multiple.md + - user-guide/io/hive.md - user-guide/io/database.md - user-guide/io/cloud-storage.md - user-guide/io/bigquery.md