From 2b54214f862d2792e393fa00edb0fdf238f3d330 Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Mon, 8 Jul 2024 16:50:13 +1000 Subject: [PATCH] docs(python): Add example for writing hive partitioned parquet to user guide (#17483) --- docs/src/python/user-guide/io/hive.py | 13 +++++++++++ docs/user-guide/io/hive.md | 32 ++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/docs/src/python/user-guide/io/hive.py b/docs/src/python/user-guide/io/hive.py index 55e8825c3ee7..215b4eea6f87 100644 --- a/docs/src/python/user-guide/io/hive.py +++ b/docs/src/python/user-guide/io/hive.py @@ -116,3 +116,16 @@ def dir_recurse(path: Path): print(df) # --8<-- [end:scan_file_hive] + +# --8<-- [start:write_parquet_partitioned_show_data] +df = pl.DataFrame({"a": [1, 1, 2, 2, 3], "b": [1, 1, 1, 2, 2], "c": 1}) +print(df) +# --8<-- [end:write_parquet_partitioned_show_data] + +# --8<-- [start:write_parquet_partitioned] +df.write_parquet_partitioned("docs/data/hive_write/", ["a", "b"]) +# --8<-- [end:write_parquet_partitioned] + +# --8<-- [start:write_parquet_partitioned_show_paths] +print_paths("docs/data/hive_write/") +# --8<-- [end:write_parquet_partitioned_show_paths] diff --git a/docs/user-guide/io/hive.md b/docs/user-guide/io/hive.md index db95f252ad8f..27af6b6c18ee 100644 --- a/docs/user-guide/io/hive.md +++ b/docs/user-guide/io/hive.md @@ -1,4 +1,4 @@ -## Hive partitioned data +## Scanning hive partitioned data Polars supports scanning hive partitioned parquet and IPC datasets, with planned support for other formats in the future. @@ -69,3 +69,33 @@ Pass `hive_partitioning=True` to enable hive partition parsing: ```python exec="on" result="text" session="user-guide/io/hive" --8<-- "python/user-guide/io/hive.py:scan_file_hive" ``` + +## Writing hive partitioned data + +> Note: The following functionality is considered _unstable_, and is subject to change. + +Polars supports writing hive partitioned parquet datasets, with planned support for other formats. + +### Example + +For this example the following DataFrame is used: + +{{code_block('user-guide/io/hive','write_parquet_partitioned_show_data',['write_parquet_partitioned'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:write_parquet_partitioned_show_data" +``` + +We will write it to a hive-partitioned parquet dataset, partitioned by the columns `a` and `b`: + +{{code_block('user-guide/io/hive','write_parquet_partitioned',['write_parquet_partitioned'])}} + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:write_parquet_partitioned" +``` + +The output is a hive partitioned parquet dataset with the following paths: + +```python exec="on" result="text" session="user-guide/io/hive" +--8<-- "python/user-guide/io/hive.py:write_parquet_partitioned_show_paths" +```