docs(python): Add examples for scanning hive datasets to user guide (#…

…17431)
pola-rs · Jul 5, 2024 · afce7e5 · afce7e5
1 parent 34126ca
commit afce7e5
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 0 deletions.
diff --git a/docs/src/python/user-guide/io/hive.py b/docs/src/python/user-guide/io/hive.py
@@ -0,0 +1,118 @@
+# --8<-- [start:init_paths]
+import polars as pl
+from pathlib import Path
+
+dfs = [
+    pl.DataFrame({"x": [1, 2]}),
+    pl.DataFrame({"x": [3, 4, 5]}),
+    pl.DataFrame({"x": [6, 7]}),
+    pl.DataFrame({"x": [8, 9, 10, 11]}),
+]
+
+parts = [
+    "year=2023/month=11",
+    "year=2023/month=12",
+    "year=2024/month=01",
+    "year=2024/month=02",
+]
+
+for df, part in zip(dfs, parts):
+    path = Path("docs/data/hive/") / part / "data.parquet"
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    df.write_parquet(path)
+
+    path = Path("docs/data/hive_mixed/") / part / "data.parquet"
+    Path(path).parent.mkdir(exist_ok=True, parents=True)
+    df.write_parquet(path)
+
+Path("docs/data/hive_mixed/description.txt").touch()
+
+
+def print_paths(path: str) -> None:
+    def dir_recurse(path: Path):
+        if path.is_dir():
+            for p in path.iterdir():
+                yield from dir_recurse(p)
+        else:
+            yield path
+
+    df = (
+        pl.Series(
+            "File path",
+            (str(x) for x in dir_recurse(Path(path))),
+            dtype=pl.String,
+        )
+        .sort()
+        .to_frame()
+    )
+
+    with pl.Config(
+        tbl_hide_column_data_types=True,
+        tbl_hide_dataframe_shape=True,
+        fmt_str_lengths=999,
+    ):
+        print(df)
+
+
+print_paths("docs/data/hive/")
+# --8<-- [end:init_paths]
+
+# --8<-- [start:show_mixed_paths]
+print_paths("docs/data/hive_mixed/")
+# --8<-- [end:show_mixed_paths]
+
+# --8<-- [start:scan_dir]
+import polars as pl
+
+df = pl.scan_parquet("docs/data/hive/").collect()
+
+with pl.Config(tbl_rows=99):
+    print(df)
+# --8<-- [end:scan_dir]
+
+# --8<-- [start:scan_dir_err]
+from pathlib import Path
+
+try:
+    pl.scan_parquet("docs/data/hive_mixed/").collect()
+except Exception as e:
+    print(e)
+
+# --8<-- [end:scan_dir_err]
+
+# --8<-- [start:scan_glob]
+df = pl.scan_parquet(
+    # Glob to match all files ending in `.parquet`
+    "docs/data/hive_mixed/**/*.parquet",
+    hive_partitioning=True,
+).collect()
+
+with pl.Config(tbl_rows=99):
+    print(df)
+
+# --8<-- [end:scan_glob]
+
+# --8<-- [start:scan_file_no_hive]
+df = pl.scan_parquet(
+    [
+        "docs/data/hive/year=2024/month=01/data.parquet",
+        "docs/data/hive/year=2024/month=02/data.parquet",
+    ],
+).collect()
+
+print(df)
+
+# --8<-- [end:scan_file_no_hive]
+
+# --8<-- [start:scan_file_hive]
+df = pl.scan_parquet(
+    [
+        "docs/data/hive/year=2024/month=01/data.parquet",
+        "docs/data/hive/year=2024/month=02/data.parquet",
+    ],
+    hive_partitioning=True,
+).collect()
+
+print(df)
+
+# --8<-- [end:scan_file_hive]
diff --git a/docs/user-guide/io/hive.md b/docs/user-guide/io/hive.md
@@ -0,0 +1,71 @@
+## Hive partitioned data
+
+Polars supports scanning hive partitioned parquet and IPC datasets, with planned support for other
+formats in the future.
+
+Hive partition parsing is enabled by default if `scan_parquet` receives a single directory path,
+otherwise it is disabled by default. This can be explicitly configured using the `hive_partitioning`
+parameter.
+
+### Scanning a hive directory
+
+For this example the following directory structure is used:
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:init_paths"
+```
+
+Simply pass the directory to `scan_parquet`, and all files will be loaded with the hive parts in the
+path included in the output:
+
+{{code_block('user-guide/io/hive','scan_dir',['scan_parquet'])}}
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:scan_dir"
+```
+
+### Handling mixed files
+
+Passing a directory to `scan_parquet` may not work if there are extra non-data files next to the
+data files.
+
+For this example the following directory structure is used:
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:show_mixed_paths"
+```
+
+{{code_block('user-guide/io/hive','scan_dir_err',['scan_parquet'])}}
+
+The above fails as `description.txt` is not a valid parquet file:
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:scan_dir_err"
+```
+
+In this situation, a glob pattern can be used to be more specific about which files to load. Note
+that `hive_partitioning` must explicitly set to `True`:
+
+{{code_block('user-guide/io/hive','scan_glob',['scan_parquet'])}}
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:scan_glob"
+```
+
+### Scanning file paths with hive parts
+
+`hive_partitioning` is not enabled by default for file paths:
+
+{{code_block('user-guide/io/hive','scan_file_no_hive',['scan_parquet'])}}
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:scan_file_no_hive"
+```
+
+Pass `hive_partitioning=True` to enable hive partition parsing:
+
+{{code_block('user-guide/io/hive','scan_file_hive',['scan_parquet'])}}
+
+```python exec="on" result="text" session="user-guide/io/hive"
+--8<-- "python/user-guide/io/hive.py:scan_file_hive"
+```
diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md
@@ -7,6 +7,7 @@ Reading and writing your data is crucial for a DataFrame library. In this chapte
 - [Parquet](parquet.md)
 - [Json](json.md)
 - [Multiple](multiple.md)
+- [Hive](hive.md)
 - [Database](database.md)
 - [Cloud storage](cloud-storage.md)
 - [Google Big Query](bigquery.md)
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -65,6 +65,7 @@ nav:
       - user-guide/io/parquet.md
       - user-guide/io/json.md
       - user-guide/io/multiple.md
+      - user-guide/io/hive.md
       - user-guide/io/database.md
       - user-guide/io/cloud-storage.md
       - user-guide/io/bigquery.md