From ef49f7f4199531fe7e5cf51e619975ec98519fee Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Fri, 19 Jul 2024 21:22:22 +1000 Subject: [PATCH] docs(python): Add Hugging Face section to user guide (#17721) --- docs/_build/API_REFERENCE_LINKS.yml | 1 + docs/src/python/user-guide/io/hugging-face.py | 20 +++++ docs/user-guide/io/hugging-face.md | 81 +++++++++++++++++++ docs/user-guide/io/index.md | 1 + mkdocs.yml | 1 + 5 files changed, 104 insertions(+) create mode 100644 docs/src/python/user-guide/io/hugging-face.py create mode 100644 docs/user-guide/io/hugging-face.md diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index 4cd8c18a3b05..eb868c5bab8c 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -69,6 +69,7 @@ python: read_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.read_parquet.html write_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_parquet.html scan_parquet: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_parquet.html + scan_ipc: https://docs.pola.rs/api/python/stable/reference/api/polars.scan_ipc.html read_json: https://docs.pola.rs/api/python/stable/reference/api/polars.read_json.html read_ndjson: https://docs.pola.rs/api/python/stable/reference/api/polars.read_ndjson.html write_ndjson: https://docs.pola.rs/api/python/stable/reference/api/polars.DataFrame.write_ndjson.html diff --git a/docs/src/python/user-guide/io/hugging-face.py b/docs/src/python/user-guide/io/hugging-face.py new file mode 100644 index 000000000000..09e162863884 --- /dev/null +++ b/docs/src/python/user-guide/io/hugging-face.py @@ -0,0 +1,20 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +# --8<-- [start:scan_iris_csv] +print(pl.scan_csv("hf://datasets/nameexhaustion/polars-docs/iris.csv").collect()) +# --8<-- [end:scan_iris_csv] + +# --8<-- [start:scan_iris_ndjson] +print(pl.scan_ndjson("hf://datasets/nameexhaustion/polars-docs/iris.jsonl").collect()) +# --8<-- [end:scan_iris_ndjson] + +# --8<-- [start:scan_parquet_hive] +print(pl.scan_parquet("hf://datasets/nameexhaustion/polars-docs/hive_dates/").collect()) +# --8<-- [end:scan_parquet_hive] + +# --8<-- [start:scan_ipc] +print(pl.scan_ipc("hf://spaces/nameexhaustion/polars-docs/orders.feather").collect()) +# --8<-- [end:scan_ipc] diff --git a/docs/user-guide/io/hugging-face.md b/docs/user-guide/io/hugging-face.md new file mode 100644 index 000000000000..7bb0b425d7e4 --- /dev/null +++ b/docs/user-guide/io/hugging-face.md @@ -0,0 +1,81 @@ +# Hugging Face + +## Scanning datasets from Huggging Face + +All cloud-enabled scan functions also transparently support scanning from Hugging Face: + +- [scan_parquet](https://docs.pola.rs/api/python/stable/reference/api/polars.scan_parquet.html) +- [scan_csv](https://docs.pola.rs/api/python/stable/reference/api/polars.scan_csv.html) +- [scan_ipc](https://docs.pola.rs/api/python/stable/reference/api/polars.scan_ipc.html) +- [scan_ndjson](https://docs.pola.rs/api/python/stable/reference/api/polars.scan_ndjson.html) + +### Path format + +To scan from Hugging Face, a `hf://` path can be passed to the scan functions. The `hf://` path format is defined as `hf://BUCKET/REPOSITORY@REVISION/PATH`, where: + +- `BUCKET` is one of `datasets` or `spaces` +- `REPOSITORY` is the location of the repository, this is usually in the format of `username/repo_name`. A branch can also be optionally specified by appending `@branch` +- `REVISION` is the name of the branch (or commit) to use. This is optional and defaults to `main` if not given. +- `PATH` is a file or directory path, or a glob pattern from the repository root. + +Example `hf://` paths: + +| Path | Path components | +| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| hf://datasets/nameexhaustion/polars-docs/iris.csv | Bucket: datasets
Repository: nameexhaustion/polars-docs
Branch: main
Path: iris.csv
[Web URL](https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/) | +| hf://datasets/nameexhaustion/polars-docs@foods/\*.csv | Bucket: datasets
Repository: nameexhaustion/polars-docs
Branch: foods
Path: \*.csv
[Web URL](https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/foods/) | +| hf://datasets/nameexhaustion/polars-docs/hive_dates/ | Bucket: datasets
Repository: nameexhaustion/polars-docs
Branch: main
Path: hive_dates/
[Web URL](https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/) | +| hf://spaces/nameexhaustion/polars-docs/orders.feather | Bucket: spaces
Repository: nameexhaustion/polars-docs
Branch: main
Path: orders.feather
[Web URL](https://huggingface.co/spaces/nameexhaustion/polars-docs/tree/main/) | + +### Authentication + +A Hugging Face API key can be passed to Polars to access private locations using either of the following methods: + +- Passing a `token` in `storage_options` to the scan function, e.g. `scan_parquet(..., storage_options={'token': ''})` +- Setting the `HF_TOKEN` environment variable, e.g. `export HF_TOKEN=` + +### Examples + +#### CSV + +```python exec="on" result="text" session="user-guide/io/hugging-face" +--8<-- "python/user-guide/io/hugging-face.py:setup" +``` + +{{code_block('user-guide/io/hugging-face','scan_iris_csv',['scan_csv'])}} + +```python exec="on" result="text" session="user-guide/io/hugging-face" +--8<-- "python/user-guide/io/hugging-face.py:scan_iris_csv" +``` + +See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.csv](https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.csv) + +#### NDJSON + +{{code_block('user-guide/io/hugging-face','scan_iris_ndjson',['scan_ndjson'])}} + +```python exec="on" result="text" session="user-guide/io/hugging-face" +--8<-- "python/user-guide/io/hugging-face.py:scan_iris_ndjson" +``` + +See this file at [https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.jsonl](https://huggingface.co/datasets/nameexhaustion/polars-docs/blob/main/iris.jsonl) + +#### Parquet + +{{code_block('user-guide/io/hugging-face','scan_parquet_hive',['scan_parquet'])}} + +```python exec="on" result="text" session="user-guide/io/hugging-face" +--8<-- "python/user-guide/io/hugging-face.py:scan_parquet_hive" +``` + +See this folder at [https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/](https://huggingface.co/datasets/nameexhaustion/polars-docs/tree/main/hive_dates/) + +#### IPC + +{{code_block('user-guide/io/hugging-face','scan_ipc',['scan_ipc'])}} + +```python exec="on" result="text" session="user-guide/io/hugging-face" +--8<-- "python/user-guide/io/hugging-face.py:scan_ipc" +``` + +See this file at [https://huggingface.co/spaces/nameexhaustion/polars-docs/blob/main/orders.feather](https://huggingface.co/spaces/nameexhaustion/polars-docs/blob/main/orders.feather) diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md index 4495cefc9257..75cac3f47a5e 100644 --- a/docs/user-guide/io/index.md +++ b/docs/user-guide/io/index.md @@ -11,3 +11,4 @@ Reading and writing your data is crucial for a DataFrame library. In this chapte - [Database](database.md) - [Cloud storage](cloud-storage.md) - [Google Big Query](bigquery.md) +- [Hugging Face][hugging-face.md] diff --git a/mkdocs.yml b/mkdocs.yml index 7dda2d2d3b49..ed93b9badcfe 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -69,6 +69,7 @@ nav: - user-guide/io/database.md - user-guide/io/cloud-storage.md - user-guide/io/bigquery.md + - user-guide/io/hugging-face.md - SQL: - user-guide/sql/intro.md - user-guide/sql/show.md