From dd5476c189f77e22e76a4b9ae32848b911358e11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20Vehmaanper=C3=A4?= Date: Sun, 4 Aug 2024 01:04:15 +0300 Subject: [PATCH 1/7] docs(python): Fix LazyFrame fetch method references --- .../source/reference/lazyframe/miscellaneous.rst | 1 + py-polars/polars/lazyframe/frame.py | 13 ------------- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst index 2a0ab647766d..0cdf5096e445 100644 --- a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst +++ b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst @@ -10,6 +10,7 @@ Miscellaneous LazyFrame.collect LazyFrame.collect_async LazyFrame.collect_schema + LazyFrame.fetch LazyFrame.lazy LazyFrame.map_batches LazyFrame.pipe diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index cf98067d1809..ee03dcc18821 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -1894,7 +1894,6 @@ def collect( See Also -------- - fetch: Run the query on the first `n` rows only for debugging purposes. explain : Print the query plan that is evaluated with collect. profile : Collect the LazyFrame and time each node in the computation graph. polars.collect_all : Collect multiple LazyFrames at the same time. @@ -5188,12 +5187,6 @@ def limit(self, n: int = 5) -> LazyFrame: n Number of rows to return. - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - Examples -------- >>> lf = pl.LazyFrame( @@ -5237,12 +5230,6 @@ def head(self, n: int = 5) -> LazyFrame: n Number of rows to return. - Notes - ----- - Consider using the :func:`fetch` operation if you only want to test your - query. The :func:`fetch` operation will load the first `n` rows at the scan - level, whereas the :func:`head`/:func:`limit` are applied at the end. - Examples -------- >>> lf = pl.LazyFrame( From e71333cadf9beddb819ff0ca186da9b415f0baf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20Vehmaanper=C3=A4?= Date: Mon, 5 Aug 2024 19:45:22 +0300 Subject: [PATCH 2/7] Remove fetch again --- py-polars/docs/source/reference/lazyframe/miscellaneous.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst index 0cdf5096e445..2a0ab647766d 100644 --- a/py-polars/docs/source/reference/lazyframe/miscellaneous.rst +++ b/py-polars/docs/source/reference/lazyframe/miscellaneous.rst @@ -10,7 +10,6 @@ Miscellaneous LazyFrame.collect LazyFrame.collect_async LazyFrame.collect_schema - LazyFrame.fetch LazyFrame.lazy LazyFrame.map_batches LazyFrame.pipe From 3c86c0afebaa780f5c0e9568f0b077f593514fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20Vehmaanper=C3=A4?= Date: Mon, 5 Aug 2024 19:45:48 +0300 Subject: [PATCH 3/7] Remove fetch from user-guide --- .../src/python/user-guide/lazy/execution.py | 8 ------ docs/source/user-guide/lazy/execution.md | 27 ------------------- 2 files changed, 35 deletions(-) diff --git a/docs/source/src/python/user-guide/lazy/execution.py b/docs/source/src/python/user-guide/lazy/execution.py index c4a2ed165922..ffef5ba20e8a 100644 --- a/docs/source/src/python/user-guide/lazy/execution.py +++ b/docs/source/src/python/user-guide/lazy/execution.py @@ -28,12 +28,4 @@ .collect(streaming=True) ) # --8<-- [end:stream] -# --8<-- [start:partial] -q9 = ( - pl.scan_csv(f"docs/assets/data/reddit.csv") - .with_columns(pl.col("name").str.to_uppercase()) - .filter(pl.col("comment_karma") > 0) - .fetch(n_rows=int(100)) -) -# --8<-- [end:partial] """ diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index 975f52a0ac4a..77713be810f9 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -49,31 +49,4 @@ If your data requires more memory than you have available Polars may be able to We look at [streaming in more detail here](streaming.md). -### Execution on a partial dataset - -While you're writing, optimizing or checking your query on a large dataset, querying all available data may lead to a slow development process. - -You can instead execute the query with the `.fetch` method. The `.fetch` method takes a parameter `n_rows` and tries to 'fetch' that number of rows at the data source. The number of rows cannot be guaranteed, however, as the lazy API does not count how many rows there are at each stage of the query. - -Here we "fetch" 100 rows from the source file and apply the predicates. - -{{code_block('user-guide/lazy/execution','partial',['scan_csv','collect','fetch'])}} - -```text -shape: (27, 6) -┌───────┬───────────────────────────┬─────────────┬────────────┬───────────────┬────────────┐ -│ id ┆ name ┆ created_utc ┆ updated_on ┆ comment_karma ┆ link_karma │ -│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ -│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ -╞═══════╪═══════════════════════════╪═════════════╪════════════╪═══════════════╪════════════╡ -│ 6 ┆ TAOJIANLONG_JASONBROKEN ┆ 1397113510 ┆ 1536527864 ┆ 4 ┆ 0 │ -│ 17 ┆ SSAIG_JASONBROKEN ┆ 1397113544 ┆ 1536527864 ┆ 1 ┆ 0 │ -│ 19 ┆ FDBVFDSSDGFDS_JASONBROKEN ┆ 1397113552 ┆ 1536527864 ┆ 3 ┆ 0 │ -│ 37 ┆ IHATEWHOWEARE_JASONBROKEN ┆ 1397113636 ┆ 1536527864 ┆ 61 ┆ 0 │ -│ … ┆ … ┆ … ┆ … ┆ … ┆ … │ -│ 77763 ┆ LUNCHY ┆ 1137599510 ┆ 1536528275 ┆ 65 ┆ 0 │ -│ 77765 ┆ COMPOSTELLAS ┆ 1137474000 ┆ 1536528276 ┆ 6 ┆ 0 │ -│ 77766 ┆ GENERICBOB ┆ 1137474000 ┆ 1536528276 ┆ 291 ┆ 14 │ -│ 77768 ┆ TINHEADNED ┆ 1139665457 ┆ 1536497404 ┆ 4434 ┆ 103 │ -└───────┴───────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ ``` From ff415b13deed3da3f3c02d9153ae6d297829ca6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20Vehmaanper=C3=A4?= Date: Wed, 7 Aug 2024 20:08:39 +0300 Subject: [PATCH 4/7] Fix lint --- docs/source/user-guide/lazy/execution.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index 77713be810f9..d5fcf1a89675 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -48,5 +48,3 @@ If your data requires more memory than you have available Polars may be able to {{code_block('user-guide/lazy/execution','stream',['scan_csv','collect'])}} We look at [streaming in more detail here](streaming.md). - -``` From 319b22a45c4e15e4b48b295bfa5e720099f762fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20Vehmaanper=C3=A4?= Date: Tue, 24 Sep 2024 18:57:08 +0300 Subject: [PATCH 5/7] Add a modified user-guide section back --- .../src/python/user-guide/lazy/execution.py | 9 +++++++++ docs/source/user-guide/lazy/execution.md | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/docs/source/src/python/user-guide/lazy/execution.py b/docs/source/src/python/user-guide/lazy/execution.py index ffef5ba20e8a..d410c8a45371 100644 --- a/docs/source/src/python/user-guide/lazy/execution.py +++ b/docs/source/src/python/user-guide/lazy/execution.py @@ -28,4 +28,13 @@ .collect(streaming=True) ) # --8<-- [end:stream] +# --8<-- [start:partial] +q9 = ( + pl.scan_csv(f"docs/assets/data/reddit.csv") + .head(10) + .with_columns(pl.col("name").str.to_uppercase()) + .filter(pl.col("comment_karma") > 0) + .collect() +) +# --8<-- [end:partial] """ diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index d5fcf1a89675..618926852f75 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -48,3 +48,22 @@ If your data requires more memory than you have available Polars may be able to {{code_block('user-guide/lazy/execution','stream',['scan_csv','collect'])}} We look at [streaming in more detail here](streaming.md). + +### Execution on a partial dataset + +While you're writing, optimizing or checking your query on a large dataset, querying all available data may lead to a slow development process. + +You can instead limit the number of scanned partitions or use .head early in the query when testing. Keep in mind that aggregations and filters may behave unpredictably on subsets of data. + +{{code_block('user-guide/lazy/execution','partial',['scan_csv','collect','head'])}} + +```text +shape: (1, 6) +┌─────┬─────────────────────────┬─────────────┬────────────┬───────────────┬────────────┐ +│ id ┆ name ┆ created_utc ┆ updated_on ┆ comment_karma ┆ link_karma │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞═════╪═════════════════════════╪═════════════╪════════════╪═══════════════╪════════════╡ +│ 6 ┆ TAOJIANLONG_JASONBROKEN ┆ 1397113510 ┆ 1536527864 ┆ 4 ┆ 0 │ +└─────┴─────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ + ``` \ No newline at end of file From 0ac3f52d869ddd4ed62935283c0def3f15ed3d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20Vehmaanper=C3=A4?= Date: Tue, 24 Sep 2024 19:01:21 +0300 Subject: [PATCH 6/7] Lint --- docs/source/user-guide/lazy/execution.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index 618926852f75..67ab9ad798a9 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -66,4 +66,4 @@ shape: (1, 6) ╞═════╪═════════════════════════╪═════════════╪════════════╪═══════════════╪════════════╡ │ 6 ┆ TAOJIANLONG_JASONBROKEN ┆ 1397113510 ┆ 1536527864 ┆ 4 ┆ 0 │ └─────┴─────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ - ``` \ No newline at end of file +``` From ea5cd0c4fa6c85a2f6bca340b51b443d2a96bfee Mon Sep 17 00:00:00 2001 From: "Edwin V." Date: Tue, 24 Sep 2024 16:46:32 +0000 Subject: [PATCH 7/7] Update docs/source/user-guide/lazy/execution.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rodrigo Girão Serrão <5621605+rodrigogiraoserrao@users.noreply.github.com> --- docs/source/user-guide/lazy/execution.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/lazy/execution.md b/docs/source/user-guide/lazy/execution.md index 67ab9ad798a9..da3e154270b2 100644 --- a/docs/source/user-guide/lazy/execution.md +++ b/docs/source/user-guide/lazy/execution.md @@ -53,7 +53,8 @@ We look at [streaming in more detail here](streaming.md). While you're writing, optimizing or checking your query on a large dataset, querying all available data may lead to a slow development process. -You can instead limit the number of scanned partitions or use .head early in the query when testing. Keep in mind that aggregations and filters may behave unpredictably on subsets of data. +Instead, you can scan a subset of your partitions or use `.head`/`.collect` at the beginning and end of your query, respectively. +Keep in mind that the results of aggregations and filters on subsets of your data may not be representative of the result you would get on the full data. {{code_block('user-guide/lazy/execution','partial',['scan_csv','collect','head'])}}