From cca31b3b8f4e7ea256992f8550854469c5c39c5a Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Sun, 13 Oct 2024 15:40:58 +0200 Subject: [PATCH] fix(python): Don't trigger row limit in array construction (#19215) --- crates/polars-error/src/constants.rs | 4 +- .../polars/_utils/construction/series.py | 63 +++++++++++++------ 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/crates/polars-error/src/constants.rs b/crates/polars-error/src/constants.rs index 910c1e62a499..b6367e3abb2e 100644 --- a/crates/polars-error/src/constants.rs +++ b/crates/polars-error/src/constants.rs @@ -11,7 +11,7 @@ pub static FALSE: &str = "false"; #[cfg(not(feature = "python"))] pub static LENGTH_LIMIT_MSG: &str = - "polars' maximum length reached. Consider compiling with 'bigidx' feature."; + "Polars' maximum length reached. Consider compiling with 'bigidx' feature."; #[cfg(feature = "python")] pub static LENGTH_LIMIT_MSG: &str = - "polars' maximum length reached. Consider installing 'polars-u64-idx'."; + "Polars' maximum length reached. Consider installing 'polars-u64-idx'."; diff --git a/py-polars/polars/_utils/construction/series.py b/py-polars/polars/_utils/construction/series.py index 121378b6d873..f8b700badc20 100644 --- a/py-polars/polars/_utils/construction/series.py +++ b/py-polars/polars/_utils/construction/series.py @@ -36,6 +36,7 @@ Object, Struct, Time, + UInt32, Unknown, dtype_to_py_type, is_polars_dtype, @@ -57,9 +58,10 @@ from polars.dependencies import numpy as np from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa +from polars.functions.eager import concat with contextlib.suppress(ImportError): # Module not available when building docs - from polars.polars import PySeries + from polars.polars import PySeries, get_index_type if TYPE_CHECKING: from collections.abc import Iterable, Sequence @@ -454,27 +456,48 @@ def numpy_to_pyseries( return constructor( name, values, nan_to_null if dtype in (np.float32, np.float64) else strict ) - elif sum(values.shape) == 0: - # Optimize by ingesting 1D and reshaping in Rust - original_shape = values.shape - values = values.reshape(-1) - py_s = numpy_to_pyseries( - name, - values, - strict=strict, - nan_to_null=nan_to_null, - ) - return wrap_s(py_s).reshape(original_shape)._s else: original_shape = values.shape - values = values.reshape(-1) - py_s = numpy_to_pyseries( - name, - values, - strict=strict, - nan_to_null=nan_to_null, - ) - return wrap_s(py_s).reshape(original_shape)._s + values_1d = values.reshape(-1) + + if get_index_type() == UInt32: + limit = 2**32 - 1 + else: + limit = 2**64 - 1 + + if values.size <= limit: + py_s = numpy_to_pyseries( + name, + values_1d, + strict=strict, + nan_to_null=nan_to_null, + ) + return wrap_s(py_s).reshape(original_shape)._s + else: + # Process in chunk, so we don't trigger ROWS_LIMIT + offset = 0 + chunks = [] + + # Tuples are immutable, so convert to list + original_shape_chunk = list(original_shape) + # Rows size is now changed, so infer + original_shape_chunk[0] = -1 + original_shape_chunk_t = tuple(original_shape_chunk) + while True: + chunk = values_1d[offset : offset + limit] + offset += limit + if chunk.shape[0] == 0: + break + + py_s = numpy_to_pyseries( + name, + chunk, + strict=strict, + nan_to_null=nan_to_null, + ) + chunks.append(wrap_s(py_s).reshape(original_shape_chunk_t)) + + return concat(chunks)._s def series_to_pyseries(