Skip to content

Commit cae486d

Browse files
authored
Merge pull request #403 from lincc-frameworks/list_struct_partial_loads
Raise a helpful error for failed partial loading of list-struct columns
2 parents a7d8124 + 7e6f939 commit cae486d

File tree

3 files changed

+70
-4
lines changed

3 files changed

+70
-4
lines changed

src/nested_pandas/nestedframe/io.py

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pyarrow as pa
99
import pyarrow.fs
1010
import pyarrow.parquet as pq
11+
from pyarrow.lib import ArrowInvalid
1112
from upath import UPath
1213

1314
from ..series.dtype import NestedDtype
@@ -95,6 +96,12 @@ def read_parquet(
9596
like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
9697
from working, as this implies both full and partial load of "nested".
9798
99+
Additionally with partial loading, be aware that nested-pandas (and pyarrow)
100+
only supports partial loading of struct of list columns. Your data may be
101+
stored as a list of structs, which can be read by nested-pandas, but without
102+
support for partial loading. We try to throw a helpful error message in these
103+
cases.
104+
98105
Furthermore, there are some cases where subcolumns will have the same name
99106
as a top-level column. For example, if you have a column "nested" with
100107
subcolumns "nested.a" and "nested.b", and also a top-level column "a". In
@@ -214,25 +221,70 @@ def _read_parquet_into_table(
214221
return _read_remote_parquet_directory(
215222
path_to_data, filesystem, storage_options, columns, **kwargs
216223
)
224+
217225
with fsspec.parquet.open_parquet_file(
218226
path_to_data.path,
219227
columns=columns,
220228
storage_options=storage_options,
221229
fs=filesystem,
222230
engine="pyarrow",
223231
) as parquet_file:
224-
return pq.read_table(parquet_file, columns=columns, **kwargs)
232+
return _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs)
225233

226234
# All other cases, including file-like objects, directories, and
227235
# even lists of the foregoing.
228236

229237
# If `filesystem` is specified - use it, passing it as part of **kwargs
230238
if kwargs.get("filesystem") is not None:
231-
return pq.read_table(data, columns=columns, **kwargs)
239+
return _read_table_with_partial_load_check(data, columns=columns, **kwargs)
232240

233241
# Otherwise convert with a special function
234242
data, filesystem = _transform_read_parquet_data_arg(data)
235-
return pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs)
243+
return _read_table_with_partial_load_check(data, columns=columns, filesystem=filesystem, **kwargs)
244+
245+
246+
def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **kwargs):
247+
"""Read a pyarrow table with partial load check for nested structures"""
248+
try:
249+
return pq.read_table(data, columns=columns, **kwargs)
250+
except ArrowInvalid as e:
251+
# if it's not related to partial loading of nested structures, re-raise
252+
if "No match for" not in str(e):
253+
raise e
254+
if columns is not None:
255+
check_schema = any("." in col for col in columns) # Check for potential partial loads
256+
if check_schema:
257+
try:
258+
_validate_structs_from_schema(data, columns=columns, filesystem=filesystem)
259+
except ValueError as validation_error:
260+
raise validation_error from e # Chain the exceptions for better context
261+
raise e
262+
263+
264+
def _validate_structs_from_schema(data, columns=None, filesystem=None):
265+
"""Validate that nested columns are structs"""
266+
if columns is not None:
267+
schema = pq.read_schema(data, filesystem=filesystem)
268+
for col in columns:
269+
# check if column is a partial load of a nested structure
270+
if "." in col:
271+
# first check if column exists as a top-level column
272+
if col in schema.names:
273+
continue
274+
# if not, inspect the base column name type
275+
else:
276+
if col.split(".")[0] in schema.names:
277+
# check if the column is a list-struct
278+
col_type = schema.field(col.split(".")[0]).type
279+
if not pa.types.is_struct(col_type):
280+
base_col = col.split(".")[0]
281+
raise ValueError(
282+
f"The provided column '{col}' signals to partially load a nested structure, "
283+
f"but the nested structure '{base_col}' is not a struct. "
284+
"Partial loading of nested structures is only supported for struct of list "
285+
f"columns. To resolve this, fully load the column '{base_col}' "
286+
f"instead of partially loading it and perform column selection afterwards."
287+
)
236288

237289

238290
def _is_local_dir(upath: UPath) -> bool:
@@ -273,7 +325,7 @@ def _read_remote_parquet_directory(
273325
fs=filesystem,
274326
engine="pyarrow",
275327
) as parquet_file:
276-
table = pq.read_table(parquet_file, columns=columns, **kwargs)
328+
table = _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs)
277329
tables.append(table)
278330
return pa.concat_tables(tables)
279331

7.51 KB
Binary file not shown.

tests/nested_pandas/nestedframe/test_io.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,3 +459,17 @@ def test__get_storage_options():
459459
assert storage_opts is not None
460460
# S3 should NOT have the block_size override (only HTTP/HTTPS)
461461
assert storage_opts.get("block_size") != FSSPEC_BLOCK_SIZE
462+
463+
464+
def test_list_struct_partial_loading_error():
465+
"""Test that attempting to partially load a list-struct raises an error."""
466+
# Load in the example file
467+
with pytest.raises(ValueError):
468+
read_parquet("tests/list_struct_data/list_struct.parquet", columns=["lightcurve.hmjd"])
469+
470+
471+
def test_normal_loading_error():
472+
"""Test that making a normal naming mistake raises the normal pyarrow error."""
473+
# Load in the example file
474+
with pytest.raises(ValueError, match="No match for*"):
475+
read_parquet("tests/test_data/nested.parquet", columns=["not_a_column"])

0 commit comments

Comments
 (0)