Merge pull request #403 from lincc-frameworks/list_struct_partial_loads

dougbrn · web-flow · commit cae486d5747d · 2025-11-05T09:27:28.000-08:00
Raise a helpful error for failed partial loading of list-struct columns
diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py
@@ -8,6 +8,7 @@
 import pyarrow as pa
 import pyarrow.fs
 import pyarrow.parquet as pq
+from pyarrow.lib import ArrowInvalid
 from upath import UPath
 
 from ..series.dtype import NestedDtype
@@ -95,6 +96,12 @@ def read_parquet(
     like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
     from working, as this implies both full and partial load of "nested".
 
+    Additionally with partial loading, be aware that nested-pandas (and pyarrow)
+    only supports partial loading of struct of list columns. Your data may be
+    stored as a list of structs, which can be read by nested-pandas, but without
+    support for partial loading. We try to throw a helpful error message in these
+    cases.
+
     Furthermore, there are some cases where subcolumns will have the same name
     as a top-level column. For example, if you have a column "nested" with
     subcolumns "nested.a" and "nested.b", and also a top-level column "a". In
@@ -214,25 +221,70 @@ def _read_parquet_into_table(
             return _read_remote_parquet_directory(
                 path_to_data, filesystem, storage_options, columns, **kwargs
             )
+
         with fsspec.parquet.open_parquet_file(
             path_to_data.path,
             columns=columns,
             storage_options=storage_options,
             fs=filesystem,
             engine="pyarrow",
         ) as parquet_file:
-            return pq.read_table(parquet_file, columns=columns, **kwargs)
+            return _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs)
 
     # All other cases, including file-like objects, directories, and
     # even lists of the foregoing.
 
     # If `filesystem` is specified - use it, passing it as part of **kwargs
     if kwargs.get("filesystem") is not None:
-        return pq.read_table(data, columns=columns, **kwargs)
+        return _read_table_with_partial_load_check(data, columns=columns, **kwargs)
 
     # Otherwise convert with a special function
     data, filesystem = _transform_read_parquet_data_arg(data)
-    return pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs)
+    return _read_table_with_partial_load_check(data, columns=columns, filesystem=filesystem, **kwargs)
+
+
+def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **kwargs):
+    """Read a pyarrow table with partial load check for nested structures"""
+    try:
+        return pq.read_table(data, columns=columns, **kwargs)
+    except ArrowInvalid as e:
+        # if it's not related to partial loading of nested structures, re-raise
+        if "No match for" not in str(e):
+            raise e
+        if columns is not None:
+            check_schema = any("." in col for col in columns)  # Check for potential partial loads
+            if check_schema:
+                try:
+                    _validate_structs_from_schema(data, columns=columns, filesystem=filesystem)
+                except ValueError as validation_error:
+                    raise validation_error from e  # Chain the exceptions for better context
+        raise e
+
+
+def _validate_structs_from_schema(data, columns=None, filesystem=None):
+    """Validate that nested columns are structs"""
+    if columns is not None:
+        schema = pq.read_schema(data, filesystem=filesystem)
+        for col in columns:
+            # check if column is a partial load of a nested structure
+            if "." in col:
+                # first check if column exists as a top-level column
+                if col in schema.names:
+                    continue
+                # if not, inspect the base column name type
+                else:
+                    if col.split(".")[0] in schema.names:
+                        # check if the column is a list-struct
+                        col_type = schema.field(col.split(".")[0]).type
+                        if not pa.types.is_struct(col_type):
+                            base_col = col.split(".")[0]
+                            raise ValueError(
+                                f"The provided column '{col}' signals to partially load a nested structure, "
+                                f"but the nested structure '{base_col}' is not a struct. "
+                                "Partial loading of nested structures is only supported for struct of list "
+                                f"columns. To resolve this, fully load the column '{base_col}' "
+                                f"instead of partially loading it and perform column selection afterwards."
+                            )
 
 
 def _is_local_dir(upath: UPath) -> bool:
@@ -273,7 +325,7 @@ def _read_remote_parquet_directory(
                 fs=filesystem,
                 engine="pyarrow",
             ) as parquet_file:
-                table = pq.read_table(parquet_file, columns=columns, **kwargs)
+                table = _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs)
         tables.append(table)
     return pa.concat_tables(tables)
 
diff --git a/tests/list_struct_data/list_struct.parquet b/tests/list_struct_data/list_struct.parquet
diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py
@@ -459,3 +459,17 @@ def test__get_storage_options():
     assert storage_opts is not None
     # S3 should NOT have the block_size override (only HTTP/HTTPS)
     assert storage_opts.get("block_size") != FSSPEC_BLOCK_SIZE
+
+
+def test_list_struct_partial_loading_error():
+    """Test that attempting to partially load a list-struct raises an error."""
+    # Load in the example file
+    with pytest.raises(ValueError):
+        read_parquet("tests/list_struct_data/list_struct.parquet", columns=["lightcurve.hmjd"])
+
+
+def test_normal_loading_error():
+    """Test that making a normal naming mistake raises the normal pyarrow error."""
+    # Load in the example file
+    with pytest.raises(ValueError, match="No match for*"):
+        read_parquet("tests/test_data/nested.parquet", columns=["not_a_column"])