Switch from_flat 'index' keyword to 'on' (#69)

* Switch from_flat 'index' keyword to 'on' * Update nested-pandas version requirements * Change test_dataset band dtype * Require nested-pandas>=0.3.1 * Formatting --------- Co-authored-by: Konstantin Malanchev <[email protected]>
lincc-frameworks · Nov 6, 2024 · f5019f7 · f5019f7
1 parent 973feca
commit f5019f7
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 7 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ classifiers = [
 dynamic = ["version"]
 requires-python = ">=3.9"
 dependencies = [
-    'nested-pandas>=0.2.1,<0.3',
+    'nested-pandas>=0.3.1,<0.4.0',
     'numpy',
     'dask>=2024.3.0',
     'dask[distributed]>=2024.3.0',

diff --git a/src/nested_dask/core.py b/src/nested_dask/core.py
@@ -287,7 +287,7 @@ def from_map(
         return NestedFrame.from_dask_dataframe(nf)
 
     @classmethod
-    def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
+    def from_flat(cls, df, base_columns, nested_columns=None, on=None, name="nested"):
         """Creates a NestedFrame with base and nested columns from a flat
         dataframe.
 
@@ -303,7 +303,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
             in the list will attempt to be packed into a single nested column
             with the name provided in `nested_name`. If None, is defined as all
             columns not in `base_columns`.
-        index: str, or None
+        on: str or None
             The name of a column to use as the new index. Typically, the index
             should have a unique value per row for base columns, and should
             repeat for nested columns. For example, a dataframe with two
@@ -323,15 +323,15 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
         meta = npd.NestedFrame(df[base_columns]._meta)
 
         if nested_columns is None:
-            nested_columns = [col for col in df.columns if (col not in base_columns) and col != index]
+            nested_columns = [col for col in df.columns if (col not in base_columns) and col != on]
 
         if len(nested_columns) > 0:
             nested_meta = pack(df[nested_columns]._meta, name)
             meta = meta.join(nested_meta)
 
         return df.map_partitions(
             lambda x: npd.NestedFrame.from_flat(
-                df=x, base_columns=base_columns, nested_columns=nested_columns, index=index, name=name
+                df=x, base_columns=base_columns, nested_columns=nested_columns, on=on, name=name
             ),
             meta=meta,
         )

diff --git a/tests/nested_dask/conftest.py b/tests/nested_dask/conftest.py
@@ -1,6 +1,8 @@
 import nested_dask as nd
 import nested_pandas as npd
 import numpy as np
+import pandas as pd
+import pyarrow as pa
 import pytest
 
 
@@ -18,7 +20,11 @@ def test_dataset():
     layer_data = {
         "t": randomstate.random(layer_size * n_base) * 20,
         "flux": randomstate.random(layer_size * n_base) * 100,
-        "band": randomstate.choice(["r", "g"], size=layer_size * n_base),
+        # Ensure pyarrow[string] dtype, not large_string
+        # https://github.com/lincc-frameworks/nested-dask/issues/71
+        "band": pd.Series(
+            randomstate.choice(["r", "g"], size=layer_size * n_base), dtype=pd.ArrowDtype(pa.string())
+        ),
         "index": np.arange(layer_size * n_base) % n_base,
     }
     layer_nf = npd.NestedFrame(data=layer_data).set_index("index").sort_index()

diff --git a/tests/nested_dask/test_nestedframe.py b/tests/nested_dask/test_nestedframe.py
@@ -173,7 +173,7 @@ def test_from_flat():
     assert len(ndf_comp) == 2
 
     # Check using an index
-    ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], index="a")
+    ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], on="a")
     assert list(ndf.columns) == ["b", "nested"]
     assert list(ndf["nested"].nest.fields) == ["c", "d"]
     ndf_comp = ndf.compute()