Skip to content

Commit

Permalink
Switch from_flat 'index' keyword to 'on' (#69)
Browse files Browse the repository at this point in the history
* Switch from_flat 'index' keyword to 'on'

* Update nested-pandas version requirements

* Change test_dataset band dtype

* Require nested-pandas>=0.3.1

* Formatting

---------

Co-authored-by: Konstantin Malanchev <[email protected]>
  • Loading branch information
wilsonbb and hombit authored Nov 6, 2024
1 parent 973feca commit f5019f7
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.9"
dependencies = [
'nested-pandas>=0.2.1,<0.3',
'nested-pandas>=0.3.1,<0.4.0',
'numpy',
'dask>=2024.3.0',
'dask[distributed]>=2024.3.0',
Expand Down
8 changes: 4 additions & 4 deletions src/nested_dask/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def from_map(
return NestedFrame.from_dask_dataframe(nf)

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
def from_flat(cls, df, base_columns, nested_columns=None, on=None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.
Expand All @@ -303,7 +303,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
Expand All @@ -323,15 +323,15 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
meta = npd.NestedFrame(df[base_columns]._meta)

if nested_columns is None:
nested_columns = [col for col in df.columns if (col not in base_columns) and col != index]
nested_columns = [col for col in df.columns if (col not in base_columns) and col != on]

if len(nested_columns) > 0:
nested_meta = pack(df[nested_columns]._meta, name)
meta = meta.join(nested_meta)

return df.map_partitions(
lambda x: npd.NestedFrame.from_flat(
df=x, base_columns=base_columns, nested_columns=nested_columns, index=index, name=name
df=x, base_columns=base_columns, nested_columns=nested_columns, on=on, name=name
),
meta=meta,
)
Expand Down
8 changes: 7 additions & 1 deletion tests/nested_dask/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import nested_dask as nd
import nested_pandas as npd
import numpy as np
import pandas as pd
import pyarrow as pa
import pytest


Expand All @@ -18,7 +20,11 @@ def test_dataset():
layer_data = {
"t": randomstate.random(layer_size * n_base) * 20,
"flux": randomstate.random(layer_size * n_base) * 100,
"band": randomstate.choice(["r", "g"], size=layer_size * n_base),
# Ensure pyarrow[string] dtype, not large_string
# https://github.com/lincc-frameworks/nested-dask/issues/71
"band": pd.Series(
randomstate.choice(["r", "g"], size=layer_size * n_base), dtype=pd.ArrowDtype(pa.string())
),
"index": np.arange(layer_size * n_base) % n_base,
}
layer_nf = npd.NestedFrame(data=layer_data).set_index("index").sort_index()
Expand Down
2 changes: 1 addition & 1 deletion tests/nested_dask/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def test_from_flat():
assert len(ndf_comp) == 2

# Check using an index
ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], index="a")
ndf = nd.NestedFrame.from_flat(nf, base_columns=["b"], on="a")
assert list(ndf.columns) == ["b", "nested"]
assert list(ndf["nested"].nest.fields) == ["c", "d"]
ndf_comp = ndf.compute()
Expand Down

0 comments on commit f5019f7

Please sign in to comment.