12
12
from pandas .api .extensions import no_default
13
13
from pandas .core .computation .expr import PARSERS , PandasExprVisitor
14
14
15
- from nested_pandas .series import packer
15
+ from nested_pandas .nestedframe . utils import extract_nest_names
16
16
from nested_pandas .series .dtype import NestedDtype
17
-
18
- from ..series .packer import pack_sorted_df_into_struct
19
- from .utils import extract_nest_names
17
+ from nested_pandas .series .packer import pack , pack_lists , pack_sorted_df_into_struct
20
18
21
19
22
20
class NestedPandasExprVisitor (PandasExprVisitor ):
@@ -219,10 +217,8 @@ def __setitem__(self, key, value):
219
217
"." in key and key .split ("." )[0 ] in self .nested_columns
220
218
):
221
219
nested , col = key .split ("." )
222
- new_flat = self [nested ].nest .to_flat ()
223
- new_flat [col ] = value
224
- packed = packer .pack (new_flat )
225
- return super ().__setitem__ (nested , packed )
220
+ new_nested_series = self [nested ].nest .with_flat_field (col , value )
221
+ return super ().__setitem__ (nested , new_nested_series )
226
222
227
223
# Adding a new nested structure from a column
228
224
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
@@ -231,8 +227,9 @@ def __setitem__(self, key, value):
231
227
if isinstance (value , pd .Series ):
232
228
value .name = col
233
229
value = value .to_frame ()
234
- packed = packer .pack (value )
235
- return super ().__setitem__ (new_nested , packed )
230
+ new_df = self .add_nested (value , name = new_nested )
231
+ self ._update_inplace (new_df )
232
+ return None
236
233
237
234
return super ().__setitem__ (key , value )
238
235
@@ -242,6 +239,7 @@ def add_nested(
242
239
name : str ,
243
240
* ,
244
241
how : str = "left" ,
242
+ on : None | str | list [str ] = None ,
245
243
dtype : NestedDtype | pd .ArrowDtype | pa .DataType | None = None ,
246
244
) -> Self : # type: ignore[name-defined] # noqa: F821
247
245
"""Packs input object to a nested column and adds it to the NestedFrame
@@ -272,6 +270,8 @@ def add_nested(
272
270
index, and sort it lexicographically.
273
271
- inner: form intersection of calling frame's index with other
274
272
frame's index, preserving the order of the calling index.
273
+ on : str, default: None
274
+ A column in the list
275
275
dtype : dtype or None
276
276
NestedDtype to use for the nested column; pd.ArrowDtype or
277
277
pa.DataType can also be used to specify the nested dtype. If None,
@@ -282,13 +282,16 @@ def add_nested(
282
282
NestedFrame
283
283
A new NestedFrame with the added nested column.
284
284
"""
285
+ if on is not None and not isinstance (on , str ):
286
+ raise ValueError ("Currently we only support a single column for 'on'" )
285
287
# Add sources to objects
286
- packed = packer . pack (obj , name = name , dtype = dtype )
288
+ packed = pack (obj , name = name , on = on , dtype = dtype )
287
289
new_df = self .copy ()
288
- return new_df .join (packed , how = how )
290
+ res = new_df .join (packed , how = how , on = on )
291
+ return res
289
292
290
293
@classmethod
291
- def from_flat (cls , df , base_columns , nested_columns = None , index = None , name = "nested" ):
294
+ def from_flat (cls , df , base_columns , nested_columns = None , on : str | None = None , name = "nested" ):
292
295
"""Creates a NestedFrame with base and nested columns from a flat
293
296
dataframe.
294
297
@@ -304,7 +307,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
304
307
in the list will attempt to be packed into a single nested column
305
308
with the name provided in `nested_name`. If None, is defined as all
306
309
columns not in `base_columns`.
307
- index : str, or None
310
+ on : str or None
308
311
The name of a column to use as the new index. Typically, the index
309
312
should have a unique value per row for base columns, and should
310
313
repeat for nested columns. For example, a dataframe with two
@@ -330,11 +333,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
330
333
"""
331
334
332
335
# Resolve new index
333
- if index is not None :
336
+ if on is not None :
334
337
# if a base column is chosen remove it
335
- if index in base_columns :
336
- base_columns = [col for col in base_columns if col != index ]
337
- df = df .set_index (index )
338
+ if on in base_columns :
339
+ base_columns = [col for col in base_columns if col != on ]
340
+ df = df .set_index (on )
338
341
339
342
# drop duplicates on index
340
343
out_df = df [base_columns ][~ df .index .duplicated (keep = "first" )]
@@ -401,7 +404,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
401
404
raise ValueError ("No columns were assigned as list columns." )
402
405
403
406
# Pack list columns into a nested column
404
- packed_df = packer . pack_lists (df [list_columns ])
407
+ packed_df = pack_lists (df [list_columns ])
405
408
packed_df .name = name
406
409
407
410
# join the nested column to the base_column df
@@ -519,17 +522,33 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
519
522
# since it operated on the base attributes.
520
523
if isinstance (result , _SeriesFromNest ):
521
524
nest_name , flat_nest = result .nest_name , result .flat_nest
522
- new_flat_nest = flat_nest .loc [result ]
523
- result = self .copy ()
524
- result [nest_name ] = pack_sorted_df_into_struct (new_flat_nest )
525
+ # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
526
+ list_index = self [nest_name ].array .get_list_index ()
527
+ flat_nest = flat_nest .set_index (list_index )
528
+ query_result = result .set_axis (list_index )
529
+ # Selecting flat values matching the query result
530
+ new_flat_nest = flat_nest [query_result ]
531
+ new_df = self ._set_filtered_flat_df (nest_name , new_flat_nest )
525
532
else :
526
- result = self .loc [result ]
533
+ new_df = self .loc [result ]
527
534
528
535
if inplace :
529
- self ._update_inplace (result )
536
+ self ._update_inplace (new_df )
530
537
return None
531
538
else :
532
- return result
539
+ return new_df
540
+
541
+ def _set_filtered_flat_df (self , nest_name , flat_df ):
542
+ """Set a filtered flat dataframe for a nested column
543
+
544
+ Here we assume that flat_df has filtered "ordinal" index,
545
+ e.g. flat_df.index == [0, 2, 2, 2], while self.index
546
+ is arbitrary (e.g. ["a", "b", "a"]),
547
+ and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
548
+ """
549
+ new_df = self .reset_index (drop = True )
550
+ new_df [nest_name ] = pack_sorted_df_into_struct (flat_df , name = nest_name )
551
+ return new_df .set_index (self .index )
533
552
534
553
def _resolve_dropna_target (self , on_nested , subset ):
535
554
"""resolves the target layer for a given set of dropna kwargs"""
@@ -654,34 +673,32 @@ def dropna(
654
673
return super ().dropna (
655
674
axis = axis , how = how , thresh = thresh , subset = subset , inplace = inplace , ignore_index = ignore_index
656
675
)
676
+ if ignore_index :
677
+ raise ValueError ("ignore_index is not supported for nested columns" )
657
678
if subset is not None :
658
679
subset = [col .split ("." )[- 1 ] for col in subset ]
680
+ target_flat = self [target ].nest .to_flat ()
681
+ target_flat = target_flat .set_index (self [target ].array .get_list_index ())
659
682
if inplace :
660
- target_flat = self [target ].nest .to_flat ()
661
683
target_flat .dropna (
662
684
axis = axis ,
663
685
how = how ,
664
686
thresh = thresh ,
665
687
subset = subset ,
666
- inplace = inplace ,
667
- ignore_index = ignore_index ,
688
+ inplace = True ,
668
689
)
669
- self [target ] = packer .pack_flat (target_flat )
670
- return self
671
- # Or if not inplace
672
- new_df = self .copy ()
673
- new_df [target ] = packer .pack_flat (
674
- new_df [target ]
675
- .nest .to_flat ()
676
- .dropna (
690
+ else :
691
+ target_flat = target_flat .dropna (
677
692
axis = axis ,
678
693
how = how ,
679
694
thresh = thresh ,
680
695
subset = subset ,
681
- inplace = inplace ,
682
- ignore_index = ignore_index ,
696
+ inplace = False ,
683
697
)
684
- )
698
+ new_df = self ._set_filtered_flat_df (nest_name = target , flat_df = target_flat )
699
+ if inplace :
700
+ self ._update_inplace (new_df )
701
+ return None
685
702
return new_df
686
703
687
704
def reduce (self , func , * args , ** kwargs ) -> NestedFrame : # type: ignore[override]
0 commit comments