10
10
from pandas ._libs import lib
11
11
from pandas ._typing import Any , AnyAll , Axis , IndexLabel
12
12
from pandas .api .extensions import no_default
13
+ from pandas .core .computation import ops
14
+ from pandas .core .computation .eval import Expr , ensure_scope
13
15
from pandas .core .computation .expr import PARSERS , PandasExprVisitor
16
+ from pandas .core .computation .parsing import clean_column_name
14
17
15
- from nested_pandas .nestedframe .utils import extract_nest_names
16
18
from nested_pandas .series .dtype import NestedDtype
17
19
from nested_pandas .series .packer import pack , pack_lists , pack_sorted_df_into_struct
18
20
@@ -79,6 +81,22 @@ class _NestResolver(dict):
79
81
def __init__ (self , outer : NestedFrame ):
80
82
self ._outer = outer
81
83
super ().__init__ ()
84
+ # Pre-load the field resolvers for all columns which are known at present.
85
+ for column in outer .nested_columns :
86
+ self ._initialize_field_resolver (column , outer )
87
+
88
+ def _initialize_field_resolver (self , column : str , outer : NestedFrame ):
89
+ """
90
+ Initialize a resolver for the given nested column, and also an alias
91
+ for it, in the case of column names that have spaces or are otherwise
92
+ not identifier-like.
93
+ """
94
+ super ().__setitem__ (column , _NestedFieldResolver (column , outer ))
95
+ clean_id = clean_column_name (column )
96
+ # And once more for the cleaned name, if it's different.
97
+ # This allows us to capture references to it from the Pandas evaluator.
98
+ if clean_id != column :
99
+ super ().__setitem__ (clean_id , _NestedFieldResolver (column , outer ))
82
100
83
101
def __contains__ (self , item ):
84
102
top_nest = item if "." not in item else item .split ("." )[0 ].strip ()
@@ -89,7 +107,7 @@ def __getitem__(self, item):
89
107
if not super ().__contains__ (top_nest ):
90
108
if top_nest not in self ._outer .nested_columns :
91
109
raise KeyError (f"Unknown nest { top_nest } " )
92
- super (). __setitem__ (top_nest , _NestedFieldResolver ( top_nest , self ._outer ) )
110
+ self . _initialize_field_resolver (top_nest , self ._outer )
93
111
return super ().__getitem__ (top_nest )
94
112
95
113
def __setitem__ (self , item , _ ):
@@ -133,6 +151,48 @@ def __getattr__(self, item_name: str):
133
151
raise AttributeError (f"No attribute { item_name } " )
134
152
135
153
154
+ def _subexprs_by_nest (parents : list , node ) -> dict [str , list ]:
155
+ """
156
+ Given an expression which contains references to both base and nested
157
+ columns, return a dictionary of the sub-expressions that should be
158
+ evaluated independently, keyed by nesting context.
159
+
160
+ The key of the dictionary is the name of the nested column, and will
161
+ be a blank string in the case of base columns. The value is a list
162
+ of the parent nodes that lead to sub-expressions that can be evaluated
163
+ successfully.
164
+
165
+ While this is not in use today for automatically splitting expressions,
166
+ it can be used to detect whether an expression is suitably structured
167
+ for evaluation: the returned dictionary should have a single key.
168
+ """
169
+ if isinstance (node , ops .Term ) and not isinstance (node , ops .Constant ):
170
+ if isinstance (node .value , _SeriesFromNest ):
171
+ return {node .value .nest_name : parents }
172
+ return {getattr (node , "upper_name" , "" ): parents }
173
+ if not isinstance (node , ops .Op ):
174
+ return {}
175
+ sources = [getattr (node , "lhs" , None ), getattr (node , "rhs" , None )]
176
+ result : dict [str , list ] = {}
177
+ for source in sources :
178
+ child = _subexprs_by_nest (parents , source )
179
+ for k , v in child .items ():
180
+ result .setdefault (k , []).append (v )
181
+ # After a complete traversal across sources, check for any necessary splits.
182
+ # If it's homogenous, move the split-node up the tree.
183
+ if len (result ) == 1 :
184
+ # Let the record of each parent node drift up the tree,
185
+ # and merge the subtrees into a single node, since by definition,
186
+ # this node is homogeneous over all of its children, and can
187
+ # be evaluated in a single step.
188
+ result = {k : [node ] for k in result }
189
+ # If the result is either empty or has more than one key, leave the result
190
+ # alone. Each key represents a different nest (with a blank string for the base),
191
+ # and the value is the highest point in the expression tree where the expression
192
+ # was still within a single nest.
193
+ return result
194
+
195
+
136
196
class NestedFrame (pd .DataFrame ):
137
197
"""A Pandas Dataframe extension with support for nested structure.
138
198
@@ -457,6 +517,39 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
457
517
kwargs ["parser" ] = "nested-pandas"
458
518
return super ().eval (expr , ** kwargs )
459
519
520
+ def extract_nest_names (
521
+ self ,
522
+ expr : str ,
523
+ local_dict = None ,
524
+ global_dict = None ,
525
+ resolvers = (),
526
+ level : int = 0 ,
527
+ target = None ,
528
+ ** kwargs ,
529
+ ) -> set [str ]:
530
+ """
531
+ Given a string expression, parse it and visit the resulting expression tree,
532
+ surfacing the nesting types. The purpose is to identify expressions that attempt
533
+ to mix base and nested columns, or columns from two different nests.
534
+ """
535
+ index_resolvers = self ._get_index_resolvers ()
536
+ column_resolvers = self ._get_cleaned_column_resolvers ()
537
+ resolvers = resolvers + (_NestResolver (self ), column_resolvers , index_resolvers )
538
+ # Parser needs to be the "nested-pandas" parser.
539
+ # We also need the same variable context that eval() will have, so that
540
+ # backtick-quoted names are substituted as expected.
541
+ env = ensure_scope (
542
+ level + 1 ,
543
+ global_dict = global_dict ,
544
+ local_dict = local_dict ,
545
+ resolvers = resolvers ,
546
+ target = target ,
547
+ )
548
+ parsed_expr = Expr (expr , parser = "nested-pandas" , env = env )
549
+ expr_tree = parsed_expr .terms
550
+ separable = _subexprs_by_nest ([], expr_tree )
551
+ return set (separable .keys ())
552
+
460
553
def query (self , expr : str , * , inplace : bool = False , ** kwargs ) -> NestedFrame | None :
461
554
"""
462
555
Query the columns of a NestedFrame with a boolean expression. Specified
@@ -514,7 +607,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
514
607
# At present, the query expression must be either entirely within a
515
608
# single nest, or have nothing but base columns. Mixed structures are not
516
609
# supported, so preflight the expression.
517
- nest_names = extract_nest_names (expr )
610
+ nest_names = self . extract_nest_names (expr , ** kwargs )
518
611
if len (nest_names ) > 1 :
519
612
raise ValueError ("Queries cannot target multiple structs/layers, write a separate query for each" )
520
613
result = self .eval (expr , ** kwargs )
0 commit comments