[describe-] collect aggr funcs that operate on list of values in Dict (…

…#2009) * [describe-] collect aggr funcs that operate on list of values in OrderedDict aggregator() converts a func, which operates on a list, into a _func, which operates on a srccol and a list of rows. The original functions were then added into Globals, but this caused problems for a function like `sum()` which appears naturally in Python code. I created an OrderedDict, and named it aggregators_vals as a place to store them. Other possible options: * We could include the optional funcvals along with func(srccol) for Aggregator. Describe Sheet could then grab the funcvals if it exists. * Describe Sheet could pass the srccol and list of rows, instead of vals. This is not ideal because it means for each aggregator, we call getValues once-more. This would cause a performance degradation. * Add them to vd.aggregators, possibly with the suffix "_vals", and create an Aggregator out of them as well. Have Describe Sheet pull aggrname_vals. * Similarly use the vd.aggregator_vals, but have a less terrible name. Optional: Do we want to do the work of porting quantiles and percentiles to be useable by Describe Sheet? Currently, an aggregator that does not go through aggregator() is not useable by Describe Sheet. * private functions should start with _
saulpw · Sep 9, 2023 · d7204d8 · d7204d8
1 parent dd5abba
commit d7204d8
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 12 deletions.
diff --git a/tests/golden/sum-freq-table.tsv b/tests/golden/sum-freq-table.tsv
@@ -0,0 +1,38 @@
+Units	count	Units_sum
+2	1	2
+3	1	3
+4	1	4
+5	1	5
+7	2	14
+11	1	11
+14	1	14
+15	1	15
+16	1	16
+27	1	27
+28	2	56
+29	1	29
+32	1	32
+35	1	35
+36	1	36
+42	1	42
+46	1	46
+50	2	100
+53	1	53
+55	1	55
+56	1	56
+57	1	57
+60	2	120
+62	1	62
+64	1	64
+66	1	66
+67	1	67
+74	1	74
+75	1	75
+76	1	76
+80	1	80
+81	1	81
+87	1	87
+90	2	180
+94	1	94
+95	1	95
+96	2	192
diff --git a/tests/sum-freq-table.vd b/tests/sum-freq-table.vd
@@ -0,0 +1,6 @@
+sheet	col	row	longname	input	keystrokes	comment
+			open-file	sample_data/sample.tsv	o	
+sample	Units		type-int		#	set type of current column to int
+sample	Units		aggregate-col	sum	+	Add aggregator to current column
+sample	Units		freq-col		Shift+F	open Frequency Table grouped on current column, with aggregations of other columns
+sample_Units_freq	Units		sort-asc		[	sort ascending by current column; replace any existing sort criteria
diff --git a/visidata/aggregators.py b/visidata/aggregators.py
@@ -56,10 +56,11 @@ def aggregators_set(col, aggs):
 
 
 class Aggregator:
-    def __init__(self, name, type, func, helpstr='foo'):
+    def __init__(self, name, type, funcRows, funcValues=None, helpstr='foo'):
         'Define aggregator `name` that calls func(col, rows)'
         self.type = type
-        self.func = func
+        self.func = funcRows  # funcRows(col, rows)
+        self.funcValues = funcValues  # funcValues(values, *args)
         self.helpstr = helpstr
         self.name = name
 
@@ -69,19 +70,18 @@ def __call__(self, *args, **kwargs):
 _defaggr = Aggregator
 
 @VisiData.api
-def aggregator(vd, name, func, helpstr='', *args, type=None):
-    'Define simple aggregator *name* that calls ``func(values, *args)`` to aggregate *values*.  Use *type* to force the default type of the aggregated column.'
-    def _func(col, rows):  # wrap builtins so they can have a .type
+def aggregator(vd, name, funcValues, helpstr='', *args, type=None):
+    'Define simple aggregator *name* that calls ``funcValues(values, *args)`` to aggregate *values*.  Use *type* to force the default type of the aggregated column.'
+    def _funcRows(col, rows):  # wrap builtins so they can have a .type
         vals = list(col.getValues(rows))
         try:
-            return func(vals, *args)
+            return funcValues(vals, *args)
         except Exception as e:
             if len(vals) == 0:
                 return None
             return e
 
-    vd.aggregators[name] = _defaggr(name, type, _func, helpstr)
-    vd.addGlobals({name: func})
+    vd.aggregators[name] = _defaggr(name, type, _funcRows, funcValues=funcValues, helpstr=helpstr)  # accepts a srccol + list of rows
 
 ## specific aggregator implementations
 
@@ -117,7 +117,7 @@ def _percentile(N, percent, key=lambda x:x):
 
 @functools.lru_cache(100)
 def percentile(pct, helpstr=''):
-    return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr)
+    return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr=helpstr)
 
 def quantiles(q, helpstr):
     return [percentile(round(100*i/q), helpstr) for i in range(1, q)]
@@ -145,7 +145,7 @@ def quantiles(q, helpstr):
     vd.aggregators[f'p{pct}'] = percentile(pct, f'{pct}th percentile')
 
 # returns keys of the row with the max value
-vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), 'key of the maximum value')
+vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), helpstr='key of the maximum value')
 
 
 ColumnsSheet.columns += [

diff --git a/visidata/features/describe.py b/visidata/features/describe.py
@@ -99,8 +99,8 @@ def reloadColumn(self, srccol):
                 for func in [min, max, sum, median]:  # use type
                     d[func.__name__] = self.calcStatistic(d, func, vals)
                 for aggrname in vd.options.describe_aggrs.split():
-                    func = vd.getGlobals()[aggrname]
-                    d[func.__name__] = self.calcStatistic(d, func, vals)
+                    aggr = vd.aggregators[aggrname].funcValues
+                    d[aggrname] = self.calcStatistic(d, aggr, vals)
 
     def calcStatistic(self, d, func, *args, **kwargs):
         r = wrapply(func, *args, **kwargs)