Skip to content

Commit

Permalink
[describe-] collect aggr funcs that operate on list of values in Dict (
Browse files Browse the repository at this point in the history
…#2009)

* [describe-] collect aggr funcs that operate on list of values in OrderedDict

aggregator() converts a func, which operates on a list, into a _func,
which operates on a srccol and a list of rows.

The original functions were then added into Globals, but this caused problems for
a function like `sum()` which appears naturally in Python code.

I created an OrderedDict, and named it aggregators_vals as a place to
store them.

Other possible options:

* We could include the optional funcvals along with func(srccol) for
Aggregator. Describe Sheet could then grab the funcvals if it exists.

* Describe Sheet could pass the srccol and list of rows, instead of
vals. This is not ideal because it means for each aggregator, we call
getValues once-more. This would cause a performance degradation.

* Add them to vd.aggregators, possibly with the suffix "_vals", and
create an Aggregator out of them as well. Have Describe Sheet pull
aggrname_vals.

* Similarly use the vd.aggregator_vals, but have a less terrible name.

Optional: Do we want to do the work of porting quantiles and percentiles to be useable by Describe Sheet? Currently, an aggregator that does not go through aggregator() is not useable by Describe Sheet.

* private functions should start with _
  • Loading branch information
anjakefala authored Sep 9, 2023
1 parent dd5abba commit d7204d8
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 12 deletions.
38 changes: 38 additions & 0 deletions tests/golden/sum-freq-table.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Units count Units_sum
2 1 2
3 1 3
4 1 4
5 1 5
7 2 14
11 1 11
14 1 14
15 1 15
16 1 16
27 1 27
28 2 56
29 1 29
32 1 32
35 1 35
36 1 36
42 1 42
46 1 46
50 2 100
53 1 53
55 1 55
56 1 56
57 1 57
60 2 120
62 1 62
64 1 64
66 1 66
67 1 67
74 1 74
75 1 75
76 1 76
80 1 80
81 1 81
87 1 87
90 2 180
94 1 94
95 1 95
96 2 192
6 changes: 6 additions & 0 deletions tests/sum-freq-table.vd
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sheet col row longname input keystrokes comment
open-file sample_data/sample.tsv o
sample Units type-int # set type of current column to int
sample Units aggregate-col sum + Add aggregator to current column
sample Units freq-col Shift+F open Frequency Table grouped on current column, with aggregations of other columns
sample_Units_freq Units sort-asc [ sort ascending by current column; replace any existing sort criteria
20 changes: 10 additions & 10 deletions visidata/aggregators.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,11 @@ def aggregators_set(col, aggs):


class Aggregator:
def __init__(self, name, type, func, helpstr='foo'):
def __init__(self, name, type, funcRows, funcValues=None, helpstr='foo'):
'Define aggregator `name` that calls func(col, rows)'
self.type = type
self.func = func
self.func = funcRows # funcRows(col, rows)
self.funcValues = funcValues # funcValues(values, *args)
self.helpstr = helpstr
self.name = name

Expand All @@ -69,19 +70,18 @@ def __call__(self, *args, **kwargs):
_defaggr = Aggregator

@VisiData.api
def aggregator(vd, name, func, helpstr='', *args, type=None):
'Define simple aggregator *name* that calls ``func(values, *args)`` to aggregate *values*. Use *type* to force the default type of the aggregated column.'
def _func(col, rows): # wrap builtins so they can have a .type
def aggregator(vd, name, funcValues, helpstr='', *args, type=None):
'Define simple aggregator *name* that calls ``funcValues(values, *args)`` to aggregate *values*. Use *type* to force the default type of the aggregated column.'
def _funcRows(col, rows): # wrap builtins so they can have a .type
vals = list(col.getValues(rows))
try:
return func(vals, *args)
return funcValues(vals, *args)
except Exception as e:
if len(vals) == 0:
return None
return e

vd.aggregators[name] = _defaggr(name, type, _func, helpstr)
vd.addGlobals({name: func})
vd.aggregators[name] = _defaggr(name, type, _funcRows, funcValues=funcValues, helpstr=helpstr) # accepts a srccol + list of rows

## specific aggregator implementations

Expand Down Expand Up @@ -117,7 +117,7 @@ def _percentile(N, percent, key=lambda x:x):

@functools.lru_cache(100)
def percentile(pct, helpstr=''):
return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr)
return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr=helpstr)

def quantiles(q, helpstr):
return [percentile(round(100*i/q), helpstr) for i in range(1, q)]
Expand Down Expand Up @@ -145,7 +145,7 @@ def quantiles(q, helpstr):
vd.aggregators[f'p{pct}'] = percentile(pct, f'{pct}th percentile')

# returns keys of the row with the max value
vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), 'key of the maximum value')
vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), helpstr='key of the maximum value')


ColumnsSheet.columns += [
Expand Down
4 changes: 2 additions & 2 deletions visidata/features/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ def reloadColumn(self, srccol):
for func in [min, max, sum, median]: # use type
d[func.__name__] = self.calcStatistic(d, func, vals)
for aggrname in vd.options.describe_aggrs.split():
func = vd.getGlobals()[aggrname]
d[func.__name__] = self.calcStatistic(d, func, vals)
aggr = vd.aggregators[aggrname].funcValues
d[aggrname] = self.calcStatistic(d, aggr, vals)

def calcStatistic(self, d, func, *args, **kwargs):
r = wrapply(func, *args, **kwargs)
Expand Down

0 comments on commit d7204d8

Please sign in to comment.