From 8700e7c749ff606763d6578ec51cc89d40130ae6 Mon Sep 17 00:00:00 2001 From: anjakefala Date: Fri, 25 Aug 2023 17:42:08 -0700 Subject: [PATCH 1/2] [describe-] collect aggr funcs that operate on list of values in OrderedDict aggregator() converts a func, which operates on a list, into a _func, which operates on a srccol and a list of rows. The original functions were then added into Globals, but this caused problems for a function like `sum()` which appears naturally in Python code. I created an OrderedDict, and named it aggregators_vals as a place to store them. Other possible options: * We could include the optional funcvals along with func(srccol) for Aggregator. Describe Sheet could then grab the funcvals if it exists. * Describe Sheet could pass the srccol and list of rows, instead of vals. This is not ideal because it means for each aggregator, we call getValues once-more. This would cause a performance degradation. * Add them to vd.aggregators, possibly with the suffix "_vals", and create an Aggregator out of them as well. Have Describe Sheet pull aggrname_vals. * Similarly use the vd.aggregator_vals, but have a less terrible name. Optional: Do we want to do the work of porting quantiles and percentiles to be useable by Describe Sheet? Currently, an aggregator that does not go through aggregator() is not useable by Describe Sheet. --- tests/golden/sum-freq-table.tsv | 38 +++++++++++++++++++++++++++++++++ tests/sum-freq-table.vd | 6 ++++++ visidata/aggregators.py | 20 ++++++++--------- visidata/features/describe.py | 4 ++-- 4 files changed, 56 insertions(+), 12 deletions(-) create mode 100644 tests/golden/sum-freq-table.tsv create mode 100644 tests/sum-freq-table.vd diff --git a/tests/golden/sum-freq-table.tsv b/tests/golden/sum-freq-table.tsv new file mode 100644 index 000000000..e12bae018 --- /dev/null +++ b/tests/golden/sum-freq-table.tsv @@ -0,0 +1,38 @@ +Units count Units_sum +2 1 2 +3 1 3 +4 1 4 +5 1 5 +7 2 14 +11 1 11 +14 1 14 +15 1 15 +16 1 16 +27 1 27 +28 2 56 +29 1 29 +32 1 32 +35 1 35 +36 1 36 +42 1 42 +46 1 46 +50 2 100 +53 1 53 +55 1 55 +56 1 56 +57 1 57 +60 2 120 +62 1 62 +64 1 64 +66 1 66 +67 1 67 +74 1 74 +75 1 75 +76 1 76 +80 1 80 +81 1 81 +87 1 87 +90 2 180 +94 1 94 +95 1 95 +96 2 192 diff --git a/tests/sum-freq-table.vd b/tests/sum-freq-table.vd new file mode 100644 index 000000000..a5bddd9a9 --- /dev/null +++ b/tests/sum-freq-table.vd @@ -0,0 +1,6 @@ +sheet col row longname input keystrokes comment + open-file sample_data/sample.tsv o +sample Units type-int # set type of current column to int +sample Units aggregate-col sum + Add aggregator to current column +sample Units freq-col Shift+F open Frequency Table grouped on current column, with aggregations of other columns +sample_Units_freq Units sort-asc [ sort ascending by current column; replace any existing sort criteria diff --git a/visidata/aggregators.py b/visidata/aggregators.py index 0bed13def..d63b553f8 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -56,10 +56,11 @@ def aggregators_set(col, aggs): class Aggregator: - def __init__(self, name, type, func, helpstr='foo'): + def __init__(self, name, type, funcRows, funcValues=None, helpstr='foo'): 'Define aggregator `name` that calls func(col, rows)' self.type = type - self.func = func + self.func = funcRows # funcRows(col, rows) + self.funcValues = funcValues # funcValues(values, *args) self.helpstr = helpstr self.name = name @@ -69,19 +70,18 @@ def __call__(self, *args, **kwargs): _defaggr = Aggregator @VisiData.api -def aggregator(vd, name, func, helpstr='', *args, type=None): - 'Define simple aggregator *name* that calls ``func(values, *args)`` to aggregate *values*. Use *type* to force the default type of the aggregated column.' - def _func(col, rows): # wrap builtins so they can have a .type +def aggregator(vd, name, funcValues, helpstr='', *args, type=None): + 'Define simple aggregator *name* that calls ``funcValues(values, *args)`` to aggregate *values*. Use *type* to force the default type of the aggregated column.' + def funcRows(col, rows): # wrap builtins so they can have a .type vals = list(col.getValues(rows)) try: - return func(vals, *args) + return funcValues(vals, *args) except Exception as e: if len(vals) == 0: return None return e - vd.aggregators[name] = _defaggr(name, type, _func, helpstr) - vd.addGlobals({name: func}) + vd.aggregators[name] = _defaggr(name, type, funcRows, funcValues=funcValues, helpstr=helpstr) # accepts a srccol + list of rows ## specific aggregator implementations @@ -117,7 +117,7 @@ def _percentile(N, percent, key=lambda x:x): @functools.lru_cache(100) def percentile(pct, helpstr=''): - return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr) + return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr=helpstr) def quantiles(q, helpstr): return [percentile(round(100*i/q), helpstr) for i in range(1, q)] @@ -145,7 +145,7 @@ def quantiles(q, helpstr): vd.aggregators[f'p{pct}'] = percentile(pct, f'{pct}th percentile') # returns keys of the row with the max value -vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), 'key of the maximum value') +vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), helpstr='key of the maximum value') ColumnsSheet.columns += [ diff --git a/visidata/features/describe.py b/visidata/features/describe.py index c359e7535..3052e85ea 100644 --- a/visidata/features/describe.py +++ b/visidata/features/describe.py @@ -99,8 +99,8 @@ def reloadColumn(self, srccol): for func in [min, max, sum, median]: # use type d[func.__name__] = self.calcStatistic(d, func, vals) for aggrname in vd.options.describe_aggrs.split(): - func = vd.getGlobals()[aggrname] - d[func.__name__] = self.calcStatistic(d, func, vals) + aggr = vd.aggregators[aggrname].funcValues + d[aggrname] = self.calcStatistic(d, aggr, vals) def calcStatistic(self, d, func, *args, **kwargs): r = wrapply(func, *args, **kwargs) From 2b66c6f5552bdf4e509332e27e44d25ba49d6693 Mon Sep 17 00:00:00 2001 From: Saul Pwanson Date: Fri, 1 Sep 2023 17:38:31 -0700 Subject: [PATCH 2/2] private functions should start with _ --- visidata/aggregators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/visidata/aggregators.py b/visidata/aggregators.py index d63b553f8..2ebde702e 100644 --- a/visidata/aggregators.py +++ b/visidata/aggregators.py @@ -72,7 +72,7 @@ def __call__(self, *args, **kwargs): @VisiData.api def aggregator(vd, name, funcValues, helpstr='', *args, type=None): 'Define simple aggregator *name* that calls ``funcValues(values, *args)`` to aggregate *values*. Use *type* to force the default type of the aggregated column.' - def funcRows(col, rows): # wrap builtins so they can have a .type + def _funcRows(col, rows): # wrap builtins so they can have a .type vals = list(col.getValues(rows)) try: return funcValues(vals, *args) @@ -81,7 +81,7 @@ def funcRows(col, rows): # wrap builtins so they can have a .type return None return e - vd.aggregators[name] = _defaggr(name, type, funcRows, funcValues=funcValues, helpstr=helpstr) # accepts a srccol + list of rows + vd.aggregators[name] = _defaggr(name, type, _funcRows, funcValues=funcValues, helpstr=helpstr) # accepts a srccol + list of rows ## specific aggregator implementations