diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index 0e77f6040e54..5365b99924f1 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -1999,20 +1999,72 @@ def nunique(self, where: ir.BooleanValue | None = None) -> ir.IntegerScalar: self, where=self._bind_to_parent_table(where) ).to_expr() - def topk(self, k: int, by: ir.Value | None = None) -> ir.Table: + def topk( + self, k: int, by: ir.Value | None = None, *, name: str | None = None + ) -> ir.Table: """Return a "top k" expression. + Computes a Table containing the top `k` values by a certain metric + (defaults to count). + Parameters ---------- k - Return this number of rows + The number of rows to return. by - An expression. Defaults to `count`. + The metric to compute "top" by. Defaults to `count`. + name + The name to use for the metric column. A suitable name will be + automatically generated if not provided. Returns ------- Table - A top-k expression + The top `k` values. + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.examples.diamonds.fetch() + + Compute the top 3 diamond colors by frequency: + + >>> t.color.topk(3) + ┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ + ┃ color ┃ CountStar(diamonds) ┃ + ┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ + │ string │ int64 │ + ├────────┼─────────────────────┤ + │ G │ 11292 │ + │ E │ 9797 │ + │ F │ 9542 │ + └────────┴─────────────────────┘ + + Compute the top 3 diamond colors by mean price: + + >>> t.color.topk(3, by=t.price.mean()) + ┏━━━━━━━━┳━━━━━━━━━━━━━┓ + ┃ color ┃ Mean(price) ┃ + ┡━━━━━━━━╇━━━━━━━━━━━━━┩ + │ string │ float64 │ + ├────────┼─────────────┤ + │ J │ 5323.818020 │ + │ I │ 5091.874954 │ + │ H │ 4486.669196 │ + └────────┴─────────────┘ + + Compute the top 2 diamond colors by max carat: + + >>> t.color.topk(2, by=t.carat.max(), name="max_carat") + ┏━━━━━━━━┳━━━━━━━━━━━┓ + ┃ color ┃ max_carat ┃ + ┡━━━━━━━━╇━━━━━━━━━━━┩ + │ string │ float64 │ + ├────────┼───────────┤ + │ J │ 5.01 │ + │ H │ 4.13 │ + └────────┴───────────┘ """ from ibis.expr.types.relations import bind @@ -2028,6 +2080,9 @@ def topk(self, k: int, by: ir.Value | None = None) -> ir.Table: (metric,) = bind(table, by) + if name is not None: + metric = metric.name(name) + return table.aggregate(metric, by=[self]).order_by(metric.desc()).limit(k) def arbitrary( @@ -2100,33 +2155,28 @@ def count(self, where: ir.BooleanValue | None = None) -> ir.IntegerScalar: """ return ops.Count(self, where=self._bind_to_parent_table(where)).to_expr() - def value_counts(self) -> ir.Table: + def value_counts(self, *, name: str | None = None) -> ir.Table: """Compute a frequency table. + Parameters + ---------- + name + The name to use for the frequency column. A suitable name will be + automatically generated if not provided. + Returns ------- Table - Frequency table expression + The frequency table. Examples -------- >>> import ibis >>> ibis.options.interactive = True - >>> t = ibis.memtable({"chars": char} for char in "aabcddd") - >>> t - ┏━━━━━━━━┓ - ┃ chars ┃ - ┡━━━━━━━━┩ - │ string │ - ├────────┤ - │ a │ - │ a │ - │ b │ - │ c │ - │ d │ - │ d │ - │ d │ - └────────┘ + >>> t = ibis.memtable({"chars": ["a", "a", "b", "c", "c", "c", "d", "d", "d", "d"]}) + + Compute the count of each unique value in "chars", ordered by "chars": + >>> t.chars.value_counts().order_by("chars") ┏━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ chars ┃ chars_count ┃ @@ -2135,13 +2185,30 @@ def value_counts(self) -> ir.Table: ├────────┼─────────────┤ │ a │ 2 │ │ b │ 1 │ - │ c │ 1 │ - │ d │ 3 │ + │ c │ 3 │ + │ d │ 4 │ └────────┴─────────────┘ + + Compute the count of each unique value in "chars" as a column named + "freq", ordered by "freq": + + >>> t.chars.value_counts(name="freq").order_by("freq") + ┏━━━━━━━━┳━━━━━━━┓ + ┃ chars ┃ freq ┃ + ┡━━━━━━━━╇━━━━━━━┩ + │ string │ int64 │ + ├────────┼───────┤ + │ b │ 1 │ + │ a │ 2 │ + │ c │ 3 │ + │ d │ 4 │ + └────────┴───────┘ """ - name = self.get_name() - metric = _.count().name(f"{name}_count") - return self.as_table().group_by(name).aggregate(metric) + colname = self.get_name() + if name is None: + name = f"{colname}_count" + t = self.as_table() + return t.group_by(t[colname]).aggregate(t.count().name(name)) def first( self, diff --git a/ibis/tests/expr/test_analytics.py b/ibis/tests/expr/test_analytics.py index ab2f6a17b7af..348d04f1f01a 100644 --- a/ibis/tests/expr/test_analytics.py +++ b/ibis/tests/expr/test_analytics.py @@ -17,6 +17,7 @@ import ibis import ibis.expr.types as ir +from ibis import _ from ibis.common.annotations import ValidationError from ibis.tests.expr.mocks import MockBackend from ibis.tests.util import assert_equal @@ -110,3 +111,15 @@ def test_topk_function_late_bind(airlines): expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean()) assert_equal(expr1, expr2) + + +def test_topk_name(airlines): + expr1 = airlines.dest.topk(5, name="mycol") + expr2 = airlines.dest.topk(5, by=_.count().name("mycol")) + assert expr1.columns == ["dest", "mycol"] + assert_equal(expr1, expr2) + + expr3 = airlines.dest.topk(5, by=_.arrdelay.mean(), name="mycol") + expr4 = airlines.dest.topk(5, by=_.arrdelay.mean().name("mycol")) + assert expr3.columns == ["dest", "mycol"] + assert_equal(expr3, expr4) diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py index 50fb132011f0..c9d7cf7ab0e9 100644 --- a/ibis/tests/expr/test_table.py +++ b/ibis/tests/expr/test_table.py @@ -871,19 +871,21 @@ def test_group_by_column_select_api(table): getattr(grouped.f, fn)() -def test_value_counts_convenience(table): - # #152 - result = table.g.value_counts() - expected = table.select("g").group_by("g").aggregate(g_count=lambda t: t.count()) +def test_value_counts(table): + expr1 = table.g.value_counts() + expr2 = table[["g"]].group_by("g").aggregate(g_count=_.count()) + assert expr1.columns == ["g", "g_count"] + assert_equal(expr1, expr2) - assert_equal(result, expected) + expr3 = table.g.value_counts(name="freq") + expr4 = table[["g"]].group_by("g").aggregate(freq=_.count()) + assert expr3.columns == ["g", "freq"] + assert_equal(expr3, expr4) -def test_isin_value_counts(table): - # #157, this code path was untested before - bool_clause = table.g.notin(["1", "4", "7"]) - # it works! - bool_clause.name("notin").value_counts() +def test_value_counts_on_window_function(table): + expr = (table.a - table.a.mean()).name("x").value_counts(name="count") + assert expr.columns == ["x", "count"] def test_value_counts_unnamed_expr(con): diff --git a/ibis/tests/expr/test_value_exprs.py b/ibis/tests/expr/test_value_exprs.py index 35a376ffc3ab..d1e1cd5e35c7 100644 --- a/ibis/tests/expr/test_value_exprs.py +++ b/ibis/tests/expr/test_value_exprs.py @@ -289,12 +289,6 @@ def test_isin_notin_list(table, container): assert isinstance(not_expr.op().arg, ops.InValues) -def test_value_counts(table, string_col): - bool_clause = table[string_col].notin(["1", "4", "7"]) - expr = table.filter(bool_clause)[string_col].value_counts() - assert isinstance(expr, ir.Table) - - def test_isin_notin_scalars(): a, b, c = (ibis.literal(x) for x in [1, 1, 2])