ibis-project · jcrist · Sep 11, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py
@@ -1999,20 +1999,72 @@ def nunique(self, where: ir.BooleanValue | None = None) -> ir.IntegerScalar:
             self, where=self._bind_to_parent_table(where)
         ).to_expr()
 
-    def topk(self, k: int, by: ir.Value | None = None) -> ir.Table:
+    def topk(
+        self, k: int, by: ir.Value | None = None, *, name: str | None = None
+    ) -> ir.Table:
         """Return a "top k" expression.
 
+        Computes a Table containing the top `k` values by a certain metric
+        (defaults to count).
+
         Parameters
         ----------
         k
-            Return this number of rows
+            The number of rows to return.
         by
-            An expression. Defaults to `count`.
+            The metric to compute "top" by. Defaults to `count`.
+        name
+            The name to use for the metric column. A suitable name will be
+            automatically generated if not provided.
 
         Returns
         -------
         Table
-            A top-k expression
+            The top `k` values.
+
+        Examples
+        --------
+        >>> import ibis
+        >>> ibis.options.interactive = True
+        >>> t = ibis.examples.diamonds.fetch()
+
+        Compute the top 3 diamond colors by frequency:
+
+        >>> t.color.topk(3)
+        ┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓
+        ┃ color  ┃ CountStar(diamonds) ┃
+        ┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩
+        │ string │ int64               │
+        ├────────┼─────────────────────┤
+        │ G      │               11292 │
+        │ E      │                9797 │
+        │ F      │                9542 │
+        └────────┴─────────────────────┘
+
+        Compute the top 3 diamond colors by mean price:
+
+        >>> t.color.topk(3, by=t.price.mean())
+        ┏━━━━━━━━┳━━━━━━━━━━━━━┓
+        ┃ color  ┃ Mean(price) ┃
+        ┡━━━━━━━━╇━━━━━━━━━━━━━┩
+        │ string │ float64     │
+        ├────────┼─────────────┤
+        │ J      │ 5323.818020 │
+        │ I      │ 5091.874954 │
+        │ H      │ 4486.669196 │
+        └────────┴─────────────┘
+
+        Compute the top 2 diamond colors by max carat:
+
+        >>> t.color.topk(2, by=t.carat.max(), name="max_carat")
+        ┏━━━━━━━━┳━━━━━━━━━━━┓
+        ┃ color  ┃ max_carat ┃
+        ┡━━━━━━━━╇━━━━━━━━━━━┩
+        │ string │ float64   │
+        ├────────┼───────────┤
+        │ J      │      5.01 │
+        │ H      │      4.13 │
+        └────────┴───────────┘
         """
         from ibis.expr.types.relations import bind
 
@@ -2028,6 +2080,9 @@ def topk(self, k: int, by: ir.Value | None = None) -> ir.Table:
 
         (metric,) = bind(table, by)
 
+        if name is not None:
+            metric = metric.name(name)
+
         return table.aggregate(metric, by=[self]).order_by(metric.desc()).limit(k)
 
     def arbitrary(
@@ -2100,33 +2155,28 @@ def count(self, where: ir.BooleanValue | None = None) -> ir.IntegerScalar:
         """
         return ops.Count(self, where=self._bind_to_parent_table(where)).to_expr()
 
-    def value_counts(self) -> ir.Table:
+    def value_counts(self, *, name: str | None = None) -> ir.Table:
         """Compute a frequency table.
 
+        Parameters
+        ----------
+        name
+            The name to use for the frequency column. A suitable name will be
+            automatically generated if not provided.
+
         Returns
         -------
         Table
-            Frequency table expression
+            The frequency table.
 
         Examples
         --------
         >>> import ibis
         >>> ibis.options.interactive = True
-        >>> t = ibis.memtable({"chars": char} for char in "aabcddd")
-        >>> t
-        ┏━━━━━━━━┓
-        ┃ chars  ┃
-        ┡━━━━━━━━┩
-        │ string │
-        ├────────┤
-        │ a      │
-        │ a      │
-        │ b      │
-        │ c      │
-        │ d      │
-        │ d      │
-        │ d      │
-        └────────┘
+        >>> t = ibis.memtable({"chars": ["a", "a", "b", "c", "c", "c", "d", "d", "d", "d"]})
+
+        Compute the count of each unique value in "chars", ordered by "chars":
+
         >>> t.chars.value_counts().order_by("chars")
         ┏━━━━━━━━┳━━━━━━━━━━━━━┓
         ┃ chars  ┃ chars_count ┃
@@ -2135,13 +2185,30 @@ def value_counts(self) -> ir.Table:
         ├────────┼─────────────┤
         │ a      │           2 │
         │ b      │           1 │
-        │ c      │           1 │
-        │ d      │           3 │
+        │ c      │           3 │
+        │ d      │           4 │
         └────────┴─────────────┘
+
+        Compute the count of each unique value in "chars" as a column named
+        "freq", ordered by "freq":
+
+        >>> t.chars.value_counts(name="freq").order_by("freq")
+        ┏━━━━━━━━┳━━━━━━━┓
+        ┃ chars  ┃ freq  ┃
+        ┡━━━━━━━━╇━━━━━━━┩
+        │ string │ int64 │
+        ├────────┼───────┤
+        │ b      │     1 │
+        │ a      │     2 │
+        │ c      │     3 │
+        │ d      │     4 │
+        └────────┴───────┘
         """
-        name = self.get_name()
-        metric = _.count().name(f"{name}_count")
-        return self.as_table().group_by(name).aggregate(metric)
+        colname = self.get_name()
+        if name is None:
+            name = f"{colname}_count"
+        t = self.as_table()
+        return t.group_by(t[colname]).aggregate(t.count().name(name))
 
     def first(
         self,

diff --git a/ibis/tests/expr/test_analytics.py b/ibis/tests/expr/test_analytics.py
@@ -17,6 +17,7 @@
 
 import ibis
 import ibis.expr.types as ir
+from ibis import _
 from ibis.common.annotations import ValidationError
 from ibis.tests.expr.mocks import MockBackend
 from ibis.tests.util import assert_equal
@@ -110,3 +111,15 @@ def test_topk_function_late_bind(airlines):
     expr2 = airlines.dest.topk(5, by=airlines.arrdelay.mean())
 
     assert_equal(expr1, expr2)
+
+
+def test_topk_name(airlines):
+    expr1 = airlines.dest.topk(5, name="mycol")
+    expr2 = airlines.dest.topk(5, by=_.count().name("mycol"))
+    assert expr1.columns == ["dest", "mycol"]
+    assert_equal(expr1, expr2)
+
+    expr3 = airlines.dest.topk(5, by=_.arrdelay.mean(), name="mycol")
+    expr4 = airlines.dest.topk(5, by=_.arrdelay.mean().name("mycol"))
+    assert expr3.columns == ["dest", "mycol"]
+    assert_equal(expr3, expr4)
diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py
@@ -871,19 +871,21 @@ def test_group_by_column_select_api(table):
         getattr(grouped.f, fn)()
 
 
-def test_value_counts_convenience(table):
-    # #152
-    result = table.g.value_counts()
-    expected = table.select("g").group_by("g").aggregate(g_count=lambda t: t.count())
+def test_value_counts(table):
+    expr1 = table.g.value_counts()
+    expr2 = table[["g"]].group_by("g").aggregate(g_count=_.count())
+    assert expr1.columns == ["g", "g_count"]
+    assert_equal(expr1, expr2)
 
-    assert_equal(result, expected)
+    expr3 = table.g.value_counts(name="freq")
+    expr4 = table[["g"]].group_by("g").aggregate(freq=_.count())
+    assert expr3.columns == ["g", "freq"]
+    assert_equal(expr3, expr4)
 
 
-def test_isin_value_counts(table):
-    # #157, this code path was untested before
-    bool_clause = table.g.notin(["1", "4", "7"])
-    # it works!
-    bool_clause.name("notin").value_counts()
+def test_value_counts_on_window_function(table):
+    expr = (table.a - table.a.mean()).name("x").value_counts(name="count")
+    assert expr.columns == ["x", "count"]
 
 
 def test_value_counts_unnamed_expr(con):

diff --git a/ibis/tests/expr/test_value_exprs.py b/ibis/tests/expr/test_value_exprs.py
@@ -289,12 +289,6 @@ def test_isin_notin_list(table, container):
     assert isinstance(not_expr.op().arg, ops.InValues)
 
 
-def test_value_counts(table, string_col):
-    bool_clause = table[string_col].notin(["1", "4", "7"])
-    expr = table.filter(bool_clause)[string_col].value_counts()
-    assert isinstance(expr, ir.Table)
-
-
 def test_isin_notin_scalars():
     a, b, c = (ibis.literal(x) for x in [1, 1, 2])