facebookincubator · aditi-pandit · Oct 5, 2023 · mbasmanova · Oct 5, 2023 · aditi-pandit
diff --git a/velox/core/PlanNode.cpp b/velox/core/PlanNode.cpp
@@ -376,6 +376,32 @@ RowTypePtr getGroupIdOutputType(
 
   return ROW(std::move(names), std::move(types));
 }
+
+std::vector<std::vector<std::string>> getGroupingSets(
+    const std::vector<std::vector<FieldAccessTypedExprPtr>>& groupingSetFields,
+    const std::vector<GroupIdNode::GroupingKeyInfo>& groupingKeyInfos) {
+  std::unordered_map<std::string, std::string> inputToOutputGroupingKeyMap;
+  for (const auto& groupKeyInfo : groupingKeyInfos) {
+    inputToOutputGroupingKeyMap[groupKeyInfo.input->name()] =
+        groupKeyInfo.output;
+  }
+
+  // Prestissimo passes grouping keys with their input column name to Velox.
+  // But Velox expects the output column name for the grouping key.
+  std::vector<std::vector<std::string>> groupingSets;
+  groupingSets.reserve(groupingSetFields.size());
+  for (const auto& groupFields : groupingSetFields) {
+    std::vector<std::string> groupingKeys;
+    groupingKeys.reserve(groupFields.size());
+    for (const auto& groupingField : groupFields) {
+      groupingKeys.push_back(
+          inputToOutputGroupingKeyMap[groupingField->name()]);
+    }
+    groupingSets.push_back(groupingKeys);
+  }
+  return groupingSets;
+}
+
 } // namespace
 
 GroupIdNode::GroupIdNode(
@@ -385,6 +411,29 @@ GroupIdNode::GroupIdNode(
     std::vector<FieldAccessTypedExprPtr> aggregationInputs,
     std::string groupIdName,
     PlanNodePtr source)
+    : PlanNode(std::move(id)),
+      sources_{source},
+      outputType_(getGroupIdOutputType(
+          groupingKeyInfos,
+          aggregationInputs,
+          groupIdName)),
+      groupingSets_(getGroupingSets(groupingSets, groupingKeyInfos)),
+      groupingKeyInfos_(std::move(groupingKeyInfos)),
+      aggregationInputs_(std::move(aggregationInputs)),
+      groupIdName_(std::move(groupIdName)) {
+  VELOX_USER_CHECK_GE(
+      groupingSets_.size(),
+      2,
+      "GroupIdNode requires two or more grouping sets.");
+}
+
+GroupIdNode::GroupIdNode(
+    PlanNodeId id,
+    std::vector<std::vector<std::string>> groupingSets,
+    std::vector<GroupingKeyInfo> groupingKeyInfos,
+    std::vector<FieldAccessTypedExprPtr> aggregationInputs,
+    std::string groupIdName,
+    PlanNodePtr source)
     : PlanNode(std::move(id)),
       sources_{source},
       outputType_(getGroupIdOutputType(
@@ -395,7 +444,7 @@ GroupIdNode::GroupIdNode(
       groupingKeyInfos_(std::move(groupingKeyInfos)),
       aggregationInputs_(std::move(aggregationInputs)),
       groupIdName_(std::move(groupIdName)) {
-  VELOX_CHECK_GE(
+  VELOX_USER_CHECK_GE(
       groupingSets_.size(),
       2,
       "GroupIdNode requires two or more grouping sets.");
@@ -407,7 +456,12 @@ void GroupIdNode::addDetails(std::stringstream& stream) const {
       stream << ", ";
     }
     stream << "[";
-    addFields(stream, groupingSets_[i]);
+    for (auto j = 0; j < groupingSets_[i].size(); j++) {
+      if (j > 0) {
+        stream << ", ";
+      }
+      stream << groupingSets_[i][j];
+    }
     stream << "]";
   }
 }
@@ -434,14 +488,16 @@ folly::dynamic GroupIdNode::serialize() const {
 // static
 PlanNodePtr GroupIdNode::create(const folly::dynamic& obj, void* context) {
   auto source = deserializeSingleSource(obj, context);
-  auto groupingSets = ISerializable::deserialize<
-      std::vector<std::vector<FieldAccessTypedExpr>>>(obj["groupingSets"]);
   std::vector<GroupingKeyInfo> groupingKeyInfos;
   for (const auto& info : obj["groupingKeyInfos"]) {
     groupingKeyInfos.push_back(
         {info["output"].asString(),
          ISerializable::deserialize<FieldAccessTypedExpr>(info["input"])});
   }
+
+  auto groupingSets =
+      ISerializable::deserialize<std::vector<std::vector<std::string>>>(
+          obj["groupingSets"]);
   return std::make_shared<GroupIdNode>(
       deserializePlanNodeId(obj),
       std::move(groupingSets),

diff --git a/velox/core/PlanNode.h b/velox/core/PlanNode.h
@@ -795,6 +795,7 @@ class GroupIdNode : public PlanNode {
   /// @param groupIdName Name of the column that will contain the grouping set
   /// ID (a zero based integer).
   /// @param source Input plan node.
+  /// NOTE: THIS FUNCTION IS DEPRECATED. PLEASE DO NOT USE.
   GroupIdNode(
       PlanNodeId id,
       std::vector<std::vector<FieldAccessTypedExprPtr>> groupingSets,
@@ -803,6 +804,25 @@ class GroupIdNode : public PlanNode {
       std::string groupIdName,
       PlanNodePtr source);
 
+  /// @param id Plan node ID.
+  /// @param groupingSets A list of grouping key sets. Grouping keys within the
+  /// set must be unique, but grouping keys across sets may repeat.
+  /// Note: groupingSets are specified using output column names.
+  /// @param groupingKeyInfos The names and order of the grouping keys in the
+  /// output.
+  /// @param aggregationInputs Columns that contain inputs to the aggregate
+  /// functions.
+  /// @param groupIdName Name of the column that will contain the grouping set
+  /// ID (a zero based integer).
+  /// @param source Input plan node.
+  GroupIdNode(
+      PlanNodeId id,
+      std::vector<std::vector<std::string>> groupingSets,
+      std::vector<GroupingKeyInfo> groupingKeyInfos,
+      std::vector<FieldAccessTypedExprPtr> aggregationInputs,
+      std::string groupIdName,
+      PlanNodePtr source);
+
   const RowTypePtr& outputType() const override {
     return outputType_;
   }
@@ -811,8 +831,7 @@ class GroupIdNode : public PlanNode {
     return sources_;
   }
 
-  const std::vector<std::vector<FieldAccessTypedExprPtr>>& groupingSets()
-      const {
+  const std::vector<std::vector<std::string>>& groupingSets() const {
     return groupingSets_;
   }
 
@@ -845,7 +864,12 @@ class GroupIdNode : public PlanNode {
 
   const std::vector<PlanNodePtr> sources_;
   const RowTypePtr outputType_;
-  const std::vector<std::vector<FieldAccessTypedExprPtr>> groupingSets_;
+
+  // Specifies groupingSets with output column names.
+  // This allows for the case when a single input column could map
+  // to multiple output columns which are used in separate grouping sets.
+  const std::vector<std::vector<std::string>> groupingSets_;
+
   const std::vector<GroupingKeyInfo> groupingKeyInfos_;
   const std::vector<FieldAccessTypedExprPtr> aggregationInputs_;
   const std::string groupIdName_;

diff --git a/velox/docs/develop/operators.rst b/velox/docs/develop/operators.rst
@@ -232,13 +232,83 @@ followed by the group ID column. The type of group ID column is BIGINT.
      - Description
    * - groupingSets
      - List of grouping key sets. Keys within each set must be unique, but keys can repeat across the sets.
+     - Grouping keys are specified with their output names.
    * - groupingKeyInfos
      - The names and order of the grouping key columns in the output.
    * - aggregationInputs
      - Input columns to duplicate.
    * - groupIdName
      - The name for the group-id column that identifies the grouping set. Zero-based integer corresponding to the position of the grouping set in the 'groupingSets' list.
 
+GroupIdNode is typically used to compute GROUPING SETS, CUBE and ROLLUP.
+
+While usually GroupingSets do not repeat with the same grouping key column, there are some use-cases where
+they might. To illustrate why GroupingSets might do so lets examine the following SQL query:
+
+.. code-block:: sql
+
+  SELECT count(orderkey), count(DISTINCT orderkey) FROM orders;
+
+In this query the user wants to compute global aggregates using the same column, though with
+and without the DISTINCT clause. With a particular optimization strategy
+`optimize.mixed-distinct-aggregations <https://www.qubole.com/blog/presto-optimizes-aggregations-over-distinct-values>`_, Presto uses GroupIdNode to compute these.
+
+First, the optimizer creates a GroupIdNode to duplicate every row assigning one copy
+to group 0 and another to group 1. This is achieved using the GroupIdNode with 2 grouping sets
+each using orderkey as a grouping key. In order to disambiguate the
+groups the orderkey column is aliased as a grouping key for one of the
+grouping sets.
+
+Lets say the orders table has 5 rows:
+
+.. code-block::
+
+  orderkey
+     1
+     2
+     2
+     3
+     4
+
+The GroupIdNode would transform this into:
+
+.. code-block::
+
+    orderkey   orderkey1   group_id
+    1             null        0
+    2             null        0
+    2             null        0
+    3             null        0
+    4             null        0
+    null           1          1
+    null           2          1
+    null           2          1
+    null           3          1
+    null           4          1
+
+Then Presto plans an aggregation using (orderkey, group_id) and count(orderkey1).
+
+This results in the following 5 rows:
+
+.. code-block::
+
+    orderkey     group_id     count(orderkey1) as c
+    1                0         null
+    2                0         null
+    3                0         null
+    4                0         null
+    null             1          5
+
+Then Presto plans a second aggregation with no keys and count(orderkey), arbitrary(c).
+Since both aggregations ignore nulls this correctly computes the number of
+distinct orderkeys and the count of all orderkeys.
+
+.. code-block::
+
+    count(orderkey)     arbitrary(c)
+     4                     5
+
+
 HashJoinNode and MergeJoinNode
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/velox/exec/GroupId.cpp b/velox/exec/GroupId.cpp
@@ -29,24 +29,27 @@ GroupId::GroupId(
           "GroupId") {
   const auto& inputType = groupIdNode->sources()[0]->outputType();
 
-  std::unordered_map<std::string, column_index_t>
-      inputToOutputGroupingKeyMapping;
+  std::unordered_map<column_index_t, column_index_t>
+      outputToInputGroupingKeyMapping;
   for (const auto& groupingKeyInfo : groupIdNode->groupingKeyInfos()) {
-    inputToOutputGroupingKeyMapping[groupingKeyInfo.input->name()] =
-        outputType_->getChildIdx(groupingKeyInfo.output);
+    outputToInputGroupingKeyMapping[outputType_->getChildIdx(
+        groupingKeyInfo.output)] =
+        inputType->getChildIdx(groupingKeyInfo.input->name());
   }
 
-  auto numGroupingSets = groupIdNode->groupingSets().size();
-  groupingKeyMappings_.reserve(numGroupingSets);
-
   auto numGroupingKeys = groupIdNode->numGroupingKeys();
 
+  groupingKeyMappings_.reserve(groupIdNode->groupingSets().size());
   for (const auto& groupingSet : groupIdNode->groupingSets()) {
     std::vector<column_index_t> mappings(numGroupingKeys, kMissingGroupingKey);
     for (const auto& groupingKey : groupingSet) {
-      auto outputChannel =
-          inputToOutputGroupingKeyMapping.at(groupingKey->name());
-      auto inputChannel = inputType->getChildIdx(groupingKey->name());
+      auto outputChannel = outputType_->getChildIdx(groupingKey);
+      VELOX_USER_CHECK_NE(
+          outputToInputGroupingKeyMapping.count(outputChannel),
+          0,
+          "GroupIdNode didn't map grouping key {} to input channel",
+          groupingKey);
+      auto inputChannel = outputToInputGroupingKeyMapping.at(outputChannel);
       mappings[outputChannel] = inputChannel;
     }
 

diff --git a/velox/exec/tests/AggregationTest.cpp b/velox/exec/tests/AggregationTest.cpp
@@ -1459,7 +1459,7 @@ TEST_F(AggregationTest, groupingSets) {
   auto plan =
       PlanBuilder()
           .values({data})
-          .groupId({{"k1"}, {"k2"}}, {"a", "b"})
+          .groupId({"k1", "k2"}, {{"k1"}, {"k2"}}, {"a", "b"})
           .singleAggregation(
               {"k1", "k2", "group_id"},
               {"count(1) as count_1", "sum(a) as sum_a", "max(b) as max_b"})
@@ -1474,7 +1474,7 @@ TEST_F(AggregationTest, groupingSets) {
   // group_id column.
   plan = PlanBuilder()
              .values({data})
-             .groupId({{"k1"}, {"k2"}}, {"a", "b"})
+             .groupId({"k1", "k2"}, {{"k1"}, {"k2"}}, {"a", "b"})
              .project(
                  {"k1",
                   "k2",
@@ -1497,14 +1497,15 @@ TEST_F(AggregationTest, groupingSets) {
       "SELECT null, k2, count(1), null, max(b) FROM tmp GROUP BY k2");
 
   // Cube.
-  plan = PlanBuilder()
-             .values({data})
-             .groupId({{"k1", "k2"}, {"k1"}, {"k2"}, {}}, {"a", "b"})
-             .singleAggregation(
-                 {"k1", "k2", "group_id"},
-                 {"count(1) as count_1", "sum(a) as sum_a", "max(b) as max_b"})
-             .project({"k1", "k2", "count_1", "sum_a", "max_b"})
-             .planNode();
+  plan =
+      PlanBuilder()
+          .values({data})
+          .groupId({"k1", "k2"}, {{"k1", "k2"}, {"k1"}, {"k2"}, {}}, {"a", "b"})
+          .singleAggregation(
+              {"k1", "k2", "group_id"},
+              {"count(1) as count_1", "sum(a) as sum_a", "max(b) as max_b"})
+          .project({"k1", "k2", "count_1", "sum_a", "max_b"})
+          .planNode();
 
   assertQuery(
       plan,
@@ -1513,7 +1514,7 @@ TEST_F(AggregationTest, groupingSets) {
   // Rollup.
   plan = PlanBuilder()
              .values({data})
-             .groupId({{"k1", "k2"}, {"k1"}, {}}, {"a", "b"})
+             .groupId({"k1", "k2"}, {{"k1", "k2"}, {"k1"}, {}}, {"a", "b"})
              .singleAggregation(
                  {"k1", "k2", "group_id"},
                  {"count(1) as count_1", "sum(a) as sum_a", "max(b) as max_b"})
@@ -1544,7 +1545,7 @@ TEST_F(AggregationTest, groupingSetsOutput) {
   auto reversedOrderPlan =
       PlanBuilder()
           .values({data})
-          .groupId({{"k2", "k1"}, {}}, {"a", "b"})
+          .groupId({"k2", "k1"}, {{"k2", "k1"}, {}}, {"a", "b"})
           .capturePlanNode(reversedOrderGroupIdNode)
           .singleAggregation(
               {"k2", "k1", "group_id"},
@@ -1555,7 +1556,7 @@ TEST_F(AggregationTest, groupingSetsOutput) {
   auto orderPlan =
       PlanBuilder()
           .values({data})
-          .groupId({{"k1", "k2"}, {}}, {"a", "b"})
+          .groupId({"k1", "k2"}, {{"k1", "k2"}, {}}, {"a", "b"})
           .capturePlanNode(orderGroupIdNode)
           .singleAggregation(
               {"k1", "k2", "group_id"},
@@ -1584,6 +1585,32 @@ TEST_F(AggregationTest, groupingSetsOutput) {
   assertEqualResults(orderResult.second, reversedOrderResult.second);
 }
 
+TEST_F(AggregationTest, groupingSetsSameKey) {
+  auto data = makeRowVector(
+      {"o_key", "o_status"},
+      {makeFlatVector<int64_t>({0, 1, 2, 3, 4}),
+       makeFlatVector<std::string>({"", "x", "xx", "xxx", "xxxx"})});
+
+  createDuckDbTable({data});
+
+  auto plan = PlanBuilder()
+                  .values({data})
+                  .groupId(
+                      {"o_key", "o_key as o_key_1"},
+                      {{"o_key", "o_key_1"}, {"o_key"}, {"o_key_1"}, {}},
+                      {"o_status"})
+                  .singleAggregation(
+                      {"o_key", "o_key_1", "group_id"},
+                      {"max(o_status) as max_o_status"})
+                  .project({"o_key", "o_key_1", "max_o_status"})
+                  .planNode();
+
+  assertQuery(
+      plan,
+      "SELECT o_key, o_key_1, max(o_status) as max_o_status FROM ("
+      "select o_key, o_key as o_key_1, o_status FROM tmp) GROUP BY GROUPING SETS ((o_key, o_key_1), (o_key), (o_key_1), ())");
+}
+
 TEST_F(AggregationTest, outputBatchSizeCheckWithSpill) {
   const int numVectors = 5;
   const int vectorSize = 20;