Skip to content

Commit 58d0528

Browse files
committed
HIVE-29197: Disable vectorization for multi-column COUNT(DISTINCT)
1 parent 481d274 commit 58d0528

File tree

3 files changed

+626
-0
lines changed

3 files changed

+626
-0
lines changed

ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4501,6 +4501,12 @@ public static ImmutablePair<VectorAggregationDesc,String> getVectorAggregationDe
45014501
vecAggrClasses = new Class[] {
45024502
VectorUDAFComputeDsKllSketchDouble.class, VectorUDAFComputeDsKllSketchFinal.class
45034503
};
4504+
} else if (VECTORIZABLE_UDAF.COUNT.toString().equalsIgnoreCase(aggregationName) && parameterList.size() > 1) {
4505+
// Handle unsupported multi-column COUNT DISTINCT
4506+
String issue = "Unsupported COUNT DISTINCT with multiple columns: "
4507+
+ aggregationName + "(" + parameterList + "). "
4508+
+ "Hive only supports COUNT(DISTINCT col) in vectorized execution. ";
4509+
return new ImmutablePair<>(null, issue);
45044510
} else {
45054511
VectorizedUDAFs annotation =
45064512
AnnotationUtils.getAnnotation(evaluator.getClass(), VectorizedUDAFs.class);
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
drop table if exists test_vector;
2+
create external table test_vector(id string, pid bigint) PARTITIONED BY (full_date int);
3+
insert into test_vector (pid, full_date, id) values (1, '20240305', '6150');
4+
5+
--------------------------------------------------------------------------------
6+
-- 1. Basic COUNT cases (valid in vectorization)
7+
--------------------------------------------------------------------------------
8+
SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const, COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
9+
FROM test_vector WHERE full_date=20240305;
10+
EXPLAIN VECTORIZATION EXPRESSION
11+
SELECT COUNT(pid) AS cnt_col, COUNT(*) AS cnt_star, COUNT(20240305) AS cnt_const,COUNT(DISTINCT pid) as cnt_distinct, COUNT(1) AS CNT
12+
FROM test_vector WHERE full_date=20240305;
13+
14+
--------------------------------------------------------------------------------
15+
-- 2. COUNT with DISTINCT column + constant (INVALID in vectorization)
16+
--------------------------------------------------------------------------------
17+
SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
18+
EXPLAIN VECTORIZATION EXPRESSION
19+
SELECT COUNT(DISTINCT pid, 20240305) AS CNT FROM test_vector WHERE full_date=20240305;
20+
21+
--------------------------------------------------------------------------------
22+
-- 3. COUNT(DISTINCT pid, full_date) (multi-col distinct → FAIL)
23+
--------------------------------------------------------------------------------
24+
SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
25+
EXPLAIN VECTORIZATION EXPRESSION
26+
SELECT COUNT(DISTINCT pid, full_date) AS CNT FROM test_vector WHERE full_date=20240305;
27+
28+
--------------------------------------------------------------------------------
29+
-- 4. COUNT(DISTINCT pid, full_date, id) (multi-col distinct → FAIL)
30+
--------------------------------------------------------------------------------
31+
SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
32+
EXPLAIN VECTORIZATION EXPRESSION
33+
SELECT COUNT(DISTINCT pid, full_date, id) AS CNT FROM test_vector WHERE full_date=20240305;
34+
35+
DROP TABLE test_vector;

0 commit comments

Comments
 (0)