Skip to content

Commit

Permalink
[VL] Fix spark34 group-by.sql(.out) in GlutenSQLQueryTestSuite (apach…
Browse files Browse the repository at this point in the history
  • Loading branch information
liujiayi771 authored Mar 28, 2024
1 parent 8f5ad48 commit 657966e
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 200 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings {
private val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
// Velox corr has better computation logic but it fails Spark's precision check.
// Remove -- SPARK-24369 multiple distinct aggregations having the same argument set,
// -- SPARK-37613: Support ANSI Aggregate Function: regr_r2
// -- SPARK-37613: Support ANSI Aggregate Function: regr_r2
"group-by.sql",
// Remove -- SPARK-24369 multiple distinct aggregations having the same argument set
"udf/udf-group-by.sql"
Expand Down
65 changes: 18 additions & 47 deletions gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
AS testData(a, b);
CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
AS testRegression(k, y, x);
CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES
(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null)
AS aggr(k, v);

-- Aggregate with empty GroupBy expressions.
SELECT a, COUNT(b) FROM testData;
Expand Down Expand Up @@ -40,6 +34,9 @@ SELECT a + b, COUNT(b) FROM testData GROUP BY a + b;
SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1;
SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;

-- struct() in group by
SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa);

-- Aggregate with nulls.
SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
FROM testData;
Expand Down Expand Up @@ -233,17 +230,6 @@ FROM VALUES (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE
SELECT histogram_numeric(col, 3)
FROM VALUES (CAST(NULL AS INT)), (CAST(NULL AS INT)), (CAST(NULL AS INT)) AS tab(col);


-- SPARK-37613: Support ANSI Aggregate Function: regr_count
SELECT regr_count(y, x) FROM testRegression;
SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL;
SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k;
SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k;

-- SPARK-37613: Support ANSI Aggregate Function: regr_r2
SELECT regr_r2(y, x) FROM testRegression;
SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL;

-- SPARK-27974: Support ANSI Aggregate Function: array_agg
SELECT
collect_list(col),
Expand All @@ -258,34 +244,19 @@ FROM VALUES
(1,4),(2,3),(1,4),(2,4) AS v(a,b)
GROUP BY a;

-- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression;
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k;
SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k;

-- SPARK-37676: Support ANSI Aggregation Function: percentile_cont
SELECT
percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr;
SELECT
k,
percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
GROUP BY k
ORDER BY k;

-- SPARK-37691: Support ANSI Aggregation Function: percentile_disc
SELECT
percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr;
SELECT
k,
percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
GROUP BY k
ORDER BY k;
SELECT mode(a), mode(b) FROM testData;
SELECT a, mode(b) FROM testData GROUP BY a ORDER BY a;


-- SPARK-44846: PushFoldableIntoBranches in complex grouping expressions cause bindReference error
SELECT c * 2 AS d
FROM (
SELECT if(b > 1, 1, b) AS c
FROM (
SELECT if(a < 0, 0, a) AS b
FROM VALUES (-1), (1), (2) AS t1(a)
) t2
GROUP BY b
) t3
GROUP BY c;
192 changes: 41 additions & 151 deletions gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 101


-- !query
CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
Expand All @@ -12,26 +9,6 @@ struct<>



-- !query
CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
AS testRegression(k, y, x)
-- !query schema
struct<>
-- !query output



-- !query
CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES
(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null)
AS aggr(k, v)
-- !query schema
struct<>
-- !query output



-- !query
SELECT a, COUNT(b) FROM testData
-- !query schema
Expand Down Expand Up @@ -168,6 +145,17 @@ struct<((a + 1) + 1):int,count(b):bigint>
NULL 1


-- !query
SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa)
-- !query schema
struct<count(1):bigint>
-- !query output
2
2
2
3


-- !query
SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
FROM testData
Expand Down Expand Up @@ -274,6 +262,7 @@ org.apache.spark.sql.AnalysisException
}
}


-- !query
set spark.sql.groupByAliases=false
-- !query schema
Expand Down Expand Up @@ -304,6 +293,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a
-- !query schema
Expand Down Expand Up @@ -341,7 +331,6 @@ struct<1:int>
-- !query output



-- !query
SELECT 1 FROM range(10) HAVING true
-- !query schema
Expand Down Expand Up @@ -376,6 +365,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true
-- !query schema
Expand Down Expand Up @@ -423,6 +413,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SELECT id FROM range(10) HAVING id > 0
-- !query schema
Expand Down Expand Up @@ -569,6 +560,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SELECT some(1S)
-- !query schema
Expand All @@ -594,6 +586,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SELECT any(1L)
-- !query schema
Expand All @@ -619,6 +612,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SELECT every("true")
-- !query schema
Expand All @@ -644,6 +638,7 @@ org.apache.spark.sql.AnalysisException
} ]
}


-- !query
SELECT bool_and(1.0)
-- !query schema
Expand Down Expand Up @@ -1054,56 +1049,6 @@ struct<histogram_numeric(col, 3):array<struct<x:int,y:double>>>
NULL


-- !query
SELECT regr_count(y, x) FROM testRegression
-- !query schema
struct<regr_count(y, x):bigint>
-- !query output
3


-- !query
SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL
-- !query schema
struct<regr_count(y, x):bigint>
-- !query output
3


-- !query
SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,count(1):bigint,regr_count(y, x):bigint>
-- !query output
1 1 0
2 4 3


-- !query
SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,count(1) FILTER (WHERE (x IS NOT NULL)):bigint,regr_count(y, x):bigint>
-- !query output
1 0 0
2 3 3


-- !query
SELECT regr_r2(y, x) FROM testRegression
-- !query schema
struct<regr_r2(y, x):double>
-- !query output
0.997690531177829


-- !query
SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL
-- !query schema
struct<regr_r2(y, x):double>
-- !query output
0.997690531177829


-- !query
SELECT
collect_list(col),
Expand Down Expand Up @@ -1132,92 +1077,37 @@ struct<a:int,collect_list(b):array<int>,collect_list(b):array<int>>


-- !query
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
-- !query schema
struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
-- !query output
22.666666666666668 20.0


-- !query
SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
SELECT mode(a), mode(b) FROM testData
-- !query schema
struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
struct<mode(a):int,mode(b):int>
-- !query output
22.666666666666668 20.0
3 1


-- !query
SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k
SELECT a, mode(b) FROM testData GROUP BY a ORDER BY a
-- !query schema
struct<k:int,avg(x):double,avg(y):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
struct<a:int,mode(b):int>
-- !query output
1 NULL 10.0 NULL NULL
2 22.666666666666668 21.25 22.666666666666668 20.0


-- !query
SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k
-- !query schema
struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT NULL))):double,avg(y) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT NULL))):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
-- !query output
1 NULL NULL NULL NULL
2 22.666666666666668 20.0 22.666666666666668 20.0


-- !query
SELECT
percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
-- !query schema
struct<percentile_cont(0.25) WITHIN GROUP (ORDER BY v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
-- !query output
10.0 30.0


-- !query
SELECT
k,
percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
GROUP BY k
ORDER BY k
-- !query schema
struct<k:int,percentile_cont(0.25) WITHIN GROUP (ORDER BY v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
-- !query output
0 10.0 30.0
1 12.5 17.5
2 17.5 26.25
3 60.0 60.0
4 NULL NULL
NULL 1
1 1
2 1
3 1


-- !query
SELECT
percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
SELECT c * 2 AS d
FROM (
SELECT if(b > 1, 1, b) AS c
FROM (
SELECT if(a < 0, 0, a) AS b
FROM VALUES (-1), (1), (2) AS t1(a)
) t2
GROUP BY b
) t3
GROUP BY c
-- !query schema
struct<percentile_disc(0.25) WITHIN GROUP (ORDER BY v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
struct<d:int>
-- !query output
10.0 30.0


-- !query
SELECT
k,
percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
FROM aggr
GROUP BY k
ORDER BY k
-- !query schema
struct<k:int,percentile_disc(0.25) WITHIN GROUP (ORDER BY v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
-- !query output
0 10.0 30.0
1 10.0 20.0
2 10.0 30.0
3 60.0 60.0
4 NULL NULL
0
2
Loading

0 comments on commit 657966e

Please sign in to comment.