[VL] Fix spark34 group-by.sql(.out) in GlutenSQLQueryTestSuite (apach…

…e#5162)
mskapilks · Mar 28, 2024 · 657966e · 657966e
1 parent 8f5ad48
commit 657966e
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 200 deletions.
diff --git a/...en-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala b/...en-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -234,7 +234,7 @@ object VeloxSQLQueryTestSettings extends SQLQueryTestSettings {
   private val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
     // Velox corr has better computation logic but it fails Spark's precision check.
     // Remove -- SPARK-24369 multiple distinct aggregations having the same argument set,
-    // -- SPARK-37613: Support ANSI Aggregate Function: regr_r2
+    //        -- SPARK-37613: Support ANSI Aggregate Function: regr_r2
     "group-by.sql",
     // Remove -- SPARK-24369 multiple distinct aggregations having the same argument set
     "udf/udf-group-by.sql"

diff --git a/gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql b/gluten-ut/spark34/src/test/resources/sql-tests/inputs/group-by.sql
@@ -7,12 +7,6 @@
 CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
 (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
 AS testData(a, b);
-CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
-(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
-AS testRegression(k, y, x);
-CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES
-(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null)
-AS aggr(k, v);
 
 -- Aggregate with empty GroupBy expressions.
 SELECT a, COUNT(b) FROM testData;
@@ -40,6 +34,9 @@ SELECT a + b, COUNT(b) FROM testData GROUP BY a + b;
 SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1;
 SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1;
 
+-- struct() in group by
+SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa);
+
 -- Aggregate with nulls.
 SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
 FROM testData;
@@ -233,17 +230,6 @@ FROM VALUES (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE)), (CAST(NULL AS DOUBLE
 SELECT histogram_numeric(col, 3)
 FROM VALUES (CAST(NULL AS INT)), (CAST(NULL AS INT)), (CAST(NULL AS INT)) AS tab(col);
 
-
--- SPARK-37613: Support ANSI Aggregate Function: regr_count
-SELECT regr_count(y, x) FROM testRegression;
-SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL;
-SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k;
-SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k;
-
--- SPARK-37613: Support ANSI Aggregate Function: regr_r2
-SELECT regr_r2(y, x) FROM testRegression;
-SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL;
-
 -- SPARK-27974: Support ANSI Aggregate Function: array_agg
 SELECT
   collect_list(col),
@@ -258,34 +244,19 @@ FROM VALUES
   (1,4),(2,3),(1,4),(2,4) AS v(a,b)
 GROUP BY a;
 
--- SPARK-37614: Support ANSI Aggregate Function: regr_avgx & regr_avgy
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression;
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL;
-SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k;
-SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k;
 
--- SPARK-37676: Support ANSI Aggregation Function: percentile_cont
-SELECT
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr;
-SELECT
-  k,
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k;
-
--- SPARK-37691: Support ANSI Aggregation Function: percentile_disc
-SELECT
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr;
-SELECT
-  k,
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k;
+SELECT mode(a), mode(b) FROM testData;
+SELECT a, mode(b) FROM testData GROUP BY a ORDER BY a;
+
+
+-- SPARK-44846: PushFoldableIntoBranches in complex grouping expressions cause bindReference error
+SELECT c * 2 AS d
+FROM (
+         SELECT if(b > 1, 1, b) AS c
+         FROM (
+                  SELECT if(a < 0, 0, a) AS b
+                  FROM VALUES (-1), (1), (2) AS t1(a)
+              ) t2
+         GROUP BY b
+     ) t3
+GROUP BY c;
diff --git a/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out b/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,7 +1,4 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 101
-
-
 -- !query
 CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES
 (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null)
@@ -12,26 +9,6 @@ struct<>
 
 
 
--- !query
-CREATE OR REPLACE TEMPORARY VIEW testRegression AS SELECT * FROM VALUES
-(1, 10, null), (2, 10, 11), (2, 20, 22), (2, 25, null), (2, 30, 35)
-AS testRegression(k, y, x)
--- !query schema
-struct<>
--- !query output
-
-
-
--- !query
-CREATE OR REPLACE TEMPORARY VIEW aggr AS SELECT * FROM VALUES
-(0, 0), (0, 10), (0, 20), (0, 30), (0, 40), (1, 10), (1, 20), (2, 10), (2, 20), (2, 25), (2, 30), (3, 60), (4, null)
-AS aggr(k, v)
--- !query schema
-struct<>
--- !query output
-
-
-
 -- !query
 SELECT a, COUNT(b) FROM testData
 -- !query schema
@@ -168,6 +145,17 @@ struct<((a + 1) + 1):int,count(b):bigint>
 NULL	1
 
 
+-- !query
+SELECT count(1) FROM testData GROUP BY struct(a + 0.1 AS aa)
+-- !query schema
+struct<count(1):bigint>
+-- !query output
+2
+2
+2
+3
+
+
 -- !query
 SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a)
 FROM testData
@@ -274,6 +262,7 @@ org.apache.spark.sql.AnalysisException
   }
 }
 
+
 -- !query
 set spark.sql.groupByAliases=false
 -- !query schema
@@ -304,6 +293,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a
 -- !query schema
@@ -341,7 +331,6 @@ struct<1:int>
 -- !query output
 
 
-
 -- !query
 SELECT 1 FROM range(10) HAVING true
 -- !query schema
@@ -376,6 +365,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SET spark.sql.legacy.parser.havingWithoutGroupByAsWhere=true
 -- !query schema
@@ -423,6 +413,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SELECT id FROM range(10) HAVING id > 0
 -- !query schema
@@ -569,6 +560,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SELECT some(1S)
 -- !query schema
@@ -594,6 +586,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SELECT any(1L)
 -- !query schema
@@ -619,6 +612,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SELECT every("true")
 -- !query schema
@@ -644,6 +638,7 @@ org.apache.spark.sql.AnalysisException
   } ]
 }
 
+
 -- !query
 SELECT bool_and(1.0)
 -- !query schema
@@ -1054,56 +1049,6 @@ struct<histogram_numeric(col, 3):array<struct<x:int,y:double>>>
 NULL
 
 
--- !query
-SELECT regr_count(y, x) FROM testRegression
--- !query schema
-struct<regr_count(y, x):bigint>
--- !query output
-3
-
-
--- !query
-SELECT regr_count(y, x) FROM testRegression WHERE x IS NOT NULL
--- !query schema
-struct<regr_count(y, x):bigint>
--- !query output
-3
-
-
--- !query
-SELECT k, count(*), regr_count(y, x) FROM testRegression GROUP BY k
--- !query schema
-struct<k:int,count(1):bigint,regr_count(y, x):bigint>
--- !query output
-1	1	0
-2	4	3
-
-
--- !query
-SELECT k, count(*) FILTER (WHERE x IS NOT NULL), regr_count(y, x) FROM testRegression GROUP BY k
--- !query schema
-struct<k:int,count(1) FILTER (WHERE (x IS NOT NULL)):bigint,regr_count(y, x):bigint>
--- !query output
-1	0	0
-2	3	3
-
-
--- !query
-SELECT regr_r2(y, x) FROM testRegression
--- !query schema
-struct<regr_r2(y, x):double>
--- !query output
-0.997690531177829
-
-
--- !query
-SELECT regr_r2(y, x) FROM testRegression WHERE x IS NOT NULL
--- !query schema
-struct<regr_r2(y, x):double>
--- !query output
-0.997690531177829
-
-
 -- !query
 SELECT
   collect_list(col),
@@ -1132,92 +1077,37 @@ struct<a:int,collect_list(b):array<int>,collect_list(b):array<int>>
 
 
 -- !query
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression
--- !query schema
-struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
--- !query output
-22.666666666666668	20.0
-
-
--- !query
-SELECT regr_avgx(y, x), regr_avgy(y, x) FROM testRegression WHERE x IS NOT NULL AND y IS NOT NULL
+SELECT mode(a), mode(b) FROM testData
 -- !query schema
-struct<regr_avgx(y, x):double,regr_avgy(y, x):double>
+struct<mode(a):int,mode(b):int>
 -- !query output
-22.666666666666668	20.0
+3	1
 
 
 -- !query
-SELECT k, avg(x), avg(y), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k
+SELECT a, mode(b) FROM testData GROUP BY a ORDER BY a
 -- !query schema
-struct<k:int,avg(x):double,avg(y):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
+struct<a:int,mode(b):int>
 -- !query output
-1	NULL	10.0	NULL	NULL
-2	22.666666666666668	21.25	22.666666666666668	20.0
-
-
--- !query
-SELECT k, avg(x) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), avg(y) FILTER (WHERE x IS NOT NULL AND y IS NOT NULL), regr_avgx(y, x), regr_avgy(y, x) FROM testRegression GROUP BY k
--- !query schema
-struct<k:int,avg(x) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT NULL))):double,avg(y) FILTER (WHERE ((x IS NOT NULL) AND (y IS NOT NULL))):double,regr_avgx(y, x):double,regr_avgy(y, x):double>
--- !query output
-1	NULL	NULL	NULL	NULL
-2	22.666666666666668	20.0	22.666666666666668	20.0
-
-
--- !query
-SELECT
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
--- !query schema
-struct<percentile_cont(0.25) WITHIN GROUP (ORDER BY v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
--- !query output
-10.0	30.0
-
-
--- !query
-SELECT
-  k,
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k
--- !query schema
-struct<k:int,percentile_cont(0.25) WITHIN GROUP (ORDER BY v):double,percentile_cont(0.25) WITHIN GROUP (ORDER BY v DESC):double>
--- !query output
-0	10.0	30.0
-1	12.5	17.5
-2	17.5	26.25
-3	60.0	60.0
-4	NULL	NULL
+NULL	1
+1	1
+2	1
+3	1
 
 
 -- !query
-SELECT
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
+SELECT c * 2 AS d
+FROM (
+         SELECT if(b > 1, 1, b) AS c
+         FROM (
+                  SELECT if(a < 0, 0, a) AS b
+                  FROM VALUES (-1), (1), (2) AS t1(a)
+              ) t2
+         GROUP BY b
+     ) t3
+GROUP BY c
 -- !query schema
-struct<percentile_disc(0.25) WITHIN GROUP (ORDER BY v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
+struct<d:int>
 -- !query output
-10.0	30.0
-
-
--- !query
-SELECT
-  k,
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v),
-  percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC)
-FROM aggr
-GROUP BY k
-ORDER BY k
--- !query schema
-struct<k:int,percentile_disc(0.25) WITHIN GROUP (ORDER BY v):double,percentile_disc(0.25) WITHIN GROUP (ORDER BY v DESC):double>
--- !query output
-0	10.0	30.0
-1	10.0	20.0
-2	10.0	30.0
-3	60.0	60.0
-4	NULL	NULL
+0
+2