From 1baf0dbc597915d199c133941647a56205773ac0 Mon Sep 17 00:00:00 2001 From: Jibing-Li <64681310+Jibing-Li@users.noreply.github.com> Date: Fri, 27 Sep 2024 13:46:24 +0800 Subject: [PATCH] [improvement](statistics)Reduce partition column sample BE memory consumption. (#41203) (#41387) backport: https://github.com/apache/doris/pull/41203 --- .../doris/statistics/BaseAnalysisTask.java | 42 ++++++------------- .../statistics/ExternalAnalysisTask.java | 7 +--- .../doris/statistics/OlapAnalysisTask.java | 15 +++---- .../statistics/BaseAnalysisTaskTest.java | 2 +- .../statistics/OlapAnalysisTaskTest.java | 25 ++++++----- .../test_hive_statistics_all_type_p0.groovy | 29 ++++++++++++- 6 files changed, 64 insertions(+), 56 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 52c8ce932a54b4..a7aaaf2c037c07 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -82,30 +82,6 @@ public abstract class BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}"; - protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT " - + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " - + "${catalogId} AS `catalog_id`, " - + "${dbId} AS `db_id`, " - + "${tblId} AS `tbl_id`, " - + "${idxId} AS `idx_id`, " - + "'${colId}' AS `col_id`, " - + "NULL AS `part_id`, " - + "${rowCount} AS `row_count`, " - + "${ndvFunction} as `ndv`, " - + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " - + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " - + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " - + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " - + "NOW() " - + "FROM ( " - + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` " - + " FROM " - + " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` " - + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`colValue` " - + ") as `t1` "; - protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " + "${catalogId} AS `catalog_id`, " @@ -122,11 +98,11 @@ public abstract class BaseAnalysisTask { + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + "NOW() " + "FROM ( " - + " SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` " + + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` " + " FROM " - + " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " - + " ${sampleHints} ${limit}) as `t0` " - + " GROUP BY `t0`.`${colName}` " + + " (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` " + + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` " + + " GROUP BY `t0`.`colValue` " + ") as `t1` "; protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT " @@ -230,7 +206,7 @@ public long getJobId() { protected String getDataSizeFunction(Column column, boolean useDuj1) { if (useDuj1) { if (column.getType().isStringType()) { - return "SUM(LENGTH(`column_key`) * count)"; + return "SUM(`column_length`)"; } else { return "SUM(t1.count) * " + column.getType().getSlotSize(); } @@ -243,6 +219,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) { } } + protected String getStringTypeColName(Column column) { + if (column.getType().isStringType()) { + return "xxhash_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))"; + } else { + return "`${colName}`"; + } + } + protected String getMinFunction() { if (tableSample == null) { return "CAST(MIN(`${colName}`) as ${type}) "; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java index c054c7bc5b075e..979ff04f79e68d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java @@ -132,11 +132,8 @@ protected void getColumnStats() throws Exception { params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); } else { - if (col.getType().isStringType()) { - sb.append(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sb.append(DUJ1_ANALYZE_TEMPLATE); - } + sb.append(DUJ1_ANALYZE_TEMPLATE); + params.put("subStringColName", getStringTypeColName(col)); params.put("dataSizeFunction", getDataSizeFunction(col, true)); params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})")); params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 7f96c52e81b248..5e5d6ac474042f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -134,8 +134,8 @@ protected void doSample() throws Exception { params.put("colId", StatisticsUtil.escapeSQL(String.valueOf(info.colName))); params.put("dataSizeFunction", getDataSizeFunction(col, false)); params.put("dbName", db.getFullName()); - params.put("colName", StatisticsUtil.escapeColumnName(info.colName)); - params.put("tblName", tbl.getName()); + params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName))); + params.put("tblName", String.valueOf(tbl.getName())); params.put("scaleFactor", String.valueOf(scaleFactor)); params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr)); params.put("ndvFunction", getNdvFunction(String.valueOf(totalRowCount))); @@ -168,11 +168,8 @@ protected void doSample() throws Exception { sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); } else { params.put("dataSizeFunction", getDataSizeFunction(col, true)); - if (col.getType().isStringType()) { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE); - } else { - sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); - } + params.put("subStringColName", getStringTypeColName(col)); + sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); } LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], " + "limited [{}], distribute column [{}], partition column [{}], key column [{}], " @@ -195,8 +192,8 @@ protected ResultRow collectBasicStat(AutoCloseConnectContext context) { long startTime = System.currentTimeMillis(); Map params = new HashMap<>(); params.put("dbName", db.getFullName()); - params.put("colName", StatisticsUtil.escapeColumnName(info.colName)); - params.put("tblName", tbl.getName()); + params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName))); + params.put("tblName", String.valueOf(tbl.getName())); params.put("index", getIndex()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(BASIC_STATS_TEMPLATE); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index 187c4d207dfcad..86ccfff26e0375 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -31,7 +31,7 @@ public void testGetFunctions() { OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); Column column = new Column("string_column", PrimitiveType.STRING); String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true); - Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction); + Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction); dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false); Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 75506b1c85a014..5bb0920e4338b8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -160,9 +160,11 @@ public void runQuery(String sql) { + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`," + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) " - + "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`" - + " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) " + + "as `count`, SUM(`len`) as `column_length` FROM " + + "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len` " + + "FROM `catalogName`.`${dbName}`.`null`" + + " limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; @@ -232,12 +234,12 @@ public void runQuery(String sql) { + "SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, " + "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, " + "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, " - + "500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, " - + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) " + + "500 AS `row_count`, ROUND(NDV(`null`) * 5.0) as `ndv`, " + + "ROUND(SUM(CASE WHEN `null` IS NULL THEN 1 ELSE 0 END) * 5.0) " + "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() " - + "FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); + + "SUM(LENGTH(`null`)) * 5.0 AS `data_size`, NOW() " + + "FROM `catalogName`.`${dbName}`.`null` limit 100", sql); return; } }; @@ -320,9 +322,12 @@ public void runQuery(String sql) { + "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM " - + "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); + + "SUM(`column_length`) * 5.0 AS `data_size`, NOW() " + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as " + + "`column_length` FROM (SELECT xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) " + + "AS `colValue`, LENGTH(`null`) as `len`" + + " FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` " + + "GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; diff --git a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy index 496cab8825a713..7d85f288bd7b94 100644 --- a/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy +++ b/regression-test/suites/external_table_p0/hive/test_hive_statistics_all_type_p0.groovy @@ -18,7 +18,7 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_docker,external_docker_hive") { String enabled = context.config.otherConfigs.get("enableHiveTest") if (enabled == null || !enabled.equalsIgnoreCase("true")) { - logger.info("diable Hive test.") + logger.info("disable Hive test.") return; } @@ -34,10 +34,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}' );""" sql """use `${catalog_name}`.`default`""" - sql """analyze table orc_all_types with sync""" + sql """analyze table orc_all_types with sync with sample rows 4000000""" def result = sql """show column stats orc_all_types;""" assertEquals(16, result.size()) + result = sql """show column stats orc_all_types (int_col);""" + assertEquals("int_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3240.0", result[0][3]) + assertEquals("361.0", result[0][4]) + assertEquals("14400.0", result[0][5]) + + result = sql """show column stats orc_all_types (string_col);""" + assertEquals("string_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("3254.0", result[0][3]) + assertEquals("347.0", result[0][4]) + assertEquals("453634.0", result[0][5]) + + result = sql """show column stats orc_all_types (varchar_col);""" + assertEquals("varchar_col", result[0][0]) + assertEquals("3600.0", result[0][2]) + assertEquals("6.0", result[0][3]) + assertEquals("0.0", result[0][4]) + assertEquals("35950.0", result[0][5]) + + sql """drop stats orc_all_types""" + sql """analyze table orc_all_types with sync""" + result = sql """show column stats orc_all_types;""" + assertEquals(16, result.size()) result = sql """show column stats orc_all_types (int_col);""" assertEquals("int_col", result[0][0]) assertEquals("3600.0", result[0][2])