Skip to content

Commit

Permalink
[improvement](statistics)Reduce partition column sample BE memory con…
Browse files Browse the repository at this point in the history
…sumption. (apache#41203) (apache#41387)

backport: apache#41203
  • Loading branch information
Jibing-Li authored Sep 27, 2024
1 parent e0c9cbd commit 1baf0db
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -82,30 +82,6 @@ public abstract class BaseAnalysisTask {
+ "NOW() "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}";

protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT "
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "
+ "${dbId} AS `db_id`, "
+ "${tblId} AS `tbl_id`, "
+ "${idxId} AS `idx_id`, "
+ "'${colId}' AS `col_id`, "
+ "NULL AS `part_id`, "
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, "
+ "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, "
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
+ " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` "
+ " FROM "
+ " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` "
+ " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
+ " ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`colValue` "
+ ") as `t1` ";

protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT "
+ "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, "
+ "${catalogId} AS `catalog_id`, "
Expand All @@ -122,11 +98,11 @@ public abstract class BaseAnalysisTask {
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
+ " SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` "
+ " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as `column_length` "
+ " FROM "
+ " (SELECT `${colName}` FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} "
+ " ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`${colName}` "
+ " (SELECT ${subStringColName} AS `colValue`, LENGTH(`${colName}`) as `len` "
+ " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}) as `t0` "
+ " GROUP BY `t0`.`colValue` "
+ ") as `t1` ";

protected static final String ANALYZE_PARTITION_COLUMN_TEMPLATE = " SELECT "
Expand Down Expand Up @@ -230,7 +206,7 @@ public long getJobId() {
protected String getDataSizeFunction(Column column, boolean useDuj1) {
if (useDuj1) {
if (column.getType().isStringType()) {
return "SUM(LENGTH(`column_key`) * count)";
return "SUM(`column_length`)";
} else {
return "SUM(t1.count) * " + column.getType().getSlotSize();
}
Expand All @@ -243,6 +219,14 @@ protected String getDataSizeFunction(Column column, boolean useDuj1) {
}
}

protected String getStringTypeColName(Column column) {
if (column.getType().isStringType()) {
return "xxhash_64(SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024))";
} else {
return "`${colName}`";
}
}

protected String getMinFunction() {
if (tableSample == null) {
return "CAST(MIN(`${colName}`) as ${type}) ";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,8 @@ protected void getColumnStats() throws Exception {
params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})");
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
} else {
if (col.getType().isStringType()) {
sb.append(DUJ1_ANALYZE_STRING_TEMPLATE);
} else {
sb.append(DUJ1_ANALYZE_TEMPLATE);
}
sb.append(DUJ1_ANALYZE_TEMPLATE);
params.put("subStringColName", getStringTypeColName(col));
params.put("dataSizeFunction", getDataSizeFunction(col, true));
params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})"));
params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ protected void doSample() throws Exception {
params.put("colId", StatisticsUtil.escapeSQL(String.valueOf(info.colName)));
params.put("dataSizeFunction", getDataSizeFunction(col, false));
params.put("dbName", db.getFullName());
params.put("colName", StatisticsUtil.escapeColumnName(info.colName));
params.put("tblName", tbl.getName());
params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName)));
params.put("tblName", String.valueOf(tbl.getName()));
params.put("scaleFactor", String.valueOf(scaleFactor));
params.put("sampleHints", tabletStr.isEmpty() ? "" : String.format("TABLET(%s)", tabletStr));
params.put("ndvFunction", getNdvFunction(String.valueOf(totalRowCount)));
Expand Down Expand Up @@ -168,11 +168,8 @@ protected void doSample() throws Exception {
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
} else {
params.put("dataSizeFunction", getDataSizeFunction(col, true));
if (col.getType().isStringType()) {
sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE);
} else {
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
params.put("subStringColName", getStringTypeColName(col));
sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE);
}
LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], "
+ "limited [{}], distribute column [{}], partition column [{}], key column [{}], "
Expand All @@ -195,8 +192,8 @@ protected ResultRow collectBasicStat(AutoCloseConnectContext context) {
long startTime = System.currentTimeMillis();
Map<String, String> params = new HashMap<>();
params.put("dbName", db.getFullName());
params.put("colName", StatisticsUtil.escapeColumnName(info.colName));
params.put("tblName", tbl.getName());
params.put("colName", StatisticsUtil.escapeColumnName(String.valueOf(info.colName)));
params.put("tblName", String.valueOf(tbl.getName()));
params.put("index", getIndex());
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(BASIC_STATS_TEMPLATE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public void testGetFunctions() {
OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
Column column = new Column("string_column", PrimitiveType.STRING);
String dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, true);
Assertions.assertEquals("SUM(LENGTH(`column_key`) * count)", dataSizeFunction);
Assertions.assertEquals("SUM(`column_length`)", dataSizeFunction);
dataSizeFunction = olapAnalysisTask.getDataSizeFunction(column, false);
Assertions.assertEquals("SUM(LENGTH(`${colName}`))", dataSizeFunction);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,11 @@ public void runQuery(String sql) {
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`,"
+ " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) "
+ "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`"
+ " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql);
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) "
+ "as `count`, SUM(`len`) as `column_length` FROM "
+ "(SELECT `null` AS `colValue`, LENGTH(`null`) as `len` "
+ "FROM `catalogName`.`${dbName}`.`null`"
+ " limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql);
return;
}
};
Expand Down Expand Up @@ -232,12 +234,12 @@ public void runQuery(String sql) {
+ "SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, "
+ "10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, "
+ "-1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, "
+ "500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, "
+ "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) "
+ "500 AS `row_count`, ROUND(NDV(`null`) * 5.0) as `ndv`, "
+ "ROUND(SUM(CASE WHEN `null` IS NULL THEN 1 ELSE 0 END) * 5.0) "
+ "AS `null_count`, SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() "
+ "FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
+ "SUM(LENGTH(`null`)) * 5.0 AS `data_size`, NOW() "
+ "FROM `catalogName`.`${dbName}`.`null` limit 100", sql);
return;
}
};
Expand Down Expand Up @@ -320,9 +322,12 @@ public void runQuery(String sql) {
+ "IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, "
+ "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, "
+ "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, "
+ "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM "
+ "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql);
+ "SUM(`column_length`) * 5.0 AS `data_size`, NOW() "
+ "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count`, SUM(`len`) as "
+ "`column_length` FROM (SELECT xxhash_64(SUBSTRING(CAST(`null` AS STRING), 1, 1024)) "
+ "AS `colValue`, LENGTH(`null`) as `len`"
+ " FROM `catalogName`.`${dbName}`.`null` limit 100) as `t0` "
+ "GROUP BY `t0`.`colValue` ) as `t1` ", sql);
return;
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
logger.info("disable Hive test.")
return;
}

Expand All @@ -34,10 +34,35 @@ suite("test_hive_statistics_all_type_p0", "all_types,p0,external,hive,external_d
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
);"""
sql """use `${catalog_name}`.`default`"""
sql """analyze table orc_all_types with sync"""
sql """analyze table orc_all_types with sync with sample rows 4000000"""
def result = sql """show column stats orc_all_types;"""
assertEquals(16, result.size())

result = sql """show column stats orc_all_types (int_col);"""
assertEquals("int_col", result[0][0])
assertEquals("3600.0", result[0][2])
assertEquals("3240.0", result[0][3])
assertEquals("361.0", result[0][4])
assertEquals("14400.0", result[0][5])

result = sql """show column stats orc_all_types (string_col);"""
assertEquals("string_col", result[0][0])
assertEquals("3600.0", result[0][2])
assertEquals("3254.0", result[0][3])
assertEquals("347.0", result[0][4])
assertEquals("453634.0", result[0][5])

result = sql """show column stats orc_all_types (varchar_col);"""
assertEquals("varchar_col", result[0][0])
assertEquals("3600.0", result[0][2])
assertEquals("6.0", result[0][3])
assertEquals("0.0", result[0][4])
assertEquals("35950.0", result[0][5])

sql """drop stats orc_all_types"""
sql """analyze table orc_all_types with sync"""
result = sql """show column stats orc_all_types;"""
assertEquals(16, result.size())
result = sql """show column stats orc_all_types (int_col);"""
assertEquals("int_col", result[0][0])
assertEquals("3600.0", result[0][2])
Expand Down

0 comments on commit 1baf0db

Please sign in to comment.