diff --git a/docs/en/docs/query-acceleration/statistics.md b/docs/en/docs/query-acceleration/statistics.md index 4d37d278eef4f9..9cf75cdd5ec329 100644 --- a/docs/en/docs/query-acceleration/statistics.md +++ b/docs/en/docs/query-acceleration/statistics.md @@ -69,7 +69,7 @@ Column statistics collection syntax: ```SQL ANALYZE < TABLE | DATABASE table_name | db_name > - [ PARTITIONS (partition_name [, ...]) ] + [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT] ] [ (column_name [, ...]) ] [ [ WITH SYNC ] [ WITH INCREMENTAL ] [ WITH SAMPLE PERCENT | ROWS ] [ WITH PERIOD ]] [ PROPERTIES ("key" = "value", ...) ]; @@ -78,7 +78,7 @@ ANALYZE < TABLE | DATABASE table_name | db_name > Explanation: - Table_name: The target table for the specified. It can be a `db_name.table_name` form. -- partition_name: The specified target partitions(for hive external table only)。Must be partitions exist in `table_name`. Multiple partition names are separated by commas. e.g. (nation=US/city=Washington) +- partition_name: The specified target partitions(for hive external table only)。Must be partitions exist in `table_name`. Multiple partition names are separated by commas. e.g. for single level partition: PARTITIONS(`event_date=20230706`), for multi level partition: PARTITIONS(`nation=US/city=Washington`). PARTITIONS(*) specifies all partitions, PARTITIONS WITH RECENT 30 specifies the latest 30 partitions. - Column_name: The specified target column. Must be `table_name` a column that exists in. Multiple column names are separated by commas. - Sync: Synchronizes the collection of statistics. Return after collection. If not specified, it will be executed asynchronously and the job ID will be returned. - Incremental: Incrementally gather statistics. @@ -673,4 +673,4 @@ When executing ANALYZE, statistical data is written to the internal table __inte ### ANALYZE Failure for Large Tables -Due to the strict resource limitations for ANALYZE, the ANALYZE operation for large tables might experience timeouts or exceed the memory limit of BE. In such cases, it's recommended to use ANALYZE ... WITH SAMPLE.... Additionally, for scenarios involving dynamic partitioned tables, it's highly recommended to use ANALYZE ... WITH INCREMENTAL.... This statement processes only the partitions with updated data incrementally, avoiding redundant computations and improving efficiency. \ No newline at end of file +Due to the strict resource limitations for ANALYZE, the ANALYZE operation for large tables might experience timeouts or exceed the memory limit of BE. In such cases, it's recommended to use ANALYZE ... WITH SAMPLE.... Additionally, for scenarios involving dynamic partitioned tables, it's highly recommended to use ANALYZE ... WITH INCREMENTAL.... This statement processes only the partitions with updated data incrementally, avoiding redundant computations and improving efficiency. diff --git a/docs/zh-CN/docs/query-acceleration/statistics.md b/docs/zh-CN/docs/query-acceleration/statistics.md index ad14c1c9504df0..784dc0def7c928 100644 --- a/docs/zh-CN/docs/query-acceleration/statistics.md +++ b/docs/zh-CN/docs/query-acceleration/statistics.md @@ -66,7 +66,7 @@ Doris 查询优化器使用统计信息来确定查询最有效的执行计划 ```SQL ANALYZE < TABLE | DATABASE table_name | db_name > - [ PARTITIONS (partition_name [, ...]) ] + [ PARTITIONS [(*) | (partition_name [, ...]) | WITH RECENT COUNT] ] [ (column_name [, ...]) ] [ [ WITH SYNC ] [WITH INCREMENTAL] [ WITH SAMPLE PERCENT | ROWS ] [ WITH PERIOD ] ] [ PROPERTIES ("key" = "value", ...) ]; @@ -75,7 +75,7 @@ ANALYZE < TABLE | DATABASE table_name | db_name > 其中: - table_name: 指定的的目标表。可以是  `db_name.table_name`  形式。 -- partition_name: 指定的目标分区(目前只针对Hive外表)。必须是  `table_name`  中存在的分区,多个列名称用逗号分隔。分区名样例:event_date=20230706, nation=CN/city=Beijing +- partition_name: 指定的目标分区(目前只针对Hive外表)。必须是  `table_name`  中存在的分区,多个列名称用逗号分隔。分区名样例: 单层分区PARTITIONS(`event_date=20230706`),多层分区PARTITIONS(`nation=CN/city=Beijing`)。PARTITIONS (*)指定所有分区,PARTITIONS WITH RECENT 100指定最新的100个分区。 - column_name: 指定的目标列。必须是  `table_name`  中存在的列,多个列名称用逗号分隔。 - sync:同步收集统计信息。收集完后返回。若不指定则异步执行并返回任务 ID。 - period:周期性收集统计信息。单位为秒,指定后会定期收集相应的统计信息。 diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index f0e83f09646f12..baec3cd18d00b4 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -528,6 +528,7 @@ terminal String KW_QUOTA, KW_RANDOM, KW_RANGE, + KW_RECENT, KW_READ, KW_REBALANCE, KW_RECOVER, @@ -5900,6 +5901,14 @@ partition_names ::= {: RESULT = new PartitionNames(true, Lists.newArrayList(partName)); :} + | KW_PARTITIONS LPAREN STAR RPAREN + {: + RESULT = new PartitionNames(true); + :} + | KW_PARTITIONS KW_WITH KW_RECENT INTEGER_LITERAL:count + {: + RESULT = new PartitionNames(count); + :} ; opt_table_sample ::= diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java index ed5dda22498a36..874b83b280d886 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java @@ -84,7 +84,7 @@ public class AnalyzeTblStmt extends AnalyzeStmt { private final TableName tableName; private List columnNames; - private List partitionNames; + private PartitionNames partitionNames; private boolean isAllColumns; // after analyzed @@ -97,7 +97,7 @@ public AnalyzeTblStmt(TableName tableName, AnalyzeProperties properties) { super(properties); this.tableName = tableName; - this.partitionNames = partitionNames == null ? null : partitionNames.getPartitionNames(); + this.partitionNames = partitionNames; this.columnNames = columnNames; this.analyzeProperties = properties; this.isAllColumns = columnNames == null; @@ -240,14 +240,34 @@ public Set getColumnNames() { } public Set getPartitionNames() { - Set partitions = partitionNames == null ? table.getPartitionNames() : Sets.newHashSet(partitionNames); - if (isSamplingPartition()) { - int partNum = ConnectContext.get().getSessionVariable().getExternalTableAnalyzePartNum(); - partitions = partitions.stream().limit(partNum).collect(Collectors.toSet()); + if (partitionNames == null || partitionNames.getPartitionNames() == null) { + return null; } + Set partitions = Sets.newHashSet(); + partitions.addAll(partitionNames.getPartitionNames()); + /* + if (isSamplingPartition()) { + int partNum = ConnectContext.get().getSessionVariable().getExternalTableAnalyzePartNum(); + partitions = partitions.stream().limit(partNum).collect(Collectors.toSet()); + } + */ return partitions; } + public boolean isAllPartitions() { + if (partitionNames == null) { + return false; + } + return partitionNames.isAllPartitions(); + } + + public long getPartitionCount() { + if (partitionNames == null) { + return 0; + } + return partitionNames.getCount(); + } + public boolean isPartitionOnly() { return partitionNames != null; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java index 1140dfc6777641..ca26a2978e0e54 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java @@ -48,15 +48,37 @@ public class PartitionNames implements ParseNode, Writable { // true if these partitions are temp partitions @SerializedName(value = "isTemp") private final boolean isTemp; + private final boolean allPartitions; + private final long count; + // Default partition count to collect statistic for external table. + private static final long DEFAULT_PARTITION_COUNT = 100; public PartitionNames(boolean isTemp, List partitionNames) { this.partitionNames = partitionNames; this.isTemp = isTemp; + this.allPartitions = false; + this.count = 0; } public PartitionNames(PartitionNames other) { this.partitionNames = Lists.newArrayList(other.partitionNames); this.isTemp = other.isTemp; + this.allPartitions = other.allPartitions; + this.count = 0; + } + + public PartitionNames(boolean allPartitions) { + this.partitionNames = null; + this.isTemp = false; + this.allPartitions = allPartitions; + this.count = 0; + } + + public PartitionNames(long partitionCount) { + this.partitionNames = null; + this.isTemp = false; + this.allPartitions = false; + this.count = partitionCount; } public List getPartitionNames() { @@ -67,9 +89,23 @@ public boolean isTemp() { return isTemp; } + public boolean isAllPartitions() { + return allPartitions; + } + + public long getCount() { + return count; + } + @Override public void analyze(Analyzer analyzer) throws AnalysisException { - if (partitionNames.isEmpty()) { + if (allPartitions && count > 0) { + throw new AnalysisException("All partition and partition count couldn't be set at the same time."); + } + if (allPartitions || count > 0) { + return; + } + if (partitionNames == null || partitionNames.isEmpty()) { throw new AnalysisException("No partition specified in partition lists"); } // check if partition name is not empty string diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index c1de1ea98d7555..d691c0c6e5f4c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -57,6 +57,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.time.LocalDate; @@ -628,6 +629,12 @@ private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticB builder.setMaxValue(Double.MAX_VALUE); } } + + @Override + public void gsonPostProcess() throws IOException { + super.gsonPostProcess(); + estimatedRowCount = -1; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java index 00b8c7cdaaea50..ec59c61fffe4d0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java @@ -155,13 +155,19 @@ public enum ScheduleType { // True means this task is a table level task for external table. // This kind of task is mainly to collect the number of rows of a table. @SerializedName("externalTableLevelTask") - public boolean externalTableLevelTask; + public final boolean externalTableLevelTask; @SerializedName("partitionOnly") - public boolean partitionOnly; + public final boolean partitionOnly; @SerializedName("samplingPartition") - public boolean samplingPartition; + public final boolean samplingPartition; + + @SerializedName("isAllPartition") + public final boolean isAllPartition; + + @SerializedName("partitionCount") + public final long partitionCount; // For serialize @SerializedName("cronExpr") @@ -181,7 +187,7 @@ public AnalysisInfo(long jobId, long taskId, List taskIds, String catalogN int samplePercent, int sampleRows, int maxBucketNum, long periodTimeInMs, String message, long lastExecTimeInMs, long timeCostInMs, AnalysisState state, ScheduleType scheduleType, boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition, - CronExpression cronExpression, boolean forceFull) { + boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull) { this.jobId = jobId; this.taskId = taskId; this.taskIds = taskIds; @@ -208,6 +214,8 @@ public AnalysisInfo(long jobId, long taskId, List taskIds, String catalogN this.externalTableLevelTask = isExternalTableLevelTask; this.partitionOnly = partitionOnly; this.samplingPartition = samplingPartition; + this.isAllPartition = isAllPartition; + this.partitionCount = partitionCount; this.cronExpression = cronExpression; if (cronExpression != null) { this.cronExprStr = cronExpression.getCronExpression(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index 0c296ace91da14..c17bbc69d91c6d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -56,6 +56,8 @@ public class AnalysisInfoBuilder { private boolean externalTableLevelTask; private boolean partitionOnly; private boolean samplingPartition; + private boolean isAllPartition; + private long partitionCount; private CronExpression cronExpression; @@ -91,6 +93,8 @@ public AnalysisInfoBuilder(AnalysisInfo info) { externalTableLevelTask = info.externalTableLevelTask; partitionOnly = info.partitionOnly; samplingPartition = info.samplingPartition; + isAllPartition = info.isAllPartition; + partitionCount = info.partitionCount; cronExpression = info.cronExpression; forceFull = info.forceFull; } @@ -225,6 +229,16 @@ public AnalysisInfoBuilder setSamplingPartition(boolean samplingPartition) { return this; } + public AnalysisInfoBuilder setAllPartition(boolean isAllPartition) { + this.isAllPartition = isAllPartition; + return this; + } + + public AnalysisInfoBuilder setPartitionCount(long partitionCount) { + this.partitionCount = partitionCount; + return this; + } + public void setCronExpression(CronExpression cronExpression) { this.cronExpression = cronExpression; } @@ -237,6 +251,38 @@ public AnalysisInfo build() { return new AnalysisInfo(jobId, taskId, taskIds, catalogName, dbName, tblName, colToPartitions, partitionNames, colName, indexId, jobType, analysisMode, analysisMethod, analysisType, samplePercent, sampleRows, maxBucketNum, periodTimeInMs, message, lastExecTimeInMs, timeCostInMs, state, scheduleType, - externalTableLevelTask, partitionOnly, samplingPartition, cronExpression, forceFull); + externalTableLevelTask, partitionOnly, samplingPartition, isAllPartition, partitionCount, + cronExpression, forceFull); + } + + public AnalysisInfoBuilder copy() { + return new AnalysisInfoBuilder() + .setJobId(jobId) + .setTaskId(taskId) + .setTaskIds(taskIds) + .setCatalogName(catalogName) + .setDbName(dbName) + .setTblName(tblName) + .setColToPartitions(colToPartitions) + .setColName(colName) + .setIndexId(indexId) + .setJobType(jobType) + .setAnalysisMode(analysisMode) + .setAnalysisMethod(analysisMethod) + .setAnalysisType(analysisType) + .setSamplePercent(samplePercent) + .setSampleRows(sampleRows) + .setPeriodTimeInMs(periodTimeInMs) + .setMaxBucketNum(maxBucketNum) + .setMessage(message) + .setLastExecTimeInMs(lastExecTimeInMs) + .setTimeCostInMs(timeCostInMs) + .setState(state) + .setScheduleType(scheduleType) + .setExternalTableLevelTask(externalTableLevelTask) + .setSamplingPartition(samplingPartition) + .setPartitionOnly(partitionOnly) + .setAllPartition(isAllPartition) + .setPartitionCount(partitionCount); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 131f650e4dbabd..83c3cc84e494bd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -363,7 +363,7 @@ protected AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlExceptio boolean isSync = stmt.isSync(); Map analysisTaskInfos = new HashMap<>(); createTaskForEachColumns(jobInfo, analysisTaskInfos, isSync); - if (stmt.isAllColumns() + if (!jobInfo.partitionOnly && stmt.isAllColumns() && StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, isSync); } @@ -453,7 +453,7 @@ private Map> validateAndGetPartitions(TableIf table, Set> existColAndPartsForStats = StatisticsRepository + Map> existColAndPartsForStats = StatisticsRepository .fetchColAndPartsForStats(tableId); if (existColAndPartsForStats.isEmpty()) { @@ -461,12 +461,12 @@ private Map> validateAndGetPartitions(TableIf table, Set existPartIdsForStats = new HashSet<>(); + Set existPartIdsForStats = new HashSet<>(); existColAndPartsForStats.values().forEach(existPartIdsForStats::addAll); - Map idToPartition = StatisticsUtil.getPartitionIdToName(table); + Set idToPartition = StatisticsUtil.getPartitionIds(table); // Get an invalid set of partitions (those partitions were deleted) - Set invalidPartIds = existPartIdsForStats.stream() - .filter(id -> !idToPartition.containsKey(id)).collect(Collectors.toSet()); + Set invalidPartIds = existPartIdsForStats.stream() + .filter(id -> !idToPartition.contains(id)).collect(Collectors.toSet()); if (!invalidPartIds.isEmpty()) { // Delete invalid partition statistics to avoid affecting table statistics @@ -496,6 +496,8 @@ public AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlExceptio Set partitionNames = stmt.getPartitionNames(); boolean partitionOnly = stmt.isPartitionOnly(); boolean isSamplingPartition = stmt.isSamplingPartition(); + boolean isAllPartition = stmt.isAllPartitions(); + long partitionCount = stmt.getPartitionCount(); int samplePercent = stmt.getSamplePercent(); int sampleRows = stmt.getSampleRows(); AnalysisType analysisType = stmt.getAnalysisType(); @@ -516,6 +518,8 @@ public AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlExceptio infoBuilder.setPartitionNames(partitionNames); infoBuilder.setPartitionOnly(partitionOnly); infoBuilder.setSamplingPartition(isSamplingPartition); + infoBuilder.setAllPartition(isAllPartition); + infoBuilder.setPartitionCount(partitionCount); infoBuilder.setJobType(JobType.MANUAL); infoBuilder.setState(AnalysisState.PENDING); infoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index 84f8d5bfbfe0fc..67bc308bcaa833 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -93,7 +93,7 @@ public class ColumnStatistic { public final Histogram histogram; @SerializedName("partitionIdToColStats") - public final Map partitionIdToColStats = new HashMap<>(); + public final Map partitionIdToColStats = new HashMap<>(); public final String updatedTime; @@ -120,7 +120,7 @@ public ColumnStatistic(double count, double ndv, ColumnStatistic original, doubl } public static ColumnStatistic fromResultRow(List resultRows) { - Map partitionIdToColStats = new HashMap<>(); + Map partitionIdToColStats = new HashMap<>(); ColumnStatistic columnStatistic = null; try { for (ResultRow resultRow : resultRows) { @@ -128,7 +128,7 @@ public static ColumnStatistic fromResultRow(List resultRows) { if (partId == null) { columnStatistic = fromResultRow(resultRow); } else { - partitionIdToColStats.put(Long.parseLong(partId), fromResultRow(resultRow)); + partitionIdToColStats.put(partId, fromResultRow(resultRow)); } } } catch (Throwable t) { @@ -392,7 +392,7 @@ public boolean isUnKnown() { return isUnKnown; } - public void putPartStats(long partId, ColumnStatistic columnStatistic) { + public void putPartStats(String partId, ColumnStatistic columnStatistic) { this.partitionIdToColStats.put(partId, columnStatistic); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index 4a3af054df7382..fa4cf7ebc99cb4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -40,7 +40,7 @@ public class ColumnStatisticBuilder { private ColumnStatistic original; - private Map partitionIdToColStats = new HashMap<>(); + private Map partitionIdToColStats = new HashMap<>(); private String updatedTime; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 512aa9982ffed5..973e5e76aa010a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -26,7 +26,7 @@ import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.util.StatisticsUtil; -import org.apache.commons.lang3.StringUtils; +import com.google.common.collect.Lists; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -35,10 +35,13 @@ import java.time.LocalDateTime; import java.time.ZoneId; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; public class HMSAnalysisTask extends BaseAnalysisTask { private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class); @@ -48,7 +51,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { public static final String NUM_FILES = "numFiles"; public static final String TIMESTAMP = "transient_lastDdlTime"; - private static final String ANALYZE_SQL_TABLE_TEMPLATE = "INSERT INTO " + private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " + "${internalDB}.${columnStatTbl}" + " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " @@ -67,10 +70,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; - private static final String ANALYZE_SQL_PARTITION_TEMPLATE = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + private static final String ANALYZE_PARTITION_TEMPLATE = " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + "${catalogId} AS catalog_id, " + "${dbId} AS db_id, " + "${tblId} AS tbl_id, " @@ -83,22 +84,22 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " + "${dataSizeFunction} AS data_size, " - + "NOW() " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT COUNT(1) as rowCount " + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + // cache stats for each partition, it would be inserted into column_statistics in a batch. + private final List> buf = new ArrayList<>(); + private final boolean isTableLevelTask; - private final boolean isSamplingPartition; private final boolean isPartitionOnly; - private final Set partitionNames; + private Set partitionNames; private HMSExternalTable table; public HMSAnalysisTask(AnalysisInfo info) { super(info); isTableLevelTask = info.externalTableLevelTask; - isSamplingPartition = info.samplingPartition; isPartitionOnly = info.partitionOnly; partitionNames = info.partitionNames; table = (HMSExternalTable) tbl; @@ -113,7 +114,7 @@ public void doExecute() throws Exception { } /** - * Get table row count and insert the result to __internal_schema.table_statistics + * Get table row count */ private void getTableStats() throws Exception { Map params = buildTableStatsParams(null); @@ -147,55 +148,15 @@ private void getTableColumnStats() throws Exception { // 0 AS data_size, // NOW() FROM `hive`.`tpch100`.`region` if (isPartitionOnly) { - for (String partId : partitionNames) { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_TABLE_TEMPLATE); - sb.append(" where "); - String[] splits = partId.split("/"); - for (int i = 0; i < splits.length; i++) { - String value = splits[i].split("=")[1]; - splits[i] = splits[i].replace(value, "\'" + value + "\'"); - } - sb.append(StringUtils.join(splits, " and ")); - Map params = buildTableStatsParams(partId); - params.put("internalDB", FeConstants.INTERNAL_DB_NAME); - params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); - params.put("colName", col.getName()); - params.put("colId", info.colName); - params.put("dataSizeFunction", getDataSizeFunction(col)); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(sb.toString()); - executeInsertSql(sql); + getPartitionNames(); + List partitionAnalysisSQLs = new ArrayList<>(); + for (String partId : this.partitionNames) { + partitionAnalysisSQLs.add(generateSqlForPartition(partId)); } + execSQLs(partitionAnalysisSQLs); } else { StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_TABLE_TEMPLATE); - if (isSamplingPartition) { - sb.append(" where 1=1 "); - String[] splitExample = partitionNames.stream().findFirst().get().split("/"); - int parts = splitExample.length; - List partNames = new ArrayList<>(); - for (String split : splitExample) { - partNames.add(split.split("=")[0]); - } - List> valueLists = new ArrayList<>(); - for (int i = 0; i < parts; i++) { - valueLists.add(new ArrayList<>()); - } - for (String partId : partitionNames) { - String[] partIds = partId.split("/"); - for (int i = 0; i < partIds.length; i++) { - valueLists.get(i).add("\'" + partIds[i].split("=")[1] + "\'"); - } - } - for (int i = 0; i < parts; i++) { - sb.append(" and "); - sb.append(partNames.get(i)); - sb.append(" in ("); - sb.append(StringUtils.join(valueLists.get(i), ",")); - sb.append(") "); - } - } + sb.append(ANALYZE_TABLE_TEMPLATE); Map params = buildTableStatsParams("NULL"); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); @@ -208,6 +169,80 @@ private void getTableColumnStats() throws Exception { } } + private void getPartitionNames() { + if (partitionNames == null) { + if (info.isAllPartition) { + partitionNames = table.getPartitionNames(); + } else if (info.partitionCount > 0) { + partitionNames = table.getPartitionNames().stream() + .limit(info.partitionCount).collect(Collectors.toSet()); + } + if (partitionNames == null || partitionNames.isEmpty()) { + throw new RuntimeException("Not a partition table or no partition specified."); + } + } + } + + private String generateSqlForPartition(String partId) { + StringBuilder sb = new StringBuilder(); + sb.append(ANALYZE_PARTITION_TEMPLATE); + String[] splits = partId.split("/"); + for (int i = 0; i < splits.length; i++) { + String[] kv = splits[i].split("="); + sb.append(kv[0]); + sb.append("='"); + sb.append(kv[1]); + sb.append("'"); + if (i < splits.length - 1) { + sb.append(" and "); + } + } + Map params = buildTableStatsParams(partId); + params.put("internalDB", FeConstants.INTERNAL_DB_NAME); + params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); + params.put("colName", col.getName()); + params.put("colId", info.colName); + params.put("dataSizeFunction", getDataSizeFunction(col)); + return new StringSubstitutor(params).replace(sb.toString()); + } + + public void execSQLs(List partitionAnalysisSQLs) throws Exception { + long startTime = System.currentTimeMillis(); + LOG.debug("analyze task {} start at {}", info.toString(), new Date()); + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { + List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); + for (List group : sqlGroups) { + if (killed) { + return; + } + StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); + group.forEach(partitionCollectSQL::add); + stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); + buf.add(stmtExecutor.executeInternalQuery() + .stream().map(ColStatsData::new).collect(Collectors.toList())); + QueryState queryState = r.connectContext.getState(); + if (queryState.getStateType().equals(QueryState.MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, partitionCollectSQL, + queryState.getErrorMessage())); + } + } + for (List colStatsDataList : buf) { + StringBuilder batchInsertSQL = + new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + + " VALUES "); + StringJoiner sj = new StringJoiner(","); + colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); + batchInsertSQL.append(sj); + stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); + executeWithExceptionOnFail(stmtExecutor); + } + } finally { + LOG.debug("analyze task {} end. cost {}ms", info, System.currentTimeMillis() - startTime); + } + + } + private void executeInsertSql(String sql) throws Exception { long startTime = System.currentTimeMillis(); try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { @@ -270,7 +305,8 @@ private void setParameterData(Map parameters, Map key StatsId statsId = new StatsId(r); long tblId = statsId.tblId; long idxId = statsId.idxId; - long partId = statsId.partId; + String partId = statsId.partId; String colId = statsId.colId; ColumnStatistic partStats = ColumnStatistic.fromResultRow(r); keyToColStats.get(new StatisticsCacheKey(tblId, idxId, colId)).putPartStats(partId, partStats); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java index 849b68fe94ad4f..6521a8b4a5999b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java @@ -228,11 +228,11 @@ private long findExpiredStats(OlapTable statsTbl, ExpiredStats expiredStats, lon continue; } OlapTable olapTable = (OlapTable) t; - Long partId = statsId.partId; + String partId = statsId.partId; if (partId == null) { continue; } - if (!olapTable.getPartitionIds().contains(partId)) { + if (!olapTable.getPartitionIds().contains(Long.parseLong(partId))) { expiredStats.ids.add(id); } } catch (Exception e) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java index f9b18f41e4531d..cd3cc67f3c91c7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java @@ -179,7 +179,7 @@ private static String constructId(Object... params) { return stringJoiner.toString(); } - public static void dropStatistics(Set partIds) throws DdlException { + public static void dropStatistics(Set partIds) throws DdlException { dropStatisticsByPartId(partIds, StatisticConstants.STATISTIC_TBL_NAME); } @@ -202,7 +202,7 @@ public static void dropStatisticsByColName(long tblId, Set colNames, Str } } - public static void dropStatisticsByPartId(Set partIds, String statsTblName) throws DdlException { + public static void dropStatisticsByPartId(Set partIds, String statsTblName) throws DdlException { Map params = new HashMap<>(); String right = StatisticsUtil.joinElementsToString(partIds, ","); String inPredicate = String.format(" part_id IN (%s)", right); @@ -296,14 +296,14 @@ public static List fetchStatsFullName(long limit, long offset) { return StatisticsUtil.execStatisticQuery(new StringSubstitutor(params).replace(FETCH_STATS_FULL_NAME)); } - public static Map> fetchColAndPartsForStats(long tblId) { + public static Map> fetchColAndPartsForStats(long tblId) { Map params = Maps.newHashMap(); params.put("tblId", String.valueOf(tblId)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String partSql = stringSubstitutor.replace(FETCH_STATS_PART_ID); List resultRows = StatisticsUtil.execStatisticQuery(partSql); - Map> columnToPartitions = Maps.newHashMap(); + Map> columnToPartitions = Maps.newHashMap(); resultRows.forEach(row -> { try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java index c7af03a8d9e2c3..3f9b2641b75224 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java @@ -32,7 +32,7 @@ public class StatsId { public final String colId; // nullable - public final Long partId; + public final String partId; public StatsId(ResultRow row) { this.id = row.get(0); @@ -41,7 +41,7 @@ public StatsId(ResultRow row) { this.tblId = Long.parseLong(row.get(3)); this.idxId = Long.parseLong(row.get(4)); this.colId = row.get(5); - this.partId = row.get(6) == null ? null : Long.parseLong(row.get(6)); + this.partId = row.get(6); } public String toSQL() { @@ -51,8 +51,8 @@ public String toSQL() { sj.add(String.valueOf(dbId)); sj.add(String.valueOf(tblId)); sj.add(String.valueOf(idxId)); - sj.add(StatisticsUtil.quote(String.valueOf(colId))); - sj.add(String.valueOf(partId)); + sj.add(StatisticsUtil.quote(colId)); + sj.add(StatisticsUtil.quote(partId)); return sj.toString(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index ed95f4bd1f3645..3d2d0b171882a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -103,6 +103,7 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.StringJoiner; import java.util.UUID; import java.util.function.Function; @@ -439,6 +440,15 @@ public static Map getPartitionIdToName(TableIf table) { )); } + public static Set getPartitionIds(TableIf table) { + if (table instanceof OlapTable) { + return ((OlapTable) table).getPartitionIds().stream().map(String::valueOf).collect(Collectors.toSet()); + } else if (table instanceof ExternalTable) { + return table.getPartitionNames(); + } + throw new RuntimeException(String.format("Not supported Table %s", table.getClass().getName())); + } + public static String joinElementsToString(Collection values, String delimiter) { StringJoiner builder = new StringJoiner(delimiter); values.forEach(v -> builder.add(String.valueOf(v))); @@ -512,7 +522,11 @@ public static long getHiveRowCount(HMSExternalTable table, boolean isInit) { } // Table parameters contains row count, simply get and return it. if (parameters.containsKey(NUM_ROWS)) { - return Long.parseLong(parameters.get(NUM_ROWS)); + long rows = Long.parseLong(parameters.get(NUM_ROWS)); + // Sometimes, the NUM_ROWS in hms is 0 but actually is not. Need to check TOTAL_SIZE if NUM_ROWS is 0. + if (rows != 0) { + return rows; + } } if (!parameters.containsKey(TOTAL_SIZE) || isInit) { return -1; diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index 9845c2c8df5dc4..0f8eaa5d9bc4d4 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -381,6 +381,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("read", new Integer(SqlParserSymbols.KW_READ)); keywordMap.put("real", new Integer(SqlParserSymbols.KW_DOUBLE)); keywordMap.put("rebalance", new Integer(SqlParserSymbols.KW_REBALANCE)); + keywordMap.put("recent", new Integer(SqlParserSymbols.KW_RECENT)); keywordMap.put("recover", new Integer(SqlParserSymbols.KW_RECOVER)); keywordMap.put("recycle", new Integer(SqlParserSymbols.KW_RECYCLE)); keywordMap.put("refresh", new Integer(SqlParserSymbols.KW_REFRESH)); diff --git a/regression-test/data/external_table_p2/hive/test_hive_partition_statistic.out b/regression-test/data/external_table_p2/hive/test_hive_partition_statistic.out new file mode 100644 index 00000000000000..0e32ebe4775654 --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_hive_partition_statistic.out @@ -0,0 +1,87 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !01 -- +event_day=1956-09-07 39 +event_day=2008-09-25 39 + +-- !1 -- +event_day=2008-09-25 10000 1 0 0 0 +event_day=2008-09-25 10000 1 0 2008-09-25 2008-09-25 +event_day=2008-09-25 10000 11 0 0 10 +event_day=2008-09-25 10000 13 0 MFGR#12 MFGR#52 +event_day=2008-09-25 10000 13 0 antique wheat +event_day=2008-09-25 10000 16 0 JUMBO BAG WRAP PACK +event_day=2008-09-25 10000 17 0 1 48 +event_day=2008-09-25 10000 17 0 64078 113087 +event_day=2008-09-25 10000 17 0 754035 763603 +event_day=2008-09-25 10000 17 0 ECONOMY ANODIZED BRASS STANDARD POLISHED TIN +event_day=2008-09-25 10000 17 0 MFGR#1221 MFGR#528 +event_day=2008-09-25 10000 17 0 burnished drab violet firebrick +event_day=2008-09-25 10000 2362 0 19920101 19980802 +event_day=2008-09-25 10000 2382 0 19920203 19981027 +event_day=2008-09-25 10000 25 0 ALGERIA VIETNAM +event_day=2008-09-25 10000 25 0 ALGERIA VIETNAM +event_day=2008-09-25 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=2008-09-25 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=2008-09-25 10000 5 0 1-URGENT 5-LOW +event_day=2008-09-25 10000 5 0 AFRICA MIDDLE EAST +event_day=2008-09-25 10000 5 0 AFRICA MIDDLE EAST +event_day=2008-09-25 10000 5 0 AUTOMOBILE MACHINERY +event_day=2008-09-25 10000 5 0 MFGR#1 MFGR#5 +event_day=2008-09-25 10000 50 0 1 50 +event_day=2008-09-25 10000 6074 0 96748 9388900 +event_day=2008-09-25 10000 7 0 1 7 +event_day=2008-09-25 10000 7 0 AIR TRUCK +event_day=2008-09-25 10000 845 0 106797 9423950 +event_day=2008-09-25 10000 9 0 0 8 +event_day=2008-09-25 10000 9775 0 119 2999848 +event_day=2008-09-25 10000 9794 0 107970 45833194 +event_day=2008-09-25 10000 9837 0 MGHV8XBriO zzlztYTFMFW +event_day=2008-09-25 10000 9846 0 Customer#000000119 Customer#002999848 +event_day=2008-09-25 10000 9861 0 13091 599962401 +event_day=2008-09-25 10000 9879 0 10-100-337-6599 34-999-684-2905 +event_day=2008-09-25 10000 9883 0 Supplier#000000001 Supplier#000199983 +event_day=2008-09-25 10000 9896 0 B5YhCdkaxR232CrXx zyxtAvAViHMabnr,1UQybiW +event_day=2008-09-25 10000 9927 0 10-105-800-9296 34-998-982-7450 +event_day=2008-09-25 10000 9971 0 1 199983 + +-- !2 -- +event_day=1956-09-07 10000 1 0 0 0 +event_day=1956-09-07 10000 1 0 1956-09-07 1956-09-07 +event_day=1956-09-07 10000 11 0 0 10 +event_day=1956-09-07 10000 13 0 MFGR#12 MFGR#52 +event_day=1956-09-07 10000 13 0 antique wheat +event_day=1956-09-07 10000 16 0 JUMBO BAG WRAP PACK +event_day=1956-09-07 10000 17 0 1 48 +event_day=1956-09-07 10000 17 0 64078 113087 +event_day=1956-09-07 10000 17 0 754035 763603 +event_day=1956-09-07 10000 17 0 ECONOMY ANODIZED BRASS STANDARD POLISHED TIN +event_day=1956-09-07 10000 17 0 MFGR#1221 MFGR#528 +event_day=1956-09-07 10000 17 0 burnished drab violet firebrick +event_day=1956-09-07 10000 2362 0 19920101 19980802 +event_day=1956-09-07 10000 2382 0 19920203 19981027 +event_day=1956-09-07 10000 25 0 ALGERIA VIETNAM +event_day=1956-09-07 10000 25 0 ALGERIA VIETNAM +event_day=1956-09-07 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=1956-09-07 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=1956-09-07 10000 5 0 1-URGENT 5-LOW +event_day=1956-09-07 10000 5 0 AFRICA MIDDLE EAST +event_day=1956-09-07 10000 5 0 AFRICA MIDDLE EAST +event_day=1956-09-07 10000 5 0 AUTOMOBILE MACHINERY +event_day=1956-09-07 10000 5 0 MFGR#1 MFGR#5 +event_day=1956-09-07 10000 50 0 1 50 +event_day=1956-09-07 10000 6074 0 96748 9388900 +event_day=1956-09-07 10000 7 0 1 7 +event_day=1956-09-07 10000 7 0 AIR TRUCK +event_day=1956-09-07 10000 845 0 106797 9423950 +event_day=1956-09-07 10000 9 0 0 8 +event_day=1956-09-07 10000 9775 0 119 2999848 +event_day=1956-09-07 10000 9794 0 107970 45833194 +event_day=1956-09-07 10000 9837 0 MGHV8XBriO zzlztYTFMFW +event_day=1956-09-07 10000 9846 0 Customer#000000119 Customer#002999848 +event_day=1956-09-07 10000 9861 0 13091 599962401 +event_day=1956-09-07 10000 9879 0 10-100-337-6599 34-999-684-2905 +event_day=1956-09-07 10000 9883 0 Supplier#000000001 Supplier#000199983 +event_day=1956-09-07 10000 9896 0 B5YhCdkaxR232CrXx zyxtAvAViHMabnr,1UQybiW +event_day=1956-09-07 10000 9927 0 10-105-800-9296 34-998-982-7450 +event_day=1956-09-07 10000 9971 0 1 199983 + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_partition_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_partition_statistic.groovy new file mode 100644 index 00000000000000..9f4b462237fa3f --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_partition_statistic.groovy @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_partition_statistic", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_partition_statistic" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + sql """use ${catalog_name}.multi_partition""" + sql """analyze table multi_partition_orc partitions (`event_day=2008-09-25`, `event_day=1956-09-07`) with sync""" + + def ctlId + def result = sql """show proc '/catalogs'""" + + for (int i = 0; i < result.size(); i++) { + if (result[i][1] == catalog_name) { + ctlId = result[i][0] + } + } + + qt_01 """select part_id, count(*) from internal.__internal_schema.column_statistics where catalog_id='$ctlId' group by part_id order by part_id;""" + order_qt_1 """select part_id, count, ndv, null_count, min, max from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and part_id='event_day=2008-09-25'""" + order_qt_2 """select part_id, count, ndv, null_count, min, max from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and part_id='event_day=1956-09-07'""" + + sql """drop catalog ${catalog_name}"""; + } +} +