Skip to content

Commit

Permalink
[multistage] Refactor query planner and dispatcher (apache#10748)
Browse files Browse the repository at this point in the history
  • Loading branch information
xiangfu0 authored May 11, 2023
1 parent 3a8c578 commit fe98bb0
Show file tree
Hide file tree
Showing 46 changed files with 1,526 additions and 711 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
import org.apache.pinot.query.QueryEnvironment;
import org.apache.pinot.query.catalog.PinotCatalog;
import org.apache.pinot.query.mailbox.MailboxService;
import org.apache.pinot.query.planner.QueryPlan;
import org.apache.pinot.query.planner.DispatchableSubPlan;
import org.apache.pinot.query.routing.WorkerManager;
import org.apache.pinot.query.service.QueryConfig;
import org.apache.pinot.query.service.dispatch.QueryDispatcher;
Expand Down Expand Up @@ -175,7 +175,7 @@ private BrokerResponse handleRequest(long requestId, String query, @Nullable Sql
return new BrokerResponseNative(QueryException.getException(QueryException.SQL_PARSING_ERROR, e));
}

QueryPlan queryPlan = queryPlanResult.getQueryPlan();
DispatchableSubPlan dispatchableSubPlan = queryPlanResult.getQueryPlan();
Set<String> tableNames = queryPlanResult.getTableNames();

// Compilation Time. This includes the time taken for parsing, compiling, create stage plans and assigning workers.
Expand All @@ -201,13 +201,13 @@ private BrokerResponse handleRequest(long requestId, String query, @Nullable Sql

ResultTable queryResults;
Map<Integer, ExecutionStatsAggregator> stageIdStatsMap = new HashMap<>();
for (Integer stageId : queryPlan.getDispatchablePlanMetadataMap().keySet()) {
for (int stageId = 0; stageId < dispatchableSubPlan.getQueryStageList().size(); stageId++) {
stageIdStatsMap.put(stageId, new ExecutionStatsAggregator(traceEnabled));
}

long executionStartTimeNs = System.nanoTime();
try {
queryResults = _queryDispatcher.submitAndReduce(requestId, queryPlan, _mailboxService, queryTimeoutMs,
queryResults = _queryDispatcher.submitAndReduce(requestId, dispatchableSubPlan, _mailboxService, queryTimeoutMs,
sqlNodeAndOptions.getOptions(), stageIdStatsMap, traceEnabled);
} catch (Exception e) {
LOGGER.info("query execution failed", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

import com.google.common.annotations.VisibleForTesting;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import javax.annotation.Nullable;
Expand Down Expand Up @@ -60,9 +58,13 @@
import org.apache.calcite.tools.RelBuilder;
import org.apache.pinot.common.config.provider.TableCache;
import org.apache.pinot.query.context.PlannerContext;
import org.apache.pinot.query.planner.DispatchableSubPlan;
import org.apache.pinot.query.planner.PlannerUtils;
import org.apache.pinot.query.planner.QueryPlan;
import org.apache.pinot.query.planner.logical.StagePlanner;
import org.apache.pinot.query.planner.SubPlan;
import org.apache.pinot.query.planner.logical.PinotLogicalQueryPlanner;
import org.apache.pinot.query.planner.logical.RelToPlanNodeConverter;
import org.apache.pinot.query.planner.physical.PinotDispatchPlanner;
import org.apache.pinot.query.routing.WorkerManager;
import org.apache.pinot.query.type.TypeFactory;
import org.apache.pinot.sql.parsers.CalciteSqlParser;
Expand All @@ -72,7 +74,7 @@
/**
* The {@code QueryEnvironment} contains the main entrypoint for query planning.
*
* <p>It provide the higher level entry interface to convert a SQL string into a {@link QueryPlan}.
* <p>It provide the higher level entry interface to convert a SQL string into a {@link DispatchableSubPlan}.
*/
public class QueryEnvironment {
// Calcite configurations
Expand Down Expand Up @@ -155,9 +157,12 @@ public QueryPlannerResult planQuery(String sqlQuery, SqlNodeAndOptions sqlNodeAn
try (PlannerContext plannerContext = new PlannerContext(_config, _catalogReader, _typeFactory, _hepProgram)) {
plannerContext.setOptions(sqlNodeAndOptions.getOptions());
RelRoot relRoot = compileQuery(sqlNodeAndOptions.getSqlNode(), plannerContext);
Set<String> tableNames = getTableNamesFromRelRoot(relRoot.rel);
return new QueryPlannerResult(toDispatchablePlan(relRoot, plannerContext, requestId, tableNames), null,
tableNames);
SubPlan subPlanRoot = toSubPlan(relRoot);
// TODO: current code only assume one SubPlan per query, but we should support multiple SubPlans per query.
// Each SubPlan should be able to run independently from Broker then set the results into the dependent
// SubPlan for further processing.
DispatchableSubPlan dispatchableSubPlan = toDispatchableSubPlan(subPlanRoot, plannerContext, requestId);
return new QueryPlannerResult(dispatchableSubPlan, null, dispatchableSubPlan.getTableNames());
} catch (CalciteContextException e) {
throw new RuntimeException("Error composing query plan for '" + sqlQuery
+ "': " + e.getMessage() + "'", e);
Expand All @@ -170,7 +175,8 @@ public QueryPlannerResult planQuery(String sqlQuery, SqlNodeAndOptions sqlNodeAn
* Explain a SQL query.
*
* Similar to {@link QueryEnvironment#planQuery(String, SqlNodeAndOptions, long)}, this API runs the query
* compilation. But it doesn't run the distributed {@link QueryPlan} generation, instead it only returns the
* compilation. But it doesn't run the distributed {@link DispatchableSubPlan} generation, instead it only
* returns the
* explained logical plan.
*
* @param sqlQuery SQL query string.
Expand All @@ -185,15 +191,15 @@ public QueryPlannerResult explainQuery(String sqlQuery, SqlNodeAndOptions sqlNod
SqlExplainFormat format = explain.getFormat() == null ? SqlExplainFormat.DOT : explain.getFormat();
SqlExplainLevel level =
explain.getDetailLevel() == null ? SqlExplainLevel.DIGEST_ATTRIBUTES : explain.getDetailLevel();
Set<String> tableNames = getTableNamesFromRelRoot(relRoot.rel);
Set<String> tableNames = RelToPlanNodeConverter.getTableNamesFromRelRoot(relRoot.rel);
return new QueryPlannerResult(null, PlannerUtils.explainPlan(relRoot.rel, format, level), tableNames);
} catch (Exception e) {
throw new RuntimeException("Error explain query plan for: " + sqlQuery, e);
}
}

@VisibleForTesting
public QueryPlan planQuery(String sqlQuery) {
public DispatchableSubPlan planQuery(String sqlQuery) {
return planQuery(sqlQuery, CalciteSqlParser.compileToSqlNodeAndOptions(sqlQuery), 0).getQueryPlan();
}

Expand All @@ -206,12 +212,13 @@ public String explainQuery(String sqlQuery) {
* Results of planning a query
*/
public static class QueryPlannerResult {
private QueryPlan _queryPlan;
private DispatchableSubPlan _dispatchableSubPlan;
private String _explainPlan;
Set<String> _tableNames;

QueryPlannerResult(@Nullable QueryPlan queryPlan, @Nullable String explainPlan, Set<String> tableNames) {
_queryPlan = queryPlan;
QueryPlannerResult(@Nullable DispatchableSubPlan dispatchableSubPlan, @Nullable String explainPlan,
Set<String> tableNames) {
_dispatchableSubPlan = dispatchableSubPlan;
_explainPlan = explainPlan;
_tableNames = tableNames;
}
Expand All @@ -220,8 +227,8 @@ public String getExplainPlan() {
return _explainPlan;
}

public QueryPlan getQueryPlan() {
return _queryPlan;
public DispatchableSubPlan getQueryPlan() {
return _dispatchableSubPlan;
}

// Returns all the table names in the query.
Expand Down Expand Up @@ -297,11 +304,20 @@ private RelNode optimize(RelRoot relRoot, PlannerContext plannerContext) {
}
}

private QueryPlan toDispatchablePlan(RelRoot relRoot, PlannerContext plannerContext, long requestId,
Set<String> tableNames) {
// 5. construct a dispatchable query plan.
StagePlanner queryStagePlanner = new StagePlanner(plannerContext, _workerManager, requestId, _tableCache);
return queryStagePlanner.makePlan(relRoot, tableNames);
private SubPlan toSubPlan(RelRoot relRoot) {
// 5. construct a logical query plan.
PinotLogicalQueryPlanner pinotLogicalQueryPlanner = new PinotLogicalQueryPlanner();
QueryPlan queryPlan = pinotLogicalQueryPlanner.planQuery(relRoot);
SubPlan subPlan = pinotLogicalQueryPlanner.makePlan(queryPlan);
return subPlan;
}

private DispatchableSubPlan toDispatchableSubPlan(SubPlan subPlan, PlannerContext plannerContext, long requestId) {
// 6. construct a dispatchable query plan.
PinotDispatchPlanner pinotDispatchPlanner =
new PinotDispatchPlanner(plannerContext, _workerManager, requestId, _tableCache);
DispatchableSubPlan dispatchableSubPlan = pinotDispatchPlanner.createDispatchableSubPlan(subPlan);
return dispatchableSubPlan;
}

// --------------------------------------------------------------------------
Expand All @@ -311,17 +327,4 @@ private QueryPlan toDispatchablePlan(RelRoot relRoot, PlannerContext plannerCont
private HintStrategyTable getHintStrategyTable() {
return PinotHintStrategyTable.PINOT_HINT_STRATEGY_TABLE;
}


private Set<String> getTableNamesFromRelRoot(RelNode relRoot) {
Set<String> tableNames = new HashSet<>();
List<String> qualifiedTableNames = RelOptUtil.findAllTableQualifiedNames(relRoot);
for (String qualifiedTableName : qualifiedTableNames) {
// Calcite encloses table and schema names in square brackets to properly quote and delimit them in SQL
// statements, particularly to handle cases when they contain special characters or reserved keywords.
String tableName = qualifiedTableName.replaceAll("^\\[(.*)\\]$", "$1");
tableNames.add(tableName);
}
return tableNames;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.query.planner;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.pinot.core.routing.TimeBoundaryInfo;
import org.apache.pinot.query.routing.QueryServerInstance;
import org.apache.pinot.query.routing.StageMetadata;
import org.apache.pinot.query.routing.WorkerMetadata;


public class DispatchablePlanFragment {

public static final String TABLE_NAME_KEY = "tableName";
public static final String TIME_BOUNDARY_COLUMN_KEY = "timeBoundaryInfo.timeColumn";
public static final String TIME_BOUNDARY_VALUE_KEY = "timeBoundaryInfo.timeValue";
private final PlanFragment _planFragment;
private final List<WorkerMetadata> _workerMetadataList;

// This is used at broker stage - we don't need to ship it to the server.
private final Map<QueryServerInstance, List<Integer>> _serverInstanceToWorkerIdMap;

// used for table scan stage - we use ServerInstance instead of VirtualServer
// here because all virtual servers that share a server instance will have the
// same segments on them
private final Map<Integer, Map<String, List<String>>> _workerIdToSegmentsMap;

// used for passing custom properties to build StageMetadata on the server.
private final Map<String, String> _customProperties;

public DispatchablePlanFragment(PlanFragment planFragment) {
this(planFragment, new ArrayList<>(), new HashMap<>(), new HashMap<>());
}

public DispatchablePlanFragment(PlanFragment planFragment, List<WorkerMetadata> workerMetadataList,
Map<QueryServerInstance, List<Integer>> serverInstanceToWorkerIdMap, Map<String, String> customPropertyMap) {
_planFragment = planFragment;
_workerMetadataList = workerMetadataList;
_serverInstanceToWorkerIdMap = serverInstanceToWorkerIdMap;
_workerIdToSegmentsMap = new HashMap<>();
_customProperties = customPropertyMap;
}

public PlanFragment getPlanFragment() {
return _planFragment;
}

public List<WorkerMetadata> getWorkerMetadataList() {
return _workerMetadataList;
}

public Map<QueryServerInstance, List<Integer>> getServerInstanceToWorkerIdMap() {
return _serverInstanceToWorkerIdMap;
}

public Map<String, String> getCustomProperties() {
return _customProperties;
}

public String getTableName() {
return _customProperties.get(TABLE_NAME_KEY);
}

public String setTableName(String tableName) {
return _customProperties.put(TABLE_NAME_KEY, tableName);
}

public TimeBoundaryInfo getTimeBoundary() {
return new TimeBoundaryInfo(_customProperties.get(TIME_BOUNDARY_COLUMN_KEY),
_customProperties.get(TIME_BOUNDARY_VALUE_KEY));
}

public void setTimeBoundaryInfo(TimeBoundaryInfo timeBoundaryInfo) {
_customProperties.put(TIME_BOUNDARY_COLUMN_KEY, timeBoundaryInfo.getTimeColumn());
_customProperties.put(TIME_BOUNDARY_VALUE_KEY, timeBoundaryInfo.getTimeValue());
}

public Map<Integer, Map<String, List<String>>> getWorkerIdToSegmentsMap() {
return _workerIdToSegmentsMap;
}

public void setWorkerIdToSegmentsMap(Map<Integer, Map<String, List<String>>> workerIdToSegmentsMap) {
_workerIdToSegmentsMap.clear();
_workerIdToSegmentsMap.putAll(workerIdToSegmentsMap);
}

public void setWorkerMetadataList(List<WorkerMetadata> workerMetadataList) {
_workerMetadataList.clear();
_workerMetadataList.addAll(workerMetadataList);
}

public StageMetadata toStageMetadata() {
return new StageMetadata(_workerMetadataList, _customProperties);
}

public void setServerInstanceToWorkerIdMap(Map<QueryServerInstance, List<Integer>> serverInstanceToWorkerIdMap) {
_serverInstanceToWorkerIdMap.clear();
_serverInstanceToWorkerIdMap.putAll(serverInstanceToWorkerIdMap);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.query.planner;

import java.util.List;
import java.util.Set;
import org.apache.calcite.util.Pair;


/**
* The {@code DispatchableSubPlan} is the dispatchable query execution plan from the result of
* {@link org.apache.pinot.query.planner.logical.LogicalPlanner} and
* {@link org.apache.pinot.query.planner.physical.PinotDispatchPlanner}.
*
* <p>QueryPlan should contain the necessary stage boundary information and the cross exchange information
* for:
* <ul>
* <li>dispatch individual stages to executor.</li>
* <li>instruction for stage executor to establish connection channels to other stages.</li>
* <li>instruction for encoding data blocks & transferring between stages based on partitioning scheme.</li>
* </ul>
*/
public class DispatchableSubPlan {
private final List<Pair<Integer, String>> _queryResultFields;
private final List<DispatchablePlanFragment> _queryStageList;
private final Set<String> _tableNames;

public DispatchableSubPlan(List<Pair<Integer, String>> fields, List<DispatchablePlanFragment> queryStageList,
Set<String> tableNames) {
_queryResultFields = fields;
_queryStageList = queryStageList;
_tableNames = tableNames;
}

/**
* Get the list of stage plan root node.
* @return stage plan map.
*/
public List<DispatchablePlanFragment> getQueryStageList() {
return _queryStageList;
}

/**
* Get the query result field.
* @return query result field.
*/
public List<Pair<Integer, String>> getQueryResultFields() {
return _queryResultFields;
}

/**
* Get the table names.
* @return table names.
*/
public Set<String> getTableNames() {
return _tableNames;
}
}
Loading

0 comments on commit fe98bb0

Please sign in to comment.