[FLINK-36067][runtime] Support optimize stream graph based on input i…

…nfo.
apache · Dec 12, 2024 · 9b2cc2a · 9b2cc2a
1 parent 02cdbf3
commit 9b2cc2a
Show file tree

Hide file tree

Showing 20 changed files with 628 additions and 86 deletions.
diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/IntermediateResult.java b/flink-runtime/src/main/java/org/apache/flink/runtime/executiongraph/IntermediateResult.java
@@ -63,6 +63,7 @@ public class IntermediateResult {
     private final int numParallelProducers;
 
     private final ExecutionPlanSchedulingContext executionPlanSchedulingContext;
+    private final boolean produceBroadcastResult;
 
     private int partitionsAssigned;
 
@@ -102,6 +103,8 @@ public IntermediateResult(
         this.shuffleDescriptorCache = new HashMap<>();
 
         this.executionPlanSchedulingContext = checkNotNull(executionPlanSchedulingContext);
+
+        this.produceBroadcastResult = intermediateDataSet.isBroadcast();
     }
 
     public boolean areAllConsumerVerticesCreated() {
@@ -207,6 +210,10 @@ public boolean isForward() {
         return intermediateDataSet.isForward();
     }
 
+    public boolean isEveryConsumerConsumeAllSubPartitions() {
+        return !produceBroadcastResult && intermediateDataSet.isBroadcast();
+    }
+
     public int getConnectionIndex() {
         return connectionIndex;
     }

diff --git a/...runtime/src/main/java/org/apache/flink/runtime/executiongraph/IntermediateResultInfo.java b/...runtime/src/main/java/org/apache/flink/runtime/executiongraph/IntermediateResultInfo.java
@@ -35,6 +35,14 @@ public interface IntermediateResultInfo {
      */
     boolean isBroadcast();
 
+    /**
+     * Indicates whether every downstream consumer needs to consume all produced sub-partitions.
+     *
+     * @return true if every downstream consumer needs to consume all produced sub-partitions, false
+     *     otherwise.
+     */
+    boolean isEveryConsumerConsumeAllSubPartitions();
+
     /**
      * Whether it is a pointwise result.
      *

diff --git a/...rc/main/java/org/apache/flink/runtime/executiongraph/VertexInputInfoComputationUtils.java b/...rc/main/java/org/apache/flink/runtime/executiongraph/VertexInputInfoComputationUtils.java
@@ -84,7 +84,8 @@ public static Map<IntermediateDataSetID, JobVertexInputInfo> computeVertexInputI
                                 parallelism,
                                 input::getNumSubpartitions,
                                 isDynamicGraph,
-                                input.isBroadcast()));
+                                input.isBroadcast(),
+                                input.isEveryConsumerConsumeAllSubPartitions()));
             }
         }
 
@@ -124,6 +125,7 @@ static JobVertexInputInfo computeVertexInputInfoForPointwise(
                                 1,
                                 () -> numOfSubpartitionsRetriever.apply(start),
                                 isDynamicGraph,
+                                false,
                                 false);
                 executionVertexInputInfos.add(
                         new ExecutionVertexInputInfo(index, partitionRange, subpartitionRange));
@@ -145,6 +147,7 @@ static JobVertexInputInfo computeVertexInputInfoForPointwise(
                                     numConsumers,
                                     () -> numOfSubpartitionsRetriever.apply(finalPartitionNum),
                                     isDynamicGraph,
+                                    false,
                                     false);
                     executionVertexInputInfos.add(
                             new ExecutionVertexInputInfo(i, partitionRange, subpartitionRange));
@@ -165,14 +168,16 @@ static JobVertexInputInfo computeVertexInputInfoForPointwise(
      * @param numOfSubpartitionsRetriever a retriever to get the number of subpartitions
      * @param isDynamicGraph whether is dynamic graph
      * @param isBroadcast whether the edge is broadcast
+     * @param consumeAllSubpartitions whether the edge should consume all subpartitions
      * @return the computed {@link JobVertexInputInfo}
      */
     static JobVertexInputInfo computeVertexInputInfoForAllToAll(
             int sourceCount,
             int targetCount,
             Function<Integer, Integer> numOfSubpartitionsRetriever,
             boolean isDynamicGraph,
-            boolean isBroadcast) {
+            boolean isBroadcast,
+            boolean consumeAllSubpartitions) {
         final List<ExecutionVertexInputInfo> executionVertexInputInfos = new ArrayList<>();
         IndexRange partitionRange = new IndexRange(0, sourceCount - 1);
         for (int i = 0; i < targetCount; ++i) {
@@ -182,7 +187,8 @@ static JobVertexInputInfo computeVertexInputInfoForAllToAll(
                             targetCount,
                             () -> numOfSubpartitionsRetriever.apply(0),
                             isDynamicGraph,
-                            isBroadcast);
+                            isBroadcast,
+                            consumeAllSubpartitions);
             executionVertexInputInfos.add(
                     new ExecutionVertexInputInfo(i, partitionRange, subpartitionRange));
         }
@@ -199,6 +205,7 @@ static JobVertexInputInfo computeVertexInputInfoForAllToAll(
      * @param numOfSubpartitionsSupplier a supplier to get the number of subpartitions
      * @param isDynamicGraph whether is dynamic graph
      * @param isBroadcast whether the edge is broadcast
+     * @param consumeAllSubpartitions whether the edge should consume all subpartitions
      * @return the computed subpartition range
      */
     @VisibleForTesting
@@ -207,16 +214,21 @@ static IndexRange computeConsumedSubpartitionRange(
             int numConsumers,
             Supplier<Integer> numOfSubpartitionsSupplier,
             boolean isDynamicGraph,
-            boolean isBroadcast) {
+            boolean isBroadcast,
+            boolean consumeAllSubpartitions) {
         int consumerIndex = consumerSubtaskIndex % numConsumers;
         if (!isDynamicGraph) {
             return new IndexRange(consumerIndex, consumerIndex);
         } else {
             int numSubpartitions = numOfSubpartitionsSupplier.get();
             if (isBroadcast) {
-                // broadcast results have only one subpartition, and be consumed multiple times.
-                checkArgument(numSubpartitions == 1);
-                return new IndexRange(0, 0);
+                if (consumeAllSubpartitions) {
+                    return new IndexRange(0, numSubpartitions - 1);
+                } else {
+                    // broadcast results have only one subpartition, and be consumed multiple times.
+                    checkArgument(numSubpartitions == 1);
+                    return new IndexRange(0, 0);
+                }
             } else {
                 checkArgument(consumerIndex < numConsumers);
                 checkArgument(numConsumers <= numSubpartitions);
@@ -246,6 +258,11 @@ public boolean isBroadcast() {
             return intermediateResult.isBroadcast();
         }
 
+        @Override
+        public boolean isEveryConsumerConsumeAllSubPartitions() {
+            return intermediateResult.isEveryConsumerConsumeAllSubPartitions();
+        }
+
         @Override
         public boolean isPointwise() {
             return intermediateResult.getConsumingDistributionPattern()

diff --git a/flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/IntermediateDataSet.java b/flink-runtime/src/main/java/org/apache/flink/runtime/jobgraph/IntermediateDataSet.java
@@ -134,6 +134,18 @@ public void configure(
         }
     }
 
+    public void updateOutputPattern(
+            DistributionPattern distributionPattern, boolean isBroadcast, boolean isForward) {
+        checkState(consumers.isEmpty(), "The output job edges have already been added.");
+        checkState(
+                numJobEdgesToCreate == 1,
+                "Modification is not allowed when the subscribing output is reused.");
+
+        this.distributionPattern = distributionPattern;
+        this.isBroadcast = isBroadcast;
+        this.isForward = isForward;
+    }
+
     public void increaseNumJobEdgesToCreate() {
         this.numJobEdgesToCreate++;
     }

diff --git a/...ain/java/org/apache/flink/runtime/scheduler/adaptivebatch/AbstractBlockingResultInfo.java b/...ain/java/org/apache/flink/runtime/scheduler/adaptivebatch/AbstractBlockingResultInfo.java
@@ -44,11 +44,14 @@ abstract class AbstractBlockingResultInfo implements BlockingResultInfo {
     protected final Map<Integer, long[]> subpartitionBytesByPartitionIndex;
 
     AbstractBlockingResultInfo(
-            IntermediateDataSetID resultId, int numOfPartitions, int numOfSubpartitions) {
+            IntermediateDataSetID resultId,
+            int numOfPartitions,
+            int numOfSubpartitions,
+            Map<Integer, long[]> subpartitionBytesByPartitionIndex) {
         this.resultId = checkNotNull(resultId);
         this.numOfPartitions = numOfPartitions;
         this.numOfSubpartitions = numOfSubpartitions;
-        this.subpartitionBytesByPartitionIndex = new HashMap<>();
+        this.subpartitionBytesByPartitionIndex = subpartitionBytesByPartitionIndex;
     }
 
     @Override
@@ -72,4 +75,9 @@ public void resetPartitionInfo(int partitionIndex) {
     int getNumOfRecordedPartitions() {
         return subpartitionBytesByPartitionIndex.size();
     }
+
+    @Override
+    public Map<Integer, long[]> getSubpartitionBytesByPartitionIndex() {
+        return new HashMap<>(subpartitionBytesByPartitionIndex);
+    }
 }
diff --git a/...rc/main/java/org/apache/flink/runtime/scheduler/adaptivebatch/AdaptiveBatchScheduler.java b/...rc/main/java/org/apache/flink/runtime/scheduler/adaptivebatch/AdaptiveBatchScheduler.java
@@ -274,9 +274,11 @@ public void onNewJobVerticesAdded(List<JobVertex> newVertices, int pendingOperat
         // 4. update json plan
         getExecutionGraph().setJsonPlan(JsonPlanGenerator.generatePlan(getJobGraph()));
 
-        // 5. try aggregate subpartition bytes
+        // 5. update the DistributionPattern of the upstream results consumed by the newly created
+        // JobVertex and aggregate subpartition bytes.
         for (JobVertex newVertex : newVertices) {
             for (JobEdge input : newVertex.getInputs()) {
+                tryUpdateResultInfo(input.getSourceId(), input.getDistributionPattern());
                 Optional.ofNullable(blockingResultInfos.get(input.getSourceId()))
                         .ifPresent(this::maybeAggregateSubpartitionBytes);
             }
@@ -932,21 +934,24 @@ private static void resetDynamicParallelism(Iterable<JobVertex> vertices) {
         }
     }
 
-    private static BlockingResultInfo createFromIntermediateResult(IntermediateResult result) {
+    private static BlockingResultInfo createFromIntermediateResult(
+            IntermediateResult result, Map<Integer, long[]> subpartitionBytesByPartitionIndex) {
         checkArgument(result != null);
         // Note that for dynamic graph, different partitions in the same result have the same number
         // of subpartitions.
         if (result.getConsumingDistributionPattern() == DistributionPattern.POINTWISE) {
             return new PointwiseBlockingResultInfo(
                     result.getId(),
                     result.getNumberOfAssignedPartitions(),
-                    result.getPartitions()[0].getNumberOfSubpartitions());
+                    result.getPartitions()[0].getNumberOfSubpartitions(),
+                    subpartitionBytesByPartitionIndex);
         } else {
             return new AllToAllBlockingResultInfo(
                     result.getId(),
                     result.getNumberOfAssignedPartitions(),
                     result.getPartitions()[0].getNumberOfSubpartitions(),
-                    result.isBroadcast());
+                    result.isBroadcast(),
+                    subpartitionBytesByPartitionIndex);
         }
     }
 
@@ -960,6 +965,26 @@ SpeculativeExecutionHandler getSpeculativeExecutionHandler() {
         return speculativeExecutionHandler;
     }
 
+    private void tryUpdateResultInfo(IntermediateDataSetID id, DistributionPattern targetPattern) {
+        if (blockingResultInfos.containsKey(id)) {
+            BlockingResultInfo resultInfo = blockingResultInfos.get(id);
+            IntermediateResult result = getExecutionGraph().getAllIntermediateResults().get(id);
+
+            if ((targetPattern == DistributionPattern.ALL_TO_ALL && resultInfo.isPointwise())
+                    || (targetPattern == DistributionPattern.POINTWISE
+                            && !resultInfo.isPointwise())) {
+
+                BlockingResultInfo newInfo =
+                        createFromIntermediateResult(
+                                result, resultInfo.getSubpartitionBytesByPartitionIndex());
+
+                blockingResultInfos.put(id, newInfo);
+            } else if (targetPattern == DistributionPattern.ALL_TO_ALL) {
+                ((AllToAllBlockingResultInfo) resultInfo).setBroadcast(result.isBroadcast());
+            }
+        }
+    }
+
     private class DefaultBatchJobRecoveryContext implements BatchJobRecoveryContext {
 
         private final FailoverStrategy restartStrategyOnResultConsumable =

diff --git a/...ava/org/apache/flink/runtime/scheduler/adaptivebatch/AdaptiveExecutionHandlerFactory.java b/...ava/org/apache/flink/runtime/scheduler/adaptivebatch/AdaptiveExecutionHandlerFactory.java
@@ -21,6 +21,7 @@
 import org.apache.flink.runtime.jobgraph.JobGraph;
 import org.apache.flink.streaming.api.graph.ExecutionPlan;
 import org.apache.flink.streaming.api.graph.StreamGraph;
+import org.apache.flink.util.DynamicCodeLoadingException;
 
 import java.util.concurrent.Executor;
 
@@ -46,7 +47,8 @@ public class AdaptiveExecutionHandlerFactory {
     public static AdaptiveExecutionHandler create(
             ExecutionPlan executionPlan,
             ClassLoader userClassLoader,
-            Executor serializationExecutor) {
+            Executor serializationExecutor)
+            throws DynamicCodeLoadingException {
         if (executionPlan instanceof JobGraph) {
             return new NonAdaptiveExecutionHandler((JobGraph) executionPlan);
         } else {

diff --git a/...ain/java/org/apache/flink/runtime/scheduler/adaptivebatch/AllToAllBlockingResultInfo.java b/...ain/java/org/apache/flink/runtime/scheduler/adaptivebatch/AllToAllBlockingResultInfo.java
@@ -18,6 +18,7 @@
 
 package org.apache.flink.runtime.scheduler.adaptivebatch;
 
+import org.apache.flink.annotation.VisibleForTesting;
 import org.apache.flink.runtime.executiongraph.IndexRange;
 import org.apache.flink.runtime.executiongraph.ResultPartitionBytes;
 import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
@@ -26,7 +27,9 @@
 
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Optional;
 import java.util.stream.Collectors;
 
@@ -35,21 +38,35 @@
 /** Information of All-To-All result. */
 public class AllToAllBlockingResultInfo extends AbstractBlockingResultInfo {
 
-    private final boolean isBroadcast;
+    private boolean isBroadcast;
+
+    private boolean everyConsumerConsumeAllSubPartitions;
 
     /**
      * Aggregated subpartition bytes, which aggregates the subpartition bytes with the same
      * subpartition index in different partitions. Note that We can aggregate them because they will
      * be consumed by the same downstream task.
      */
-    @Nullable private List<Long> aggregatedSubpartitionBytes;
+    @Nullable protected List<Long> aggregatedSubpartitionBytes;
+
+    @VisibleForTesting
+    AllToAllBlockingResultInfo(
+            IntermediateDataSetID resultId,
+            int numOfPartitions,
+            int numOfSubpartitions,
+            boolean isBroadcast,
+            boolean everyConsumerConsumeAllSubPartitions) {
+        this(resultId, numOfPartitions, numOfSubpartitions, isBroadcast, new HashMap<>());
+        this.everyConsumerConsumeAllSubPartitions = everyConsumerConsumeAllSubPartitions;
+    }
 
     AllToAllBlockingResultInfo(
             IntermediateDataSetID resultId,
             int numOfPartitions,
             int numOfSubpartitions,
-            boolean isBroadcast) {
-        super(resultId, numOfPartitions, numOfSubpartitions);
+            boolean isBroadcast,
+            Map<Integer, long[]> subpartitionBytesByPartitionIndex) {
+        super(resultId, numOfPartitions, numOfSubpartitions, subpartitionBytesByPartitionIndex);
         this.isBroadcast = isBroadcast;
     }
 
@@ -58,6 +75,21 @@ public boolean isBroadcast() {
         return isBroadcast;
     }
 
+    @Override
+    public boolean isEveryConsumerConsumeAllSubPartitions() {
+        return everyConsumerConsumeAllSubPartitions;
+    }
+
+    void setBroadcast(boolean broadcast) {
+        if (!this.isBroadcast && broadcast) {
+            everyConsumerConsumeAllSubPartitions = true;
+        } else if (this.isBroadcast && !broadcast) {
+            everyConsumerConsumeAllSubPartitions = false;
+        }
+
+        isBroadcast = broadcast;
+    }
+
     @Override
     public boolean isPointwise() {
         return false;
@@ -83,7 +115,7 @@ public long getNumBytesProduced() {
         List<Long> bytes =
                 Optional.ofNullable(aggregatedSubpartitionBytes)
                         .orElse(getAggregatedSubpartitionBytesInternal());
-        if (isBroadcast) {
+        if (isBroadcast && !everyConsumerConsumeAllSubPartitions) {
             return bytes.get(0);
         } else {
             return bytes.stream().reduce(0L, Long::sum);

diff --git a/...me/src/main/java/org/apache/flink/runtime/scheduler/adaptivebatch/BlockingResultInfo.java b/...me/src/main/java/org/apache/flink/runtime/scheduler/adaptivebatch/BlockingResultInfo.java
@@ -22,6 +22,8 @@
 import org.apache.flink.runtime.executiongraph.IntermediateResultInfo;
 import org.apache.flink.runtime.executiongraph.ResultPartitionBytes;
 
+import java.util.Map;
+
 /**
  * The blocking result info, which will be used to calculate the vertex parallelism and input infos.
  */
@@ -67,4 +69,12 @@ public interface BlockingResultInfo extends IntermediateResultInfo {
 
     /** Aggregates the subpartition bytes to reduce space usage. */
     void aggregateSubpartitionBytes();
+
+    /**
+     * Gets subpartition bytes by partition index.
+     *
+     * @return a map with integer keys representing partition indices and long array values
+     *     representing subpartition bytes.
+     */
+    Map<Integer, long[]> getSubpartitionBytesByPartitionIndex();
 }