fix conflict

apache · Aug 27, 2023 · 22b8417 · 22b8417
2 parents 0ce5938 + 2b657c5
commit 22b8417
Show file tree

Hide file tree

Showing 38 changed files with 1,402 additions and 104 deletions.
diff --git a/...nt-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/...nt-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java
@@ -129,7 +129,15 @@ public <K, V, C> ShuffleHandle registerShuffle(
 
     if (fallbackPolicyRunner.applyAllFallbackPolicy(
         lifecycleManager, dependency.partitioner().numPartitions())) {
-      logger.warn("Fallback to SortShuffleManager!");
+      if (conf.getBoolean("spark.dynamicAllocation.enabled", false)) {
+        logger.error(
+            "DRA is enabled but we fallback to vanilla Spark SortShuffleManager for "
+                + "shuffle: {} due to fallback policy. It may cause block can not found when reducer "
+                + "task fetch data.",
+            shuffleId);
+      } else {
+        logger.warn("Fallback to vanilla Spark SortShuffleManager for shuffle: {}", shuffleId);
+      }
       sortShuffleIds.add(shuffleId);
       return sortShuffleManager().registerShuffle(shuffleId, dependency);
     } else {

diff --git a/...park/spark-3/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala b/...park/spark-3/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala
@@ -97,11 +97,17 @@ class CelebornShuffleReader[K, C](
     }).flatMap(
       serializerInstance.deserializeStream(_).asKeyValueIterator)
 
+    val iterWithUpdatedRecordsRead =
+      if (GlutenColumnarBatchSerdeHelper.isGlutenSerde(serializerInstance.getClass.getName)) {
+        GlutenColumnarBatchSerdeHelper.withUpdatedRecordsRead(recordIter, metrics)
+      } else {
+        recordIter.map { record =>
+          metrics.incRecordsRead(1)
+          record
+        }
+      }
     val metricIter = CompletionIterator[(Any, Any), Iterator[(Any, Any)]](
-      recordIter.map { record =>
-        metrics.incRecordsRead(1)
-        record
-      },
+      iterWithUpdatedRecordsRead,
       context.taskMetrics().mergeShuffleReadMetrics())
 
     // An interruptible iterator must be used here in order to support task cancellation

diff --git a/...k-3/src/main/scala/org/apache/spark/shuffle/celeborn/GlutenColumnarBatchSerdeHelper.scala b/...k-3/src/main/scala/org/apache/spark/shuffle/celeborn/GlutenColumnarBatchSerdeHelper.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.celeborn
+
+import org.apache.spark.shuffle.ShuffleReadMetricsReporter
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+/**
+ * A helper class to be compatible with Gluten Celeborn.
+ */
+object GlutenColumnarBatchSerdeHelper {
+
+  def isGlutenSerde(serdeName: String): Boolean = {
+    // scalastyle:off
+    // see Gluten
+    // https://github.com/oap-project/gluten/blob/main/gluten-celeborn/src/main/scala/org/apache/spark/shuffle/CelebornColumnarBatchSerializer.scala
+    // scalastyle:on
+    "org.apache.spark.shuffle.CelebornColumnarBatchSerializer".equals(serdeName)
+  }
+
+  def withUpdatedRecordsRead(
+      input: Iterator[(Any, Any)],
+      metrics: ShuffleReadMetricsReporter): Iterator[(Any, Any)] = {
+    input.map { record =>
+      metrics.incRecordsRead(record._2.asInstanceOf[ColumnarBatch].numRows())
+      record
+    }
+  }
+}
diff --git a/client/src/main/java/org/apache/celeborn/client/read/DfsPartitionReader.java b/client/src/main/java/org/apache/celeborn/client/read/DfsPartitionReader.java
@@ -47,7 +47,7 @@
 public class DfsPartitionReader implements PartitionReader {
   private static Logger logger = LoggerFactory.getLogger(DfsPartitionReader.class);
   PartitionLocation location;
-  private final int shuffleChunkSize;
+  private final long shuffleChunkSize;
   private final int fetchMaxReqsInFlight;
   private final LinkedBlockingQueue<ByteBuf> results;
   private final AtomicReference<IOException> exception = new AtomicReference<>();
@@ -66,7 +66,7 @@ public DfsPartitionReader(
       int startMapIndex,
       int endMapIndex)
       throws IOException {
-    shuffleChunkSize = (int) conf.shuffleChunkSize();
+    shuffleChunkSize = conf.dfsReadChunkSize();
     fetchMaxReqsInFlight = conf.clientFetchMaxReqsInFlight();
     results = new LinkedBlockingQueue<>();
 

diff --git a/common/src/main/proto/TransportMessages.proto b/common/src/main/proto/TransportMessages.proto
@@ -151,6 +151,7 @@ message PbHeartbeatFromWorker {
   string requestId = 8;
   map<string, PbResourceConsumption> userResourceConsumption = 9;
   map<string, int64> estimatedAppDiskUsage = 10;
+  bool highWorkload = 11;
 }
 
 message PbHeartbeatFromWorkerResponse {

diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -654,6 +654,7 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se
   def workerPushMaxComponents: Int = get(WORKER_PUSH_COMPOSITEBUFFER_MAXCOMPONENTS)
   def workerFetchHeartbeatEnabled: Boolean = get(WORKER_FETCH_HEARTBEAT_ENABLED)
   def workerPartitionSplitEnabled: Boolean = get(WORKER_PARTITION_SPLIT_ENABLED)
+  def workerActiveConnectionMax: Option[Long] = get(WORKER_ACTIVE_CONNECTION_MAX)
 
   // //////////////////////////////////////////////////////
   //                 Metrics System                      //
@@ -804,6 +805,7 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se
   def shuffleExpiredCheckIntervalMs: Long = get(SHUFFLE_EXPIRED_CHECK_INTERVAL)
   def shuffleManagerPort: Int = get(CLIENT_SHUFFLE_MANAGER_PORT)
   def shuffleChunkSize: Long = get(SHUFFLE_CHUNK_SIZE)
+  def dfsReadChunkSize: Long = get(CLIENT_FETCH_DFS_READ_CHUNK_SIZE)
   def shufflePartitionSplitMode: PartitionSplitMode =
     PartitionSplitMode.valueOf(get(SHUFFLE_PARTITION_SPLIT_MODE))
   def shufflePartitionSplitThreshold: Long = get(SHUFFLE_PARTITION_SPLIT_THRESHOLD)
@@ -1945,13 +1947,21 @@ object CelebornConf extends Logging {
 
   val SHUFFLE_CHUNK_SIZE: ConfigEntry[Long] =
     buildConf("celeborn.shuffle.chunk.size")
-      .categories("client", "worker")
+      .categories("worker")
       .version("0.2.0")
       .doc("Max chunk size of reducer's merged shuffle data. For example, if a reducer's " +
         "shuffle data is 128M and the data will need 16 fetch chunk requests to fetch.")
       .bytesConf(ByteUnit.BYTE)
       .createWithDefaultString("8m")
 
+  val CLIENT_FETCH_DFS_READ_CHUNK_SIZE: ConfigEntry[Long] =
+    buildConf("celeborn.client.fetch.dfsReadChunkSize")
+      .categories("client")
+      .version("0.3.1")
+      .doc("Max chunk size for DfsPartitionReader.")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefaultString("8m")
+
   val WORKER_PARTITION_SPLIT_ENABLED: ConfigEntry[Boolean] =
     buildConf("celeborn.worker.shuffle.partitionSplit.enabled")
       .withAlternative("celeborn.worker.partition.split.enabled")
@@ -2685,6 +2695,16 @@ object CelebornConf extends Logging {
       .booleanConf
       .createWithDefault(false)
 
+  val WORKER_ACTIVE_CONNECTION_MAX: OptionalConfigEntry[Long] =
+    buildConf("celeborn.worker.activeConnection.max")
+      .categories("worker")
+      .doc("If the number of active connections on a worker exceeds this configuration value, " +
+        "the worker will be marked as high-load in the heartbeat report, " +
+        "and the master will not include that node in the response of RequestSlots.")
+      .version("0.3.1")
+      .longConf
+      .createOptional
+
   val APPLICATION_HEARTBEAT_INTERVAL: ConfigEntry[Long] =
     buildConf("celeborn.client.application.heartbeatInterval")
       .withAlternative("celeborn.application.heartbeatInterval")

diff --git a/common/src/main/scala/org/apache/celeborn/common/protocol/message/ControlMessages.scala b/common/src/main/scala/org/apache/celeborn/common/protocol/message/ControlMessages.scala
@@ -113,6 +113,7 @@ object ControlMessages extends Logging {
       userResourceConsumption: util.Map[UserIdentifier, ResourceConsumption],
       activeShuffleKeys: util.Set[String],
       estimatedAppDiskUsage: util.HashMap[String, java.lang.Long],
+      highWorkload: Boolean,
       override var requestId: String = ZERO_UUID) extends MasterRequestMessage
 
   case class HeartbeatFromWorkerResponse(
@@ -446,6 +447,7 @@ object ControlMessages extends Logging {
           userResourceConsumption,
           activeShuffleKeys,
           estimatedAppDiskUsage,
+          highWorkload,
           requestId) =>
       val pbDisks = disks.map(PbSerDeUtils.toPbDiskInfo).asJava
       val pbUserResourceConsumption =
@@ -460,6 +462,7 @@ object ControlMessages extends Logging {
         .setReplicatePort(replicatePort)
         .addAllActiveShuffleKeys(activeShuffleKeys)
         .putAllEstimatedAppDiskUsage(estimatedAppDiskUsage)
+        .setHighWorkload(highWorkload)
         .setRequestId(requestId)
         .build().toByteArray
       new TransportMessage(MessageType.HEARTBEAT_FROM_WORKER, payload)
@@ -824,6 +827,7 @@ object ControlMessages extends Logging {
           userResourceConsumption,
           activeShuffleKeys,
           estimatedAppDiskUsage,
+          pbHeartbeatFromWorker.getHighWorkload,
           pbHeartbeatFromWorker.getRequestId)
 
       case HEARTBEAT_FROM_WORKER_RESPONSE_VALUE =>