[CELEBORN-1496] Differentiate map results with only different stageAt…

…temptId ### What changes were proposed in this pull request? Let attemptNumber = (stageAttemptId << 16) | taskAttemptNumber, to differentiate map results with only different stageAttemptId. ### Why are the changes needed? If we can't differentiate map tasks with only different stageAttemptId, it may lead to mixed reading of two map tasks' shuffle write batches during shuffle read, causing data correctness issue. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Add ut: org.apache.spark.shuffle.celeborn.SparkShuffleManagerSuite#testWrongSparkConf_MaxAttemptLimit Closes apache#2609 from jiang13021/spark_stage_attempt_id. Lead-authored-by: jiang13021 <[email protected]> Co-authored-by: Fu Chen <[email protected]> Co-authored-by: Shuang <[email protected]> Signed-off-by: Shuang <[email protected]>
onebox-li · Aug 30, 2024 · 3853075 · 3853075
1 parent 3ee672e
commit 3853075
Show file tree

Hide file tree

Showing 12 changed files with 160 additions and 29 deletions.
diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/SparkCommonUtils.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/SparkCommonUtils.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.celeborn;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.TaskContext;
+import org.apache.spark.scheduler.DAGScheduler;
+
+public class SparkCommonUtils {
+  public static void validateAttemptConfig(SparkConf conf) throws IllegalArgumentException {
+    int maxStageAttempts =
+        conf.getInt(
+            "spark.stage.maxConsecutiveAttempts",
+            DAGScheduler.DEFAULT_MAX_CONSECUTIVE_STAGE_ATTEMPTS());
+    // In Spark 2, the parameter is referred to as MAX_TASK_FAILURES, while in Spark 3, it has been
+    // changed to TASK_MAX_FAILURES. The default value for both is consistently set to 4.
+    int maxTaskAttempts = conf.getInt("spark.task.maxFailures", 4);
+    if (maxStageAttempts >= (1 << 15) || maxTaskAttempts >= (1 << 16)) {
+      // The map attemptId is a non-negative number constructed from
+      // both stageAttemptNumber and taskAttemptNumber.
+      // The high 16 bits of the map attemptId are used for the stageAttemptNumber,
+      // and the low 16 bits are used for the taskAttemptNumber.
+      // So spark.stage.maxConsecutiveAttempts should be less than 32768 (1 << 15)
+      // and spark.task.maxFailures should be less than 65536 (1 << 16).
+      throw new IllegalArgumentException(
+          "The spark.stage.maxConsecutiveAttempts should be less than 32768 (currently "
+              + maxStageAttempts
+              + ")"
+              + "and spark.task.maxFailures should be less than 65536 (currently "
+              + maxTaskAttempts
+              + ").");
+    }
+  }
+
+  public static int getEncodedAttemptNumber(TaskContext context) {
+    return (context.stageAttemptNumber() << 16) | context.attemptNumber();
+  }
+}
diff --git a/...spark/spark-2/src/main/java/org/apache/spark/shuffle/celeborn/HashBasedShuffleWriter.java b/...spark/spark-2/src/main/java/org/apache/spark/shuffle/celeborn/HashBasedShuffleWriter.java
@@ -68,6 +68,7 @@ public class HashBasedShuffleWriter<K, V, C> extends ShuffleWriter<K, V> {
   private final ShuffleWriteMetrics writeMetrics;
   private final int shuffleId;
   private final int mapId;
+  private final int encodedAttemptId;
   private final TaskContext taskContext;
   private final ShuffleClient shuffleClient;
   private final int numMappers;
@@ -112,6 +113,7 @@ public HashBasedShuffleWriter(
     this.mapId = mapId;
     this.dep = handle.dependency();
     this.shuffleId = shuffleId;
+    this.encodedAttemptId = SparkCommonUtils.getEncodedAttemptNumber(taskContext);
     SerializerInstance serializer = dep.serializer().newInstance();
     this.partitioner = dep.partitioner();
     this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics();
@@ -146,7 +148,7 @@ public HashBasedShuffleWriter(
           new DataPusher(
               shuffleId,
               mapId,
-              taskContext.attemptNumber(),
+              encodedAttemptId,
               taskContext.taskAttemptId(),
               numMappers,
               numPartitions,
@@ -279,7 +281,7 @@ private void pushGiantRecord(int partitionId, byte[] buffer, int numBytes) throw
         shuffleClient.pushData(
             shuffleId,
             mapId,
-            taskContext.attemptNumber(),
+            encodedAttemptId,
             partitionId,
             buffer,
             0,
@@ -333,7 +335,7 @@ private void close() throws IOException, InterruptedException {
     // here we wait for all the in-flight batches to return which sent by dataPusher thread
     dataPusher.waitOnTermination();
     sendBufferPool.returnPushTaskQueue(dataPusher.getIdleQueue());
-    shuffleClient.prepareForMergeData(shuffleId, mapId, taskContext.attemptNumber());
+    shuffleClient.prepareForMergeData(shuffleId, mapId, encodedAttemptId);
 
     // merge and push residual data to reduce network traffic
     // NB: since dataPusher thread have no in-flight data at this point,
@@ -345,7 +347,7 @@ private void close() throws IOException, InterruptedException {
             shuffleClient.mergeData(
                 shuffleId,
                 mapId,
-                taskContext.attemptNumber(),
+                encodedAttemptId,
                 i,
                 sendBuffers[i],
                 0,
@@ -358,7 +360,7 @@ private void close() throws IOException, InterruptedException {
         writeMetrics.incBytesWritten(bytesWritten);
       }
     }
-    shuffleClient.pushMergedData(shuffleId, mapId, taskContext.attemptNumber());
+    shuffleClient.pushMergedData(shuffleId, mapId, encodedAttemptId);
 
     updateMapStatus();
 
@@ -367,7 +369,7 @@ private void close() throws IOException, InterruptedException {
     sendOffsets = null;
 
     long waitStartTime = System.nanoTime();
-    shuffleClient.mapperEnd(shuffleId, mapId, taskContext.attemptNumber(), numMappers);
+    shuffleClient.mapperEnd(shuffleId, mapId, encodedAttemptId, numMappers);
     writeMetrics.incWriteTime(System.nanoTime() - waitStartTime);
 
     BlockManagerId bmId = SparkEnv.get().blockManager().shuffleServerId();
@@ -404,7 +406,7 @@ public Option<MapStatus> stop(boolean success) {
         }
       }
     } finally {
-      shuffleClient.cleanup(shuffleId, mapId, taskContext.attemptNumber());
+      shuffleClient.cleanup(shuffleId, mapId, encodedAttemptId);
     }
   }
 }
diff --git a/...spark/spark-2/src/main/java/org/apache/spark/shuffle/celeborn/SortBasedShuffleWriter.java b/...spark/spark-2/src/main/java/org/apache/spark/shuffle/celeborn/SortBasedShuffleWriter.java
@@ -62,6 +62,7 @@ public class SortBasedShuffleWriter<K, V, C> extends ShuffleWriter<K, V> {
   private final ShuffleWriteMetrics writeMetrics;
   private final int shuffleId;
   private final int mapId;
+  private final int encodedAttemptId;
   private final TaskContext taskContext;
   private final ShuffleClient shuffleClient;
   private final int numMappers;
@@ -102,6 +103,7 @@ public SortBasedShuffleWriter(
     this.mapId = taskContext.partitionId();
     this.dep = dep;
     this.shuffleId = shuffleId;
+    this.encodedAttemptId = SparkCommonUtils.getEncodedAttemptNumber(taskContext);
     SerializerInstance serializer = dep.serializer().newInstance();
     this.partitioner = dep.partitioner();
     this.writeMetrics = taskContext.taskMetrics().shuffleWriteMetrics();
@@ -130,7 +132,7 @@ public SortBasedShuffleWriter(
             taskContext,
             shuffleId,
             mapId,
-            taskContext.attemptNumber(),
+            encodedAttemptId,
             taskContext.taskAttemptId(),
             numMappers,
             numPartitions,
@@ -285,7 +287,7 @@ private void pushGiantRecord(int partitionId, byte[] buffer, int numBytes) throw
         shuffleClient.pushData(
             shuffleId,
             mapId,
-            taskContext.attemptNumber(),
+            encodedAttemptId,
             partitionId,
             buffer,
             0,
@@ -309,12 +311,12 @@ private void close() throws IOException {
     pusher.close(true);
     writeMetrics.incWriteTime(System.nanoTime() - pushStartTime);
 
-    shuffleClient.pushMergedData(shuffleId, mapId, taskContext.attemptNumber());
+    shuffleClient.pushMergedData(shuffleId, mapId, encodedAttemptId);
 
     updateMapStatus();
 
     long waitStartTime = System.nanoTime();
-    shuffleClient.mapperEnd(shuffleId, mapId, taskContext.attemptNumber(), numMappers);
+    shuffleClient.mapperEnd(shuffleId, mapId, encodedAttemptId, numMappers);
     writeMetrics.incWriteTime(System.nanoTime() - waitStartTime);
   }
 
@@ -350,7 +352,7 @@ public Option<MapStatus> stop(boolean success) {
     } catch (IOException e) {
       return Option.apply(null);
     } finally {
-      shuffleClient.cleanup(shuffleId, mapId, taskContext.attemptNumber());
+      shuffleClient.cleanup(shuffleId, mapId, encodedAttemptId);
     }
   }
 

diff --git a/...nt-spark/spark-2/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/...nt-spark/spark-2/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java
@@ -66,6 +66,7 @@ public class SparkShuffleManager implements ShuffleManager {
   private ExecutorShuffleIdTracker shuffleIdTracker = new ExecutorShuffleIdTracker();
 
   public SparkShuffleManager(SparkConf conf, boolean isDriver) {
+    SparkCommonUtils.validateAttemptConfig(conf);
     this.conf = conf;
     this.isDriver = isDriver;
     this.celebornConf = SparkUtils.fromSparkConf(conf);

diff --git a/...park/spark-2/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala b/...park/spark-2/src/main/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleReader.scala
@@ -56,6 +56,7 @@ class CelebornShuffleReader[K, C](
     handle.extension)
 
   private val exceptionRef = new AtomicReference[IOException]
+  private val encodedAttemptId = SparkCommonUtils.getEncodedAttemptNumber(context)
 
   override def read(): Iterator[Product2[K, C]] = {
 
@@ -96,7 +97,7 @@ class CelebornShuffleReader[K, C](
               val inputStream = shuffleClient.readPartition(
                 shuffleId,
                 partitionId,
-                context.attemptNumber(),
+                encodedAttemptId,
                 startMapIndex,
                 endMapIndex,
                 metricsCallback)

diff --git a/...park-2/src/test/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleManagerSuite.scala b/...park-2/src/test/scala/org/apache/spark/shuffle/celeborn/CelebornShuffleManagerSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.shuffle.celeborn
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.sql.internal.SQLConf
 import org.junit
+import org.junit.Assert
 import org.junit.runner.RunWith
 import org.junit.runners.JUnit4
 
@@ -67,4 +68,33 @@ class SparkShuffleManagerSuite extends Logging {
     sc.stop()
   }
 
+  @junit.Test
+  def testWrongSparkConfMaxAttemptLimit(): Unit = {
+    val conf = new SparkConf().setIfMissing("spark.master", "local")
+      .setIfMissing(
+        "spark.shuffle.manager",
+        "org.apache.spark.shuffle.celeborn.SparkShuffleManager")
+      .set(s"spark.${CelebornConf.MASTER_ENDPOINTS.key}", "localhost:9097")
+      .set(s"spark.${CelebornConf.CLIENT_PUSH_REPLICATE_ENABLED.key}", "false")
+      .set("spark.shuffle.service.enabled", "false")
+      .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true")
+
+    // default conf, will success
+    new SparkShuffleManager(conf, true)
+
+    conf
+      .set("spark.stage.maxConsecutiveAttempts", "32768")
+      .set("spark.task.maxFailures", "10")
+    try {
+      new SparkShuffleManager(conf, true)
+      Assert.fail()
+    } catch {
+      case e: IllegalArgumentException =>
+        Assert.assertTrue(
+          e.getMessage.contains("The spark.stage.maxConsecutiveAttempts should be less than 32768"))
+      case _: Throwable =>
+        Assert.fail()
+    }
+  }
+
 }
diff --git a/...src/test/scala/org/apache/spark/shuffle/celeborn/CelebornColumnarShuffleReaderSuite.scala b/...src/test/scala/org/apache/spark/shuffle/celeborn/CelebornColumnarShuffleReaderSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.shuffle.celeborn
 
-import org.apache.spark.{ShuffleDependency, SparkConf}
+import org.apache.spark.{ShuffleDependency, SparkConf, TaskContext}
 import org.apache.spark.serializer.{KryoSerializer, KryoSerializerInstance}
 import org.apache.spark.sql.execution.UnsafeRowSerializer
 import org.apache.spark.sql.execution.columnar.CelebornColumnarBatchSerializerInstance
@@ -45,14 +45,17 @@ class CelebornColumnarShuffleReaderSuite {
 
     var shuffleClient: MockedStatic[ShuffleClient] = null
     try {
+      val taskContext = Mockito.mock(classOf[TaskContext])
+      Mockito.when(taskContext.stageAttemptNumber).thenReturn(0)
+      Mockito.when(taskContext.attemptNumber).thenReturn(0)
       shuffleClient = Mockito.mockStatic(classOf[ShuffleClient])
       val shuffleReader = SparkUtils.createColumnarShuffleReader(
         handle,
         0,
         10,
         0,
         10,
-        null,
+        taskContext,
         new CelebornConf(),
         null,
         new ExecutorShuffleIdTracker())
@@ -68,6 +71,9 @@ class CelebornColumnarShuffleReaderSuite {
   def columnarShuffleReaderNewSerializerInstance(): Unit = {
     var shuffleClient: MockedStatic[ShuffleClient] = null
     try {
+      val taskContext = Mockito.mock(classOf[TaskContext])
+      Mockito.when(taskContext.stageAttemptNumber).thenReturn(0)
+      Mockito.when(taskContext.attemptNumber).thenReturn(0)
       shuffleClient = Mockito.mockStatic(classOf[ShuffleClient])
       val shuffleReader = SparkUtils.createColumnarShuffleReader(
         new CelebornShuffleHandle[Int, String, String](
@@ -83,7 +89,7 @@ class CelebornColumnarShuffleReaderSuite {
         10,
         0,
         10,
-        null,
+        taskContext,
         new CelebornConf(),
         null,
         new ExecutorShuffleIdTracker())

diff --git a/...spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/HashBasedShuffleWriter.java b/...spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/HashBasedShuffleWriter.java
@@ -68,6 +68,7 @@ public class HashBasedShuffleWriter<K, V, C> extends ShuffleWriter<K, V> {
   private final ShuffleWriteMetricsReporter writeMetrics;
   private final int shuffleId;
   private final int mapId;
+  private final int encodedAttemptId;
   private final TaskContext taskContext;
   private final ShuffleClient shuffleClient;
   private final int numMappers;
@@ -112,6 +113,7 @@ public HashBasedShuffleWriter(
     this.mapId = taskContext.partitionId();
     this.dep = handle.dependency();
     this.shuffleId = shuffleId;
+    this.encodedAttemptId = SparkCommonUtils.getEncodedAttemptNumber(taskContext);
     SerializerInstance serializer = dep.serializer().newInstance();
     this.partitioner = dep.partitioner();
     this.writeMetrics = metrics;
@@ -142,7 +144,7 @@ public HashBasedShuffleWriter(
           new DataPusher(
               shuffleId,
               mapId,
-              taskContext.attemptNumber(),
+              encodedAttemptId,
               taskContext.taskAttemptId(),
               numMappers,
               numPartitions,
@@ -279,7 +281,7 @@ protected void pushGiantRecord(int partitionId, byte[] buffer, int numBytes) thr
         shuffleClient.pushData(
             shuffleId,
             mapId,
-            taskContext.attemptNumber(),
+            encodedAttemptId,
             partitionId,
             buffer,
             0,
@@ -343,7 +345,7 @@ protected void mergeData(int partitionId, byte[] buffer, int offset, int length)
         shuffleClient.mergeData(
             shuffleId,
             mapId,
-            taskContext.attemptNumber(),
+            encodedAttemptId,
             partitionId,
             buffer,
             offset,
@@ -368,14 +370,14 @@ private void close() throws IOException, InterruptedException {
     long pushMergedDataTime = System.nanoTime();
     dataPusher.waitOnTermination();
     sendBufferPool.returnPushTaskQueue(dataPusher.getIdleQueue());
-    shuffleClient.prepareForMergeData(shuffleId, mapId, taskContext.attemptNumber());
+    shuffleClient.prepareForMergeData(shuffleId, mapId, encodedAttemptId);
     closeWrite();
-    shuffleClient.pushMergedData(shuffleId, mapId, taskContext.attemptNumber());
+    shuffleClient.pushMergedData(shuffleId, mapId, encodedAttemptId);
     writeMetrics.incWriteTime(System.nanoTime() - pushMergedDataTime);
     updateRecordsWrittenMetrics();
 
     long waitStartTime = System.nanoTime();
-    shuffleClient.mapperEnd(shuffleId, mapId, taskContext.attemptNumber(), numMappers);
+    shuffleClient.mapperEnd(shuffleId, mapId, encodedAttemptId, numMappers);
     writeMetrics.incWriteTime(System.nanoTime() - waitStartTime);
 
     BlockManagerId bmId = SparkEnv.get().blockManager().shuffleServerId();
@@ -408,7 +410,7 @@ public Option<MapStatus> stop(boolean success) {
         }
       }
     } finally {
-      shuffleClient.cleanup(shuffleId, mapId, taskContext.attemptNumber());
+      shuffleClient.cleanup(shuffleId, mapId, encodedAttemptId);
     }
   }