apache · FMX · Jul 4, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ celeborn.worker.commitFiles.threads 128
 celeborn.master.slot.assign.policy roundrobin
 celeborn.rpc.askTimeout 240s
 celeborn.worker.flusher.hdfs.buffer.size 4m
-celeborn.worker.storage.hdfs.dir hdfs://<namenode>/celeborn
+celeborn.storage.hdfs.dir hdfs://<namenode>/celeborn
 celeborn.worker.replicate.fastFail.duration 240s
 
 # If your hosts have disk raid or use lvm, set celeborn.worker.monitor.disk.enabled to false
@@ -175,7 +175,7 @@ celeborn.worker.commitFiles.threads 128
 celeborn.master.slot.assign.policy roundrobin
 celeborn.rpc.askTimeout 240s
 celeborn.worker.flusher.hdfs.buffer.size 4m
-celeborn.worker.storage.hdfs.dir hdfs://<namenode>/celeborn
+celeborn.storage.hdfs.dir hdfs://<namenode>/celeborn
 celeborn.worker.replicate.fastFail.duration 240s
 
 # If your hosts have disk raid or use lvm, set celeborn.worker.monitor.disk.enabled to false
@@ -259,7 +259,7 @@ spark.celeborn.client.push.replicate.enabled true
 spark.sql.adaptive.localShuffleReader.enabled false
 
 # If Celeborn is using HDFS
-spark.celeborn.worker.storage.hdfs.dir hdfs://<namenode>/celeborn
+spark.celeborn.storage.hdfs.dir hdfs://<namenode>/celeborn
 
 # we recommend enabling aqe support to gain better performance
 spark.sql.adaptive.enabled true

diff --git a/client/src/main/java/org/apache/celeborn/client/ShuffleClient.java b/client/src/main/java/org/apache/celeborn/client/ShuffleClient.java
@@ -20,7 +20,6 @@
 import java.io.IOException;
 import java.util.concurrent.ConcurrentHashMap;
 
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -30,7 +29,7 @@
 import org.apache.celeborn.common.identity.UserIdentifier;
 import org.apache.celeborn.common.protocol.PartitionLocation;
 import org.apache.celeborn.common.rpc.RpcEndpointRef;
-import org.apache.celeborn.common.util.CelebornHadoopUtils$;
+import org.apache.celeborn.common.util.Utils;
 import org.apache.celeborn.common.write.PushState;
 
 /**
@@ -85,16 +84,9 @@ public static FileSystem getHdfsFs(CelebornConf conf) {
     if (null == hdfsFs) {
       synchronized (ShuffleClient.class) {
         if (null == hdfsFs) {
-          Configuration hdfsConfiguration = CelebornHadoopUtils$.MODULE$.newConfiguration(conf);
-          // enable fs cache to avoid too many fs instances
-          hdfsConfiguration.set("fs.hdfs.impl.disable.cache", "false");
-          hdfsConfiguration.set("fs.viewfs.impl.disable.cache", "false");
-          logger.info(
-              "Celeborn client will ignore cluster"
-                  + " settings about fs.hdfs/viewfs.impl.disable.cache and set it to false");
           try {
-            hdfsFs = FileSystem.get(hdfsConfiguration);
-          } catch (IOException e) {
+            hdfsFs = Utils.getHadoopFS(conf);
+          } catch (Exception e) {
             System.err.println("Celeborn initialize hdfs failed.");
             e.printStackTrace(System.err);
           }

diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -1955,11 +1955,10 @@ object CelebornConf extends Logging {
       .createWithDefault(16)
 
   val HDFS_DIR: OptionalConfigEntry[String] =
-    buildConf("celeborn.worker.storage.hdfs.dir")
-      .withAlternative("celeborn.storage.hdfs.dir")
-      .categories("worker")
+    buildConf("celeborn.storage.hdfs.dir")
+      .categories("worker", "master", "client")
       .version("0.2.0")
-      .doc("HDFS dir configuration for Celeborn to access HDFS.")
+      .doc("HDFS base directory for Celeborn to store shuffle data.")
       .stringConf
       .createOptional
 

diff --git a/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala b/common/src/main/scala/org/apache/celeborn/common/util/CelebornHadoopUtils.scala
@@ -18,12 +18,23 @@
 package org.apache.celeborn.common.util
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.celeborn.common.CelebornConf
+import org.apache.celeborn.common.internal.Logging
 
-object CelebornHadoopUtils {
+object CelebornHadoopUtils extends Logging {
   private[celeborn] def newConfiguration(conf: CelebornConf): Configuration = {
     val hadoopConf = new Configuration()
+    if (!conf.hdfsDir.isEmpty) {
+      val path = new Path(conf.hdfsDir)
+      val scheme = path.toUri.getScheme
+      val disableCacheName = String.format("fs.%s.impl.disable.cache", scheme)
+      hadoopConf.set("dfs.replication", "2")
+      hadoopConf.set(disableCacheName, "false")
+      logInfo(s"Celeborn will ignore cluster settings $disableCacheName and " +
+        "set it to false")
+    }
     appendSparkHadoopConfigs(conf, hadoopConf)
     hadoopConf
   }
@@ -34,4 +45,8 @@ object CelebornHadoopUtils {
       hadoopConf.set(key.substring("celeborn.hadoop.".length), value)
     }
   }
+
+  def getHadoopFS(conf: CelebornConf): FileSystem = {
+    new Path(conf.hdfsDir).getFileSystem(CelebornHadoopUtils.newConfiguration(conf))
+  }
 }
diff --git a/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala b/common/src/main/scala/org/apache/celeborn/common/util/Utils.scala
@@ -1072,4 +1072,5 @@ object Utils extends Logging {
     }
     labelPart(0).trim -> labelPart(1).trim
   }
+
 }
diff --git a/docs/configuration/client.md b/docs/configuration/client.md
@@ -97,4 +97,5 @@ license: |
 | celeborn.client.spark.shuffle.writer | HASH | Celeborn supports the following kind of shuffle writers. 1. hash: hash-based shuffle writer works fine when shuffle partition count is normal; 2. sort: sort-based shuffle writer works fine when memory pressure is high or shuffle partition count is huge. | 0.3.0 | 
 | celeborn.master.endpoints | &lt;localhost&gt;:9097 | Endpoints of master nodes for celeborn client to connect, allowed pattern is: `<host1>:<port1>[,<host2>:<port2>]*`, e.g. `clb1:9097,clb2:9098,clb3:9099`. If the port is omitted, 9097 will be used. | 0.2.0 | 
 | celeborn.shuffle.chunk.size | 8m | Max chunk size of reducer's merged shuffle data. For example, if a reducer's shuffle data is 128M and the data will need 16 fetch chunk requests to fetch. | 0.2.0 | 
+| celeborn.storage.hdfs.dir | &lt;undefined&gt; | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | 
 <!--end-include-->
diff --git a/docs/configuration/master.md b/docs/configuration/master.md
@@ -34,4 +34,5 @@ license: |
 | celeborn.master.slot.assign.policy | ROUNDROBIN | Policy for master to assign slots, Celeborn supports two types of policy: roundrobin and loadaware. Loadaware policy will be ignored when `HDFS` is enabled in `celeborn.storage.activeTypes` | 0.3.0 | 
 | celeborn.master.userResourceConsumption.update.interval | 30s | Time length for a window about compute user resource consumption. | 0.3.0 | 
 | celeborn.storage.activeTypes | HDD,SSD | Enabled storage levels. Available options: HDD,SSD,HDFS.  | 0.3.0 | 
+| celeborn.storage.hdfs.dir | &lt;undefined&gt; | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | 
 <!--end-include-->
diff --git a/docs/configuration/worker.md b/docs/configuration/worker.md
@@ -23,6 +23,7 @@ license: |
 | celeborn.master.estimatedPartitionSize.minSize | 8mb | Ignore partition size smaller than this configuration of partition size for estimation. | 0.3.0 | 
 | celeborn.shuffle.chunk.size | 8m | Max chunk size of reducer's merged shuffle data. For example, if a reducer's shuffle data is 128M and the data will need 16 fetch chunk requests to fetch. | 0.2.0 | 
 | celeborn.storage.activeTypes | HDD,SSD | Enabled storage levels. Available options: HDD,SSD,HDFS.  | 0.3.0 | 
+| celeborn.storage.hdfs.dir | &lt;undefined&gt; | HDFS base directory for Celeborn to store shuffle data. | 0.2.0 | 
 | celeborn.worker.bufferStream.threadsPerMountpoint | 8 | Threads count for read buffer per mount point. | 0.3.0 | 
 | celeborn.worker.closeIdleConnections | false | Whether worker will close idle connections. | 0.2.0 | 
 | celeborn.worker.commitFiles.threads | 32 | Thread number of worker to commit shuffle data files asynchronously. It's recommended to set at least `128` when `HDFS` is enabled in `celeborn.storage.activeTypes`. | 0.3.0 | 
@@ -92,7 +93,6 @@ license: |
 | celeborn.worker.storage.checkDirsEmpty.timeout | 1000ms | The wait time per retry for a worker to check if the working directory is cleaned up before registering with the master. | 0.3.0 | 
 | celeborn.worker.storage.dirs | &lt;undefined&gt; | Directory list to store shuffle data. It's recommended to configure one directory on each disk. Storage size limit can be set for each directory. For the sake of performance, there should be no more than 2 flush threads on the same disk partition if you are using HDD, and should be 8 or more flush threads on the same disk partition if you are using SSD. For example: `dir1[:capacity=][:disktype=][:flushthread=],dir2[:capacity=][:disktype=][:flushthread=]` | 0.2.0 | 
 | celeborn.worker.storage.disk.reserve.size | 5G | Celeborn worker reserved space for each disk. | 0.3.0 | 
-| celeborn.worker.storage.hdfs.dir | &lt;undefined&gt; | HDFS dir configuration for Celeborn to access HDFS. | 0.2.0 | 
 | celeborn.worker.storage.workingDir | celeborn-worker/shuffle_data | Worker's working dir path name. | 0.3.0 | 
 | celeborn.worker.writer.close.timeout | 120s | Timeout for a file writer to close | 0.2.0 | 
 | celeborn.worker.writer.create.maxAttempts | 3 | Retry count for a file writer to create if its creation was failed. | 0.2.0 | 

diff --git a/.../main/java/org/apache/celeborn/service/deploy/master/clustermeta/AbstractMetaManager.java b/.../main/java/org/apache/celeborn/service/deploy/master/clustermeta/AbstractMetaManager.java
@@ -138,6 +138,12 @@ public void updateAppHeartbeatMeta(String appId, long time, long totalWritten, l
     partitionTotalFileCount.add(fileCount);
   }
 
+  public Set<String> getActiveAppIds() {
+    return registeredShuffle.stream()
+        .map(key -> Utils.splitShuffleKey(key)._1)
+        .collect(Collectors.toSet());
+  }
+
   public void updateAppLostMeta(String appId) {
     registeredShuffle.stream()
         .filter(shuffle -> shuffle.startsWith(appId))

diff --git a/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala b/master/src/main/scala/org/apache/celeborn/service/deploy/master/Master.scala
@@ -25,6 +25,8 @@ import java.util.concurrent.{ConcurrentHashMap, ScheduledFuture, TimeUnit}
 import scala.collection.JavaConverters._
 import scala.util.Random
 
+import org.apache.hadoop.fs.{FileSystem, Path}
+
 import org.apache.celeborn.common.CelebornConf
 import org.apache.celeborn.common.client.MasterClient
 import org.apache.celeborn.common.identity.UserIdentifier
@@ -141,6 +143,7 @@ private[celeborn] class Master(
   // init and register master metrics
   val resourceConsumptionSource = new ResourceConsumptionSource(conf)
   private val masterSource = new MasterSource(conf)
+  private var hadoopFs: FileSystem = _
   masterSource.addGauge(MasterSource.REGISTERED_SHUFFLE_COUNT) { () =>
     statusSystem.registeredShuffle.size
   }
@@ -653,11 +656,33 @@ private[celeborn] class Master(
       override def run(): Unit = {
         statusSystem.handleAppLost(appId, requestId)
         logInfo(s"Removed application $appId")
+        // only leader can clean hdfs dirs
+        if (conf.hasHDFSStorage && !conf.hdfsDir.isEmpty) {
+          cleanExpiredAppDirsOnHDFS()
+        }
         context.reply(ApplicationLostResponse(StatusCode.SUCCESS))
       }
     })
   }
 
+  private def cleanExpiredAppDirsOnHDFS(): Unit = {
+    val activeAppIds = statusSystem.getActiveAppIds
+    if (hadoopFs == null) {
+      hadoopFs = Utils.getHadoopFS(conf)
+    }
+    val hdfsWorkPath = new Path(conf.hdfsDir, conf.workerWorkingDir)
+    if (hadoopFs.exists(hdfsWorkPath)) {
+      val iter = hadoopFs.listStatusIterator(hdfsWorkPath)
+      while (iter.hasNext) {
+        val fileStatus = iter.next()
+        if (!activeAppIds.contains(fileStatus.getPath.getName)) {
+          logDebug(s"Clean hdfs dir ${fileStatus.getPath.toString}")
+          hadoopFs.delete(fileStatus.getPath, true)
+        }
+      }
+    }
+  }
+
   private def handleHeartbeatFromApplication(
       context: RpcCallContext,
       appId: String,

diff --git a/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala b/worker/src/main/scala/org/apache/celeborn/service/deploy/worker/storage/StorageManager.scala
@@ -120,22 +120,12 @@ final private[worker] class StorageManager(conf: CelebornConf, workerSource: Abs
   deviceMonitor.startCheck()
 
   val hdfsDir = conf.hdfsDir
-  if (!hdfsDir.isEmpty && conf.hasHDFSStorage) {
-    logInfo(s"Initialize HDFS support with path ${hdfsDir}")
-  }
   val hdfsPermission = new FsPermission("755")
   val hdfsWriters = JavaUtils.newConcurrentHashMap[String, FileWriter]()
   val (hdfsFlusher, _totalHdfsFlusherThread) =
     if (!hdfsDir.isEmpty && conf.hasHDFSStorage) {
-      val path = new Path(hdfsDir)
-      val scheme = path.toUri.getScheme
-      val disableCacheName = String.format("fs.%s.impl.disable.cache", scheme)
-      val hdfsConfiguration = CelebornHadoopUtils.newConfiguration(conf)
-      hdfsConfiguration.set("dfs.replication", "2")
-      hdfsConfiguration.set(disableCacheName, "false")
-      logInfo("Celeborn will ignore cluster settings " +
-        disableCacheName + " and set it to false")
-      StorageManager.hadoopFs = path.getFileSystem(hdfsConfiguration)
+      logInfo(s"Initialize HDFS support with path ${hdfsDir}")
+      StorageManager.hadoopFs = Utils.getHadoopFS(conf)
       (
         Some(new HdfsFlusher(
           workerSource,
@@ -527,19 +517,6 @@ final private[worker] class StorageManager(conf: CelebornConf, workerSource: Abs
         case _ => // do nothing
       }
     }
-
-    if (hadoopFs != null) {
-      val hdfsWorkPath = new Path(hdfsDir, conf.workerWorkingDir)
-      if (hadoopFs.exists(hdfsWorkPath)) {
-        val iter = hadoopFs.listStatusIterator(hdfsWorkPath)
-        while (iter.hasNext) {
-          val fileStatus = iter.next()
-          if (!appIds.contains(fileStatus.getPath.getName)) {
-            hadoopFs.delete(fileStatus.getPath, true)
-          }
-        }
-      }
-    }
   }
 
   private def deleteDirectory(dir: File, threadPool: ThreadPoolExecutor): Unit = {