From 8ac0c18e89a7fa8b6b39ad136d4d9657fa04c84a Mon Sep 17 00:00:00 2001 From: Nicholas Jiang Date: Sat, 4 Jan 2025 08:28:27 +0800 Subject: [PATCH] [GLUTEN-8398] Bump Celeborn to 0.4.3 and 0.5.2 (#8399) --- .github/workflows/velox_backend.yml | 8 +++++-- dev/docker/Dockerfile.centos8-dynamic-build | 2 +- .../celeborn/CelebornShuffleManager.java | 22 ++++++++++++++++--- tools/gluten-it/pom.xml | 2 +- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/velox_backend.yml b/.github/workflows/velox_backend.yml index ada2ea3f23fa..ec79bc8b1bea 100644 --- a/.github/workflows/velox_backend.yml +++ b/.github/workflows/velox_backend.yml @@ -544,7 +544,7 @@ jobs: fail-fast: false matrix: spark: [ "spark-3.2" ] - celeborn: [ "celeborn-0.5.2", "celeborn-0.4.2", "celeborn-0.3.2-incubating" ] + celeborn: [ "celeborn-0.5.2", "celeborn-0.4.3", "celeborn-0.3.2-incubating" ] runs-on: ubuntu-20.04 container: apache/gluten:centos-8 steps: @@ -566,12 +566,16 @@ jobs: - name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 with ${{ matrix.celeborn }} run: | EXTRA_PROFILE="" - if [ "${{ matrix.celeborn }}" = "celeborn-0.4.2" ]; then + if [ "${{ matrix.celeborn }}" = "celeborn-0.4.3" ]; then EXTRA_PROFILE="-Pceleborn-0.4" elif [ "${{ matrix.celeborn }}" = "celeborn-0.5.2" ]; then EXTRA_PROFILE="-Pceleborn-0.5" fi echo "EXTRA_PROFILE: ${EXTRA_PROFILE}" + if [ ! -e "/opt/apache-${{ matrix.celeborn }}-bin.tgz" ]; then + echo "WARNING: please pre-install your required package in docker image since the downloading is throttled by this site." + wget -nv https://archive.apache.org/dist/celeborn/${{ matrix.celeborn }}/apache-${{ matrix.celeborn }}-bin.tgz -P /opt/ + fi cd /opt && mkdir -p celeborn && \ tar xzf apache-${{ matrix.celeborn }}-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \ mv ./conf/celeborn-env.sh.template ./conf/celeborn-env.sh && \ diff --git a/dev/docker/Dockerfile.centos8-dynamic-build b/dev/docker/Dockerfile.centos8-dynamic-build index e0229697f61d..daeff3b729bd 100644 --- a/dev/docker/Dockerfile.centos8-dynamic-build +++ b/dev/docker/Dockerfile.centos8-dynamic-build @@ -16,7 +16,7 @@ RUN wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.8.8 ENV PATH=${PATH}:/usr/lib/maven/bin RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.3.2-incubating/apache-celeborn-0.3.2-incubating-bin.tgz -P /opt/ -RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.4.2/apache-celeborn-0.4.2-bin.tgz -P /opt/ +RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.4.3/apache-celeborn-0.4.3-bin.tgz -P /opt/ RUN wget -nv https://archive.apache.org/dist/celeborn/celeborn-0.5.2/apache-celeborn-0.5.2-bin.tgz -P /opt/ RUN git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten diff --git a/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java b/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java index a4d4c4f5c538..00c87b391f3a 100644 --- a/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java +++ b/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java @@ -190,6 +190,8 @@ private void initializeLifecycleManager() { private ShuffleHandle registerCelebornShuffleHandle( int shuffleId, ShuffleDependency dependency) { + // for Celeborn 0.4.0 + CelebornUtils.registerAppShuffleDeterminate(lifecycleManager, shuffleId, dependency); return CelebornUtils.getCelebornShuffleHandle( appUniqueId, lifecycleManager.getHost(), @@ -207,9 +209,6 @@ public ShuffleHandle registerShuffle( appUniqueId = SparkUtils.appUniqueId(dependency.rdd().context()); initializeLifecycleManager(); - // for Celeborn 0.4.0 - CelebornUtils.registerAppShuffleDeterminate(lifecycleManager, shuffleId, dependency); - // Note: generate app unique id at driver side, make sure dependency.rdd.context // is the same SparkContext among different shuffleIds. // This method may be called many times. @@ -307,6 +306,23 @@ public ShuffleWriter getWriter( false, extension); + // for Celeborn 0.5.2 + try { + Field field = CelebornShuffleHandle.class.getDeclaredField("throwsFetchFailure"); + field.setAccessible(true); + boolean throwsFetchFailure = (boolean) field.get(handle); + if (throwsFetchFailure) { + Method addFailureListenerMethod = + SparkUtils.class.getMethod( + "addFailureListenerIfBarrierTask", + ShuffleClient.class, + TaskContext.class, + CelebornShuffleHandle.class); + addFailureListenerMethod.invoke(null, shuffleClient, context, h); + } + } catch (NoSuchFieldException | NoSuchMethodException ignored) { + } + int shuffleId; // for Celeborn 0.4.0 diff --git a/tools/gluten-it/pom.xml b/tools/gluten-it/pom.xml index 8120bc5ea1eb..9b1cf10df891 100644 --- a/tools/gluten-it/pom.xml +++ b/tools/gluten-it/pom.xml @@ -177,7 +177,7 @@ celeborn-0.4 - 0.4.2 + 0.4.3