Skip to content

Commit

Permalink
[feature](refactor) New Spark Load (#214)
Browse files Browse the repository at this point in the history
Currently, Spark Load is integrated into the Doris core, which brings the following problems:

Due to the reliance on Spark-related dependencies, the security issues of Spark itself will cause security issues in Doris
When modifying the content of Spark ETL, it also depends on the version release of Doris, which is not conducive to rapid expansion and problem repair

The reliance on Hadoop ecosystem also increases the complexity of the Doris system
Therefore, separating the task submission, management, and Spark DPP processing of Spark Load from the Doris core will help reduce the complexity of the Doris core and also help unify Doris's Spark ecosystem tools.
  • Loading branch information
gnehil authored Dec 24, 2024
1 parent 157ef07 commit 2c3d9dd
Show file tree
Hide file tree
Showing 74 changed files with 13,141 additions and 1 deletion.
15 changes: 15 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ dependency-reduced-pom.xml
target
.flattened-pom.xml

spark-load/.idea/
spark-load/target
spark-load/spark-load-core/dependency-reduced-pom.xml
spark-load/spark-load-core/output/
spark-load/spark-load-core/target/
spark-load/spark-load-core/.idea/
spark-load/spark-load-dist/dependency-reduced-pom.xml
spark-load/spark-load-dist/target/
spark-load/spark-load-dpp/dependency-reduced-pom.xml
spark-load/spark-load-dpp/.flattened-pom.xml
spark-load/spark-load-dpp/target/
spark-load/spark-load-common/dependency-reduced-pom.xml
spark-load/spark-load-common/target/


### Java template
# Compiled class file
*.class
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ package org.apache.doris.spark.sql

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.{Ignore, Test}
import org.junit.Ignore
import org.junit.Test

// This test need real connect info to run.
// Set the connect info before comment out this @Ignore
Expand Down
175 changes: 175 additions & 0 deletions spark-load/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

##############################################################
# This script is used to compile Spark-Load
# Usage:
# sh build.sh
#
##############################################################

# Bugzilla 37848: When no TTY is available, don't output to console
have_tty=0
# shellcheck disable=SC2006
if [[ "`tty`" != "not a tty" ]]; then
have_tty=1
fi

# Bugzilla 37848: When no TTY is available, don't output to console
have_tty=0
# shellcheck disable=SC2006
if [[ "`tty`" != "not a tty" ]]; then
have_tty=1
fi

# Only use colors if connected to a terminal
if [[ ${have_tty} -eq 1 ]]; then
PRIMARY=$(printf '\033[38;5;082m')
RED=$(printf '\033[31m')
GREEN=$(printf '\033[32m')
YELLOW=$(printf '\033[33m')
BLUE=$(printf '\033[34m')
BOLD=$(printf '\033[1m')
RESET=$(printf '\033[0m')
else
PRIMARY=""
RED=""
GREEN=""
YELLOW=""
BLUE=""
BOLD=""
RESET=""
fi

echo_r () {
# Color red: Error, Failed
[[ $# -ne 1 ]] && return 1
# shellcheck disable=SC2059
printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $RED $RESET
}

echo_g () {
# Color green: Success
[[ $# -ne 1 ]] && return 1
# shellcheck disable=SC2059
printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $GREEN $RESET
}

echo_y () {
# Color yellow: Warning
[[ $# -ne 1 ]] && return 1
# shellcheck disable=SC2059
printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $YELLOW $RESET
}

echo_w () {
# Color yellow: White
[[ $# -ne 1 ]] && return 1
# shellcheck disable=SC2059
printf "[%sDoris%s] %s$1%s\n" $BLUE $RESET $WHITE $RESET
}

# OS specific support. $var _must_ be set to either true or false.
cygwin=false
os400=false
# shellcheck disable=SC2006
case "`uname`" in
CYGWIN*) cygwin=true;;
OS400*) os400=true;;
esac

# resolve links - $0 may be a softlink
PRG="$0"

while [[ -h "$PRG" ]]; do
# shellcheck disable=SC2006
ls=`ls -ld "$PRG"`
# shellcheck disable=SC2006
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
# shellcheck disable=SC2006
PRG=`dirname "$PRG"`/"$link"
fi
done

# Get standard environment variables
# shellcheck disable=SC2006
ROOT=$(cd "$(dirname "$PRG")" &>/dev/null && pwd)
export DORIS_HOME=$(cd "$ROOT/../" &>/dev/null && pwd)

. "${DORIS_HOME}"/env.sh

# include custom environment variables
if [[ -f ${DORIS_HOME}/custom_env.sh ]]; then
. "${DORIS_HOME}"/custom_env.sh
fi

selectSpark() {
echo 'Spark-Load supports multiple versions of spark. Which version do you need ?'
select spark in "2.x" "3.x" "other"
do
case $spark in
"2.x")
return 1
;;
"3.x")
return 2
;;
*)
echo "invalid selected, exit.."
exit 1
;;
esac
done
}

SPARK_VERSION=0
selectSpark
SparkVer=$?
if [ ${SparkVer} -eq 1 ]; then
SPARK_VERSION="spark2"
SCALA_VERSION="scala_2.11"
elif [ ${SparkVer} -eq 2 ]; then
SPARK_VERSION="spark3"
SCALA_VERSION="scala_2.12"
fi

echo_g " spark load run based on : ${SPARK_VERSION} and ${SCALA_VERSION}"
echo_g " build starting..."

${MVN_BIN} clean package -P${SPARK_VERSION},${SCALA_VERSION} "$@"

EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
DIST_DIR=${DORIS_HOME}/dist
[ ! -d "$DIST_DIR" ] && mkdir "$DIST_DIR"
dist_jar=$(ls "${ROOT}"/target | grep "spark-load-")
rm -rf "${DIST_DIR}"/"${dist_jar}"
cp "${ROOT}"/target/"${dist_jar}" "$DIST_DIR"

echo_g "*****************************************************************"
echo_g "Successfully build Spark-Load"
echo_g "dist: $DIST_DIR/$dist_jar "
echo_g "*****************************************************************"
exit 0;
else
echo_r "Failed build Spark-Load"
exit $EXIT_CODE;
fi
Loading

0 comments on commit 2c3d9dd

Please sign in to comment.