Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add spark 3.5
Browse files Browse the repository at this point in the history
julien bignon committed Sep 5, 2024

Verified

This commit was signed with the committer’s verified signature.
theseion Max Leske
1 parent 71d65ed commit d29281f
Showing 27 changed files with 778 additions and 5 deletions.
3 changes: 2 additions & 1 deletion technologies/job/spark/spark-2.4/context.yaml
Original file line number Diff line number Diff line change
@@ -2,4 +2,5 @@ id: "2.4"
label: "2.4"
available: true
recommended: false
trustLevel: stable
trustLevel: deprecated
deprecationDate: "2024-09-01T00:00:00Z"
3 changes: 2 additions & 1 deletion technologies/job/spark/spark-3.0/context.yaml
Original file line number Diff line number Diff line change
@@ -2,4 +2,5 @@ id: "3.0"
label: "3.0"
available: true
recommended: false
trustLevel: stable
trustLevel: deprecated
deprecationDate: "2024-09-01T00:00:00Z"
5 changes: 3 additions & 2 deletions technologies/job/spark/spark-3.1/context.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
id: "3.1"
label: "3.1"
available: true
recommended: true
trustLevel: stable
recommended: false
trustLevel: deprecated
deprecationDate: "2024-09-01T00:00:00Z"
22 changes: 22 additions & 0 deletions technologies/job/spark/spark-3.5/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin

apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()
5 changes: 5 additions & 0 deletions technologies/job/spark/spark-3.5/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "3.5"
label: "3.5"
available: true
recommended: true
trustLevel: stable
22 changes: 22 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
ARG jre_major
FROM spark:3.5.2-scala2.12-java$jre_major-ubuntu

ENV PATH "$PATH:$SPARK_HOME/bin"
ENV LANG C.UTF-8

# LIGHT DEPENDENCIES START
USER root
RUN apt update -qq && apt install -yqq --no-install-recommends \
wget curl unzip krb5-user zip && \
rm -rf /var/lib/apt/lists/*s

COPY entrypoint.sh /opt/
RUN chmod 755 /opt/entrypoint.sh

USER spark

#See hadoop version used by spark and udpate if necessary.
#See https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.4 to get right version of aws-java-sdk-bundle
RUN wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar && \
wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar && \
mv *.jar /opt/spark/jars/
18 changes: 18 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
id: java-scala
label: Java/Scala
available: true
trustLevel: stable
job:
features:
- type: COMMAND_LINE
label: Command line
mandatory: true
comment: Linux shell command to launch the job.
defaultValue: "spark-submit \\\n--conf spark.executor.memory=1G \\\n--conf spark.executor.cores=1 \\\n--conf spark.kubernetes.executor.limit.cores=1 \\\n--conf spark.executor.instances=2 \\\n--class=Main {file} arg1 arg2"
- type: ARTIFACT
label: Package
mandatory: true
comment: "Compatible upload file : .jar"
- type: SCHEDULER
label: Scheduled
mandatory: true
141 changes: 141 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/jre/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Prevent any errors from being silently ignored
set -eo pipefail

attempt_setup_fake_passwd_entry() {
# Check whether there is a passwd entry for the container UID
local myuid; myuid="$(id -u)"
# If there is no passwd entry for the container UID, attempt to fake one
# You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523
# It's to resolve OpenShift random UID case.
# See also: https://github.com/docker-library/postgres/pull/448
if ! getent passwd "$myuid" &> /dev/null; then
local wrapper
for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do
if [ -s "$wrapper" ]; then
NSS_WRAPPER_PASSWD="$(mktemp)"
NSS_WRAPPER_GROUP="$(mktemp)"
export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP
local mygid; mygid="$(id -g)"
printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD"
printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP"
break
fi
done
fi
}

if [ -z "$JAVA_HOME" ]; then
JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
fi

SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
for v in "${!SPARK_JAVA_OPT_@}"; do
SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" )
done

if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi

if ! [ -z "${PYSPARK_PYTHON+x}" ]; then
export PYSPARK_PYTHON
fi
if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then
export PYSPARK_DRIVER_PYTHON
fi

# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
fi

if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
fi

if ! [ -z "${SPARK_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
elif ! [ -z "${SPARK_HOME+x}" ]; then
SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
fi

# SPARK-43540: add current working directory into executor classpath
SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD"

# Switch to spark if no USER specified (root by default) otherwise use USER directly
switch_spark_if_root() {
if [ $(id -u) -eq 0 ]; then
echo gosu spark
fi
}

case "$1" in
driver)
shift 1
CMD=(
"$SPARK_HOME/bin/spark-submit"
--conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
--conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS"
--deploy-mode client
"$@"
)
attempt_setup_fake_passwd_entry
# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
;;
executor)
shift 1
CMD=(
${JAVA_HOME}/bin/java
"${SPARK_EXECUTOR_JAVA_OPTS[@]}"
-Xms"$SPARK_EXECUTOR_MEMORY"
-Xmx"$SPARK_EXECUTOR_MEMORY"
-cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
--driver-url "$SPARK_DRIVER_URL"
--executor-id "$SPARK_EXECUTOR_ID"
--cores "$SPARK_EXECUTOR_CORES"
--app-id "$SPARK_APPLICATION_ID"
--hostname "$SPARK_EXECUTOR_POD_IP"
--resourceProfileId "$SPARK_RESOURCE_PROFILE_ID"
--podName "$SPARK_EXECUTOR_POD_NAME"
)
attempt_setup_fake_passwd_entry
# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
;;

*)
# BEGIN SAAGIE SPECIFIC CODE
cd /sandbox
mkdir -p /opt/spark/conf/
cat conf/*.conf > /opt/spark/conf/spark-defaults.conf
if test -f main_script;
then
CMD=(/bin/sh ./main_script)
exec "${CMD[@]}"
else
# END SAAGIE SPECIFIC CODE
#Non-spark-on-k8s command provided, proceeding in pass-through mode...
exec "$@"
fi;
;;
esac
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin
import com.saagie.technologies.readDockerInfo
import com.saagie.technologies.getVersionForDocker


apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()

tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) {
this.buildArgs.put(
"jre_major",
"11"
)
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
image: saagie/spark
baseTag: 3.5-jre-11
dynamicVersion: 1.125.0
version: 3.5-jre-11-1.125.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
schemaVersion: "2.0.0"

metadataTest:
env:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/opt/java/openjdk"
- key: SPARK_HOME
value: "/opt/spark"

fileExistenceTests:
- name: "entrypoint.sh"
path: "/opt/entrypoint.sh"
shouldExist: true
permissions: "-rwxr-xr-x"

- name: "kinit"
path: "/usr/bin/kinit"
shouldExist: true
permissions: "-rwxr-xr-x"

commandTests:
- name: "Workdir"
command: "pwd"
expectedOutput: ["/opt/spark/work-dir"]

- name: "Spark version"
command: "/opt/spark/bin/spark-submit"
args: ["--version"]
expectedError: ["version 3.5.*"]

- name: "krb5-user installation"
command: "kinit"
expectedError: ["kinit: Client's credentials have been revoked while getting initial credentials"]
exitCode: 1

- name: "wget"
args: ["--help"]
command: "wget"
exitCode: 0

- name: "curl"
args: ["--help"]
command: "curl"
exitCode: 0

- name: "unzip"
args: ["--help"]
command: "unzip"
exitCode: 0

- name: "tar"
args: ["--help"]
command: "tar"
exitCode: 0

- name: "tini"
command: "/usr/bin/tini"
args: ["--version"]
expectedOutput: ["tini version 0.18.0.*"]
exitCode: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "11"
label: "11"
available: true
trustLevel: stable
recommended: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin
import com.saagie.technologies.readDockerInfo
import com.saagie.technologies.getVersionForDocker


apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()

tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) {
this.buildArgs.put(
"jre_major",
"17"
)
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
image: saagie/spark
baseTag: 3.5-jre-17
dynamicVersion: 1.125.0
version: 3.5-jre-11-1.125.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
schemaVersion: "2.0.0"

metadataTest:
env:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/opt/java/openjdk"
- key: SPARK_HOME
value: "/opt/spark"

fileExistenceTests:
- name: "entrypoint.sh"
path: "/opt/entrypoint.sh"
shouldExist: true
permissions: "-rwxr-xr-x"

- name: "kinit"
path: "/usr/bin/kinit"
shouldExist: true
permissions: "-rwxr-xr-x"

commandTests:
- name: "Workdir"
command: "pwd"
expectedOutput: ["/opt/spark/work-dir"]

- name: "Spark version"
command: "/opt/spark/bin/spark-submit"
args: ["--version"]
expectedError: ["version 3.5.*"]

- name: "krb5-user installation"
command: "kinit"
expectedError: ["kinit: Client's credentials have been revoked while getting initial credentials"]
exitCode: 1

- name: "wget"
args: ["--help"]
command: "wget"
exitCode: 0

- name: "curl"
args: ["--help"]
command: "curl"
exitCode: 0

- name: "unzip"
args: ["--help"]
command: "unzip"
exitCode: 0

- name: "tar"
args: ["--help"]
command: "tar"
exitCode: 0

- name: "tini"
command: "/usr/bin/tini"
args: ["--version"]
expectedOutput: ["tini version 0.19.0.*"]
exitCode: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "17"
label: "17"
available: true
trustLevel: stable
recommended: true
40 changes: 40 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/python/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
ARG base_img

FROM spark:3.5.2 AS SPARK_BASE

FROM ${base_img} AS BASE_IMG

COPY --from=SPARK_BASE /opt/spark /opt/spark
COPY --from=SPARK_BASE /usr/bin/tini /usr/bin/tini

COPY --from=SPARK_BASE /opt/java/openjdk /opt/java/openjdk

ENV JAVA_HOME /opt/java/openjdk
ENV LANG C.UTF-8
ENV SPARK_HOME /opt/spark

#See https://github.com/apache/spark-docker/blob/master/Dockerfile.template#L19
ARG spark_uid=185

RUN groupadd --system --gid=${spark_uid} spark && \
useradd --system --uid=${spark_uid} --gid=spark spark

RUN apt update -qq && apt install -yqq --no-install-recommends \
gosu && \
rm -rf /var/lib/apt/lists/*s

RUN pip --no-cache-dir install --upgrade pip \
&& pip --no-cache-dir install pyspark==3.5.2 \
&& rm -rf /root/.cachex \
&& rm -rf /boot/.cache/pip \
&& rm -rf ~/.cache/pip

# As long as base image is from saagie, no need to add krb5 or LD_LIBRARY_PATH

# Move scripts and frequently changing directive to the end of the build
COPY entrypoint.sh /opt/
RUN chmod 755 /opt/entrypoint.sh

WORKDIR /opt/spark/work-dir

ENTRYPOINT [ "/opt/entrypoint.sh" ]
22 changes: 22 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/python/context.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
id: python
label: Python
available: true
trustLevel: stable
job:
features:
- type: COMMAND_LINE
label: Command line
mandatory: true
comment: Linux shell command to launch the job.
defaultValue: "spark-submit \\\n--conf spark.executor.memory=1G \\\n--conf spark.executor.cores=1 \\\n--conf spark.kubernetes.executor.limit.cores=1 \\\n--conf spark.executor.instances=2 \\\n--py-files={file} local://__main__.py"
- type: ARTIFACT
label: Package
mandatory: true
comment: "Compatible upload file : .py or .zip"
- type: SCHEDULER
label: Scheduled
mandatory: true
- type: AI_DESCRIPTION_GENERATOR
label: AI description generator enabled
mandatory: true
comment: Activation of the AI-based automatic description generation function.
182 changes: 182 additions & 0 deletions technologies/job/spark/spark-3.5/innerContexts/python/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/bin/bash
# FROM https://github.com/apache/spark-docker/blob/master/entrypoint.sh.template
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Prevent any errors from being silently ignored
set -eo pipefail

attempt_setup_fake_passwd_entry() {
# Check whether there is a passwd entry for the container UID
local myuid; myuid="$(id -u)"
# If there is no passwd entry for the container UID, attempt to fake one
# You can also refer to the https://github.com/docker-library/official-images/pull/13089#issuecomment-1534706523
# It's to resolve OpenShift random UID case.
# See also: https://github.com/docker-library/postgres/pull/448
if ! getent passwd "$myuid" &> /dev/null; then
local wrapper
for wrapper in {/usr,}/lib{/*,}/libnss_wrapper.so; do
if [ -s "$wrapper" ]; then
NSS_WRAPPER_PASSWD="$(mktemp)"
NSS_WRAPPER_GROUP="$(mktemp)"
export LD_PRELOAD="$wrapper" NSS_WRAPPER_PASSWD NSS_WRAPPER_GROUP
local mygid; mygid="$(id -g)"
printf 'spark:x:%s:%s:${SPARK_USER_NAME:-anonymous uid}:%s:/bin/false\n' "$myuid" "$mygid" "$SPARK_HOME" > "$NSS_WRAPPER_PASSWD"
printf 'spark:x:%s:\n' "$mygid" > "$NSS_WRAPPER_GROUP"
break
fi
done
fi
}

if [ -z "$JAVA_HOME" ]; then
JAVA_HOME=$(java -XshowSettings:properties -version 2>&1 > /dev/null | grep 'java.home' | awk '{print $3}')
fi

# BEGIN SAAGIE SPECIFIC CODE
cd /sandbox
# parse content and if pyfiles extract minio url and inject it
if [ -f main_script ] && grep -q "\--py-files" main_script;
then
PYSPARK_FILES="`grep -Po '.*--py-files=\K[^ ]+' main_script`"
fi;

if [ -n "$PYSPARK_FILES" ]; then
PYTHONPATH="$PYTHONPATH:$PYSPARK_FILES"
#Copy and unzip pyfiles
if [[ $PYSPARK_FILES == *[,]* ]];then
echo "PYSPARK_FILES contains comma"
pyfiles=$(echo $PYSPARK_FILES | tr "," "\n")

for file in $pyfiles
do
echo ">>> [$file]"
wget -nv $file
done
else
echo ">>> [$PYSPARK_FILES]"
wget -nv $PYSPARK_FILES
fi
if [ -f *.zip ]
then
unzip -q *.zip
fi
if [ -f "requirements.txt" ]
then
pip install -r requirements.txt
fi
rm -Rf /opt/spark/work-dir
ln -s /sandbox/ /opt/spark/work-dir
fi
# END SAAGIE SPECIFIC CODE

SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
for v in "${!SPARK_JAVA_OPT_@}"; do
SPARK_EXECUTOR_JAVA_OPTS+=( "${!v}" )
done

if [ -n "$SPARK_EXTRA_CLASSPATH" ]; then
SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_EXTRA_CLASSPATH"
fi

if ! [ -z "${PYSPARK_PYTHON+x}" ]; then
export PYSPARK_PYTHON
fi
if ! [ -z "${PYSPARK_DRIVER_PYTHON+x}" ]; then
export PYSPARK_DRIVER_PYTHON
fi

# If HADOOP_HOME is set and SPARK_DIST_CLASSPATH is not set, set it here so Hadoop jars are available to the executor.
# It does not set SPARK_DIST_CLASSPATH if already set, to avoid overriding customizations of this value from elsewhere e.g. Docker/K8s.
if [ -n "${HADOOP_HOME}" ] && [ -z "${SPARK_DIST_CLASSPATH}" ]; then
export SPARK_DIST_CLASSPATH="$($HADOOP_HOME/bin/hadoop classpath)"
fi

if ! [ -z "${HADOOP_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$HADOOP_CONF_DIR:$SPARK_CLASSPATH";
fi

if ! [ -z "${SPARK_CONF_DIR+x}" ]; then
SPARK_CLASSPATH="$SPARK_CONF_DIR:$SPARK_CLASSPATH";
elif ! [ -z "${SPARK_HOME+x}" ]; then
SPARK_CLASSPATH="$SPARK_HOME/conf:$SPARK_CLASSPATH";
fi

# SPARK-43540: add current working directory into executor classpath
SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD"

# Switch to spark if no USER specified (root by default) otherwise use USER directly
#SAAGIE disable this part because main_script only ready by root user.
switch_spark_if_root() {
# if [ $(id -u) -eq 0 ]; then
# echo gosu spark
# fi
echo ""
}

case "$1" in
driver)
shift 1
CMD=(
"$SPARK_HOME/bin/spark-submit"
--conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS"
--conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS"
--py-files=/sandbox/* # SAAGIE SPECIFIC CODE
--deploy-mode client
"$@"
)
attempt_setup_fake_passwd_entry
# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
;;
executor)
shift 1
CMD=(
${JAVA_HOME}/bin/java
"${SPARK_EXECUTOR_JAVA_OPTS[@]}"
-Xms"$SPARK_EXECUTOR_MEMORY"
-Xmx"$SPARK_EXECUTOR_MEMORY"
-cp "$SPARK_CLASSPATH:$SPARK_DIST_CLASSPATH"
org.apache.spark.scheduler.cluster.k8s.KubernetesExecutorBackend
--driver-url "$SPARK_DRIVER_URL"
--executor-id "$SPARK_EXECUTOR_ID"
--cores "$SPARK_EXECUTOR_CORES"
--app-id "$SPARK_APPLICATION_ID"
--hostname "$SPARK_EXECUTOR_POD_IP"
--resourceProfileId "$SPARK_RESOURCE_PROFILE_ID"
--podName "$SPARK_EXECUTOR_POD_NAME"
)
attempt_setup_fake_passwd_entry
# Execute the container CMD under tini for better hygiene
exec $(switch_spark_if_root) /usr/bin/tini -s -- "${CMD[@]}"
;;

*)
# BEGIN SAAGIE SPECIFIC CODE
mkdir -p /opt/spark/conf/
cat conf/*.conf > /opt/spark/conf/spark-defaults.conf
echo "spark.kubernetes.driver.pod.name $HOSTNAME" >> /opt/spark/conf/spark-defaults.conf
if test -f main_script;
then
CMD=(/bin/sh ./main_script)
exec "${CMD[@]}"
else
# END SAAGIE SPECIFIC CODE
# Non-spark-on-k8s command provided, proceeding in pass-through mode...
exec "$@"
fi;
;;
esac
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2019-2021.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.bmuschko.gradle.docker.DockerRemoteApiPlugin
import com.saagie.technologies.SaagieTechnologiesGradlePlugin
import com.saagie.technologies.readDockerInfo
import com.saagie.technologies.getVersionForDocker


apply<DockerRemoteApiPlugin>()
apply<SaagieTechnologiesGradlePlugin>()

val dockerInfo = readDockerInfo(projectDir)

tasks.withType(com.bmuschko.gradle.docker.tasks.image.DockerBuildImage::class) {
this.buildArgs.put(
"base_img",
"saagie/python:3.12-1.183.0"
)
}

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
image: saagie/spark
baseTag: 3.5-py-3.12
dynamicVersion: 1.139.0_SDKTECHNO-207
version: 3.5-py-3.12-1.139.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
schemaVersion: "2.0.0"

metadataTest:
env:
- key: LANG
value: "C.UTF-8"
- key: JAVA_HOME
value: "/opt/java/openjdk"
- key: SPARK_HOME
value: "/opt/spark"

fileExistenceTests:
- name: "entrypoint.sh"
path: "/opt/entrypoint.sh"
shouldExist: true
permissions: "-rwxr-xr-x"

- name: "kinit"
path: "/usr/bin/kinit"
shouldExist: true
permissions: "-rwxr-xr-x"

commandTests:
- name: "Workdir"
command: "pwd"
expectedOutput: ["/opt/spark/work-dir"]

- name: "Spark version"
command: "/opt/spark/bin/spark-submit"
args: ["--version"]
expectedError: ["version 3.5.*"]

- name: "python installation"
command: "which"
args: ["python"]
expectedOutput: ["/usr/local/bin/python"]

- name: "krb5-user installation"
command: "kinit"
expectedError: ["kinit: Program lacks support for encryption type while getting initial credentials"]
exitCode: 1

- name: "wget"
args: ["--help"]
command: "wget"
exitCode: 0

- name: "curl"
args: ["--help"]
command: "curl"
exitCode: 0

- name: "unzip"
args: ["--help"]
command: "unzip"
exitCode: 0

- name: "tar"
args: ["--help"]
command: "tar"
exitCode: 0

- name: "tini"
command: "/usr/bin/tini"
args: ["--version"]
expectedOutput: ["tini version 0.18.0.*"]
exitCode: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id: "3.12"
label: "3.12"
available: true
trustLevel: stable
recommended: true
3 changes: 2 additions & 1 deletion technologies/job/spark/spark-aws-3.1/context.yaml
Original file line number Diff line number Diff line change
@@ -2,4 +2,5 @@ id: 3.1-aws
label: 3.1 AWS
available: true
recommended: true
trustLevel: stable
trustLevel: deprecated
deprecationDate: "2024-09-01T00:00:00Z"

0 comments on commit d29281f

Please sign in to comment.