From b0c50cc53a6047eff24f04a738829f6cd5f4d0e0 Mon Sep 17 00:00:00 2001 From: Jean Ruggiero Date: Tue, 3 Dec 2024 10:46:07 -0800 Subject: [PATCH 1/4] Update docker image, java and hadoop versions, and docker base image --- cluster/Dockerfile | 8 ++++---- cluster/docker-compose.yml | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cluster/Dockerfile b/cluster/Dockerfile index ff1c3bd..a471ed0 100644 --- a/cluster/Dockerfile +++ b/cluster/Dockerfile @@ -1,14 +1,14 @@ -FROM java:8-jre-alpine +FROM eclipse-temurin:11-jdk -RUN apk update -RUN apk add ca-certificates wget bash procps coreutils +RUN apt update +RUN apt install -yy ca-certificates wget bash procps coreutils RUN update-ca-certificates RUN mkdir -p /opt WORKDIR /opt ARG HADOOP_VERSION -RUN wget http://apache.mirrors.lucidnetworks.net/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz && \ +RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz && \ tar -xzvf hadoop-${HADOOP_VERSION}.tar.gz && \ rm hadoop-${HADOOP_VERSION}.tar.gz && \ mv hadoop-${HADOOP_VERSION} hadoop diff --git a/cluster/docker-compose.yml b/cluster/docker-compose.yml index 199afd2..50c03a6 100644 --- a/cluster/docker-compose.yml +++ b/cluster/docker-compose.yml @@ -5,10 +5,10 @@ services: context: . dockerfile: Dockerfile args: - HADOOP_VERSION: 2.9.2 - SPARK_VERSION: 2.4.4 - SPARK_VARIANT: without-hadoop-scala-2.12 - command: sbin/start-master.sh + HADOOP_VERSION: 3.3.5 + SPARK_VERSION: 3.5.1 + SPARK_VARIANT: without-hadoop + command: /opt/spark/sbin/start-master.sh restart: on-failure hostname: master environment: @@ -37,10 +37,10 @@ services: context: . dockerfile: Dockerfile args: - HADOOP_VERSION: 2.9.2 - SPARK_VERSION: 2.4.4 - SPARK_VARIANT: without-hadoop-scala-2.12 - command: sbin/start-slave.sh spark://master:7077 + HADOOP_VERSION: 3.3.5 + SPARK_VERSION: 3.5.1 + SPARK_VARIANT: without-hadoop + command: /opt/spark/sbin/start-slave.sh spark://master:7077 restart: on-failure hostname: worker-1 environment: @@ -70,7 +70,7 @@ services: - ./data:/data repl: - image: java:8-jre-alpine + image: eclipse-temurin:11-jdk command: java -jar /sparkplug-repl.jar restart: on-failure hostname: repl From c0a4e1da78ecc1e33379a5327f50638cd8abe2c5 Mon Sep 17 00:00:00 2001 From: Jean Ruggiero Date: Tue, 3 Dec 2024 10:46:25 -0800 Subject: [PATCH 2/4] Update to new docker compose command --- cluster/submit.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster/submit.sh b/cluster/submit.sh index 9128aa6..fd7d2dc 100755 --- a/cluster/submit.sh +++ b/cluster/submit.sh @@ -12,7 +12,7 @@ if [[ ! -f jars/$APP_JAR ]]; then exit 2 fi -docker-compose exec master \ - bin/spark-submit \ +docker compose exec master \ + /opt/spark/bin/spark-submit \ --master spark://master:7077 \ /mnt/jars/$APP_JAR From 357c395cfdc3c72f2205d564de45c0b6efa0b90d Mon Sep 17 00:00:00 2001 From: Jean Ruggiero Date: Tue, 3 Dec 2024 10:53:50 -0800 Subject: [PATCH 3/4] Remove check for executable spark script --- cluster/entry.sh | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/cluster/entry.sh b/cluster/entry.sh index 3d50010..1170501 100755 --- a/cluster/entry.sh +++ b/cluster/entry.sh @@ -1,13 +1,5 @@ -#!/bin/sh +#!/bin/bash -COMMAND="$1" -SPARK_SCRIPT="${SPARK_HOME}/${COMMAND}" -shift || exit 2 - -if [[ -x $SPARK_SCRIPT ]]; then - export SPARK_NO_DAEMONIZE=1 - export SPARK_DIST_CLASSPATH=$(/opt/hadoop/bin/hadoop classpath) - exec "$SPARK_SCRIPT" "$@" -else - exec "$COMMAND" "$@" -fi +export SPARK_NO_DAEMONIZE=1 +export SPARK_DIST_CLASSPATH=$(/opt/hadoop/bin/hadoop classpath) +exec "$@" From 4f78c080638c478ed981854c975f99d054f29306 Mon Sep 17 00:00:00 2001 From: Greg Look Date: Tue, 3 Dec 2024 13:21:54 -0800 Subject: [PATCH 4/4] Get the docker cluster working again! --- cluster/.gitignore | 2 +- cluster/Dockerfile | 11 +++--- cluster/README.md | 54 ++++++++++++------------------ cluster/code/.keep | 0 cluster/docker-compose.yml | 11 +++--- cluster/{entry.sh => spark-env.sh} | 5 ++- cluster/submit.sh | 12 +++---- sparkplug-repl/README.md | 6 ++-- sparkplug-repl/project.clj | 2 +- 9 files changed, 44 insertions(+), 59 deletions(-) create mode 100644 cluster/code/.keep rename cluster/{entry.sh => spark-env.sh} (72%) mode change 100755 => 100644 diff --git a/cluster/.gitignore b/cluster/.gitignore index df21b51..5a02aa9 100644 --- a/cluster/.gitignore +++ b/cluster/.gitignore @@ -1,2 +1,2 @@ +/code /data -/jars diff --git a/cluster/Dockerfile b/cluster/Dockerfile index a471ed0..d63e9ff 100644 --- a/cluster/Dockerfile +++ b/cluster/Dockerfile @@ -1,7 +1,7 @@ FROM eclipse-temurin:11-jdk RUN apt update -RUN apt install -yy ca-certificates wget bash procps coreutils +RUN apt install -yy ca-certificates wget bash procps coreutils python3 RUN update-ca-certificates RUN mkdir -p /opt @@ -14,15 +14,14 @@ RUN wget https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop- mv hadoop-${HADOOP_VERSION} hadoop ARG SPARK_VERSION -ARG SPARK_VARIANT +ARG SPARK_VARIANT=without-hadoop RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ tar -xzvf spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ rm spark-${SPARK_VERSION}-bin-${SPARK_VARIANT}.tgz && \ mv spark-${SPARK_VERSION}-bin-${SPARK_VARIANT} spark -ENV SPARK_HOME /opt/spark +ENV HADOOP_HOME=/opt/hadoop +ENV SPARK_HOME=/opt/spark +ADD spark-env.sh /opt/spark/conf/spark-env.sh RUN mkdir -p /tmp/spark-events - -ADD entry.sh /opt -ENTRYPOINT ["/opt/entry.sh"] diff --git a/cluster/README.md b/cluster/README.md index 622db2e..2f46896 100644 --- a/cluster/README.md +++ b/cluster/README.md @@ -10,58 +10,48 @@ issues that do not occur in local development contexts. Initialize the cluster, containing a master and one worker: -``` -docker-compose -f docker-compose.yml up -d master worker-1 +```shell +docker compose up -d ``` You can submit an application with the submit script: +```shell +cp $PROJECT/target/uberjar/my-app.jar cluster/code/ +./submit.sh my-app.jar ``` -# Launch the containers -$ docker-compose up -d - -# Copy uberjar to `jars` dir, your exact steps may vary -$ lein uberjar -$ cp $PROJECT/target/uberjar/my-app.jar docker/jars/ -$ ./submit.sh my-app.jar -``` +You can also submit an application using the Spark master's REST API. First, +create a JSON file with the request body: -You can also submit an application using the Spark master's REST API: - -``` -# Place a JSON request body in a file -$ cat request.json +```json { "action": "CreateSubmissionRequest", "appArgs": ["file:///data/hamlet.txt"], - "appResource": "file:///mnt/jars/spark-word-count.jar", - "clientSparkVersion": "2.4.4", + "appResource": "file:///mnt/code/my-app.jar", + "clientSparkVersion": "3.5.1", "environmentVariables": {"SPARK_ENV_LOADED": "1"}, - "mainClass": "spark_word_count.main", + "mainClass": "my_app.main", "sparkProperties": { - "spark.jars": "file:///mnt/jars/spark-word-count.jar", - "spark.executor.cores": 1, - "spark.executor.count": 1, - "spark.executor.memory": "1G", + "spark.app.name": "my-app", + "spark.submit.deployMode": "cluster", + "spark.jars": "file:///mnt/code/my-app.jar", "spark.driver.cores": 1, "spark.driver.memory": "1G", "spark.driver.supervise": "false", - "spark.app.name": "sparkplug", - "spark.submit.deployMode": "cluster", + "spark.executor.cores": 1, + "spark.executor.count": 1, + "spark.executor.memory": "1G", "spark.logConf": "true" } } +``` -$ curl -X POST --data @request.json http://localhost:6066/v1/submissions/create -{ - "action" : "CreateSubmissionResponse", - "message" : "Driver successfully submitted as driver-20200324235704-0000", - "serverSparkVersion" : "2.4.4", - "submissionId" : "driver-20200324235704-0000", - "success" : true -} +Then submit it to the scheduling HTTP endpoint: + +```shell +curl http://localhost:6066/v1/submissions/create --data @request.json ``` ## Endpoints diff --git a/cluster/code/.keep b/cluster/code/.keep new file mode 100644 index 0000000..e69de29 diff --git a/cluster/docker-compose.yml b/cluster/docker-compose.yml index 50c03a6..fda6e13 100644 --- a/cluster/docker-compose.yml +++ b/cluster/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3" services: master: build: @@ -7,7 +6,6 @@ services: args: HADOOP_VERSION: 3.3.5 SPARK_VERSION: 3.5.1 - SPARK_VARIANT: without-hadoop command: /opt/spark/sbin/start-master.sh restart: on-failure hostname: master @@ -30,7 +28,7 @@ services: - 7077:7077 - 8080:8080 volumes: - - ./jars:/mnt/jars + - ./code:/mnt/code worker-1: build: @@ -39,8 +37,7 @@ services: args: HADOOP_VERSION: 3.3.5 SPARK_VERSION: 3.5.1 - SPARK_VARIANT: without-hadoop - command: /opt/spark/sbin/start-slave.sh spark://master:7077 + command: /opt/spark/sbin/start-worker.sh spark://master:7077 restart: on-failure hostname: worker-1 environment: @@ -66,7 +63,7 @@ services: - 8081:8081 - 8881:8881 volumes: - - ./jars:/mnt/jars + - ./code:/mnt/code - ./data:/data repl: @@ -81,7 +78,7 @@ services: - 4050:4040 - 8765:8765 volumes: - - ./jars/sparkplug-repl.jar:/sparkplug-repl.jar + - ./code/sparkplug-repl.jar:/sparkplug-repl.jar - ./data:/data networks: diff --git a/cluster/entry.sh b/cluster/spark-env.sh old mode 100755 new mode 100644 similarity index 72% rename from cluster/entry.sh rename to cluster/spark-env.sh index 1170501..38a1784 --- a/cluster/entry.sh +++ b/cluster/spark-env.sh @@ -1,5 +1,4 @@ -#!/bin/bash +# Spark environment customizations -export SPARK_NO_DAEMONIZE=1 export SPARK_DIST_CLASSPATH=$(/opt/hadoop/bin/hadoop classpath) -exec "$@" +export SPARK_NO_DAEMONIZE=1 diff --git a/cluster/submit.sh b/cluster/submit.sh index fd7d2dc..09bf5f2 100755 --- a/cluster/submit.sh +++ b/cluster/submit.sh @@ -1,18 +1,18 @@ #!/bin/bash -APP_JAR="$1" +APP_DRIVER="$1" -if [[ -z $APP_JAR ]]; then - echo "No application jar file provided!" >&2 +if [[ -z $APP_DRIVER ]]; then + echo "No application driver code provided!" >&2 exit 1 fi -if [[ ! -f jars/$APP_JAR ]]; then - echo "Couldn't find jars/$APP_JAR - did you copy it in place?" >&2 +if [[ ! -f code/$APP_DRIVER ]]; then + echo "Couldn't find code/$APP_DRIVER - did you copy it in place?" >&2 exit 2 fi docker compose exec master \ /opt/spark/bin/spark-submit \ --master spark://master:7077 \ - /mnt/jars/$APP_JAR + /mnt/code/$APP_DRIVER diff --git a/sparkplug-repl/README.md b/sparkplug-repl/README.md index 3f56c92..0c7aba5 100644 --- a/sparkplug-repl/README.md +++ b/sparkplug-repl/README.md @@ -9,9 +9,9 @@ connected to a Spark cluster. First, build the REPL uberjar and copy it into the Docker cluster: -``` -$ lein uberjar -$ cp target/uberjar/sparkplug-repl.jar ../cluster/jars +```shell +lein uberjar +cp target/uberjar/sparkplug-repl.jar ../cluster/code ``` Next, start up the REPL container in another terminal: diff --git a/sparkplug-repl/project.clj b/sparkplug-repl/project.clj index 7e6e784..eccff4b 100644 --- a/sparkplug-repl/project.clj +++ b/sparkplug-repl/project.clj @@ -17,7 +17,7 @@ :profiles {:default - [:base :system :user :provided :spark-3.1 :dev] + [:base :system :user :provided :spark-3.5 :dev] :repl {:repl-options