opendatahub-io-contrib · LaVLaS · Sep 3, 2020 · rimolive · May 24, 2021 · LaVLaS
diff --git a/python-3.6/Dockerfile b/python-3.6/Dockerfile
@@ -1,31 +1,26 @@
 FROM quay.io/thoth-station/s2i-thoth-ubi8-py36:v0.15.0
 
-ARG PKG_ROOT=/opt/app-root
 ARG SPARK_VERSION=2.4.5
 ARG HADOOP_VERSION=2.8.5
 ARG JAVA_VERSION=1.8.0
+ARG SPARK_SOURCE_REPO=https://github.com/apache/spark.git
+ARG SPARK_SOURCE_REPO_BRANCH=v${SPARK_VERSION}
+ARG SPARK_SOURCE_REPO_TARGET_DIR=spark
+ARG SPARK_BUILD_ARGS="-Phive -Phive-thriftserver -Pkubernetes -Dhadoop.version=${HADOOP_VERSION}"
 
 USER root
-# Download and extract the binaries for Spark and Hadoop
-RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\
-    tar -C ${PKG_ROOT} -zxf spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\
-    mv ${PKG_ROOT}/spark-${SPARK_VERSION}-bin-without-hadoop ${PKG_ROOT}/spark-${SPARK_VERSION} &&\
-    rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz
-RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz &&\
-    tar -C ${PKG_ROOT} -xf hadoop-${HADOOP_VERSION}.tar.gz &&\
-    rm hadoop-${HADOOP_VERSION}.tar.gz
-
-# Install java to execute hadoop jars
+# Install openjdk so we can build the hadoop jars
 RUN yum -y install java-$JAVA_VERSION-openjdk maven &&\
     yum clean all
-
-# Setup required env vars for spark and hadoop
 ENV JAVA_HOME=/usr/lib/jvm/jre
-ENV SPARK_HOME=${PKG_ROOT}/spark-${SPARK_VERSION}
-
-# Add HADOOP_CONF_DIR to spark-env.sh based on output from running "hadoop classpath"
-RUN cp ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh.template ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh &&\
-    echo "HADOOP_CONF_DIR=$(${PKG_ROOT}/hadoop-${HADOOP_VERSION}/bin/hadoop classpath)" >> ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh
 
 USER 1001
-RUN fix-permissions ${PKG_ROOT}
+# Build the Apache spark and hadoop binaries from source
+# After the build is complete create and install the python wheel file using pip
+RUN git clone ${SPARK_SOURCE_REPO} -b ${SPARK_SOURCE_REPO_BRANCH} ${SPARK_SOURCE_REPO_TARGET_DIR}
+RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR} &&\
+    dev/make-distribution.sh --name spark-${SPARK_VERSION}-hadoop-${HADOOP_VERSION} ${SPARK_BUILD_ARGS}
+RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR}/python && python setup.py bdist_wheel && pip install dist/*whl &&\
+    cd ${HOME} && rm -rf ${SPARK_SOURCE_REPO_TARGET_DIR}
+
+RUN fix-permissions /opt/app-root
diff --git a/python-3.6/README.md b/python-3.6/README.md
@@ -0,0 +1,22 @@
+# s2i-spark-container
+
+A base image for s2i builds based on Python 3.6 with Java and Spark & Hadoop binaries
+
+## ARGUMENTS
+* `SPARK_VERSION` - Version of spark to build from source
+* `HADOOP_VERSION` - Version of hadoop to bundle with this spark version
+* `JAVA_VERSION` - Version of openjdk to install in this image.
+* `SPARK_SOURCE_REPO` - Git repo to clone for the spark source
+* `SPARK_SOURCE_REPO_BRANCH` - Git branch to use for the spark build. Defaults to the tagging format used in the Apache Spark repo "v${SPARK_VERSION}"
+* `SPARK_SOURCE_REPO_TARGET_DIR` - Directory name to use for the destination clone of the spark repo.
+* `SPARK_BUILD_ARGS` - Build arguments that will be passed to the spark `dev/make-distribution.sh`  See [Building a Runnable Distribution](https://spark.apache.org/docs/latest/building-spark.html#building-a-runnable-distribution) for the spark version being built.
+
+```
+SPARK_VERSION=2.4.5
+HADOOP_VERSION=2.8.5
+JAVA_VERSION=1.8.0
+SPARK_IMAGE_TAG=spark-${SPARK_VERSION}_hadoop-${HADOOP_VERSION}
+
+# Build the container image that is used as the base image for the jupyter s2i spark notebook
+docker build --build-arg ${SPARK_VERSION} --build-arg ${HADOOP_VERSION} --build-arg ${JAVA_VERSION} -t s2i-spark-container:${SPARK_IMAGE_TAG} https://github.com/lavlas/s2i-spark-container.git:python-3.6
+```