Build spark/hadoop from source and pip install the local package

opendatahub-io-contrib · Sep 4, 2020 · e704e4e · e704e4e
1 parent 8b15be8
commit e704e4e
Showing 1 changed file with 14 additions and 19 deletions.
diff --git a/python-3.6/Dockerfile b/python-3.6/Dockerfile
@@ -1,31 +1,26 @@
 FROM quay.io/thoth-station/s2i-thoth-ubi8-py36:v0.15.0
 
-ARG PKG_ROOT=/opt/app-root
 ARG SPARK_VERSION=2.4.5
 ARG HADOOP_VERSION=2.8.5
 ARG JAVA_VERSION=1.8.0
+ARG SPARK_SOURCE_REPO=https://github.com/apache/spark.git
+ARG SPARK_SOURCE_REPO_BRANCH=v${SPARK_VERSION}
+ARG SPARK_SOURCE_REPO_TARGET_DIR=spark
+ARG SPARK_BUILD_ARGS="-Phive -Phive-thriftserver -Pkubernetes -Dhadoop.version=${HADOOP_VERSION}"
 
 USER root
-# Download and extract the binaries for Spark and Hadoop
-RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\
-    tar -C ${PKG_ROOT} -zxf spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\
-    mv ${PKG_ROOT}/spark-${SPARK_VERSION}-bin-without-hadoop ${PKG_ROOT}/spark-${SPARK_VERSION} &&\
-    rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz
-RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz &&\
-    tar -C ${PKG_ROOT} -xf hadoop-${HADOOP_VERSION}.tar.gz &&\
-    rm hadoop-${HADOOP_VERSION}.tar.gz
-
-# Install java to execute hadoop jars
+# Install openjdk so we can build the hadoop jars
 RUN yum -y install java-$JAVA_VERSION-openjdk maven &&\
     yum clean all
-
-# Setup required env vars for spark and hadoop
 ENV JAVA_HOME=/usr/lib/jvm/jre
-ENV SPARK_HOME=${PKG_ROOT}/spark-${SPARK_VERSION}
-
-# Add HADOOP_CONF_DIR to spark-env.sh based on output from running "hadoop classpath"
-RUN cp ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh.template ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh &&\
-    echo "HADOOP_CONF_DIR=$(${PKG_ROOT}/hadoop-${HADOOP_VERSION}/bin/hadoop classpath)" >> ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh
 
 USER 1001
-RUN fix-permissions ${PKG_ROOT}
+# Build the Apache spark and hadoop binaries from source
+# After the build is complete create and install the python wheel file using pip
+RUN git clone ${SPARK_SOURCE_REPO} -b ${SPARK_SOURCE_REPO_BRANCH} ${SPARK_SOURCE_REPO_TARGET_DIR}
+RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR} &&\
+    dev/make-distribution.sh --name spark-${SPARK_VERSION}-hadoop-${HADOOP_VERSION} ${SPARK_BUILD_ARGS}
+RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR}/python && python setup.py bdist_wheel && pip install dist/*whl &&\
+    cd ${HOME} && rm -rf ${SPARK_SOURCE_REPO_TARGET_DIR}
+
+RUN fix-permissions /opt/app-root