diff --git a/python-3.6/Dockerfile b/python-3.6/Dockerfile index bfba325..77b400a 100644 --- a/python-3.6/Dockerfile +++ b/python-3.6/Dockerfile @@ -1,31 +1,26 @@ FROM quay.io/thoth-station/s2i-thoth-ubi8-py36:v0.15.0 -ARG PKG_ROOT=/opt/app-root ARG SPARK_VERSION=2.4.5 ARG HADOOP_VERSION=2.8.5 ARG JAVA_VERSION=1.8.0 +ARG SPARK_SOURCE_REPO=https://github.com/apache/spark.git +ARG SPARK_SOURCE_REPO_BRANCH=v${SPARK_VERSION} +ARG SPARK_SOURCE_REPO_TARGET_DIR=spark +ARG SPARK_BUILD_ARGS="-Phive -Phive-thriftserver -Pkubernetes -Dhadoop.version=${HADOOP_VERSION}" USER root -# Download and extract the binaries for Spark and Hadoop -RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\ - tar -C ${PKG_ROOT} -zxf spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\ - mv ${PKG_ROOT}/spark-${SPARK_VERSION}-bin-without-hadoop ${PKG_ROOT}/spark-${SPARK_VERSION} &&\ - rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz -RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz &&\ - tar -C ${PKG_ROOT} -xf hadoop-${HADOOP_VERSION}.tar.gz &&\ - rm hadoop-${HADOOP_VERSION}.tar.gz - -# Install java to execute hadoop jars +# Install openjdk so we can build the hadoop jars RUN yum -y install java-$JAVA_VERSION-openjdk maven &&\ yum clean all - -# Setup required env vars for spark and hadoop ENV JAVA_HOME=/usr/lib/jvm/jre -ENV SPARK_HOME=${PKG_ROOT}/spark-${SPARK_VERSION} - -# Add HADOOP_CONF_DIR to spark-env.sh based on output from running "hadoop classpath" -RUN cp ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh.template ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh &&\ - echo "HADOOP_CONF_DIR=$(${PKG_ROOT}/hadoop-${HADOOP_VERSION}/bin/hadoop classpath)" >> ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh USER 1001 -RUN fix-permissions ${PKG_ROOT} +# Build the Apache spark and hadoop binaries from source +# After the build is complete create and install the python wheel file using pip +RUN git clone ${SPARK_SOURCE_REPO} -b ${SPARK_SOURCE_REPO_BRANCH} ${SPARK_SOURCE_REPO_TARGET_DIR} +RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR} &&\ + dev/make-distribution.sh --name spark-${SPARK_VERSION}-hadoop-${HADOOP_VERSION} ${SPARK_BUILD_ARGS} +RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR}/python && python setup.py bdist_wheel && pip install dist/*whl &&\ + cd ${HOME} && rm -rf ${SPARK_SOURCE_REPO_TARGET_DIR} + +RUN fix-permissions /opt/app-root