Skip to content
This repository has been archived by the owner on Sep 10, 2022. It is now read-only.

Commit

Permalink
Build spark/hadoop from source and pip install the local package
Browse files Browse the repository at this point in the history
  • Loading branch information
LaVLaS committed Sep 4, 2020
1 parent 8b15be8 commit e704e4e
Showing 1 changed file with 14 additions and 19 deletions.
33 changes: 14 additions & 19 deletions python-3.6/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@
FROM quay.io/thoth-station/s2i-thoth-ubi8-py36:v0.15.0

ARG PKG_ROOT=/opt/app-root
ARG SPARK_VERSION=2.4.5
ARG HADOOP_VERSION=2.8.5
ARG JAVA_VERSION=1.8.0
ARG SPARK_SOURCE_REPO=https://github.com/apache/spark.git
ARG SPARK_SOURCE_REPO_BRANCH=v${SPARK_VERSION}
ARG SPARK_SOURCE_REPO_TARGET_DIR=spark
ARG SPARK_BUILD_ARGS="-Phive -Phive-thriftserver -Pkubernetes -Dhadoop.version=${HADOOP_VERSION}"

USER root
# Download and extract the binaries for Spark and Hadoop
RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\
tar -C ${PKG_ROOT} -zxf spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\
mv ${PKG_ROOT}/spark-${SPARK_VERSION}-bin-without-hadoop ${PKG_ROOT}/spark-${SPARK_VERSION} &&\
rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz
RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz &&\
tar -C ${PKG_ROOT} -xf hadoop-${HADOOP_VERSION}.tar.gz &&\
rm hadoop-${HADOOP_VERSION}.tar.gz

# Install java to execute hadoop jars
# Install openjdk so we can build the hadoop jars
RUN yum -y install java-$JAVA_VERSION-openjdk maven &&\
yum clean all

# Setup required env vars for spark and hadoop
ENV JAVA_HOME=/usr/lib/jvm/jre
ENV SPARK_HOME=${PKG_ROOT}/spark-${SPARK_VERSION}

# Add HADOOP_CONF_DIR to spark-env.sh based on output from running "hadoop classpath"
RUN cp ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh.template ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh &&\
echo "HADOOP_CONF_DIR=$(${PKG_ROOT}/hadoop-${HADOOP_VERSION}/bin/hadoop classpath)" >> ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh

USER 1001
RUN fix-permissions ${PKG_ROOT}
# Build the Apache spark and hadoop binaries from source
# After the build is complete create and install the python wheel file using pip
RUN git clone ${SPARK_SOURCE_REPO} -b ${SPARK_SOURCE_REPO_BRANCH} ${SPARK_SOURCE_REPO_TARGET_DIR}
RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR} &&\
dev/make-distribution.sh --name spark-${SPARK_VERSION}-hadoop-${HADOOP_VERSION} ${SPARK_BUILD_ARGS}
RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR}/python && python setup.py bdist_wheel && pip install dist/*whl &&\
cd ${HOME} && rm -rf ${SPARK_SOURCE_REPO_TARGET_DIR}

RUN fix-permissions /opt/app-root

0 comments on commit e704e4e

Please sign in to comment.