From 21f919690b1bcd312c4f529c814951d12be244d4 Mon Sep 17 00:00:00 2001 From: Landon LaSmith Date: Wed, 2 Sep 2020 23:38:43 -0400 Subject: [PATCH] Build spark/hadoop from source and pip install the local package --- python-3.6/Dockerfile | 33 ++++++++++++++------------------- python-3.6/README.md | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 19 deletions(-) create mode 100644 python-3.6/README.md diff --git a/python-3.6/Dockerfile b/python-3.6/Dockerfile index bfba325..77b400a 100644 --- a/python-3.6/Dockerfile +++ b/python-3.6/Dockerfile @@ -1,31 +1,26 @@ FROM quay.io/thoth-station/s2i-thoth-ubi8-py36:v0.15.0 -ARG PKG_ROOT=/opt/app-root ARG SPARK_VERSION=2.4.5 ARG HADOOP_VERSION=2.8.5 ARG JAVA_VERSION=1.8.0 +ARG SPARK_SOURCE_REPO=https://github.com/apache/spark.git +ARG SPARK_SOURCE_REPO_BRANCH=v${SPARK_VERSION} +ARG SPARK_SOURCE_REPO_TARGET_DIR=spark +ARG SPARK_BUILD_ARGS="-Phive -Phive-thriftserver -Pkubernetes -Dhadoop.version=${HADOOP_VERSION}" USER root -# Download and extract the binaries for Spark and Hadoop -RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\ - tar -C ${PKG_ROOT} -zxf spark-${SPARK_VERSION}-bin-without-hadoop.tgz &&\ - mv ${PKG_ROOT}/spark-${SPARK_VERSION}-bin-without-hadoop ${PKG_ROOT}/spark-${SPARK_VERSION} &&\ - rm spark-${SPARK_VERSION}-bin-without-hadoop.tgz -RUN wget https://downloads.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz &&\ - tar -C ${PKG_ROOT} -xf hadoop-${HADOOP_VERSION}.tar.gz &&\ - rm hadoop-${HADOOP_VERSION}.tar.gz - -# Install java to execute hadoop jars +# Install openjdk so we can build the hadoop jars RUN yum -y install java-$JAVA_VERSION-openjdk maven &&\ yum clean all - -# Setup required env vars for spark and hadoop ENV JAVA_HOME=/usr/lib/jvm/jre -ENV SPARK_HOME=${PKG_ROOT}/spark-${SPARK_VERSION} - -# Add HADOOP_CONF_DIR to spark-env.sh based on output from running "hadoop classpath" -RUN cp ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh.template ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh &&\ - echo "HADOOP_CONF_DIR=$(${PKG_ROOT}/hadoop-${HADOOP_VERSION}/bin/hadoop classpath)" >> ${PKG_ROOT}/spark-${SPARK_VERSION}/conf/spark-env.sh USER 1001 -RUN fix-permissions ${PKG_ROOT} +# Build the Apache spark and hadoop binaries from source +# After the build is complete create and install the python wheel file using pip +RUN git clone ${SPARK_SOURCE_REPO} -b ${SPARK_SOURCE_REPO_BRANCH} ${SPARK_SOURCE_REPO_TARGET_DIR} +RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR} &&\ + dev/make-distribution.sh --name spark-${SPARK_VERSION}-hadoop-${HADOOP_VERSION} ${SPARK_BUILD_ARGS} +RUN cd ${SPARK_SOURCE_REPO_TARGET_DIR}/python && python setup.py bdist_wheel && pip install dist/*whl &&\ + cd ${HOME} && rm -rf ${SPARK_SOURCE_REPO_TARGET_DIR} + +RUN fix-permissions /opt/app-root diff --git a/python-3.6/README.md b/python-3.6/README.md new file mode 100644 index 0000000..f299e2e --- /dev/null +++ b/python-3.6/README.md @@ -0,0 +1,22 @@ +# s2i-spark-container + +A base image for s2i builds based on Python 3.6 with Java and Spark & Hadoop binaries + +## ARGUMENTS +* `SPARK_VERSION` - Version of spark to build from source +* `HADOOP_VERSION` - Version of hadoop to bundle with this spark version +* `JAVA_VERSION` - Version of openjdk to install in this image. +* `SPARK_SOURCE_REPO` - Git repo to clone for the spark source +* `SPARK_SOURCE_REPO_BRANCH` - Git branch to use for the spark build. Defaults to the tagging format used in the Apache Spark repo "v${SPARK_VERSION}" +* `SPARK_SOURCE_REPO_TARGET_DIR` - Directory name to use for the destination clone of the spark repo. +* `SPARK_BUILD_ARGS` - Build arguments that will be passed to the spark `dev/make-distribution.sh` See [Building a Runnable Distribution](https://spark.apache.org/docs/latest/building-spark.html#building-a-runnable-distribution) for the spark version being built. + +``` +SPARK_VERSION=2.4.5 +HADOOP_VERSION=2.8.5 +JAVA_VERSION=1.8.0 +SPARK_IMAGE_TAG=spark-${SPARK_VERSION}_hadoop-${HADOOP_VERSION} + +# Build the container image that is used as the base image for the jupyter s2i spark notebook +docker build --build-arg ${SPARK_VERSION} --build-arg ${HADOOP_VERSION} --build-arg ${JAVA_VERSION} -t s2i-spark-container:${SPARK_IMAGE_TAG} https://github.com/lavlas/s2i-spark-container.git:python-3.6 +``` \ No newline at end of file