-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfigure_pyspark.sh
executable file
·89 lines (73 loc) · 2.57 KB
/
configure_pyspark.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env bash
echo "Usage : "
echo " To effectively configure Java 8 for Spark "
echo " Instead of running as "
echo " sh configure_pyspark.sh "
echo " or "
echo " ./configure_pyspark.sh "
echo " Run this shell script as : "
echo " source ~/.configure_pyspark.sh" to properly export variables
echo
echo ">>>>>>>>> Configuring PySpark with Python 3.7 ........."
echo $(which python3.7)
echo
export PYSPARK_PYTHON=python3.7
export PYSPARK_DRIVER_PYTHON=python3.7
echo "For PySpark configured $PYSPARK_PYTHON"
echo
echo ">>>>>>>>> Configuring PySpark with Java 8 ........."
echo
echo "Current JAVA_HOME="$JAVA_HOME
curr_java_exec_path=$(readlink -f $(which java))
echo "Current Java="$curr_java_exec_path
echo
echo "Current PATH="$PATH
echo
# Extract java-8 base dir
java_8_exec_path=$(update-alternatives --list java | grep java-8)
JAVA_8_HOME=$(dirname $(dirname $java_8_exec_path))
# Set JAVA_HOME and PATH to point to Java 8
export JAVA_HOME=$JAVA_8_HOME
echo
echo "For PySpark configured JAVA_HOME=$JAVA_HOME"
echo
OPT_HOME=/home/lalitstar/opt
# Set HADOOP_HOME to point to Hadoop installation from Opt folder
export HADOOP_HOME="${OPT_HOME}/hadoop/hadoop-3.3.0"
echo
echo "For PySpark configured HADOOP_HOME=${HADOOP_HOME}"
echo
# Set SPARK_HOME to point to Spark installation from Opt folder
export SPARK_HOME="${OPT_HOME}/spark/spark-3.0.0-bin-without-hadoop"
echo
echo "For PySpark configured SPARK_HOME=${SPARK_HOME}"
echo
# Configure PATH
export PATH="$JAVA_HOME/bin:$HADOOP_HOME/bin:$SPARK_HOME/bin:$PATH"
echo
echo "For PySpark configured PATH=$PATH"
echo
echo "hadoop version"
hadoop version
echo
echo "spark-submit --version"
spark-submit --version
echo
### Configure Jupyter Notebook (First ensure to run: sudo pip3 install ipython)
### export PYSPARK_SUBMIT_ARGS="pyspark-shell"
### export PYSPARK_DRIVER_PYTHON=ipython
### export PYSPARK_DRIVER_PYTHON_OPTS='notebook' pyspark
### Configure Jupyter Notebook (First ensure to run: sudo pip3 install jupyter)
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS='notebook'
echo
echo "For PySpark configured Jupyter Notebook"
echo
echo "************************************"
echo "* To start PySpark, Type : pyspark *"
echo "************************************"
echo
# for Spark 2.4.5 with Hadoop 3.2.1 (Mac OS)
# pyspark --packages com.amazonaws:aws-java-sdk-bundle:1.11.819,org.apache.hadoop:hadoop-aws:3.2.1
# for Spark 3.0.0 with Hadoop 3.3.0 (Ubuntu 20.04)
# pyspark --packages com.amazonaws:aws-java-sdk-bundle:1.11.819,org.apache.hadoop:hadoop-aws:3.3.0