Add flintrock readme

Eventual-Inc · Jan 10, 2025 · 0960399 · 0960399
1 parent f1cbdf1
commit 0960399
Show file tree

Hide file tree

Showing 5 changed files with 2,758 additions and 1 deletion.
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,15 @@ classifiers = [
     "Framework :: Daft",
     "Programming Language :: Python :: 3",
 ]
-dependencies = ["ray[default]==2.4.0", "pandas", "dask[distributed,dataframe]==2023.5.0", "ipython", "pyarrow==12.0.0", "s3fs"]
+dependencies = [
+    "ray[default]==2.4.0",
+    "pandas",
+    "dask[distributed,dataframe]==2023.5.0",
+    "ipython",
+    "pyarrow==12.0.0",
+    "s3fs",
+    "flintrock>=2.1.0",
+]
 dynamic = ["version"]
 
 [project.scripts]

diff --git a/spark_aws_ec2/README.md b/spark_aws_ec2/README.md
@@ -0,0 +1,114 @@
+# Overview
+
+This folder contains utilities to run Spark clusters on AWS EC2.
+
+We use the [`flintrock` utility](https://github.com/nchammas/flintrock) to do so.
+
+## Setup
+
+### AWS Setup
+
+Create a keypair in the EC2 console, download the `.pem` file and save it to `~/.ssh/my-keypair.pem`.
+
+Then, fix its permissions with:
+
+```bash
+chmod 600 ~/.ssh/my-keypair.pem
+```
+
+### Local Machine
+
+> NOTE: This requires `uv` to be installed.
+
+Install `flintrock` as a uv tool.
+
+```bash
+uv tool install flintrock
+```
+
+Now configure the tool.
+
+```bash
+uvx flintrock configure
+```
+
+Ensure that inside your configuration:
+
+1. You configure the appropriate AWS Region to spin your cluster up in
+2. You select the appropriate machine types
+3. You select the appropriate AMI
+4. You select the correct **name** of the keypair (in AWS), and also the corresponding path to the `.pem` file that was saved to your local machine
+5. Ensure that Spark is installed
+
+An example configuration is provided here in `example-flintrock-config.yaml`.
+
+## Start a cluster
+
+```bash
+uvx flintrock launch my-spark-cluster
+```
+
+When this is done, you can describe the cluster with:
+
+```bash
+uvx flintrock describe
+```
+
+The output looks as follows:
+
+```
+my-spark-cluster:
+  state: running
+  node-count: 2
+  master: ec2-XX-XX-XX-XX.us-west-2.compute.amazonaws.com
+  slaves:
+    - ec2-XX-XX-XX-XX.us-west-2.compute.amazonaws.com
+```
+
+## Launch a job
+
+Spark launches jobs using the `spark-submit` binary, which is packaged in releases of Spark. However, one significant limitation of the `spark-submit`
+mechanism is that it is unable to submit a **Python job to run in the cluster**. Python jobs will always run the Python script on the client (i.e. your laptop),
+which leads to lots of problems including Python client-cluster environment mismatches and high latency of communication between the client and cluster.
+
+Thus the preferred way for launching work on this newly created cluster of yours is to launch it from inside the cluster itself, by SSH'ing into the master node.
+
+1. Copy the necessary Python file(s) into the cluster
+
+```bash
+uvx flintrock run-command my-spark-cluster mkdir /home/ec2-user/workdir
+uvx flintrock copy-file \
+    --master-only \
+    my-spark-cluster \
+    ./app.py \
+    /home/ec2-user/workdir/app.py
+```
+
+2. Login to your master node and launch your work: we recommend launching using `tmux` so that your jobs will continue running even after you disconnect from the cluster.
+
+```bash
+uvx flintrock login my-spark-cluster
+```
+
+Now that you are in the cluster, launch a new tmux session:
+
+```bash
+sudo yum install tmux
+tmux
+```
+
+Now you can run your work, and use the cluster's `spark-submit` binary.
+
+```bash
+spark-submit \
+    --master spark://$(eval hostname):7077 \
+    sample-job.py
+```
+
+## Tearing Down
+
+Teardown the cluster using:
+
+```bash
+uvx flintrock destroy my-spark-cluster
+```
diff --git a/spark_aws_ec2/example-flintrock-config.yaml b/spark_aws_ec2/example-flintrock-config.yaml
@@ -0,0 +1,64 @@
+services:
+  spark:
+    version: 3.5.4
+    # git-commit: latest  # if not 'latest', provide a full commit SHA; e.g. d6dc12ef0146ae409834c78737c116050961f350
+    # git-repository:  # optional; defaults to https://github.com/apache/spark
+    # optional; defaults to download from a dynamically selected Apache mirror
+    #   - can be http, https, or s3 URL
+    #   - must contain a {v} template corresponding to the version
+    #   - Spark must be pre-built
+    #   - files must be named according to the release pattern shown here: https://dist.apache.org/repos/dist/release/spark/
+    # download-source: "https://www.example.com/files/spark/{v}/"
+    # download-source: "s3://some-bucket/spark/{v}/"
+    # executor-instances: 1
+  hdfs:
+    version: 3.3.6
+    # optional; defaults to download from a dynamically selected Apache mirror
+    #   - can be http, https, or s3 URL
+    #   - must contain a {v} template corresponding to the version
+    #   - files must be named according to the release pattern shown here: https://dist.apache.org/repos/dist/release/hadoop/common/
+    # download-source: "https://www.example.com/files/hadoop/{v}/"
+    # download-source: "http://www-us.apache.org/dist/hadoop/common/hadoop-{v}/"
+    # download-source: "s3://some-bucket/hadoop/{v}/"
+
+provider: ec2
+
+providers:
+  ec2:
+    key-name: jay-mbp
+    identity-file: /Users/jaychia/.ssh/jay-mbp.pem
+    instance-type: m5.large
+    region: us-west-2
+    # availability-zone: <name>
+    ami: ami-0575ac0e31eace5d0  # Amazon Linux 2, us-west-2
+    user: ec2-user
+    # ami: ami-61bbf104  # CentOS 7, us-east-1
+    # user: centos
+    # spot-price: <price>
+    # vpc-id: <id>
+    # subnet-id: <id>
+    # placement-group: <name>
+    # security-groups:
+    #   - group-name1
+    #   - group-name2
+    # instance-profile-name:
+    # tags:
+    #   - key1,value1
+    #   - key2, value2  # leading/trailing spaces are trimmed
+    #   - key3,  # value will be empty
+    # min-root-ebs-size-gb: <size-gb>
+    tenancy: default  # default | dedicated
+    ebs-optimized: no  # yes | no
+    instance-initiated-shutdown-behavior: terminate  # terminate | stop
+    # user-data: /path/to/userdata/script
+    # authorize-access-from:
+    #   - 10.0.0.42/32
+    #   - sg-xyz4654564xyz
+
+launch:
+  num-slaves: 1
+  install-hdfs: True
+  install-spark: True
+  # java-version: 8
+
+debug: false
diff --git a/spark_aws_ec2/sample-job.py b/spark_aws_ec2/sample-job.py
@@ -0,0 +1,16 @@
+if __name__ == "__main__":
+    from pyspark.sql import SparkSession
+
+    spark = SparkSession.builder.appName("demo").getOrCreate()
+
+    df = spark.createDataFrame(
+        [
+            ("sue", 32),
+            ("li", 3),
+            ("bob", 75),
+            ("heo", 13),
+        ],
+        ["first_name", "age"],
+    )
+
+    df.show()