michaelzhiluo · suquark · May 4, 2023 · May 7, 2023 · May 8, 2023 · May 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.egg-info
diff --git a/cluster_setup/aws/README.md b/cluster_setup/aws/README.md
@@ -0,0 +1,60 @@
+
+## Setup
+
+```
+conda create -n skyburst python=3.10
+```
+
+### Installing or updating kubectl
+
+https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html
+
+```
+# Kubernetes 1.26
+curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.26.2/2023-03-17/bin/darwin/amd64/kubectl
+chmod +x ./kubectl
+mkdir -p $HOME/bin && mv ./kubectl $HOME/bin/kubectl && export PATH=$HOME/bin:$PATH
+echo 'export PATH=$PATH:$HOME/bin' >> ~/.bash_profile
+```
+
+test it
+
+```
+kubectl version --short --client
+```
+
+### Installing or updating eksctl
+
+https://github.com/weaveworks/eksctl/blob/main/README.md#installation
+
+```
+# NOTE: this will also make brew installing kubectl, maybe conflict with the earlier step
+brew tap weaveworks/tap
+brew install weaveworks/tap/eksctl
+```
+
+### Kuberflow pytorch jobs
+
+https://www.kubeflow.org/docs/components/training/pytorch/
+
+https://googlecloudplatform.github.io/kubeflow-gke-docs/docs/pipelines/enable-gpu-and-tpu/
+
+
+```
+# create a cluster
+eksctl create cluster --name skyburst --region us-east-1
+
+# check the cluster
+kubectl get nodes -o wide
+
+# install kuberflow pytorch operator
+# https://github.com/kubeflow/training-operator#installation
+kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0"
+
+# run the job
+kubectl create -f simple.yaml
+
+# monitor the job
+kubectl get -o yaml pytorchjobs pytorch-simple -n kubeflow
+```
+
diff --git a/cluster_setup/aws/simple.yaml b/cluster_setup/aws/simple.yaml
@@ -0,0 +1,26 @@
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: pytorch-simple
+  namespace: kubeflow
+spec:
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
+              resources:
+                limits:
+                  cpu: "4"
+                  nvidia.com/gpu: "1"
+              imagePullPolicy: Always
+              command:
+                - "python3"
+                - "/opt/pytorch-mnist/mnist.py"
+                - "--epochs=1"
+          nodeSelector:
+            beta.kubernetes.io/instance-type: p3.16xlarge
diff --git a/cluster_setup/aws/start_eks.sh b/cluster_setup/aws/start_eks.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+eksctl create cluster \
+  --name skyburst \
+  --nodegroup-name skyburst-nodegroup \
+  --node-type p3.16xlarge \
+  --nodes 1 \
+  --region us-east-1
diff --git a/cluster_setup/gcp/README.md b/cluster_setup/gcp/README.md
@@ -0,0 +1,119 @@
+
+## Setup
+
+```
+conda create -n skyburst python=3.10
+```
+
+### Installing or updating kubectl
+
+https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html
+
+```
+# Kubernetes 1.26
+curl -O https://s3.us-west-2.amazonaws.com/amazon-eks/1.26.2/2023-03-17/bin/darwin/amd64/kubectl
+chmod +x ./kubectl
+mkdir -p $HOME/bin && mv ./kubectl $HOME/bin/kubectl && export PATH=$HOME/bin:$PATH
+echo 'export PATH=$PATH:$HOME/bin' >> ~/.bash_profile
+```
+
+test it
+
+```
+kubectl version --short --client
+```
+
+### Switch to Google Cloud Project and Region
+
+Creating a Kubernetes cluster with 8 V100 GPUs on GCP and deploying Kubeflow involves several steps. Here's a detailed, end-to-end guide to help you through the process.
+
+1. Set up the GCP environment:
+   a. Sign up for a Google Cloud Platform (GCP) account, if you don't have one, and create a new project.
+   b. Install the Google Cloud SDK (gcloud) on your local machine: https://cloud.google.com/sdk/docs/install
+   c. Authenticate with your GCP account using the following command:
+      ```
+      gcloud auth login
+      ```
+   d. Set the default GCP project and region:
+      ```
+      gcloud config set project <your-project-id>
+      gcloud config set compute/region <your-region>
+      ```
+   e. Enable the required APIs:
+      ```
+      gcloud services enable container.googleapis.com
+      gcloud services enable compute.googleapis.com
+      gcloud services enable iam.googleapis.com
+      gcloud services enable iap.googleapis.com
+      ```
+
+2. Create a Kubernetes cluster with GPU nodes:
+   a. Enable GPU support for your GCP project:
+      ```
+      gcloud compute project-info add-metadata --metadata enable-guest-attributes=TRUE
+      ```
+   b. Create a GPU node pool with 8 V100 GPUs:
+      ```
+      ./start_gke.sh
+      ```
+   c. If you see `CRITICAL: ACTION REQUIRED: gke-gcloud-auth-plugin, which is needed for continued use of kubectl, was not found or is not executable.`
+
+      Follow the instruction here: https://cloud.google.com/blog/products/containers-kubernetes/kubectl-auth-changes-in-gke 
+
+   d. Install the NVIDIA GPU device plugin for Kubernetes:
+      ```
+      kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml
+      ```
+
+3. Install and set up kubectl:
+   a. Install kubectl on your local machine: https://kubernetes.io/docs/tasks/tools/install-kubectl/
+   b. Configure kubectl to use your GCP Kubernetes cluster:
+      ```
+      gcloud container clusters get-credentials skyburst-gpu --zone us-central1-c
+      ```
+
+4. Deploy Kubeflow to the cluster:
+   a. Download and install the Kubeflow CLI (kfctl) from the GitHub releases page: https://github.com/kubeflow/kfctl/releases
+   b. Set up the environment variables:
+      ```
+      export KF_NAME=<kubeflow-deployment-name>
+      export BASE_DIR=<path-to-kubeflow-storage>
+      export KF_DIR=${BASE_DIR}/${KF_NAME}
+      ```
+   c. Download the latest Kubeflow configuration file:
+      ```
+      export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/master/kfdef/kfctl_gcp_iap.v1.3.0.yaml"
+      ```
+   d. Create and apply the Kubeflow configuration:
+      ```
+      mkdir -p ${KF_DIR}
+      cd ${KF_DIR}
+      kfctl apply -V -f ${CONFIG_URI}
+      ```
+
+
+### Kuberflow pytorch jobs
+
+https://www.kubeflow.org/docs/components/training/pytorch/
+
+https://googlecloudplatform.github.io/kubeflow-gke-docs/docs/pipelines/enable-gpu-and-tpu/
+
+
+```
+# check the cluster
+kubectl get nodes -o wide
+
+# install kuberflow pytorch operator
+# https://github.com/kubeflow/training-operator#installation
+kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0"
+
+# run the job
+kubectl create -f simple.yaml
+
+# to delete the job (if required)
+kubectl delete pytorchjobs pytorch-simple -n kubeflow
+
+# monitor the job
+kubectl get -o yaml pytorchjobs pytorch-simple -n kubeflow
+```
+
diff --git a/cluster_setup/gcp/get_logs.sh b/cluster_setup/gcp/get_logs.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+job_name=$1
+kubectl get -o yaml jobs "$1"
+echo "======================"
+kubectl get pods --selector=job-name="$1"
+echo "======================"
+# kubectl logs $(kubectl get pods --selector=job-name="$1" -o jsonpath='{.items[*].metadata.name}')
+kubectl logs $(kubectl get pods --selector=job-name="$1" -o jsonpath='{.items[-1].metadata.name}')
diff --git a/cluster_setup/gcp/inspect-resource.yaml b/cluster_setup/gcp/inspect-resource.yaml
@@ -0,0 +1,28 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: inspect-resource
+spec:
+  backoffLimit: 0
+  template:
+    spec:
+      containers:
+        - name: pytorch
+          # https://hub.docker.com/r/pytorch/pytorch/tags
+          # https://cloud.google.com/deep-learning-containers/docs/choosing-container
+          image: gcr.io/deeplearning-platform-release/pytorch-gpu.1-12
+          resources:
+            limits:
+              cpu: "4"
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "4"
+          imagePullPolicy: Always
+          command: ["/bin/bash", "-c"]
+          args:
+          - |
+            python -V && nvidia-smi
+      nodeSelector:
+        cloud.google.com/gke-nodepool: gpu-pool
+      restartPolicy: Never
+
diff --git a/cluster_setup/gcp/simple-kuberflow.yaml b/cluster_setup/gcp/simple-kuberflow.yaml
@@ -0,0 +1,28 @@
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: pytorch-simple
+  namespace: kubeflow
+spec:
+  pytorchReplicaSpecs:
+    Master:
+      replicas: 1
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
+              resources:
+                limits:
+                  cpu: "4"
+                  nvidia.com/gpu: "1"
+                requests:
+                  cpu: "4"
+              imagePullPolicy: Always
+              command:
+                - "python3"
+                - "/opt/pytorch-mnist/mnist.py"
+                - "--epochs=1"
+          nodeSelector:
+            cloud.google.com/gke-nodepool: gpu-pool
diff --git a/cluster_setup/gcp/simple.yaml b/cluster_setup/gcp/simple.yaml
@@ -0,0 +1,28 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: pytorch-simple
+spec:
+  template:
+    metadata:
+      labels:
+        app: pytorch-simple
+    spec:
+      containers:
+        - name: pytorch
+          image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727
+          resources:
+            limits:
+              cpu: "4"
+              nvidia.com/gpu: "1"
+            requests:
+              cpu: "4"
+          imagePullPolicy: Always
+          command:
+            - "python3"
+            - "/opt/pytorch-mnist/mnist.py"
+            - "--epochs=1"
+      nodeSelector:
+        cloud.google.com/gke-nodepool: gpu-pool
+      restartPolicy: Never
+
diff --git a/cluster_setup/gcp/start_gke.sh b/cluster_setup/gcp/start_gke.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+CLUSTER_NAME=skyburst-gpu
+ZONE=us-central1-c
+POOL_NAME=gpu-pool
+NUM_NODES=1
+
+gcloud container clusters create $CLUSTER_NAME \
+    --num-nodes $NUM_NODES \
+    --accelerator "type=nvidia-tesla-v100,count=8" \
+    --machine-type n1-standard-96 \
+    --zone $ZONE \
+    --cluster-version "latest" \
+    --enable-autoupgrade \
+    --enable-autorepair \
+    --scopes cloud-platform \
+    --network skypilot-vpc \
+    --subnetwork skypilot-vpc
+
+# create a note pool (a subset of nodes) out of the cluster for scheduling
+gcloud container node-pools create $POOL_NAME \
+  --cluster $CLUSTER_NAME \
+  --machine-type n1-standard-96 \
+  --accelerator "type=nvidia-tesla-v100,count=8" \
+  --num-nodes $NUM_NODES \
+  --zone $ZONE
+
+# delete the cluster
+# gcloud container clusters delete $CLUSTER_NAME --zone $ZONE
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,31 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "skyburst"
+version = "0.0.1"
+description = "SkyBurst is ....."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+]
+dependencies = [
+    "cvxpy", "matplotlib", "seaborn", "jupyter",
+    "fastapi", "uvicorn", "kubernetes", "httpx", "click", "pydantic"
+]
+
+[project.optional-dependencies]
+dev = ["black==23.3.0", "pylint==2.8.2"]
+
+[project.urls]
+"Homepage" = "https://github.com/michaelzhiluo/skyburst"
+"Bug Tracker" = "https://github.com/michaelzhiluo/skyburst/issues"
+
+[tool.setuptools.packages.find]
+exclude = ["simulators", "docs", "scripts*", "tests*"]
+
+[tool.wheel]
+exclude = ["simulators", "docs", "scripts*", "tests*"]
diff --git a/requirements.txt b/requirements.txt