-
Notifications
You must be signed in to change notification settings - Fork 1
/
deploy.sh
executable file
·125 lines (99 loc) · 3.57 KB
/
deploy.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/bin/bash
# Check if the user is logged into Azure CLI
if ! az account show > /dev/null 2>&1; then
echo "Please login to Azure CLI using 'az login' before running this script."
exit 1
fi
# Create a Terraform plan
terraform plan -out main.tfplan
# Apply the Terraform plan
terraform apply main.tfplan
# Retrieve the Terraform outputs and store in variables
resource_group_name=$(terraform output -raw resource_group_name)
system_node_pool_name=$(terraform output -raw system_node_pool_name)
aks_cluster_name=$(terraform output -raw kubernetes_cluster_name)
# Get AKS credentials for the cluster
az aks get-credentials \
--resource-group $resource_group_name \
--name $aks_cluster_name
# Create the kuberay namespace
kuberay_namespace="kuberay"
kubectl create namespace $kuberay_namespace
# Output the current Kubernetes context
current_context=$(kubectl config current-context)
echo "Current Kubernetes Context: $current_context"
# Output the nodes in the cluster
kubectl get nodes
# Check Helm version
helm version
# Add the KubeRay Helm repository
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
# Update the Helm repository
helm repo update
# Install or upgrade the KubeRay operator using Helm
helm upgrade \
--install \
--cleanup-on-fail \
--wait \
--timeout 10m0s \
--namespace "$kuberay_namespace" \
--create-namespace kuberay-operator kuberay/kuberay-operator \
--version 1.1.1
# Output the pods in the kuberay namespace
kubectl get pods -n $kuberay_namespace
# Download the PyTorch MNIST job YAML file
curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml
# Train a PyTorch Model on Fashion MNIST
kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml
# Output the pods in the kuberay namespace
kubectl get pods -n $kuberay_namespace
# Get the status of the Ray job
job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}')
# Wait for the Ray job to complete
while [ "$job_status" != "Complete" ]; do
echo -ne "Job Status: $job_status\\r"
sleep 30
job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobDeploymentStatus}')
done
echo "Job Status: $job_status"
# Check if the job succeeded
job_status=$(kubectl get rayjobs -n $kuberay_namespace -o jsonpath='{.items[0].status.jobStatus}')
if [ "$job_status" != "SUCCEEDED" ]; then
echo "Job Failed!"
exit 1
fi
# If the job succeeded, get the Ray cluster head service
rayclusterhead=$(kubectl get service -n $kuberay_namespace | grep 'rayjob-pytorch-mnist-raycluster' | grep 'ClusterIP' | awk '{print $1}')
# Now create a service of type NodePort for the Ray cluster head
kubectl expose service $rayclusterhead \
-n $kuberay_namespace \
--port=80 \
--target-port=8265 \
--type=NodePort \
--name=ray-dash
# Create an ingress for the KubeRay dashboard
cat <<EOF | kubectl apply -f -
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ray-dash
namespace: kuberay
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
ingressClassName: webapprouting.kubernetes.azure.com
rules:
- http:
paths:
- backend:
service:
name: ray-dash
port:
number: 80
path: /
pathType: Prefix
EOF
# Now find the public IP address of the ingress controller
lb_public_ip=$(kc get svc -n app-routing-system -o jsonpath='{.items[?(@.metadata.name == "nginx")].status.loadBalancer.ingress[0].ip}')
echo "KubeRay Dashboard URL: http://$lb_public_ip/"
exit 0