update node selector

awslabs · Feb 19, 2025 · aa08aea · aa08aea
1 parent a613872
commit aa08aea
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 27 deletions.
diff --git a/gen-ai/inference/llamacpp-rayserve-graviton/README.md b/gen-ai/inference/llamacpp-rayserve-graviton/README.md
@@ -20,7 +20,7 @@ You may notice the example model used in this blueprint is formatted as GGUF whi
 N_THREADS is the number of threads to use for inference, best practice is to set it as same as the number of vCPU of host EC2 instance for optimized performance.
 CMAKE_ARGS are the C/C++ compile flags when compiling llama.cpp.(please refer [this](https://github.com/aws/aws-graviton-getting-started/blob/main/c-c++.md) for more details about C/C++ compile flags for Graviton)
 
-After setting up all variables, run this command to create the kubenetes service 
+After setting up all variables, run this command to create the kubernetes service 
 
 ```bash
 kubectl create -f ray-service-llamacpp.yaml 
@@ -44,7 +44,7 @@ Launch an EC2 instance as the client in the same AZ with the Ray cluster(For opt
 ### 2. Execute port forward for the ray service
 
 ```bash
-kubectl port-forward svc/ray-service-llamacpp 8000:8000
+kubectl port-forward service/ray-service-llamacpp-serve-svc 8000:8000
 ```
 
 ### 2. Configure environment

diff --git a/gen-ai/inference/llamacpp-rayserve-graviton/ray-service-llamacpp.yaml b/gen-ai/inference/llamacpp-rayserve-graviton/ray-service-llamacpp.yaml
@@ -3,7 +3,7 @@ kind: Secret
 metadata:
   name: token
 stringData:
-  token: $HUGGING_FACE_HUB_TOKEN
+  token: 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
 ---
 apiVersion: v1
 kind: ConfigMap
@@ -145,22 +145,16 @@ spec:
             ray-control-plane: "true"
         spec:
           nodeSelector:
-            ray-control-plane: "false"
-            model-inferencing: "cpu-arm"
-          tolerations:
-            - key: "model-inferencing"
-              operator: "Equal"
-              value: "cpu-arm"
-              effect: "NoSchedule" 
+            kubernetes.io/arch: arm64
           affinity:
             nodeAffinity:
               requiredDuringSchedulingIgnoredDuringExecution:
                 nodeSelectorTerms:
-                  - matchExpressions:
-                      - key: model-inferencing
-                        operator: In
-                        values:
-                          - "cpu-arm"            
+                - matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values:
+                    - arm64           
           restartPolicy: Always
           containers:
           - name: ray-head
@@ -209,22 +203,16 @@ spec:
       template:
         spec:
           nodeSelector:
-            ray-control-plane: "false"
-            model-inferencing: "cpu-arm"
-          tolerations:
-            - key: "model-inferencing"
-              operator: "Equal"
-              value: "cpu-arm"
-              effect: "NoSchedule"
+            kubernetes.io/arch: arm64
           affinity:
             nodeAffinity:
               requiredDuringSchedulingIgnoredDuringExecution:
                 nodeSelectorTerms:
-                  - matchExpressions:
-                      - key: model-inferencing
-                        operator: In
-                        values:
-                          - "cpu-arm"            
+                - matchExpressions:
+                  - key: kubernetes.io/arch
+                    operator: In
+                    values:
+                    - arm64           
           restartPolicy: Always
           containers:
           - name: llm