Skip to content

Commit

Permalink
RUN-17256 Adjust to run.ai/reserve_for_gpu_index change (#69)
Browse files Browse the repository at this point in the history
  • Loading branch information
gshaibi authored Mar 31, 2024
1 parent f268b08 commit 6f72a74
Show file tree
Hide file tree
Showing 9 changed files with 720 additions and 244 deletions.
228 changes: 228 additions & 0 deletions design/samples/2.17/fractional_pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
apiVersion: v1
kind: Pod
metadata:
annotations:
clusterId: d69dff42-4134-41d9-90fc-1c39505cb774
cni.projectcalico.org/containerID: 4357ed00f685ddcfafb9b551fbad81f6ad9a39d74f0158fe22b6e937dad415df
cni.projectcalico.org/podIP: 100.122.249.152/32
cni.projectcalico.org/podIPs: 100.122.249.152/32
gpu-fraction: "0.5"
pod-group-name: pg-frac-1-0-2237ca39-cac0-4601-b658-8a3c5f406a4f
received-resource-type: Fraction
runai-allocated-gpu-memory: "7680"
runai-allocated-gpus: "0.5"
runai-allocated-mig-gpus: "0"
runai-calculated-status: Running
runai-job-id: 2237ca39-cac0-4601-b658-8a3c5f406a4f
runai-node: i-0b498db53280b86a6
runai/shared-gpu-configmap: frac-1-ns26p7c-runai-sh-gpu
user: [email protected]
workloadId: 027397ab-4c3c-45f7-87d0-8b3bae4ded65
creationTimestamp: "2024-03-31T09:03:22Z"
generateName: frac-1-
labels:
app: runaijob
controller-uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f
createdBy: RunaiJob
project: pa
release: frac-1
run.ai/top-owner-uid: 027397ab-4c3c-45f7-87d0-8b3bae4ded65
runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b
runai/pod-index: 0-0
workloadKind: TrainingWorkload
workloadName: frac-1
name: frac-1-0-0
namespace: runai-pa
ownerReferences:
- apiVersion: run.ai/v1
blockOwnerDeletion: true
controller: true
kind: RunaiJob
name: frac-1
uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f
resourceVersion: "10748"
uid: a801b3c7-b9be-4830-821c-2456cad2234f
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: runai/node-pool
operator: DoesNotExist
containers:
- env:
- name: RUNAI_JOB_NAME
value: frac-1
- name: RUNAI_PROJECT
value: pa
- name: WANDB_NOTES
value: https://shaibi-real.runailabs.com/trainings?columnFilter=[{"term":"frac-1","name":"name"}]&clusterId=d69dff42-4134-41d9-90fc-1c39505cb774
- name: POD_INDEX
value: "0"
- name: RUNAI_GPU_MEMORY_REQUEST
value: "0.50"
- name: RUNAI_GPU_MEMORY_LIMIT
value: "0.50"
- name: NVIDIA_VISIBLE_DEVICES
valueFrom:
configMapKeyRef:
key: RUNAI-VISIBLE-DEVICES
name: frac-1-ns26p7c-runai-sh-gpu-0
- name: RUNAI_NUM_OF_GPUS
valueFrom:
configMapKeyRef:
key: RUNAI_NUM_OF_GPUS
name: frac-1-ns26p7c-runai-sh-gpu-0
- name: jobUUID
value: 2237ca39-cac0-4601-b658-8a3c5f406a4f
- name: JOB_UUID
value: 2237ca39-cac0-4601-b658-8a3c5f406a4f
- name: jobName
value: frac-1
- name: JOB_NAME
value: frac-1
- name: reporterGatewayURL
value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091
- name: REPORTER_GATEWAY_URL
value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091
- name: podUUID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.uid
- name: POD_UUID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.uid
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
envFrom:
- configMapRef:
name: frac-1-ns26p7c-runai-sh-gpu-0-evar
optional: false
image: gshaibi/gpu-burn
imagePullPolicy: IfNotPresent
name: frac-1
resources:
requests:
cpu: 100m
memory: 100M
securityContext:
allowPrivilegeEscalation: false
capabilities: {}
seccompProfile:
type: RuntimeDefault
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-dfphn
readOnly: true
- mountPath: /etc/ld.so.preload
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
readOnly: true
subPath: ld.so.preload-key
- mountPath: /etc/runai.d/memory
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
readOnly: true
subPath: config
- mountPath: /etc/runai.d/pod_uuid
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
readOnly: true
subPath: pod-uuid
- mountPath: /runai/shared
name: runai-shared-directory
readOnly: true
- mountPath: /etc/runai.d/route
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
readOnly: true
subPath: route
dnsPolicy: ClusterFirst
enableServiceLinks: true
nodeName: i-0b498db53280b86a6
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Never
schedulerName: runai-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: kube-api-access-dfphn
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
- configMap:
defaultMode: 420
name: frac-1-ns26p7c-runai-sh-gpu-0
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
- hostPath:
path: /var/lib/runai/shared
type: DirectoryOrCreate
name: runai-shared-directory
status:
conditions:
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:27Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:51Z"
status: "True"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:51Z"
status: "True"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:27Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://4205608c75216bfe3d3a71ea7301f8bc041acba92673e033fc87be6d91867dc6
image: docker.io/gshaibi/gpu-burn:latest
imageID: docker.io/gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979
lastState: {}
name: frac-1
ready: true
restartCount: 0
started: true
state:
running:
startedAt: "2024-03-31T09:03:51Z"
hostIP: 172.20.62.77
phase: Running
podIP: 100.122.249.152
podIPs:
- ip: 100.122.249.152
qosClass: Burstable
startTime: "2024-03-31T09:03:27Z"
124 changes: 124 additions & 0 deletions design/samples/2.17/fractional_pod_reservation.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
apiVersion: v1
kind: Pod
metadata:
annotations:
cni.projectcalico.org/containerID: 75affaf027829643896b3de5699d15fedb291f4f7efac6f00b0d0bbe9a2dd65a
cni.projectcalico.org/podIP: 100.122.249.151/32
cni.projectcalico.org/podIPs: 100.122.249.151/32
pod-group-name: pg-runai-reservation-gpu-i-0b498db53280b86a6-fzdhl-3b47e794-97f0-4824-b7d5-bb44c122039e
run.ai/reserve_for_gpu_index: GPU-8983c66a-23df-e63b-4c2f-afcae9ec79b3
runai-job-id: 3b47e794-97f0-4824-b7d5-bb44c122039e
creationTimestamp: "2024-03-31T09:03:25Z"
labels:
app: runai-reservation
app.runai.resource.reservation: runai-reservation-gpu
runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b
name: runai-reservation-gpu-i-0b498db53280b86a6-fzdhl
namespace: runai-reservation
resourceVersion: "10625"
uid: 3b47e794-97f0-4824-b7d5-bb44c122039e
spec:
containers:
- env:
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
image: gcr.io/run-ai-prod/resource-reservation:v3.5.0
imagePullPolicy: IfNotPresent
name: runai-reservation
resources:
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: kube-api-access-fnjgk
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
imagePullSecrets:
- name: runai-reg-creds
nodeName: i-0b498db53280b86a6
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: runai-scheduler
securityContext: {}
serviceAccount: runai-reservation-engine
serviceAccountName: runai-reservation-engine
terminationGracePeriodSeconds: 30
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes:
- name: kube-api-access-fnjgk
projected:
defaultMode: 420
sources:
- serviceAccountToken:
expirationSeconds: 3607
path: token
- configMap:
items:
- key: ca.crt
path: ca.crt
name: kube-root-ca.crt
- downwardAPI:
items:
- fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
path: namespace
status:
conditions:
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:25Z"
status: "True"
type: Initialized
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:27Z"
status: "True"
type: Ready
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:27Z"
status: "True"
type: ContainersReady
- lastProbeTime: null
lastTransitionTime: "2024-03-31T09:03:25Z"
status: "True"
type: PodScheduled
containerStatuses:
- containerID: containerd://1063439dc8e82d20ef89a97ad9567d40d59d0d270ac5b8d4cab7f49a474e4398
image: gcr.io/run-ai-prod/resource-reservation:v3.5.0
imageID: gcr.io/run-ai-prod/resource-reservation@sha256:add1db641829508bbd1e74a7e757348159bc99b67844fc656acc1e795872d0a6
lastState: {}
name: runai-reservation
ready: true
restartCount: 0
started: true
state:
running:
startedAt: "2024-03-31T09:03:27Z"
hostIP: 172.20.62.77
phase: Running
podIP: 100.122.249.151
podIPs:
- ip: 100.122.249.151
qosClass: BestEffort
startTime: "2024-03-31T09:03:25Z"
Loading

0 comments on commit 6f72a74

Please sign in to comment.