-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
RUN-17256 Adjust to
run.ai/reserve_for_gpu_index
change (#69)
- Loading branch information
Showing
9 changed files
with
720 additions
and
244 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
annotations: | ||
clusterId: d69dff42-4134-41d9-90fc-1c39505cb774 | ||
cni.projectcalico.org/containerID: 4357ed00f685ddcfafb9b551fbad81f6ad9a39d74f0158fe22b6e937dad415df | ||
cni.projectcalico.org/podIP: 100.122.249.152/32 | ||
cni.projectcalico.org/podIPs: 100.122.249.152/32 | ||
gpu-fraction: "0.5" | ||
pod-group-name: pg-frac-1-0-2237ca39-cac0-4601-b658-8a3c5f406a4f | ||
received-resource-type: Fraction | ||
runai-allocated-gpu-memory: "7680" | ||
runai-allocated-gpus: "0.5" | ||
runai-allocated-mig-gpus: "0" | ||
runai-calculated-status: Running | ||
runai-job-id: 2237ca39-cac0-4601-b658-8a3c5f406a4f | ||
runai-node: i-0b498db53280b86a6 | ||
runai/shared-gpu-configmap: frac-1-ns26p7c-runai-sh-gpu | ||
user: [email protected] | ||
workloadId: 027397ab-4c3c-45f7-87d0-8b3bae4ded65 | ||
creationTimestamp: "2024-03-31T09:03:22Z" | ||
generateName: frac-1- | ||
labels: | ||
app: runaijob | ||
controller-uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f | ||
createdBy: RunaiJob | ||
project: pa | ||
release: frac-1 | ||
run.ai/top-owner-uid: 027397ab-4c3c-45f7-87d0-8b3bae4ded65 | ||
runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b | ||
runai/pod-index: 0-0 | ||
workloadKind: TrainingWorkload | ||
workloadName: frac-1 | ||
name: frac-1-0-0 | ||
namespace: runai-pa | ||
ownerReferences: | ||
- apiVersion: run.ai/v1 | ||
blockOwnerDeletion: true | ||
controller: true | ||
kind: RunaiJob | ||
name: frac-1 | ||
uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f | ||
resourceVersion: "10748" | ||
uid: a801b3c7-b9be-4830-821c-2456cad2234f | ||
spec: | ||
affinity: | ||
nodeAffinity: | ||
requiredDuringSchedulingIgnoredDuringExecution: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: runai/node-pool | ||
operator: DoesNotExist | ||
containers: | ||
- env: | ||
- name: RUNAI_JOB_NAME | ||
value: frac-1 | ||
- name: RUNAI_PROJECT | ||
value: pa | ||
- name: WANDB_NOTES | ||
value: https://shaibi-real.runailabs.com/trainings?columnFilter=[{"term":"frac-1","name":"name"}]&clusterId=d69dff42-4134-41d9-90fc-1c39505cb774 | ||
- name: POD_INDEX | ||
value: "0" | ||
- name: RUNAI_GPU_MEMORY_REQUEST | ||
value: "0.50" | ||
- name: RUNAI_GPU_MEMORY_LIMIT | ||
value: "0.50" | ||
- name: NVIDIA_VISIBLE_DEVICES | ||
valueFrom: | ||
configMapKeyRef: | ||
key: RUNAI-VISIBLE-DEVICES | ||
name: frac-1-ns26p7c-runai-sh-gpu-0 | ||
- name: RUNAI_NUM_OF_GPUS | ||
valueFrom: | ||
configMapKeyRef: | ||
key: RUNAI_NUM_OF_GPUS | ||
name: frac-1-ns26p7c-runai-sh-gpu-0 | ||
- name: jobUUID | ||
value: 2237ca39-cac0-4601-b658-8a3c5f406a4f | ||
- name: JOB_UUID | ||
value: 2237ca39-cac0-4601-b658-8a3c5f406a4f | ||
- name: jobName | ||
value: frac-1 | ||
- name: JOB_NAME | ||
value: frac-1 | ||
- name: reporterGatewayURL | ||
value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 | ||
- name: REPORTER_GATEWAY_URL | ||
value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 | ||
- name: podUUID | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.uid | ||
- name: POD_UUID | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.uid | ||
- name: NODE_NAME | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: spec.nodeName | ||
envFrom: | ||
- configMapRef: | ||
name: frac-1-ns26p7c-runai-sh-gpu-0-evar | ||
optional: false | ||
image: gshaibi/gpu-burn | ||
imagePullPolicy: IfNotPresent | ||
name: frac-1 | ||
resources: | ||
requests: | ||
cpu: 100m | ||
memory: 100M | ||
securityContext: | ||
allowPrivilegeEscalation: false | ||
capabilities: {} | ||
seccompProfile: | ||
type: RuntimeDefault | ||
terminationMessagePath: /dev/termination-log | ||
terminationMessagePolicy: File | ||
volumeMounts: | ||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount | ||
name: kube-api-access-dfphn | ||
readOnly: true | ||
- mountPath: /etc/ld.so.preload | ||
name: frac-1-ns26p7c-runai-sh-gpu-0-vol | ||
readOnly: true | ||
subPath: ld.so.preload-key | ||
- mountPath: /etc/runai.d/memory | ||
name: frac-1-ns26p7c-runai-sh-gpu-0-vol | ||
readOnly: true | ||
subPath: config | ||
- mountPath: /etc/runai.d/pod_uuid | ||
name: frac-1-ns26p7c-runai-sh-gpu-0-vol | ||
readOnly: true | ||
subPath: pod-uuid | ||
- mountPath: /runai/shared | ||
name: runai-shared-directory | ||
readOnly: true | ||
- mountPath: /etc/runai.d/route | ||
name: frac-1-ns26p7c-runai-sh-gpu-0-vol | ||
readOnly: true | ||
subPath: route | ||
dnsPolicy: ClusterFirst | ||
enableServiceLinks: true | ||
nodeName: i-0b498db53280b86a6 | ||
preemptionPolicy: PreemptLowerPriority | ||
priority: 0 | ||
restartPolicy: Never | ||
schedulerName: runai-scheduler | ||
securityContext: {} | ||
serviceAccount: default | ||
serviceAccountName: default | ||
terminationGracePeriodSeconds: 30 | ||
tolerations: | ||
- effect: NoExecute | ||
key: node.kubernetes.io/not-ready | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
- effect: NoExecute | ||
key: node.kubernetes.io/unreachable | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
volumes: | ||
- name: kube-api-access-dfphn | ||
projected: | ||
defaultMode: 420 | ||
sources: | ||
- serviceAccountToken: | ||
expirationSeconds: 3607 | ||
path: token | ||
- configMap: | ||
items: | ||
- key: ca.crt | ||
path: ca.crt | ||
name: kube-root-ca.crt | ||
- downwardAPI: | ||
items: | ||
- fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.namespace | ||
path: namespace | ||
- configMap: | ||
defaultMode: 420 | ||
name: frac-1-ns26p7c-runai-sh-gpu-0 | ||
name: frac-1-ns26p7c-runai-sh-gpu-0-vol | ||
- hostPath: | ||
path: /var/lib/runai/shared | ||
type: DirectoryOrCreate | ||
name: runai-shared-directory | ||
status: | ||
conditions: | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:27Z" | ||
status: "True" | ||
type: Initialized | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:51Z" | ||
status: "True" | ||
type: Ready | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:51Z" | ||
status: "True" | ||
type: ContainersReady | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:27Z" | ||
status: "True" | ||
type: PodScheduled | ||
containerStatuses: | ||
- containerID: containerd://4205608c75216bfe3d3a71ea7301f8bc041acba92673e033fc87be6d91867dc6 | ||
image: docker.io/gshaibi/gpu-burn:latest | ||
imageID: docker.io/gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 | ||
lastState: {} | ||
name: frac-1 | ||
ready: true | ||
restartCount: 0 | ||
started: true | ||
state: | ||
running: | ||
startedAt: "2024-03-31T09:03:51Z" | ||
hostIP: 172.20.62.77 | ||
phase: Running | ||
podIP: 100.122.249.152 | ||
podIPs: | ||
- ip: 100.122.249.152 | ||
qosClass: Burstable | ||
startTime: "2024-03-31T09:03:27Z" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
annotations: | ||
cni.projectcalico.org/containerID: 75affaf027829643896b3de5699d15fedb291f4f7efac6f00b0d0bbe9a2dd65a | ||
cni.projectcalico.org/podIP: 100.122.249.151/32 | ||
cni.projectcalico.org/podIPs: 100.122.249.151/32 | ||
pod-group-name: pg-runai-reservation-gpu-i-0b498db53280b86a6-fzdhl-3b47e794-97f0-4824-b7d5-bb44c122039e | ||
run.ai/reserve_for_gpu_index: GPU-8983c66a-23df-e63b-4c2f-afcae9ec79b3 | ||
runai-job-id: 3b47e794-97f0-4824-b7d5-bb44c122039e | ||
creationTimestamp: "2024-03-31T09:03:25Z" | ||
labels: | ||
app: runai-reservation | ||
app.runai.resource.reservation: runai-reservation-gpu | ||
runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b | ||
name: runai-reservation-gpu-i-0b498db53280b86a6-fzdhl | ||
namespace: runai-reservation | ||
resourceVersion: "10625" | ||
uid: 3b47e794-97f0-4824-b7d5-bb44c122039e | ||
spec: | ||
containers: | ||
- env: | ||
- name: POD_NAME | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.name | ||
- name: POD_NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.namespace | ||
image: gcr.io/run-ai-prod/resource-reservation:v3.5.0 | ||
imagePullPolicy: IfNotPresent | ||
name: runai-reservation | ||
resources: | ||
limits: | ||
nvidia.com/gpu: "1" | ||
requests: | ||
nvidia.com/gpu: "1" | ||
terminationMessagePath: /dev/termination-log | ||
terminationMessagePolicy: File | ||
volumeMounts: | ||
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount | ||
name: kube-api-access-fnjgk | ||
readOnly: true | ||
dnsPolicy: ClusterFirst | ||
enableServiceLinks: true | ||
imagePullSecrets: | ||
- name: runai-reg-creds | ||
nodeName: i-0b498db53280b86a6 | ||
preemptionPolicy: PreemptLowerPriority | ||
priority: 0 | ||
restartPolicy: Always | ||
schedulerName: runai-scheduler | ||
securityContext: {} | ||
serviceAccount: runai-reservation-engine | ||
serviceAccountName: runai-reservation-engine | ||
terminationGracePeriodSeconds: 30 | ||
tolerations: | ||
- effect: NoExecute | ||
key: node.kubernetes.io/not-ready | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
- effect: NoExecute | ||
key: node.kubernetes.io/unreachable | ||
operator: Exists | ||
tolerationSeconds: 300 | ||
volumes: | ||
- name: kube-api-access-fnjgk | ||
projected: | ||
defaultMode: 420 | ||
sources: | ||
- serviceAccountToken: | ||
expirationSeconds: 3607 | ||
path: token | ||
- configMap: | ||
items: | ||
- key: ca.crt | ||
path: ca.crt | ||
name: kube-root-ca.crt | ||
- downwardAPI: | ||
items: | ||
- fieldRef: | ||
apiVersion: v1 | ||
fieldPath: metadata.namespace | ||
path: namespace | ||
status: | ||
conditions: | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:25Z" | ||
status: "True" | ||
type: Initialized | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:27Z" | ||
status: "True" | ||
type: Ready | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:27Z" | ||
status: "True" | ||
type: ContainersReady | ||
- lastProbeTime: null | ||
lastTransitionTime: "2024-03-31T09:03:25Z" | ||
status: "True" | ||
type: PodScheduled | ||
containerStatuses: | ||
- containerID: containerd://1063439dc8e82d20ef89a97ad9567d40d59d0d270ac5b8d4cab7f49a474e4398 | ||
image: gcr.io/run-ai-prod/resource-reservation:v3.5.0 | ||
imageID: gcr.io/run-ai-prod/resource-reservation@sha256:add1db641829508bbd1e74a7e757348159bc99b67844fc656acc1e795872d0a6 | ||
lastState: {} | ||
name: runai-reservation | ||
ready: true | ||
restartCount: 0 | ||
started: true | ||
state: | ||
running: | ||
startedAt: "2024-03-31T09:03:27Z" | ||
hostIP: 172.20.62.77 | ||
phase: Running | ||
podIP: 100.122.249.151 | ||
podIPs: | ||
- ip: 100.122.249.151 | ||
qosClass: BestEffort | ||
startTime: "2024-03-31T09:03:25Z" |
Oops, something went wrong.