-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal_run.yaml
36 lines (36 loc) · 937 Bytes
/
final_run.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
apiVersion: v1
kind: Pod
metadata:
generateName: resnet-custom-backprop-
spec:
#nodeName: gpu8-vm13 #9
containers:
- args:
- |
torchrun --nproc_per_node=2 transformer_main.py --algo 1f1b-1 --msbp ;
torchrun --nproc_per_node=4 transformer_main.py --algo 1f1b-1 --msbp ;
torchrun --nproc_per_node=8 transformer_main.py --algo 1f1b-1 --msbp ;
command: [/bin/bash, -c, --]
image: bigballoon8/custom-backprop
name: custom-backprop
env:
- name: OMP_NUM_THREADS
value: "4"
#- name: NCCL_DEBUG
# value: "INFO"
resources:
limits:
cpu: 32
memory: 64Gi
nvidia.com/gpu: 8
volumeMounts:
- mountPath: /dev/shm
name: devshm
workingDir: /workspace/experiment3(pipeline_parallelism)/
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-40GB
restartPolicy: Never
volumes:
- emptyDir:
medium: Memory
name: devshm