Skip to content

Commit

Permalink
GNN training blog
Browse files Browse the repository at this point in the history
  • Loading branch information
aeli1 committed Nov 8, 2022
1 parent da7c5ea commit 19708c5
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 0 deletions.
68 changes: 68 additions & 0 deletions training/provisioner.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
resource "kubernetes_manifest" "provisioner_gpu_provisioner" {
manifest = {
"apiVersion" = "karpenter.sh/v1alpha5"
"kind" = "Provisioner"
"metadata" = {
"name" = "gpu-provisioner"
}
"spec" = {
"provider" = {
"blockDeviceMappings" = [
{
"deviceName" = "/dev/xvda"
"ebs" = {
"deleteOnTermination" = true
"iops" = 3000
"throughput" = 125
"volumeSize" = "100Gi"
"volumeType" = "gp3"
}
},
]
"securityGroupSelector" = {
"karpenter.sh/discovery/training-cluster" = "training-cluster"
}
"subnetSelector" = {
"Name" = "training-cluster-vpc-private*"
}
"tags" = {
"karpenter.sh/discovery/training-cluster" = "training-cluster"
}
}
"requirements" = [
{
"key" = "node.kubernetes.io/instance-type"
"operator" = "In"
"values" = [
"g4dn.xlarge",
"g4dn.4xlarge",
"g4dn.8xlarge",
"g4dn.16xlarge",
]
},
{
"key" = "karpenter.sh/capacity-type"
"operator" = "In"
"values" = [
"spot",
"on-demand",
]
},
{
"key" = "kubernetes.io/arch"
"operator" = "In"
"values" = [
"amd64",
]
},
]
"taints" = [
{
"effect" = "NoSchedule"
"key" = "nvidia.com/gpu"
},
]
"ttlSecondsAfterEmpty" = 30
}
}
}
32 changes: 32 additions & 0 deletions training/training-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
apiVersion: batch/v1
kind: Job
metadata:
name: gnn-training-job
namespace: default
labels:
jobname: gnn-training-job
spec:
ttlSecondsAfterFinished: 60
template:
metadata:
name: gnn-training-job-pod
labels:
app: gnn-training-job-pod
spec:
containers:
- name: gnn-trainer
resources:
requests:
memory: "250G"
limits:
nvidia.com/gpu: 1
image: <obfuscated>
volumeMounts:
- name: dshm
mountPath: /dev/shm
imagePullPolicy: IfNotPresent
restartPolicy: OnFailure
volumes:
- name: dshm
emptyDir:
medium: Memory

0 comments on commit 19708c5

Please sign in to comment.