Skip to content

Commit

Permalink
fixes #778 (#779)
Browse files Browse the repository at this point in the history
adding EFA devices to AWS named_resources

Co-authored-by: Alexander Jipa <[email protected]>
  • Loading branch information
Alexander Jipa and azzhipa committed Oct 19, 2023
1 parent 9421dfa commit a711634
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
20 changes: 17 additions & 3 deletions torchx/specs/named_resources_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

from torchx.specs.api import Resource

EFA_DEVICE = "vpc.amazonaws.com/efa"

# ecs and ec2 have memtax and currently AWS Batch uses hard memory limits
# so we have to account for mem tax when registering these resources for AWS
# otherwise the job will be stuck in the jobqueue forever
Expand Down Expand Up @@ -63,20 +65,32 @@ def aws_p3_16xlarge() -> Resource:

def aws_p3dn_24xlarge() -> Resource:
return Resource(
cpu=96, gpu=8, memMB=768 * GiB, capabilities={K8S_ITYPE: "p3dn.24xlarge"}
cpu=96,
gpu=8,
memMB=768 * GiB,
capabilities={K8S_ITYPE: "p3dn.24xlarge"},
devices={EFA_DEVICE: 1},
)


def aws_p4d_24xlarge() -> Resource:
return Resource(
cpu=96, gpu=8, memMB=1152 * GiB, capabilities={K8S_ITYPE: "p4d.24xlarge"}
cpu=96,
gpu=8,
memMB=1152 * GiB,
capabilities={K8S_ITYPE: "p4d.24xlarge"},
devices={EFA_DEVICE: 4},
)


def aws_p4de_24xlarge() -> Resource:
# p4de has same cpu, gpu, memMB as p4d but gpu memory is 2x (32GB vs 64GB per GPU)
return Resource(
cpu=96, gpu=8, memMB=1152 * GiB, capabilities={K8S_ITYPE: "p4de.24xlarge"}
cpu=96,
gpu=8,
memMB=1152 * GiB,
capabilities={K8S_ITYPE: "p4de.24xlarge"},
devices={EFA_DEVICE: 4},
)


Expand Down
4 changes: 4 additions & 0 deletions torchx/specs/test/named_resources_aws_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
aws_t3_medium,
aws_trn1_2xl,
aws_trn1_32xl,
EFA_DEVICE,
GiB,
K8S_ITYPE,
NAMED_RESOURCES,
Expand Down Expand Up @@ -60,6 +61,7 @@ def test_aws_p3(self) -> None:
self.assertEqual(96, p3dn_24.cpu)
self.assertEqual(p3_16.gpu, p3dn_24.gpu)
self.assertEqual(768 * GiB, p3dn_24.memMB)
self.assertEqual({EFA_DEVICE: 1}, p3dn_24.devices)

def test_aws_p4(self) -> None:
p4d = aws_p4d_24xlarge()
Expand All @@ -68,10 +70,12 @@ def test_aws_p4(self) -> None:
self.assertEqual(96, p4d.cpu)
self.assertEqual(8, p4d.gpu)
self.assertEqual(1152 * GiB, p4d.memMB)
self.assertEqual({EFA_DEVICE: 4}, p4d.devices)

self.assertEqual(p4de.cpu, p4d.cpu)
self.assertEqual(p4de.gpu, p4d.gpu)
self.assertEqual(p4de.memMB, p4d.memMB)
self.assertEqual({EFA_DEVICE: 4}, p4de.devices)

def test_aws_g4dn(self) -> None:
g4d = aws_g4dn_xlarge()
Expand Down

0 comments on commit a711634

Please sign in to comment.