Skip to content

Commit

Permalink
support mounting neuron devices for local_docker scheduler
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Li committed Jun 14, 2024
1 parent 2ec3673 commit 71e4164
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 10 deletions.
15 changes: 11 additions & 4 deletions torchx/schedulers/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,33 @@
# LICENSE file in the root directory of this source tree.

# pyre-strict
from functools import partial
import warnings
from typing import Callable, Dict, List, Mapping

from torchx.specs.api import DeviceMount
from torchx.specs.named_resources_aws import EFA_DEVICE, NEURON_DEVICE


def efa_to_devicemounts(num_devices: int) -> List[DeviceMount]:
def to_devicemounts(num_devices: int, device_type: str) -> List[DeviceMount]:
device_mounts = []
for device_index in range(0, num_devices):
device_mounts.append(
DeviceMount(
src_path="/dev/infiniband/uverbs" + str(device_index),
dst_path="/dev/infiniband/uverbs" + str(device_index),
src_path=device_type + str(device_index),
dst_path=device_type + str(device_index),
)
)
return device_mounts


neuron_to_devicemounts = partial(to_devicemounts, device_type="/dev/neuron")
efa_to_devicemounts = partial(to_devicemounts, device_type="/dev/infiniband/uverbs")


DEVICES: Mapping[str, Callable[[int], List[DeviceMount]]] = {
"vpc.amazonaws.com/efa": efa_to_devicemounts,
EFA_DEVICE: efa_to_devicemounts,
NEURON_DEVICE: neuron_to_devicemounts,
}


Expand Down
10 changes: 9 additions & 1 deletion torchx/schedulers/test/aws_batch_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,10 @@ def test_resource_devices(self) -> None:
image="",
mounts=[],
resource=specs.Resource(
cpu=1, memMB=1000, gpu=0, devices={"vpc.amazonaws.com/efa": 2}
cpu=1,
memMB=1000,
gpu=0,
devices={"vpc.amazonaws.com/efa": 2, "aws.amazon.com/neurondevice": 1},
),
)
props = _role_to_node_properties(role, 0)
Expand All @@ -379,6 +382,11 @@ def test_resource_devices(self) -> None:
"containerPath": "/dev/infiniband/uverbs1",
"permissions": ["READ", "WRITE", "MKNOD"],
},
{
"hostPath": "/dev/neuron0",
"containerPath": "/dev/neuron0",
"permissions": ["READ", "WRITE", "MKNOD"],
},
],
)

Expand Down
3 changes: 2 additions & 1 deletion torchx/schedulers/test/devices_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class DevicesTest(unittest.TestCase):
def test_get_efa(self) -> None:
devices = {"vpc.amazonaws.com/efa": 2}
devices = {"vpc.amazonaws.com/efa": 2, "aws.amazon.com/neurondevice": 1}
self.assertEqual(
get_device_mounts(devices),
[
Expand All @@ -28,6 +28,7 @@ def test_get_efa(self) -> None:
src_path="/dev/infiniband/uverbs1",
dst_path="/dev/infiniband/uverbs1",
),
DeviceMount(src_path="/dev/neuron0", dst_path="/dev/neuron0"),
],
)

Expand Down
11 changes: 9 additions & 2 deletions torchx/schedulers/test/docker_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,19 @@ def test_device_mounts(self) -> None:
def test_resource_devices(self) -> None:
app = _test_app()
app.roles[0].mounts = []
app.roles[0].resource.devices = {"vpc.amazonaws.com/efa": 1}
app.roles[0].resource.devices = {
"vpc.amazonaws.com/efa": 1,
"aws.amazon.com/neurondevice": 2,
}

info = self.scheduler.submit_dryrun(app, cfg={})
self.assertEqual(
info.request.containers[0].kwargs["devices"],
["/dev/infiniband/uverbs0:/dev/infiniband/uverbs0:rwm"],
[
"/dev/infiniband/uverbs0:/dev/infiniband/uverbs0:rwm",
"/dev/neuron0:/dev/neuron0:rwm",
"/dev/neuron1:/dev/neuron1:rwm",
],
)

@patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"})
Expand Down
9 changes: 7 additions & 2 deletions torchx/specs/named_resources_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from torchx.specs.api import Resource

EFA_DEVICE = "vpc.amazonaws.com/efa"
NEURON_DEVICE = "aws.amazon.com/neurondevice"

# ecs and ec2 have memtax and currently AWS Batch uses hard memory limits
# so we have to account for mem tax when registering these resources for AWS
Expand Down Expand Up @@ -255,7 +256,11 @@ def aws_g5_48xlarge() -> Resource:

def aws_trn1_2xlarge() -> Resource:
return Resource(
cpu=8, gpu=0, memMB=32 * GiB, capabilities={K8S_ITYPE: "trn1.2xlarge"}
cpu=8,
gpu=0,
memMB=32 * GiB,
capabilities={K8S_ITYPE: "trn1.2xlarge"},
devices={NEURON_DEVICE: 1},
)


Expand All @@ -265,7 +270,7 @@ def aws_trn1_32xlarge() -> Resource:
gpu=0,
memMB=512 * GiB,
capabilities={K8S_ITYPE: "trn1.32xlarge"},
devices={EFA_DEVICE: 8},
devices={EFA_DEVICE: 8, NEURON_DEVICE: 16},
)


Expand Down
3 changes: 3 additions & 0 deletions torchx/specs/test/named_resources_aws_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
aws_trn1_2xlarge,
aws_trn1_32xlarge,
EFA_DEVICE,
NEURON_DEVICE,
GiB,
K8S_ITYPE,
NAMED_RESOURCES,
Expand Down Expand Up @@ -170,11 +171,13 @@ def test_aws_trn1(self) -> None:
self.assertEqual(8, trn1_2.cpu)
self.assertEqual(0, trn1_2.gpu)
self.assertEqual(32 * GiB, trn1_2.memMB)
self.assertEqual({NEURON_DEVICE: 1}, trn1_2.devices)

trn1_32 = aws_trn1_32xlarge()
self.assertEqual(trn1_32.cpu, trn1_2.cpu * 16)
self.assertEqual(trn1_32.gpu, trn1_2.gpu)
self.assertEqual(trn1_32.memMB, trn1_2.memMB * 16)
self.assertEqual({EFA_DEVICE: 8, NEURON_DEVICE: 16}, trn1_32.devices)

def test_aws_m5_2xlarge(self) -> None:
resource = aws_m5_2xlarge()
Expand Down

0 comments on commit 71e4164

Please sign in to comment.