Skip to content

Commit

Permalink
Expose first_device parameter for setting models, scripts, functions (#…
Browse files Browse the repository at this point in the history
…394)

Expose first_device parameter for setting models, scripts, functions
[ committed by @billschereriii ]
[ reviewed by @al-rigazzi @MattToast ]
  • Loading branch information
billschereriii authored Oct 12, 2023
1 parent a9e64c8 commit b509efd
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 36 deletions.
11 changes: 8 additions & 3 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,16 @@ To be released at some future point in time

Description

- Expose first_device parameter for scripts, functions, models
- Added support for MINBATCHTIMEOUT in model execution
- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit
- Add support for multiple databases

Detailed Notes

- Added support for first_device parameter for scripts, functions,
and models. This causes them to be loaded to the first num_devices
beginning with first_device (PR394_)
- Added support for MINBATCHTIMEOUT in model execution, which caps the delay
waiting for a minimium number of model execution operations to accumulate
before executing them as a batch (PR387_)
Expand All @@ -32,10 +37,13 @@ Detailed Notes
bug which breaks the build process on Mac OSX, it was decided to
use commit 634916c_ from RedisAI's GitHub repository, where such
bug has been fixed. This applies to all operating systems. (PR383_)
- Add support for creation of multiple databases with unique identifiers. (PR342_)

.. _PR394: https://github.com/CrayLabs/SmartSim/pull/394
.. _PR387: https://github.com/CrayLabs/SmartSim/pull/387
.. _PR383: https://github.com/CrayLabs/SmartSim/pull/383
.. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2
.. _PR342: https://github.com/CrayLabs/SmartSim/pull/342

0.5.1
-----
Expand All @@ -44,7 +52,6 @@ Released on 14 September, 2023

Description

- Add support for multiple databases
- Add typehints throughout the SmartSim codebase
- Provide support for Slurm heterogeneous jobs
- Provide better support for `PalsMpiexecSettings`
Expand All @@ -62,7 +69,6 @@ Description

Detailed Notes

- Add support for creation of multiple databases with unique identifiers. (PR342_)
- Add methods to allow users to inspect files attached to models and ensembles. (PR352_)
- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (PR350_)
- Remove unnecessary generation producing unexpected directories in the test suite. (PR349_)
Expand All @@ -86,7 +92,6 @@ Detailed Notes
- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (PR311_)
- Refactor the `smart` CLI to use subparsers for better documentation and extension. (PR308_)

.. _PR342: https://github.com/CrayLabs/SmartSim/pull/342
.. _PR352: https://github.com/CrayLabs/SmartSim/pull/352
.. _PR351: https://github.com/CrayLabs/SmartSim/pull/351
.. _PR350: https://github.com/CrayLabs/SmartSim/pull/350
Expand Down
14 changes: 9 additions & 5 deletions smartsim/_core/entrypoints/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
parser.add_argument("--file", type=str)
parser.add_argument("--backend", type=str)
parser.add_argument("--device", type=str)
parser.add_argument("--devices_per_node", type=int)
parser.add_argument("--devices_per_node", type=int, default=1)
parser.add_argument("--first_device", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=0)
parser.add_argument("--min_batch_size", type=int, default=0)
parser.add_argument("--min_batch_timeout", type=int, default=0)
Expand All @@ -100,7 +101,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str:
name=name,
model_file=args.file,
backend=args.backend,
fist_gpu=0,
first_gpu=args.first_device,
num_gpus=args.devices_per_node,
batch_size=args.batch_size,
min_batch_size=args.min_batch_size,
Expand Down Expand Up @@ -142,7 +143,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
parser.add_argument("--file", type=str)
parser.add_argument("--backend", type=str)
parser.add_argument("--device", type=str)
parser.add_argument("--devices_per_node", type=int)
parser.add_argument("--devices_per_node", type=int, default=1)
parser.add_argument("--first_device", type=int, default=0)
args = parser.parse_args(db_script)

if args.file and args.func:
Expand All @@ -151,13 +153,15 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str:
if args.func:
func = args.func.replace("\\n", "\n")
if args.devices_per_node > 1 and args.device.lower() == "gpu":
client.set_script_multigpu(args.name, func, 0, args.devices_per_node)
client.set_script_multigpu(
args.name, func, args.first_device, args.devices_per_node
)
else:
client.set_script(args.name, func, args.device)
elif args.file:
if args.devices_per_node > 1 and args.device.lower() == "gpu":
client.set_script_from_file_multigpu(
args.name, args.file, 0, args.devices_per_node
args.name, args.file, args.first_device, args.devices_per_node
)
else:
client.set_script_from_file(args.name, args.file, args.device)
Expand Down
3 changes: 2 additions & 1 deletion smartsim/_core/launcher/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]:
cmd.append(f"--backend={db_model.backend}")
cmd.append(f"--device={db_model.device}")
cmd.append(f"--devices_per_node={db_model.devices_per_node}")
cmd.append(f"--first_device={db_model.first_device}")
if db_model.batch_size:
cmd.append(f"--batch_size={db_model.batch_size}")
if db_model.min_batch_size:
Expand Down Expand Up @@ -254,5 +255,5 @@ def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]:
cmd.append(f"--file={db_script.file}")
cmd.append(f"--device={db_script.device}")
cmd.append(f"--devices_per_node={db_script.devices_per_node}")

cmd.append(f"--first_device={db_script.first_device}")
return cmd
47 changes: 36 additions & 11 deletions smartsim/entity/dbobject.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def __init__(
file_path: t.Optional[str],
device: t.Literal["CPU", "GPU"],
devices_per_node: int,
first_device: int,
) -> None:
self.name = name
self.func = func
Expand All @@ -56,7 +57,8 @@ def __init__(
self.file = self._check_filepath(file_path)
self.device = self._check_device(device)
self.devices_per_node = devices_per_node
self._check_devices(device, devices_per_node)
self.first_device = first_device
self._check_devices(device, devices_per_node, first_device)

@property
def devices(self) -> t.List[str]:
Expand Down Expand Up @@ -118,16 +120,28 @@ def _enumerate_devices(self) -> t.List[str]:

if self.device == "GPU" and self.devices_per_node > 1:
return [
f"{self.device}:{str(device_num)}"
for device_num in range(self.devices_per_node)
f"{self.device}:{device_num}"
for device_num in range(
self.first_device, self.first_device + self.devices_per_node
)
]

return [self.device]

@staticmethod
def _check_devices(
device: t.Literal["CPU", "GPU"], devices_per_node: int
device: t.Literal["CPU", "GPU"], devices_per_node: int, first_device: int,
) -> None:
if device == "CPU" and devices_per_node > 1:
raise SSUnsupportedError(
"Cannot set devices_per_node>1 if CPU is specified under devices"
)

if device == "CPU" and first_device > 0:
raise SSUnsupportedError(
"Cannot set first_device>0 if CPU is specified under devices"
)

if devices_per_node == 1:
return

Expand All @@ -136,10 +150,6 @@ def _check_devices(
msg += f"the device was set to {device} and \
devices_per_node=={devices_per_node}"
raise ValueError(msg)
if device == "CPU":
raise SSUnsupportedError(
"Cannot set devices_per_node>1 if CPU is specified under devices"
)


class DBScript(DBObject):
Expand All @@ -150,14 +160,17 @@ def __init__(
script_path: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
):
"""TorchScript code represenation
Device selection is either "GPU" or "CPU". If many devices are
present, a number can be passed for specification e.g. "GPU:1".
Setting ``devices_per_node=N``, with N greater than one will result
in the model being stored on the first N devices of type ``device``.
in the script being stored on the first N devices of type ``device``;
additionally setting ``first_device=M`` will instead result in the
script being stored on devices M through M + N -1.
One of either script (in memory representation) or script_path (file)
must be provided
Expand All @@ -172,8 +185,12 @@ def __init__(
:type device: str, optional
:param devices_per_node: number of devices to store the script on
:type devices_per_node: int
:param first_device: first devices to store the script on
:type first_device: int
"""
super().__init__(name, script, script_path, device, devices_per_node)
super().__init__(
name, script, script_path, device, devices_per_node, first_device
)
if not script and not script_path:
raise ValueError("Either script or script_path must be provided")

Expand All @@ -191,6 +208,8 @@ def __str__(self) -> str:
"s per node\n" if self.devices_per_node > 1 else " per node\n"
)
desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str
if self.first_device > 0:
desc_str += "First device: " + str(self.first_device) + "\n"
return desc_str


Expand All @@ -203,6 +222,7 @@ def __init__(
model_file: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
batch_size: int = 0,
min_batch_size: int = 0,
min_batch_timeout: int = 0,
Expand All @@ -227,6 +247,8 @@ def __init__(
:type device: str, optional
:param devices_per_node: number of devices to store the model on
:type devices_per_node: int
:param first_device: The first device to store the model on
:type first_device: int
:param batch_size: batch size for execution, defaults to 0
:type batch_size: int, optional
:param min_batch_size: minimum batch size for model execution, defaults to 0
Expand All @@ -240,7 +262,8 @@ def __init__(
:param outputs: model outupts (TF only), defaults to None
:type outputs: list[str], optional
"""
super().__init__(name, model, model_file, device, devices_per_node)
super().__init__(
name, model, model_file, device, devices_per_node, first_device)
self.backend = self._check_backend(backend)
if not model and not model_file:
raise ValueError("Either model or model_file must be provided")
Expand All @@ -264,6 +287,8 @@ def __str__(self) -> str:
"s per node\n" if self.devices_per_node > 1 else " per node\n"
)
desc_str += "Devices: " + str(self.devices_per_node) + " " + devices_str
if self.first_device > 0:
desc_str += "First_device: " + str(self.first_device) + "\n"
desc_str += "Backend: " + str(self.backend) + "\n"
if self.batch_size:
desc_str += "Batch size: " + str(self.batch_size) + "\n"
Expand Down
22 changes: 20 additions & 2 deletions smartsim/entity/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ def add_ml_model(
model_path: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
batch_size: int = 0,
min_batch_size: int = 0,
min_batch_timeout: int = 0,
Expand All @@ -388,6 +389,12 @@ def add_ml_model(
:type backend: str
:param device: name of device for execution, defaults to "CPU"
:type device: str, optional
:param devices_per_node: number of GPUs per node in multiGPU nodes,
defaults to 1
:type devices_per_node: int, optional
:param first_device: first device in multi-GPU nodes to use for execution,
defaults to 0; ignored if devices_per_node is 1
:type first_device: int, optional
:param batch_size: batch size for execution, defaults to 0
:type batch_size: int, optional
:param min_batch_size: minimum batch size for model execution, defaults to 0
Expand All @@ -408,6 +415,7 @@ def add_ml_model(
model_file=model_path,
device=device,
devices_per_node=devices_per_node,
first_device=first_device,
batch_size=batch_size,
min_batch_size=min_batch_size,
min_batch_timeout=min_batch_timeout,
Expand All @@ -426,6 +434,7 @@ def add_script(
script_path: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
) -> None:
"""TorchScript to launch with every entity belonging to this ensemble
Expand All @@ -452,13 +461,16 @@ def add_script(
:type device: str, optional
:param devices_per_node: number of devices on each host
:type devices_per_node: int
:param first_device: first device to use on each host
:type first_device: int
"""
db_script = DBScript(
name=name,
script=script,
script_path=script_path,
device=device,
devices_per_node=devices_per_node,
first_device=first_device,
)
self._db_scripts.append(db_script)
for entity in self.models:
Expand All @@ -470,6 +482,7 @@ def add_function(
function: t.Optional[str] = None,
device: t.Literal["CPU", "GPU"] = "CPU",
devices_per_node: int = 1,
first_device: int = 0,
) -> None:
"""TorchScript function to launch with every entity belonging to this ensemble
Expand All @@ -483,7 +496,9 @@ def add_function(
present, a number can be passed for specification e.g. "GPU:1".
Setting ``devices_per_node=N``, with N greater than one will result
in the model being stored in the first N devices of type ``device``.
in the script being stored in the first N devices of type ``device``;
alternatively, setting ``first_device=M`` will result in the script
being stored on nodes M through M + N - 1.
:param name: key to store function under
:type name: str
Expand All @@ -493,9 +508,12 @@ def add_function(
:type device: str, optional
:param devices_per_node: number of devices on each host
:type devices_per_node: int
:param first_device: first device to use on each host
:type first_device: int
"""
db_script = DBScript(
name=name, script=function, device=device, devices_per_node=devices_per_node
name=name, script=function, device=device,
devices_per_node=devices_per_node, first_device=first_device
)
self._db_scripts.append(db_script)
for entity in self.models:
Expand Down
Loading

0 comments on commit b509efd

Please sign in to comment.