From f00c42601158d86bc951088eecb2f7fb92e07ea1 Mon Sep 17 00:00:00 2001 From: Bill Scherer <36514047+billschereriii@users.noreply.github.com> Date: Fri, 6 Oct 2023 14:21:36 -0500 Subject: [PATCH] Add support for MINBATCHTIMEOUT (#387) Add support for MINBATCHTIMEOUT [ committed by @billschereriii ] [ reviewed by @ashao @al-rigazzi] --- doc/changelog.rst | 5 +++++ smartsim/_core/entrypoints/colocated.py | 6 +++--- smartsim/_core/utils/redis.py | 2 ++ smartsim/entity/ensemble.py | 4 ++++ smartsim/entity/model.py | 4 ++++ 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index e64f982f1..3d45a9f42 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -19,16 +19,21 @@ To be released at some future point in time Description +- Added support for MINBATCHTIMEOUT in model execution - Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit Detailed Notes +- Added support for MINBATCHTIMEOUT in model execution, which caps the delay + waiting for a minimium number of model execution operations to accumulate + before executing them as a batch (PR387_) - RedisAI 1.2.5 is not supported anymore. The only RedisAI version is now 1.2.7. Since the officially released RedisAI 1.2.7 has a bug which breaks the build process on Mac OSX, it was decided to use commit 634916c_ from RedisAI's GitHub repository, where such bug has been fixed. This applies to all operating systems. (PR383_) + .. _PR387: https://github.com/CrayLabs/SmartSim/pull/387 .. _PR383: https://github.com/CrayLabs/SmartSim/pull/383 .. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2 diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 81b1f35bb..45baf35cd 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -78,12 +78,10 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: parser.add_argument("--devices_per_node", type=int) parser.add_argument("--batch_size", type=int, default=0) parser.add_argument("--min_batch_size", type=int, default=0) + parser.add_argument("--min_batch_timeout", type=int, default=0) parser.add_argument("--tag", type=str, default="") parser.add_argument("--inputs", nargs="+", default=None) parser.add_argument("--outputs", nargs="+", default=None) - - # Unused if we use SmartRedis - parser.add_argument("--min_batch_timeout", type=int, default=None) args = parser.parse_args(db_model) inputs = None @@ -106,6 +104,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: num_gpus=args.devices_per_node, batch_size=args.batch_size, min_batch_size=args.min_batch_size, + min_batch_timeout=args.min_batch_timeout, tag=args.tag, inputs=inputs, outputs=outputs, @@ -118,6 +117,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: device=args.device, batch_size=args.batch_size, min_batch_size=args.min_batch_size, + min_batch_timeout=args.min_batch_timeout, tag=args.tag, inputs=inputs, outputs=outputs, diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 51f531e21..6570ab28b 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -172,6 +172,7 @@ def set_ml_model(db_model: DBModel, client: Client) -> None: device=device, batch_size=db_model.batch_size, min_batch_size=db_model.min_batch_size, + min_batch_timeout=db_model.min_batch_timeout, tag=db_model.tag, inputs=db_model.inputs, outputs=db_model.outputs, @@ -184,6 +185,7 @@ def set_ml_model(db_model: DBModel, client: Client) -> None: device=device, batch_size=db_model.batch_size, min_batch_size=db_model.min_batch_size, + min_batch_timeout=db_model.min_batch_timeout, tag=db_model.tag, inputs=db_model.inputs, outputs=db_model.outputs, diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index c3b034247..2781234e3 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -364,6 +364,7 @@ def add_ml_model( devices_per_node: int = 1, batch_size: int = 0, min_batch_size: int = 0, + min_batch_timeout: int = 0, tag: str = "", inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, @@ -391,6 +392,8 @@ def add_ml_model( :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 :type min_batch_size: int, optional + :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 + :type min_batch_timeout: int, optional :param tag: additional tag for model information, defaults to "" :type tag: str, optional :param inputs: model inputs (TF only), defaults to None @@ -407,6 +410,7 @@ def add_ml_model( devices_per_node=devices_per_node, batch_size=batch_size, min_batch_size=min_batch_size, + min_batch_timeout=min_batch_timeout, tag=tag, inputs=inputs, outputs=outputs, diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 35690c526..4f1d7d1e8 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -451,6 +451,7 @@ def add_ml_model( devices_per_node: int = 1, batch_size: int = 0, min_batch_size: int = 0, + min_batch_timeout: int = 0, tag: str = "", inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, @@ -482,6 +483,8 @@ def add_ml_model( :type batch_size: int, optional :param min_batch_size: minimum batch size for model execution, defaults to 0 :type min_batch_size: int, optional + :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 + :type min_batch_timeout: int, optional :param tag: additional tag for model information, defaults to "" :type tag: str, optional :param inputs: model inputs (TF only), defaults to None @@ -498,6 +501,7 @@ def add_ml_model( devices_per_node=devices_per_node, batch_size=batch_size, min_batch_size=min_batch_size, + min_batch_timeout=min_batch_timeout, tag=tag, inputs=inputs, outputs=outputs,