Skip to content

Commit

Permalink
Improvements to colocated DB launch (#366)
Browse files Browse the repository at this point in the history
This PR improves the way colocated DBs are launched. An erroneous
warning has now been suppressed, and colocated launcher script names
contain the name of the entity being launched, this avoids race conditions.
Colocated launcher scripts have been moved to a hidden directory
named `.smartsim`, to reduce cluttering in the user directories.

[ committed by @al-rigazzi ]
[ reviewed by @MattToast @mellis13 @ashao @ankona ]
  • Loading branch information
al-rigazzi authored Oct 4, 2023
1 parent aaeee3b commit 9471ab4
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 48 deletions.
9 changes: 7 additions & 2 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,8 @@ def setup_test_colo(
exp: Experiment,
db_args: t.Dict[str, t.Any],
colo_settings: t.Optional[t.Dict[str, t.Any]] = None,
colo_model_name: t.Optional[str] = None,
port: t.Optional[int] = test_port
) -> Model:
"""Setup things needed for setting up the colo pinning tests"""
# get test setup
Expand All @@ -688,12 +690,15 @@ def setup_test_colo(
colo_settings = exp.create_run_settings(
exe=sys.executable, exe_args=[sr_test_script]
)
colo_model = exp.create_model("colocated_model", colo_settings)
colo_name = colo_model_name if colo_model_name else "colocated_model"
colo_model = exp.create_model(colo_name, colo_settings)
colo_model.set_path(test_dir)

if db_type in ["tcp", "deprecated"]:
db_args["port"] = 6780
db_args["port"] = port
db_args["ifname"] = "lo"
if db_type == "uds" and colo_model_name is not None:
db_args["unix_socket"] = f"/tmp/{colo_model_name}.socket"

colocate_fun: t.Dict[str, t.Callable[..., None]] = {
"tcp": colo_model.colocate_db_tcp,
Expand Down
18 changes: 9 additions & 9 deletions smartsim/_core/entrypoints/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,14 @@ def main(
global DBPID # pylint: disable=global-statement

lo_address = current_ip("lo")
try:
ip_addresses = [
current_ip(interface) for interface in network_interface.split(",")
]

except ValueError as e:
logger.warning(e)
ip_addresses = []
ip_addresses = []
if network_interface:
try:
ip_addresses = [
current_ip(interface) for interface in network_interface.split(",")
]
except ValueError as e:
logger.warning(e)

if all(lo_address == ip_address for ip_address in ip_addresses) or not ip_addresses:
cmd = command + [f"--bind {lo_address}"]
Expand Down Expand Up @@ -317,7 +317,7 @@ def register_signal_handlers() -> None:
LOCK.acquire(timeout=0.1)
logger.debug(f"Starting colocated database on host: {socket.gethostname()}")

# make sure to register the cleanup before the start
# make sure to register the cleanup before we start
# the proecss so our signaller will be able to stop
# the database process.
register_signal_handlers()
Expand Down
8 changes: 7 additions & 1 deletion smartsim/_core/launcher/step/step.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import time
import typing as t

from os import makedirs
from smartsim.error.errors import SmartSimError

from ....log import get_logger
Expand Down Expand Up @@ -75,7 +76,12 @@ def get_step_file(

def get_colocated_launch_script(self) -> str:
# prep step for colocated launch if specifed in run settings
script_path = self.get_step_file(script_name=".colocated_launcher.sh")
script_path = self.get_step_file(
script_name=osp.join(
".smartsim", f"colocated_launcher_{self.entity_name}.sh"
)
)
makedirs(osp.dirname(script_path), exist_ok=True)

db_settings: t.Dict[str, str] = {}
if isinstance(self.step_settings, RunSettings):
Expand Down
2 changes: 1 addition & 1 deletion tests/backends/test_dbmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,7 +807,7 @@ def test_inconsistent_params_db_model():
# Create and save ML model to filesystem
model, inputs, outputs = create_tf_cnn()
with pytest.raises(SSUnsupportedError) as ex:
db_model = DBModel(
DBModel(
"cnn",
"TF",
model=model,
Expand Down
124 changes: 89 additions & 35 deletions tests/test_colo_model_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,70 +37,81 @@
else:
supported_dbs = ["uds", "tcp", "deprecated"]

is_mac = sys.platform == 'darwin'
is_mac = sys.platform == "darwin"

@pytest.mark.skipif(not is_mac, reason='MacOS-only test')

@pytest.mark.skipif(not is_mac, reason="MacOS-only test")
def test_macosx_warning(fileutils, coloutils):
db_args = {"custom_pinning":[1]}
db_type = 'uds' # Test is insensitive to choice of db
db_args = {"custom_pinning": [1]}
db_type = "uds" # Test is insensitive to choice of db

exp = Experiment("colocated_model_defaults", launcher="local")
with pytest.warns(
RuntimeWarning,
match="CPU pinning is not supported on MacOSX. Ignoring pinning specification."
match="CPU pinning is not supported on MacOSX. Ignoring pinning specification.",
):
colo_model = coloutils.setup_test_colo(
_ = coloutils.setup_test_colo(
fileutils,
db_type,
exp,
db_args,
)


def test_unsupported_limit_app(fileutils, coloutils):
db_args = {"limit_app_cpus":True}
db_type = 'uds' # Test is insensitive to choice of db
db_args = {"limit_app_cpus": True}
db_type = "uds" # Test is insensitive to choice of db

exp = Experiment("colocated_model_defaults", launcher="local")
with pytest.raises(SSUnsupportedError):
colo_model = coloutils.setup_test_colo(
coloutils.setup_test_colo(
fileutils,
db_type,
exp,
db_args,
)


@pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX")
@pytest.mark.parametrize("custom_pinning", [1,"10","#",1.,['a'],[1.]])
@pytest.mark.parametrize("custom_pinning", [1, "10", "#", 1.0, ["a"], [1.0]])
def test_unsupported_custom_pinning(fileutils, coloutils, custom_pinning):
db_type = "uds" # Test is insensitive to choice of db
db_type = "uds" # Test is insensitive to choice of db
db_args = {"custom_pinning": custom_pinning}

exp = Experiment("colocated_model_defaults", launcher="local")
with pytest.raises(TypeError):
colo_model = coloutils.setup_test_colo(
coloutils.setup_test_colo(
fileutils,
db_type,
exp,
db_args,
)


@pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX")
@pytest.mark.parametrize("pin_list, num_cpus, expected", [
pytest.param(None, 2, "0,1", id="Automatic creation of pinned cpu list"),
pytest.param([1,2], 2, "1,2", id="Individual ids only"),
pytest.param([range(2),3], 3, "0,1,3", id="Mixed ranges and individual ids"),
pytest.param(range(3), 3, "0,1,2", id="Range only"),
pytest.param([range(8, 10), range(6, 1, -2)], 4, "2,4,6,8,9", id="Multiple ranges"),
])
@pytest.mark.parametrize(
"pin_list, num_cpus, expected",
[
pytest.param(None, 2, "0,1", id="Automatic creation of pinned cpu list"),
pytest.param([1, 2], 2, "1,2", id="Individual ids only"),
pytest.param([range(2), 3], 3, "0,1,3", id="Mixed ranges and individual ids"),
pytest.param(range(3), 3, "0,1,2", id="Range only"),
pytest.param(
[range(8, 10), range(6, 1, -2)], 4, "2,4,6,8,9", id="Multiple ranges"
),
],
)
def test_create_pinning_string(pin_list, num_cpus, expected):
assert Model._create_pinning_string(pin_list, num_cpus) == expected


@pytest.mark.parametrize("db_type", supported_dbs)
def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher="local"):
def test_launch_colocated_model_defaults(
fileutils, coloutils, db_type, launcher="local"
):
"""Test the launch of a model with a colocated database and local launcher"""

db_args = { }
db_args = {}

exp = Experiment("colocated_model_defaults", launcher=launcher)
colo_model = coloutils.setup_test_colo(
Expand All @@ -114,7 +125,9 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher
true_pinning = None
else:
true_pinning = "0"
assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning
assert (
colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning
)
exp.start(colo_model, block=True)
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])
Expand All @@ -124,9 +137,50 @@ def test_launch_colocated_model_defaults(fileutils, coloutils, db_type, launcher
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.parametrize("db_type", supported_dbs)
def test_colocated_model_disable_pinning(fileutils, coloutils, db_type, launcher="local"):
def test_launch_multiple_colocated_models(
fileutils, coloutils, wlmutils, db_type, launcher="local"
):
"""Test the concurrent launch of two models with a colocated database and local launcher
"""

db_args = {}

exp = Experiment("multi_colo_models", launcher=launcher)
colo_models = [
coloutils.setup_test_colo(
fileutils,
db_type,
exp,
db_args,
colo_model_name="colo0",
port=wlmutils.get_test_port(),
),
coloutils.setup_test_colo(
fileutils,
db_type,
exp,
db_args,
colo_model_name="colo1",
port=wlmutils.get_test_port() + 1,
),
]

exp.start(*colo_models, block=True)
statuses = exp.get_status(*colo_models)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])

# test restarting the colocated model
exp.start(*colo_models, block=True)
statuses = exp.get_status(*colo_models)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.parametrize("db_type", supported_dbs)
def test_colocated_model_disable_pinning(
fileutils, coloutils, db_type, launcher="local"
):
exp = Experiment("colocated_model_pinning_auto_1cpu", launcher=launcher)
db_args = {
"db_cpus": 1,
Expand All @@ -144,9 +198,11 @@ def test_colocated_model_disable_pinning(fileutils, coloutils, db_type, launcher
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])

@pytest.mark.parametrize("db_type", supported_dbs)
def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type, launcher="local"):

@pytest.mark.parametrize("db_type", supported_dbs)
def test_colocated_model_pinning_auto_2cpu(
fileutils, coloutils, db_type, launcher="local"
):
exp = Experiment("colocated_model_pinning_auto_2cpu", launcher=launcher)

db_args = {
Expand All @@ -164,22 +220,22 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, coloutils, db_type, launch
true_pinning = None
else:
true_pinning = "0,1"
assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning
assert (
colo_model.run_settings.colocated_db_settings["custom_pinning"] == true_pinning
)
exp.start(colo_model, block=True)
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(is_mac, reason="unsupported on MacOSX")
@pytest.mark.parametrize("db_type", supported_dbs)
def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="local"):
# Check to make sure that the CPU mask was correctly generated

exp = Experiment("colocated_model_pinning_manual", launcher=launcher)

db_args = {
"db_cpus": 2,
"custom_pinning": range(2)
}
db_args = {"db_cpus": 2, "custom_pinning": range(2)}

colo_model = coloutils.setup_test_colo(
fileutils,
Expand All @@ -192,17 +248,15 @@ def test_colocated_model_pinning_range(fileutils, coloutils, db_type, launcher="
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])


@pytest.mark.skipif(is_mac, reason="unsupported on MacOSX")
@pytest.mark.parametrize("db_type", supported_dbs)
def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="local"):
# Check to make sure that the CPU mask was correctly generated

exp = Experiment("colocated_model_pinning_manual", launcher=launcher)

db_args = {
"db_cpus": 1,
"custom_pinning": [1]
}
db_args = {"db_cpus": 1, "custom_pinning": [1]}

colo_model = coloutils.setup_test_colo(
fileutils,
Expand All @@ -213,4 +267,4 @@ def test_colocated_model_pinning_list(fileutils, coloutils, db_type, launcher="l
assert colo_model.run_settings.colocated_db_settings["custom_pinning"] == "1"
exp.start(colo_model, block=True)
statuses = exp.get_status(colo_model)
assert all([stat == status.STATUS_COMPLETED for stat in statuses])
assert all([stat == status.STATUS_COMPLETED for stat in statuses])

0 comments on commit 9471ab4

Please sign in to comment.