Skip to content

Commit

Permalink
feat: Add scaling-group-type in agent.toml (#2796) (#2798)
Browse files Browse the repository at this point in the history
Co-authored-by: Joongi Kim <[email protected]>
  • Loading branch information
lablup-octodog and achimnol authored Sep 2, 2024
1 parent 43dc77b commit 1acd768
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 3 deletions.
1 change: 1 addition & 0 deletions changes/2796.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add an explicit configuration `scaling-group-type` to `agent.toml` so that the agent could distinguish whether itself belongs to an SFTP resource group or not
8 changes: 7 additions & 1 deletion configs/agent/sample.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,16 @@ agent-sock-port = 6007
# This affects the per-node configuration scope.
# id = "i-something-special"

# Set the scaling group of this agent.
# Set the scaling group (aka resource group) of this agent.
# This affects the per-sgroup configuration scope.
scaling-group = "default"

# Set the type of scaling group (aka resource group) of this agent.
# - "compute": The agent hosts computing workloads, facing the internal cluster nodes.
# - "storage": The agent hosts storage-access containers, directly facing public/user-side netweorks.
# [default: "compute"]
# scaling-group-type = "compute"

# Set the volume mount path for the agent node.
# mount-path = "/vfroot"

Expand Down
7 changes: 6 additions & 1 deletion src/ai/backend/agent/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ai.backend.common import config
from ai.backend.common import validators as tx
from ai.backend.common.types import ResourceGroupType

from .affinity_map import AffinityPolicy
from .stats import StatModes
Expand Down Expand Up @@ -43,6 +44,9 @@
t.Key("region", default=None): t.Null | t.String,
t.Key("instance-type", default=None): t.Null | t.String,
t.Key("scaling-group", default="default"): t.String,
t.Key("scaling-group-type", default=ResourceGroupType.COMPUTE): t.Enum(
*(e.value for e in ResourceGroupType)
),
t.Key("pid-file", default=os.devnull): tx.Path(
type="file", allow_nonexisting=True, allow_devnull=True
),
Expand Down Expand Up @@ -81,7 +85,8 @@
t.Key("bind-host", default=""): t.String(allow_blank=True),
t.Key("advertised-host", default=None): t.Null | t.String(),
t.Key("port-range", default=(30000, 31000)): tx.PortRange,
t.Key("stats-type", default="docker"): t.Null | t.Enum(*[e.value for e in StatModes]),
t.Key("stats-type", default=StatModes.DOCKER): t.Null
| t.Enum(*(e.value for e in StatModes)),
t.Key("sandbox-type", default="docker"): t.Enum("docker", "jail"),
t.Key("jail-args", default=[]): t.List(t.String),
t.Key("scratch-type"): t.Enum("hostdir", "hostfile", "memory", "k8s-nfs"),
Expand Down
2 changes: 1 addition & 1 deletion src/ai/backend/agent/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def check_cgroup_available():
return not is_containerized() and sys.platform.startswith("linux")


class StatModes(enum.Enum):
class StatModes(enum.StrEnum):
CGROUP = "cgroup"
DOCKER = "docker"

Expand Down
6 changes: 6 additions & 0 deletions src/ai/backend/common/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"SlotName",
"IntrinsicSlotNames",
"ResourceSlot",
"ResourceGroupType",
"ReadableCIDR",
"HardwareMetadata",
"ModelServiceStatus",
Expand Down Expand Up @@ -294,6 +295,11 @@ class SessionResult(enum.StrEnum):
FAILURE = "failure"


class ResourceGroupType(enum.StrEnum):
COMPUTE = enum.auto()
STORAGE = enum.auto()


class ClusterMode(enum.StrEnum):
SINGLE_NODE = "single-node"
MULTI_NODE = "multi-node"
Expand Down

0 comments on commit 1acd768

Please sign in to comment.