Skip to content

Commit

Permalink
- [Bug] The shm_size property in resources doesn't take effect #1006
Browse files Browse the repository at this point in the history
  • Loading branch information
peterschmidt85 authored and Sergey Mezentsev committed Mar 13, 2024
1 parent 2af5da2 commit 0b8011c
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 4 deletions.
2 changes: 2 additions & 0 deletions runner/internal/shim/api/schemas.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ type DockerTaskBody struct {
Username string `json:"username"`
Password string `json:"password"`
ImageName string `json:"image_name"`
ShmSize int64 `json:"shm_size"`
}

type StopBody struct {
Expand All @@ -30,6 +31,7 @@ func (ra DockerTaskBody) TaskParams() shim.DockerImageConfig {
ImageName: ra.ImageName,
Username: ra.Username,
Password: ra.Password,
ShmSize: ra.ShmSize,
}
return res
}
3 changes: 2 additions & 1 deletion runner/internal/shim/docker.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@ func createContainer(ctx context.Context, client docker.APIClient, dockerParams
Resources: container.Resources{
DeviceRequests: gpuRequest,
},
Mounts: mounts,
Mounts: mounts,
ShmSize: taskParams.ShmSize,
}
resp, err := client.ContainerCreate(ctx, containerConfig, hostConfig, nil, nil, "")
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions runner/internal/shim/models.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type DockerImageConfig struct {
Username string
Password string
ImageName string
ShmSize int64
}

func (ra DockerImageConfig) EncodeRegistryAuth() (string, error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,12 +352,14 @@ def _process_provisioning_with_shim(
username=interpolate(registry_auth.username),
password=interpolate(registry_auth.password),
image_name=job_spec.image_name,
shm_size=job_spec.requirements.resources.shm_size,
)
else:
shim_client.submit(
username="",
password="",
image_name=job_spec.image_name,
shm_size=job_spec.requirements.resources.shm_size,
)

job_model.status = JobStatus.PULLING
Expand Down
1 change: 1 addition & 0 deletions src/dstack/_internal/server/schemas/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class DockerImageBody(BaseModel):
username: str
password: str
image_name: str
shm_size: int


class StopBody(BaseModel):
Expand Down
6 changes: 4 additions & 2 deletions src/dstack/_internal/server/services/runner/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests.exceptions

from dstack._internal.core.models.repos.remote import RemoteRepoCreds
from dstack._internal.core.models.resources import Memory
from dstack._internal.core.models.runs import JobSpec, RunSpec
from dstack._internal.server.schemas.runner import (
DockerImageBody,
Expand Down Expand Up @@ -102,9 +103,10 @@ def healthcheck(self, unmask_exeptions: bool = False) -> Optional[HealthcheckRes
raise
return None

def submit(self, username: str, password: str, image_name: str):
def submit(self, username: str, password: str, image_name: str, shm_size: Optional[Memory]):
_shm_size = int(shm_size * 1024 * 1024 * 1014) if shm_size else 0
post_body = DockerImageBody(
username=username, password=password, image_name=image_name
username=username, password=password, image_name=image_name, shm_size=_shm_size
).dict()
resp = requests.post(
self._url("/api/submit"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,10 @@ async def test_provisioning_shim(self, test_db, session: AsyncSession):
RunnerTunnelMock.assert_called_once()
ShimClientMock.return_value.healthcheck.assert_called_once()
ShimClientMock.return_value.submit.assert_called_once_with(
username="", password="", image_name="dstackai/base:py3.11-0.4rc4-cuda-12.1"
username="",
password="",
image_name="dstackai/base:py3.11-0.4rc4-cuda-12.1",
shm_size=None,
)
await session.refresh(job)
assert job is not None
Expand Down

0 comments on commit 0b8011c

Please sign in to comment.