Skip to content

Commit

Permalink
Merge pull request #480 from lsst-sqre/tickets/DM-48838
Browse files Browse the repository at this point in the history
DM-48838: Wait for service account creation on lab spawn
  • Loading branch information
rra authored Feb 12, 2025
2 parents cec7137 + adaecc2 commit ebd70c8
Show file tree
Hide file tree
Showing 15 changed files with 772 additions and 595 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: trailing-whitespace

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.9.4
rev: v0.9.6
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
3 changes: 3 additions & 0 deletions changelog.d/20250211_112354_rra_DM_48838.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Bug fixes

- Wait for the default service account of the user's lab namespace to be created before creating the `Pod` object. Creation of the service account can be slow when Kubernetes is busy, and creation of the `Pod` object will fail if the service account does not exist.
4 changes: 2 additions & 2 deletions controller/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ dependencies = [
"httpx",
"fastapi>=0.100",
"jinja2",
"kubernetes_asyncio",
"kubernetes-asyncio",
"pydantic>2",
"pydantic-settings",
"PyYAML",
"safir[kubernetes]>=6.2.0",
"safir[kubernetes]>=9.3.0",
"semver",
"sse-starlette",
"uvicorn[standard]",
Expand Down
321 changes: 158 additions & 163 deletions controller/requirements/dev.txt

Large diffs are not rendered by default.

243 changes: 126 additions & 117 deletions controller/requirements/main.txt

Large diffs are not rendered by default.

84 changes: 84 additions & 0 deletions controller/src/controller/storage/kubernetes/deleter.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
V1Job,
V1PersistentVolumeClaim,
V1Service,
V1ServiceAccount,
)
from structlog.stdlib import BoundLogger

Expand All @@ -36,6 +37,7 @@
"JobStorage",
"KubernetesObjectDeleter",
"PersistentVolumeClaimStorage",
"ServiceAccountStorage",
"ServiceStorage",
]

Expand Down Expand Up @@ -272,6 +274,64 @@ async def list(
) from e
return objs.items

async def wait_for_creation(
self, name: str, namespace: str, timeout: Timeout
) -> None:
"""Wait for an object to be created.
Parameters
----------
name
Name of the object.
namespace
Namespace of the object.
timeout
How long to wait for the object to be created.
Raises
------
ControllerTimeoutError
Raised if the timeout expired.
KubernetesError
Raised for exceptions from the Kubernetes API server.
"""
logger = self._logger.bind(name=name, namespace=namespace)
obj = await self.read(name, namespace, timeout)
if obj:
return

# Wait for the object to be created.
watch_timeout = timeout.partial(timedelta(seconds=timeout.left() - 2))
watcher = KubernetesWatcher(
method=self._list,
object_type=self._type,
kind=self._kind,
name=name,
namespace=namespace,
timeout=watch_timeout,
logger=logger,
)
try:
async with watch_timeout.enforce():
async for event in watcher.watch():
if event.action == WatchEventType.ADDED:
return
except ControllerTimeoutError:
# If the watch had to be restarted because the resource version
# was too old and the object was deleted while the watch was
# restarting, we could have missed the delete event. Therefore,
# before timing out, do a final check with a short timeout to see
# if the object is gone.
read_timeout = timeout.partial(timedelta(seconds=2))
if await self.read(name, namespace, read_timeout):
return
raise
finally:
await watcher.close()

# This should be impossible; someone called stop on the watcher.
raise RuntimeError("Wait for object deletion unexpectedly stopped")

async def wait_for_deletion(
self, name: str, namespace: str, timeout: Timeout
) -> None:
Expand Down Expand Up @@ -404,3 +464,27 @@ def __init__(self, api_client: ApiClient, logger: BoundLogger) -> None:
kind="Service",
logger=logger,
)


class ServiceAccountStorage(KubernetesObjectDeleter[V1ServiceAccount]):
"""Storage layer for ``Service`` objects.
Parameters
----------
api_client
Kubernetes API client.
logger
Logger to use.
"""

def __init__(self, api_client: ApiClient, logger: BoundLogger) -> None:
api = client.CoreV1Api(api_client)
super().__init__(
create_method=api.create_namespaced_service_account,
delete_method=api.delete_namespaced_service_account,
list_method=api.list_namespaced_service_account,
read_method=api.read_namespaced_service_account,
object_type=V1ServiceAccount,
kind="ServiceAccount",
logger=logger,
)
14 changes: 13 additions & 1 deletion controller/src/controller/storage/kubernetes/lab.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
ResourceQuotaStorage,
SecretStorage,
)
from .deleter import PersistentVolumeClaimStorage, ServiceStorage
from .deleter import (
PersistentVolumeClaimStorage,
ServiceAccountStorage,
ServiceStorage,
)
from .namespace import NamespaceStorage
from .pod import PodStorage

Expand Down Expand Up @@ -53,6 +57,7 @@ def __init__(self, api_client: ApiClient, logger: BoundLogger) -> None:
self._quota = ResourceQuotaStorage(api_client, logger)
self._secret = SecretStorage(api_client, logger)
self._service = ServiceStorage(api_client, logger)
self._service_account = ServiceAccountStorage(api_client, logger)

async def create(self, objects: LabObjects, timeout: Timeout) -> None:
"""Create all of the Kubernetes objects for a user's lab.
Expand Down Expand Up @@ -86,6 +91,13 @@ async def create(self, objects: LabObjects, timeout: Timeout) -> None:
namespace, objects.network_policy, timeout
)
await self._service.create(namespace, objects.service, timeout)

# The pod creation will fail if the namespace default service account
# doesn't exist yet, and sometimes that takes a while. Wait for it
# before creating the pod.
await self._service_account.wait_for_creation(
"default", namespace, timeout
)
await self._pod.create(namespace, objects.pod, timeout)

async def delete_namespace(self, name: str, timeout: Timeout) -> None:
Expand Down
19 changes: 18 additions & 1 deletion controller/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@
from __future__ import annotations

from collections.abc import AsyncIterator, Iterator
from contextlib import contextmanager

import pytest
import pytest_asyncio
import respx
from asgi_lifespan import LifespanManager
from fastapi import FastAPI
from httpx import ASGITransport, AsyncClient
from kubernetes_asyncio.client import (
V1Namespace,
V1ObjectMeta,
V1ServiceAccount,
)
from pydantic import SecretStr
from safir.testing.kubernetes import MockKubernetesApi, patch_kubernetes
from safir.testing.slack import MockSlackWebhook, mock_slack_webhook
Expand Down Expand Up @@ -125,7 +131,18 @@ def mock_gar() -> Iterator[MockArtifactRegistry]:

@pytest.fixture
def mock_kubernetes() -> Iterator[MockKubernetesApi]:
yield from patch_kubernetes()
with contextmanager(patch_kubernetes)() as mock:
# Add a hook to create the default service account on namespace
# creation.
async def create_default_sa(namespace: V1Namespace) -> None:
namespace = namespace.metadata.name
sa = V1ServiceAccount(
metadata=V1ObjectMeta(name="default", namespace=namespace)
)
await mock.create_namespaced_service_account(namespace, sa)

mock.register_create_hook_for_test("Namespace", create_default_sa)
yield mock


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,14 @@
}
}
},
{
"apiVersion": "v1",
"kind": "ServiceAccount",
"metadata": {
"name": "default",
"namespace": "fileservers"
}
},
{
"apiVersion": "gafaelfawr.lsst.io/v1alpha1",
"kind": "GafaelfawrIngress",
Expand Down
8 changes: 8 additions & 0 deletions controller/tests/data/standard/output/lab-objects.json
Original file line number Diff line number Diff line change
Expand Up @@ -789,5 +789,13 @@
"nublado.lsst.io/user": "rachel"
}
}
},
{
"apiVersion": "v1",
"kind": "ServiceAccount",
"metadata": {
"name": "default",
"namespace": "userlabs-rachel"
}
}
]
8 changes: 5 additions & 3 deletions controller/tests/handlers/files_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ async def test_cleanup_on_pod_exit(

# Check that the fileserver objects have been deleted
objs = mock_kubernetes.get_namespace_objects_for_test(namespace)
assert len(objs) == 1
assert objs[0].kind == "Namespace"
assert len(objs) == 2
assert sorted(o.kind for o in objs) == ["Namespace", "ServiceAccount"]


@pytest.mark.asyncio
Expand Down Expand Up @@ -437,4 +437,6 @@ def callback(method: str, *args: Any) -> None:
r = await client.get("/nublado/fileserver/v1/users")
assert r.json() == []
objs = mock_kubernetes.get_namespace_objects_for_test(namespace)
assert [o for o in objs if o.kind != "Namespace"] == []
assert [
o for o in objs if o.kind not in ("Namespace", "ServiceAccount")
] == []
41 changes: 41 additions & 0 deletions controller/tests/handlers/labs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CoreV1Event,
V1ObjectMeta,
V1ObjectReference,
V1ServiceAccount,
)
from safir.metrics import MockEventPublisher
from safir.testing.kubernetes import MockKubernetesApi
Expand Down Expand Up @@ -947,3 +948,43 @@ async def test_quota_no_spawn(client: AsyncClient) -> None:
headers=user.to_headers(),
)
assert r.status_code == 403


@pytest.mark.asyncio
async def test_wait_for_sa(
client: AsyncClient,
factory: Factory,
user: GafaelfawrUser,
mock_kubernetes: MockKubernetesApi,
) -> None:
"""Test waiting for the default service account during lab creation."""
lab = read_input_lab_specification_json("base", "lab-specification")

# Clear the callback added by the fixture so that we do not automatically
# create the default service account.
mock_kubernetes.register_create_hook_for_test("Namespace", None)

# Create the lab and confirm that it entered pending state.
r = await client.post(
f"/nublado/spawner/v1/labs/{user.username}/create",
json={"options": lab.options.model_dump(), "env": lab.env},
headers=user.to_headers(),
)
assert r.status_code == 201
await asyncio.sleep(0.1)
r = await client.get(f"/nublado/spawner/v1/labs/{user.username}")
assert r.status_code == 200
assert r.json()["status"] == "pending"

# Create the service account, which should cause the lab to complete.
namespace = f"userlabs-{user.username}"
await mock_kubernetes.create_namespaced_service_account(
namespace,
V1ServiceAccount(
metadata=V1ObjectMeta(name="default", namespace=namespace)
),
)
await asyncio.sleep(0.1)
r = await client.get(f"/nublado/spawner/v1/labs/{user.username}")
assert r.status_code == 200
assert r.json()["status"] == "running"
Loading

0 comments on commit ebd70c8

Please sign in to comment.