Skip to content

Commit

Permalink
refactor container creation codes a little
Browse files Browse the repository at this point in the history
  • Loading branch information
fregataa committed Jun 25, 2024
1 parent e59f214 commit 6af3d2c
Showing 1 changed file with 43 additions and 33 deletions.
76 changes: 43 additions & 33 deletions src/ai/backend/agent/docker/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,30 @@ def container_from_docker_container(src: DockerContainer) -> Container:
)


async def _clean_scratch(
loop: asyncio.AbstractEventLoop,
scratch_type: str,
scratch_root: Path,
kernel_id: KernelId,
) -> None:
scratch_dir = scratch_root / str(kernel_id)
tmp_dir = scratch_root / f"{kernel_id}_tmp"
try:
if sys.platform.startswith("linux") and scratch_type == "memory":
await destroy_scratch_filesystem(scratch_dir)
await destroy_scratch_filesystem(tmp_dir)
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
await loop.run_in_executor(None, shutil.rmtree, tmp_dir)
elif sys.platform.startswith("linux") and scratch_type == "hostfile":
await destroy_loop_filesystem(scratch_root, kernel_id)
else:
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
except CalledProcessError:
pass
except FileNotFoundError:
pass


def _DockerError_reduce(self):
return (
type(self),
Expand Down Expand Up @@ -852,6 +876,18 @@ async def start_container(
if self.local_config["debug"]["log-kernel-config"]:
log.debug("full container config: {!r}", pretty(container_config))

async def _rollback_container_creation() -> None:
await _clean_scratch(
loop,
self.local_config["container"]["scratch-type"],
self.local_config["container"]["scratch-root"],
self.kernel_id,
)
self.port_pool.update(host_ports)
async with self.resource_lock:
for dev_name, device_alloc in resource_spec.allocations.items():
self.computers[dev_name].alloc_map.free(device_alloc)

# We are all set! Create and start the container.
async with closing_async(Docker()) as docker:
container: Optional[DockerContainer] = None
Expand Down Expand Up @@ -883,21 +919,7 @@ async def start_container(
raise
except Exception as e:
# Oops, we have to restore the allocated resources!
scratch_type = self.local_config["container"]["scratch-type"]
scratch_root = self.local_config["container"]["scratch-root"]
if sys.platform.startswith("linux") and scratch_type == "memory":
await destroy_scratch_filesystem(self.scratch_dir)
await destroy_scratch_filesystem(self.tmp_dir)
await loop.run_in_executor(None, shutil.rmtree, self.scratch_dir)
await loop.run_in_executor(None, shutil.rmtree, self.tmp_dir)
elif sys.platform.startswith("linux") and scratch_type == "hostfile":
await destroy_loop_filesystem(scratch_root, self.kernel_id)
else:
await loop.run_in_executor(None, shutil.rmtree, self.scratch_dir)
self.port_pool.update(host_ports)
async with self.resource_lock:
for dev_name, device_alloc in resource_spec.allocations.items():
self.computers[dev_name].alloc_map.free(device_alloc)
await _rollback_container_creation()
if container is not None:
raise ContainerCreationError(
container_id=container._id, message=f"unknown. {repr(e)}"
Expand Down Expand Up @@ -1512,24 +1534,12 @@ async def log_iter():
log.warning("container deletion timeout (k:{}, c:{})", kernel_id, container_id)

if not restarting:
scratch_type = self.local_config["container"]["scratch-type"]
scratch_root = self.local_config["container"]["scratch-root"]
scratch_dir = scratch_root / str(kernel_id)
tmp_dir = scratch_root / f"{kernel_id}_tmp"
try:
if sys.platform.startswith("linux") and scratch_type == "memory":
await destroy_scratch_filesystem(scratch_dir)
await destroy_scratch_filesystem(tmp_dir)
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
await loop.run_in_executor(None, shutil.rmtree, tmp_dir)
elif sys.platform.startswith("linux") and scratch_type == "hostfile":
await destroy_loop_filesystem(scratch_root, kernel_id)
else:
await loop.run_in_executor(None, shutil.rmtree, scratch_dir)
except CalledProcessError:
pass
except FileNotFoundError:
pass
await _clean_scratch(
loop,
self.local_config["container"]["scratch-type"],
self.local_config["container"]["scratch-root"],
kernel_id,
)

async def create_local_network(self, network_name: str) -> None:
async with closing_async(Docker()) as docker:
Expand Down

0 comments on commit 6af3d2c

Please sign in to comment.