Skip to content

Commit

Permalink
Merge pull request #3 from rage/add-support-for-custom-memory-limit
Browse files Browse the repository at this point in the history
Add support for custom memory and cpu limit
  • Loading branch information
nygrenh authored Jan 12, 2024
2 parents 0a68404 + 972c0b3 commit 2eccf40
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 30 deletions.
23 changes: 17 additions & 6 deletions src/controllers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@ import Router from "koa-router"
import { CustomContext, CustomState } from "./types"
import multer from "@koa/multer"
import gateKeeper, {
INSTANCES,
CPU_CORES_IN_SYSTEM,
getBusyInstances,
freeInstance,
getReservedMemory,
TOTAL_SYSTEM_MEMORY_GB,
} from "./middleware/gatekeeper"
import { BadRequestError } from "./util/error"
import handleSubmission, { RunResult } from "./sandbox"
import Axios from "axios"
import { SupportedMimeTypes } from "./util/file_extractor"
import extractResourceLimitsFromRequest from "./util/extractResourceLimitsFromRequest"

const upload = multer({ dest: "uploads/" })
export const ALLOWED_ALTERNATIVE_DOCKER_IMAGES = ["nygrenh/sandbox-next"]
Expand All @@ -19,7 +22,12 @@ const api = new Router<CustomState, CustomContext>()
.get("/status.json", async (ctx) => {
ctx.body = {
busy_instances: getBusyInstances(),
total_instances: INSTANCES,
// This is intentionally the same as busy instances, this is more descriptive name but we're keeping busy_instances for backwards compatibility
reserved_cpu_cores: getBusyInstances(),

total_instances: CPU_CORES_IN_SYSTEM,
reserved_memory: getReservedMemory(),
total_memory: TOTAL_SYSTEM_MEMORY_GB,
}
})

Expand All @@ -29,11 +37,13 @@ const api = new Router<CustomState, CustomContext>()
// concurrent tasks in a middleware because we want to do it before receiving
// the uploaded file.

const resourceLimits = extractResourceLimitsFromRequest(ctx.request.body)

if (
ctx.file.mimetype !== "application/x-tar" &&
ctx.file.mimetype !== "application/zstd"
) {
freeInstance()
freeInstance(resourceLimits)
throw new BadRequestError(
`Uploaded file type is not supported! Mimetype was: ${ctx.file.mimetype}}. Supported types are application/x-tar and application/zstd.`,
)
Expand All @@ -47,7 +57,7 @@ const api = new Router<CustomState, CustomContext>()
ALLOWED_ALTERNATIVE_DOCKER_IMAGES.indexOf(dockerImage) !== -1
)
) {
freeInstance()
freeInstance(resourceLimits)
throw new BadRequestError("Docker image was not whitelisted.")
}

Expand All @@ -66,12 +76,13 @@ const api = new Router<CustomState, CustomContext>()
dockerImage,
ctx.log.child({ async: true }),
ctx.file.mimetype as SupportedMimeTypes,
resourceLimits,
)
} catch (reason1) {
ctx.log.error("Handling submission failed.", { reason: reason1 })
return
} finally {
freeInstance()
freeInstance(resourceLimits)
}

ctx.log.info(`Notifying ${ctx.request.body.notify}...`, {
Expand All @@ -89,7 +100,7 @@ const api = new Router<CustomState, CustomContext>()
exit_code: output.exit_code,
})
} catch (reason2) {
ctx.log.error("Notifying failed", { error: reason2.message })
ctx.log.error("Notifying failed", { error: (reason2 as Error).message })
}
})

Expand Down
39 changes: 29 additions & 10 deletions src/middleware/gatekeeper.ts
Original file line number Diff line number Diff line change
@@ -1,31 +1,50 @@
import { CustomContext } from "../types"
import { cpus } from "os"
import { cpus, totalmem } from "os"
import { SandboxBusyError } from "../util/error"
import extractResourceLimitsFromRequest, {
ResourceLimits,
} from "../util/extractResourceLimitsFromRequest"

export const INSTANCES = cpus().length
let busyInstances = 0
export const CPU_CORES_IN_SYSTEM = cpus().length
export const TOTAL_SYSTEM_MEMORY_GB = totalmem() / 1024 ** 3

let reservedCPUCores = 0
let reservedMemory = 0

export function getBusyInstances(): number {
return busyInstances
return reservedCPUCores
}

export function getReservedMemory(): number {
return reservedMemory
}

export function freeInstance(): void {
busyInstances--
export function freeInstance(limits: ResourceLimits): void {
reservedCPUCores -= limits.cpus
reservedMemory -= limits.memoryGB
}

function reserveInstance() {
busyInstances++
function reserveInstance(limits: ResourceLimits): void {
reservedCPUCores += limits.cpus
reservedMemory += limits.memoryGB
}

// Enforces the server is not processing too many submissions at once.
const gateKeeper = async (
ctx: CustomContext,
next: () => Promise<unknown>,
): Promise<void> => {
if (busyInstances >= INSTANCES) {
const limits = extractResourceLimitsFromRequest(ctx.request.body)
console.info(
`Sandbox sumbission requesting ${limits.memoryGB}GB of memory and ${limits.cpus} CPUs`,
)
if (reservedCPUCores + limits.cpus > CPU_CORES_IN_SYSTEM) {
throw new SandboxBusyError()
}
if (reservedMemory + limits.memoryGB > TOTAL_SYSTEM_MEMORY_GB) {
throw new SandboxBusyError()
}
reserveInstance()
reserveInstance(limits)
await next()
}

Expand Down
25 changes: 21 additions & 4 deletions src/sandbox.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import winston from "winston"
import { exec as origExec } from "child_process"
import { readFile as origReadFile, unlink as origUnlink } from "fs"
import extract, { SupportedMimeTypes } from "./util/file_extractor"
import { ResourceLimits } from "./util/extractResourceLimitsFromRequest"
const exec = promisify(origExec)
const readFile = promisify(origReadFile)
const unlink = promisify(origUnlink)
Expand All @@ -29,6 +30,7 @@ const handleSubmission = async (
dockerImage: string | undefined,
log: winston.Logger,
mimetype: SupportedMimeTypes,
resourceLimits: ResourceLimits,
): Promise<RunResult> => {
log.info("Handling submission")
const outputPath = join("work", id)
Expand All @@ -38,7 +40,13 @@ const handleSubmission = async (
await exec(`chmod -R 777 ${outputPath}`)
try {
await exec(`chmod -R 777 ${outputPath}`)
const results = await runTests(outputPath, id, dockerImage, log)
const results = await runTests(
outputPath,
id,
dockerImage,
log,
resourceLimits,
)
return results
} catch (e) {
log.error(`Error while running: ${e}`)
Expand All @@ -62,6 +70,7 @@ async function runTests(
submission_id: string,
dockerImage: string | undefined,
log: winston.Logger,
resourceLimits: ResourceLimits,
): Promise<RunResult> {
const id = `sandbox-submission-${submission_id}`
let status = "failed"
Expand All @@ -79,11 +88,19 @@ async function runTests(
const image = dockerImage || "nygrenh/sandbox-next"
let command
if (SUPERDEBUG) {
command = `docker create --name '${id}' --memory 2G --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus 1 --mount type=bind,source=${resolve(
command = `docker create --name '${id}' --memory '${
resourceLimits.memoryGB
}G' --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus '${
resourceLimits.cpus
}' --mount type=bind,source=${resolve(
path,
)},target=/app -it '${image}' /bin/sleep infinity `
} else {
command = `docker create --name '${id}' --network none --memory 2G --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus 1 --cap-drop SETPCAP --cap-drop SETFCAP --cap-drop AUDIT_WRITE --cap-drop SETGID --cap-drop SETUID --cap-drop NET_BIND_SERVICE --cap-drop SYS_CHROOT --cap-drop NET_RAW --mount type=bind,source=${resolve(
command = `docker create --name '${id}' --network none --memory '${
resourceLimits.memoryGB
}G' --kernel-memory=50M --pids-limit=200 --ulimit nproc=10000:10000 --cpus '${
resourceLimits.cpus
}' --cap-drop SETPCAP --cap-drop SETFCAP --cap-drop AUDIT_WRITE --cap-drop SETGID --cap-drop SETUID --cap-drop NET_BIND_SERVICE --cap-drop SYS_CHROOT --cap-drop NET_RAW --mount type=bind,source=${resolve(
path,
)},target=/app -it '${image}' /app/init`
}
Expand Down Expand Up @@ -114,7 +131,7 @@ async function runTests(
} catch (e) {
const executionEndTime = new Date().getTime()
const durationMs = executionEndTime - executionStartTime
log.error("Running tests failed", { error: e.message })
log.error("Running tests failed", { error: (e as Error).message })
// If the process died within the last 5 seconds before timeout, it was
// likely a timeout.
if (durationMs > timeout_ms - 5000) {
Expand Down
23 changes: 23 additions & 0 deletions src/util/extractResourceLimitsFromRequest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
export interface ResourceLimits {
memoryGB: number
cpus: number
}

const MAX_MEMORY_REQUEST_GB = 4
const MAX_CPUS_REQUEST = 2

/** Extracts and validatates cpu and memory requests. Handles too big requests by making them smaller. */
export default function extractResourceLimitsFromRequest(
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/explicit-module-boundary-types
requestBody: any,
): ResourceLimits {
let memoryGB = Number(requestBody.memory_limit_gb ?? 1)
let cpus = Number(requestBody.cpu_limit ?? 1)
if (memoryGB > MAX_MEMORY_REQUEST_GB) {
memoryGB = MAX_MEMORY_REQUEST_GB
}
if (cpus > MAX_CPUS_REQUEST) {
cpus = MAX_CPUS_REQUEST
}
return { memoryGB, cpus }
}
53 changes: 43 additions & 10 deletions tests/submissions.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,47 @@ test("POST /tasks.json works", async () => {
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
const testOutput = JSON.parse(notifyResult.test_output)
expect(testOutput.status).toBe("PASSED")
expect(testOutput.testResults.length).toBe(1)
})

test("POST /tasks.json with higher resource limits works", async () => {
jest.setTimeout(60000)
const notifyResult: NotifyResult = await new Promise(
async (resolve, _reject) => {
const notifyAddress = createResultServer((res) => {
resolve(res)
})

await request(server)
.post("/tasks.json")
.attach("file", "tests/data/submission.tar")
.field(
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("memory_limit_gb", "3")
.field("cpu_limit", "2")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
Expand All @@ -83,15 +116,15 @@ test("POST /tasks.json works with .tar.zst files", async () => {
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)

expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
Expand All @@ -117,15 +150,15 @@ testSkipOnCi("POST /tasks.json does not crash with fork bombs", async () => {
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)

expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")

// hard to predict what happens in this case
const case1 =
Expand Down Expand Up @@ -155,14 +188,14 @@ test("POST /tasks.json works when submission uses too much memory", async () =>
"docker_image",
"eu.gcr.io/moocfi-public/tmc-sandbox-tmc-langs-rust",
)
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.status).toBe("out-of-memory")
})

Expand All @@ -178,14 +211,14 @@ test("POST /tasks.json works with java", async () => {
.post("/tasks.json")
.attach("file", "tests/data/java.tar")
.field("docker_image", "eu.gcr.io/moocfi-public/tmc-sandbox-java")
.field("token", "SUPER_SECERET")
.field("token", "SUPER_SECRET")
.field("notify", notifyAddress)
.set("Accept", "application/json")
.expect("Content-Type", /json/)
.expect(200)
},
)
expect(notifyResult.token).toBe("SUPER_SECERET")
expect(notifyResult.token).toBe("SUPER_SECRET")
expect(notifyResult.exit_code).toBe("0")
expect(notifyResult.status).toBe("finished")
expect(notifyResult.vm_log.length).toBeGreaterThan(5)
Expand Down

0 comments on commit 2eccf40

Please sign in to comment.