Merge pull request #46 from UKGovernmentBEIS/craig/networking-issue-test

Add networking diagnostic test
UKGovernmentBEIS · Jan 8, 2025 · 9f2ada7 · 9f2ada7
2 parents fe5970a + d837974
commit 9f2ada7
Show file tree

Hide file tree

Showing 4 changed files with 168 additions and 0 deletions.
diff --git a/test/diagnostics/network-issues/README.md b/test/diagnostics/network-issues/README.md
@@ -0,0 +1,46 @@
+# Context
+
+In December 2024, UK AISI migrated to a new K8s Cluster whilst also migrating some evals
+from Docker to K8s.
+
+The agentic evals in question required both internet access and access to services
+deployed as part of the eval, resolved via DNS (e.g. a `victim` web server). These evals
+used the built-in Helm chart which uses Cilium Network Policies (CNP). The Helm chart
+also uses a release-scoped DNS service to resolve domain names like `victim` to the
+relevant Pod deployed as part of the eval.
+
+We observed Cilium dropping packets which ought to have been allowed by the Network
+Policies. For example, queries to `wikipedia.org` would be dropped even if the
+`allowDomains` field in `helm-values.yaml` was set to `["*"]`. The behaviour observed
+was the request e.g. `curl` simply timed out. There may have also been issues accessing
+eval-specific services like a `victim` web server.
+
+This simple Inspect eval was used to measure the impact of the issue and evaluate
+potential mitigations and solutions.
+
+
+## Usage
+
+```bash
+python run.py
+```
+
+The scores will start to be computed after ~5 minutes (see `post_curl_sleep`).
+
+The mean score will be reported when the eval finishes. A score of 1.0 indicates that
+all `curl` commands succeeded, implying that both DNS and the HTTP requests were
+successful.
+
+Try adjusting values such as epochs, resources in `helm-values.yaml` (to control the
+number of Pods per Node), uncommenting the readinessProbe, switching from allowDomains
+to allowEntities.
+
+
+## Expectations
+
+Once the issue was resolved (which we suspect was due to the interaction between CNPs at
+the cluster level with the CNPs at the eval level resulting in many Cilium
+regenerations), the mean score is expected to be 1.0 without needing any changes to the
+`helm-values.yaml` file.
+
+When we were observing issues, the scores were ~0.6-0.8.
diff --git a/test/diagnostics/network-issues/helm-values.yaml b/test/diagnostics/network-issues/helm-values.yaml
@@ -0,0 +1,34 @@
+services:
+  default:
+    image: "python:3.12-bookworm"
+    command: ["tail", "-f", "/dev/null"]
+    runtimeClassName: gvisor
+    # Resources can be adjusted to tune the number of Pods per Node.
+    resources:
+      requests:
+        cpu: 0.1
+        memory: 0.2G
+      limits:
+        cpu: 0.1
+        memory: 0.2G
+    # One attempted fix for the issue was to add a readinessProbe which would check if
+    # the service is ready to accept traffic.
+    # Note that this readinessProbe is executed inside the container.
+    # readinessProbe:
+    #   exec:
+    #     command:
+    #       - /bin/sh
+    #       - -c
+    #       - curl -f -s httpstat.us/200
+allowDomains:
+  - "*"
+  # Also evaluated, was explicitly allowing only the required domains.
+  # - "amazon.com"
+  # - "google.com"
+  # - "yahoo.com"
+  # - "bing.com"
+  # - "wikipedia.org"
+# Also evaluated, was using Cilium entity-based policies to "all" or "world"
+# https://docs.cilium.io/en/stable/security/policy/language/#entities-based
+# allowEntities:
+#   - "all"
diff --git a/test/diagnostics/network-issues/run.py b/test/diagnostics/network-issues/run.py
@@ -0,0 +1,79 @@
+import asyncio
+import os
+import random
+
+from inspect_ai import Task, eval, task
+from inspect_ai.dataset import MemoryDataset, Sample
+from inspect_ai.model import (
+    ChatMessageAssistant,
+    ModelOutput,
+)
+from inspect_ai.scorer import includes
+from inspect_ai.solver import Generate, TaskState, solver
+from inspect_ai.util import sandbox
+
+# Runs a Task with many epochs (repeats) each of which simply curls a domain to quantify
+# DNS or network access issues. The scoring is 1.0 for success and 0.0 for a timeout or
+# other error.
+
+success_str = "sandbox_exec_success"
+domains = ["google.com", "yahoo.com", "bing.com", "wikipedia.org", "amazon.com"]
+
+
+@task
+def internet_access_task(post_curl_sleep: int):
+    return Task(
+        dataset=MemoryDataset([Sample(input="Input", target=success_str)]),
+        sandbox=("k8s", "helm-values.yaml"),
+        solver=[internet_access_solver(post_curl_sleep)],
+        scorer=includes(),
+    )
+
+
+@solver
+def internet_access_solver(post_curl_sleep: int):
+    async def solve(state: TaskState, generate: Generate):
+        result = await curl_domain()
+        state.messages.append(ChatMessageAssistant(content=result, source="generate"))
+        state.output = ModelOutput.from_content(model="mock", content=result)
+        # Keep the eval going a while longer so that the Pod sticks around in case the
+        # issue is exacerbated by number of Pods or number of Cilium Network Policies.
+        await asyncio.sleep(post_curl_sleep)
+        return state
+
+    return solve
+
+
+async def curl_domain() -> str:
+    target_domain = random.choice(domains)
+    try:
+        result = await sandbox().exec(["curl", "-I", target_domain], timeout=20)
+    except TimeoutError:
+        return f"timeout\n{target_domain}"
+    if result.returncode != 0:
+        return f"error\n{target_domain}\n{result}"
+    return f"{success_str}\n{target_domain}\n{result}"
+
+
+def run_diagnostic_eval(
+    epochs: int = 500,
+    post_curl_sleep: int = 300,
+    max_helm_install: int = 100,
+    max_helm_uninstall: int = 100,
+    max_pod_ops: int = 200,
+) -> float:
+    os.environ["INSPECT_MAX_HELM_INSTALL"] = str(max_helm_install)
+    os.environ["INSPECT_MAX_HELM_UNINSTALL"] = str(max_helm_uninstall)
+    os.environ["INSPECT_MAX_POD_OPS"] = str(max_pod_ops)
+    logs = eval(
+        tasks=[internet_access_task(post_curl_sleep)],
+        model="mockllm/model",
+        max_samples=epochs,  # Let all epochs run concurrently.
+        epochs=epochs,
+    )
+    assert logs[0].results is not None
+    return logs[0].results.scores[0].metrics["accuracy"].value
+
+
+if __name__ == "__main__":
+    run_diagnostic_eval()
diff --git a/test/diagnostics/network-issues/test_can_run_diagnostic.py b/test/diagnostics/network-issues/test_can_run_diagnostic.py
@@ -0,0 +1,9 @@
+from run import run_diagnostic_eval
+
+
+def test_can_run_diagnostic_network_issues() -> None:
+    score = run_diagnostic_eval(epochs=1, post_curl_sleep=0)
+
+    # Whilst we do verify the score, the purpose of this test is to ensure there haven't
+    # been any regressions in the ability to run the diagnostic.
+    assert score == 1.0