perf test improvements (#223)

* kube-burner timeouts and metric collection improvements * make the kube-burner timeout configurable, defaulting to 15m * attempt to collect the resource usage metrics even if the job fails * tests: configurable dqlite trace level We're adding two new test settings that can be used to enable dqlite tracing: * TEST_DQLITE_TRACE_LEVEL * TEST_RAFT_TRACE_LEVEL If set, those settings are added to the instance /var/snap/k8s/common/args/k8s-dqlite-env file. * tests: expose k8s-dqlite debug mode * Update copyright headers We'll update the copyright headers as expected by the "tox -e fmt" job: Copyright 2025 Canonical, Ltd.
canonical · Jan 17, 2025 · 8dc178b · 8dc178b
1 parent 93efe11
commit 8dc178b
Show file tree

Hide file tree

Showing 10 changed files with 70 additions and 16 deletions.
diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml
@@ -94,11 +94,13 @@ jobs:
           mkdir -p ./results/base-code
           sg lxd -c 'tox -e performance'
       - name: Generate 3 node Graphs
+        if: always()
         run: |
           cd test/performance
           sudo Rscript parse-performance-metrics.R -p ./results/head -o ./results/head -f *three-node.log
           sudo Rscript parse-performance-metrics.R -p ./results/base-code -o ./results/base-code -f *three-node.log
       - name: Generate single node Graphs
+        if: always()
         run: |
           cd test/performance
           mkdir -p ./results/single-node
@@ -107,6 +109,7 @@ jobs:
           sudo Rscript parse-performance-metrics.R -p ./results/single-node -o ./results/single-node -f *single-node.log
       - name: Upload performance result
         uses: actions/upload-artifact@v4
+        if: always()
         with:
           name: performance-results
           path: ${{ github.workspace }}/test/performance/results

diff --git a/test/performance/tests/conftest.py b/test/performance/tests/conftest.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 import itertools
 import logging

diff --git a/test/performance/tests/test_multi_node.py b/test/performance/tests/test_multi_node.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 from typing import List
 
@@ -31,6 +31,9 @@ def test_three_node_load(instances: List[harness.Instance]):
 
     metrics.configure_kube_burner(cluster_node)
     process_dict = metrics.collect_metrics(instances)
-    metrics.run_kube_burner(cluster_node)
-    metrics.stop_metrics(instances, process_dict)
-    metrics.pull_metrics(instances, "three-node")
+    try:
+        metrics.run_kube_burner(cluster_node)
+    finally:
+        # Collect the metrics even if kube-burner fails.
+        metrics.stop_metrics(instances, process_dict)
+        metrics.pull_metrics(instances, "three-node")
diff --git a/test/performance/tests/test_single_node.py b/test/performance/tests/test_single_node.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 from test_util import harness, metrics
 
@@ -8,6 +8,9 @@ def test_single_node_load(session_instance: harness.Instance):
     """Test the performance of a single node cluster with all features enabled."""
     metrics.configure_kube_burner(session_instance)
     process_dict = metrics.collect_metrics([session_instance])
-    metrics.run_kube_burner(session_instance)
-    metrics.stop_metrics([session_instance], process_dict)
-    metrics.pull_metrics([session_instance], "single-node")
+    try:
+        metrics.run_kube_burner(session_instance)
+    finally:
+        # Collect the metrics even if kube-burner fails.
+        metrics.stop_metrics([session_instance], process_dict)
+        metrics.pull_metrics([session_instance], "single-node")
diff --git a/test/performance/tests/test_util/config.py b/test/performance/tests/test_util/config.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 import os
 from pathlib import Path
@@ -36,6 +36,9 @@
     or "https://github.com/kube-burner/kube-burner/releases/download/v1.2/kube-burner-1.2-Linux-x86_64.tar.gz"
 )
 
+# Global kube-burner invocation timeout.
+KUBE_BURNER_TIMEOUT = os.getenv("TEST_KUBE_BURNER_TIMEOUT") or "10m"
+
 # FLAVOR is the flavour to use for running the performance tests.
 FLAVOR = os.getenv("TEST_FLAVOR") or ""
 
@@ -48,3 +51,9 @@
 
 # LXD_PROFILE_NAME is the profile name to use for LXD containers.
 LXD_PROFILE_NAME = os.getenv("TEST_LXD_PROFILE_NAME") or "k8s-performance"
+
+# Enable k8s-dqlite debug logging.
+K8S_DQLITE_DEBUG = os.getenv("TEST_K8S_DQLITE_DEBUG") == "1"
+# Set the following to 1 for verbose dqlite trace messages.
+DQLITE_TRACE_LEVEL = os.getenv("TEST_DQLITE_TRACE_LEVEL")
+RAFT_TRACE_LEVEL = os.getenv("TEST_RAFT_TRACE_LEVEL")
diff --git a/test/performance/tests/test_util/harness/__init__.py b/test/performance/tests/test_util/harness/__init__.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 from test_util.harness.base import Harness, HarnessError, Instance
 from test_util.harness.lxd import LXDHarness

diff --git a/test/performance/tests/test_util/harness/base.py b/test/performance/tests/test_util/harness/base.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 import subprocess
 from functools import cached_property, partial

diff --git a/test/performance/tests/test_util/harness/lxd.py b/test/performance/tests/test_util/harness/lxd.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 import logging
 import os

diff --git a/test/performance/tests/test_util/metrics.py b/test/performance/tests/test_util/metrics.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 import os
 from typing import List
@@ -82,4 +82,13 @@ def run_kube_burner(instance: harness.Instance):
     """Copies kubeconfig and runs kube-burner on the instance."""
     instance.exec(["mkdir", "-p", "/root/.kube"])
     instance.exec(["k8s", "config", ">", "/root/.kube/config"])
-    instance.exec(["/root/kube-burner", "init", "-c", "/root/api-intensive.yaml"])
+    instance.exec(
+        [
+            "/root/kube-burner",
+            "init",
+            "--timeout",
+            config.KUBE_BURNER_TIMEOUT,
+            "-c",
+            "/root/api-intensive.yaml",
+        ]
+    )
diff --git a/test/performance/tests/test_util/util.py b/test/performance/tests/test_util/util.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2024 Canonical, Ltd.
+# Copyright 2025 Canonical, Ltd.
 #
 import ipaddress
 import json
@@ -146,6 +146,30 @@ def _as_int(value: Optional[str]) -> Optional[int]:
         return None
 
 
+def configure_dqlite_logging(instance: harness.Instance):
+    """Configure k8s-dqlite logging (requires restart)."""
+    if config.DQLITE_TRACE_LEVEL:
+        instance.exec(
+            [
+                "echo",
+                f"LIBDQLITE_TRACE={config.DQLITE_TRACE_LEVEL}",
+                ">>",
+                "/var/snap/k8s/common/args/k8s-dqlite-env",
+            ]
+        )
+    if config.RAFT_TRACE_LEVEL:
+        instance.exec(
+            [
+                "echo",
+                f"LIBRAFT_TRACE={config.RAFT_TRACE_LEVEL}",
+                ">>",
+                "/var/snap/k8s/common/args/k8s-dqlite-env",
+            ]
+        )
+    if config.K8S_DQLITE_DEBUG:
+        instance.exec(["echo", "--debug", ">>", "/var/snap/k8s/common/args/k8s-dqlite"])
+
+
 def setup_k8s_snap(
     instance: harness.Instance,
     tmp_path: Path,
@@ -185,6 +209,9 @@ def setup_k8s_snap(
         cmd += [config.SNAP_NAME, "--channel", channel]
 
     instance.exec(cmd)
+
+    configure_dqlite_logging(instance)
+
     if connect_interfaces:
         LOG.info("Ensure k8s interfaces and network requirements")
         instance.exec(["/snap/k8s/current/k8s/hack/init.sh"], stdout=subprocess.DEVNULL)