From d69ce5d2dfd446854a9401b4e84605852d8709e3 Mon Sep 17 00:00:00 2001 From: Shreya Jayaraman <144164353+jshr-w@users.noreply.github.com> Date: Mon, 2 Dec 2024 12:54:32 -0800 Subject: [PATCH 01/23] fix service churn feature pipeline name (#417) --- jobs/competitive-test.yml | 8 + .../clusterloader2/autoscale/autoscale.py | 9 +- .../clusterloader2/kubernetes_client.py | 68 ++++- .../slo/config/deployment_template.yaml | 33 ++- .../slo/config/load-config.yaml | 63 +++- .../net-policy-enforcement-latency.yaml | 57 ++++ .../network-policy/net-policy-metrics.yaml | 122 ++++++++ .../slo/config/modules/reconcile-objects.yaml | 13 + modules/python/clusterloader2/slo/slo.py | 69 +++-- modules/python/csi/csi.py | 269 ++++++++++++++++++ modules/python/tests/test_csi.py | 267 +++++++++++++++++ .../python/tests/test_kubernetes_client.py | 124 +++++++- ...er-benchmark-virtualnodes10-pods100-EZ.yml | 2 +- ...erver-benchmark-virtualnodes10-pods100.yml | 6 +- ...rver-benchmark-virtualnodes100-pods10k.yml | 8 +- ...erver-benchmark-virtualnodes100-pods3k.yml | 4 +- .../CNI Benchmark/cilium-ab-testing.yml | 37 +++ ...-cluster-churn-nodes-cilium-nodesubnet.yml | 44 +++ .../network-churn/cilium-network-churn.yml | 96 +++++++ ...slo-servicediscovery-cilium-nodesubnet.yml | 44 +++ .../slo-servicediscovery-feature.yml | 2 +- .../CSI Benchmark/csi-attach-detach-1000.yml | 59 ++++ .../CSI Benchmark/csi-attach-detach-300.yml | 59 ++++ .../terraform-inputs/azure.tfvars | 4 + .../terraform-test-inputs/azure.json | 4 + .../kubernetes/storageclass.aws.yml | 16 ++ .../terraform-inputs/aws.tfvars | 82 ++++++ .../terraform-inputs/azure.tfvars | 35 +++ .../terraform-test-inputs/aws.json | 4 + .../terraform-test-inputs/azure.json | 5 + .../kubernetes/storageclass.aws.yml | 16 ++ .../terraform-inputs/aws.tfvars | 81 ++++++ .../terraform-inputs/azure.tfvars | 49 ++++ .../terraform-test-inputs/aws.json | 4 + .../terraform-test-inputs/azure.json | 5 + steps/collect-telescope-metadata.yml | 21 +- steps/engine/attach/collect.yml | 31 ++ steps/engine/attach/execute-aws.yml | 40 +++ steps/engine/attach/execute-azure.yml | 21 ++ steps/engine/attach/validate.yml | 21 ++ .../clusterloader2/cilium/scale-cluster.yml | 4 +- steps/engine/clusterloader2/slo/collect.yml | 8 +- steps/engine/clusterloader2/slo/execute.yml | 10 +- steps/provision-resources.yml | 2 +- steps/setup-tests.yml | 17 +- .../csi-attach-detach/collect-attach.yml | 20 ++ .../csi-attach-detach/execute-attach.yml | 22 ++ .../csi-attach-detach/validate-resources.yml | 18 ++ .../network-churn/collect-clusterloader2.yml | 23 ++ .../network-churn/execute-clusterloader2.yml | 17 ++ .../network-churn/validate-resources.yml | 20 ++ 51 files changed, 1993 insertions(+), 70 deletions(-) create mode 100644 modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml create mode 100644 modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml create mode 100644 modules/python/csi/csi.py create mode 100644 modules/python/tests/test_csi.py create mode 100644 pipelines/perf-eval/CNI Benchmark/cilium-ab-testing.yml create mode 100644 pipelines/perf-eval/CNI Benchmark/cilium-cluster-churn-nodes-cilium-nodesubnet.yml create mode 100644 pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml create mode 100644 pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml create mode 100644 pipelines/perf-eval/CSI Benchmark/csi-attach-detach-1000.yml create mode 100644 pipelines/perf-eval/CSI Benchmark/csi-attach-detach-300.yml create mode 100644 scenarios/perf-eval/ab-testing-cilium-parameters/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/ab-testing-cilium-parameters/terraform-test-inputs/azure.json create mode 100644 scenarios/perf-eval/storage-attach-detach-1000/kubernetes/storageclass.aws.yml create mode 100644 scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/aws.tfvars create mode 100644 scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/aws.json create mode 100644 scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/azure.json create mode 100644 scenarios/perf-eval/storage-attach-detach-300/kubernetes/storageclass.aws.yml create mode 100644 scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/aws.tfvars create mode 100644 scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/aws.json create mode 100644 scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/azure.json create mode 100644 steps/engine/attach/collect.yml create mode 100644 steps/engine/attach/execute-aws.yml create mode 100644 steps/engine/attach/execute-azure.yml create mode 100644 steps/engine/attach/validate.yml create mode 100644 steps/topology/csi-attach-detach/collect-attach.yml create mode 100644 steps/topology/csi-attach-detach/execute-attach.yml create mode 100644 steps/topology/csi-attach-detach/validate-resources.yml create mode 100644 steps/topology/network-churn/collect-clusterloader2.yml create mode 100644 steps/topology/network-churn/execute-clusterloader2.yml create mode 100644 steps/topology/network-churn/validate-resources.yml diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index c4872ad11..d09fee064 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -33,6 +33,9 @@ parameters: - name: run_id type: string default: '' +- name: run_id_2 + type: string + default: '' - name: timeout_in_minutes type: number default: 60 # default when not specified is 60 minutes @@ -48,6 +51,9 @@ parameters: - name: ssh_key_enabled type: boolean default: true +- name: use_secondary_cluster + type: boolean + default: false jobs: - job: ${{ parameters.cloud }} @@ -62,10 +68,12 @@ jobs: cloud: ${{ parameters.cloud }} region: ${{ parameters.regions[0] }} run_id: ${{ parameters.run_id }} + run_id_2: ${{ parameters.run_id_2 }} test_modules_dir: ${{ parameters.test_modules_dir }} retry_attempt_count: ${{ parameters.retry_attempt_count }} credential_type: ${{ parameters.credential_type }} ssh_key_enabled: ${{ parameters.ssh_key_enabled }} + use_secondary_cluster: ${{ parameters.use_secondary_cluster }} - template: /steps/provision-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/modules/python/clusterloader2/autoscale/autoscale.py b/modules/python/clusterloader2/autoscale/autoscale.py index 575e148bd..b607696f9 100644 --- a/modules/python/clusterloader2/autoscale/autoscale.py +++ b/modules/python/clusterloader2/autoscale/autoscale.py @@ -57,11 +57,11 @@ def collect_clusterloader2( index = match.group() if index not in summary: summary[index] = { - "up": { "failures": 0 }, + "up": { "failures": 0 }, "down": { "failures": 0 } } else: - continue + continue failure = testcase["failure"] if "WaitForRunningPodsUp" in name: @@ -76,7 +76,7 @@ def collect_clusterloader2( elif "WaitForNodesDown" in name: summary[index]["down"]["wait_for_nodes_seconds"] = -1 if failure else testcase["time"] summary[index]["down"]["failures"] += 1 if failure else 0 - + content = "" for index in summary: for key in summary[index]: @@ -85,6 +85,7 @@ def collect_clusterloader2( "wait_for_pods_seconds": summary[index][key]["wait_for_pods_seconds"], "autoscale_result": "success" if summary[index][key]["failures"] == 0 else "failure" } + # TODO: Expose optional parameter to include test details result = { "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), "autoscale_type": key, @@ -92,7 +93,7 @@ def collect_clusterloader2( "node_count": node_count, "pod_count": pod_count, "data": data, - "raw_data": raw_data, + # "raw_data": raw_data, "cloud_info": cloud_info, "run_id": run_id, "run_url": run_url diff --git a/modules/python/clusterloader2/kubernetes_client.py b/modules/python/clusterloader2/kubernetes_client.py index 6c30ff990..3d2e45bb7 100644 --- a/modules/python/clusterloader2/kubernetes_client.py +++ b/modules/python/clusterloader2/kubernetes_client.py @@ -1,6 +1,6 @@ +# TODO: Move this file to a separate folder called 'clients' from kubernetes import client, config - # https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#taint-based-evictions # https://kubernetes.io/docs/reference/labels-annotations-taints/ builtin_taints_keys = [ @@ -20,14 +20,19 @@ class KubernetesClient: def __init__(self, kubeconfig=None): config.load_kube_config(kubeconfig) self.api = client.CoreV1Api() + self.app = client.AppsV1Api() + self.storage = client.StorageV1Api() + + def get_app_client(self): + return self.app def describe_node(self, node_name): return self.api.read_node(node_name) def get_nodes(self, label_selector=None, field_selector=None): return self.api.list_node(label_selector=label_selector, field_selector=field_selector).items - - def get_ready_nodes(self): + + def get_ready_nodes(self, label_selector=None, field_selector=None): """ Get a list of nodes that are ready to be scheduled. Should apply all those conditions: - 'Ready' condition status is True @@ -35,7 +40,7 @@ def get_ready_nodes(self): - Spec unschedulable is False - Spec taints do not have any of the builtin taints keys with effect 'NoSchedule' or 'NoExecute' """ - nodes = self.get_nodes() + nodes = self.get_nodes(label_selector=label_selector, field_selector=field_selector) return [ node for node in nodes if self._is_node_schedulable(node) and self._is_node_untainted(node) @@ -50,16 +55,63 @@ def _is_node_schedulable(self, node): ) if not is_schedulable: print(f"Node NOT Ready: '{node.metadata.name}' is not schedulable. status_conditions: {status_conditions}. unschedulable: {node.spec.unschedulable}") - + return is_schedulable - + def _is_node_untainted(self, node): if not node.spec.taints: return True - + for taint in node.spec.taints: if taint.key in builtin_taints_keys and taint.effect in ("NoSchedule", "NoExecute"): print(f"Node NOT Ready: '{node.metadata.name}' has taint '{taint.key}' with effect '{taint.effect}'") return False - return True \ No newline at end of file + return True + + def get_pods_by_namespace(self, namespace, label_selector=None, field_selector=None): + return self.api.list_namespaced_pod(namespace=namespace, label_selector=label_selector, field_selector=field_selector).items + + def get_running_pods_by_namespace(self, namespace=None, label_selector=None, field_selector=None): + pods = self.get_pods_by_namespace(namespace=namespace, label_selector=label_selector, field_selector=field_selector) + return [pod for pod in pods if pod.status.phase == "Running"] + + def get_persistent_volume_claims_by_namespace(self, namespace): + return self.api.list_namespaced_persistent_volume_claim(namespace=namespace).items + + def get_bound_persistent_volume_claims_by_namespace(self, namespace): + claims = self.get_persistent_volume_claims_by_namespace(namespace=namespace) + return [claim for claim in claims if claim.status.phase == "Bound"] + + def delete_persistent_volume_claim_by_namespace(self, namespace): + pvcs = self.get_persistent_volume_claims_by_namespace(namespace=namespace) + for pvc in pvcs: + try: + self.api.delete_namespaced_persistent_volume_claim(pvc.metadata.name, namespace, body=client.V1DeleteOptions()) + except client.rest.ApiException as e: + print(f"Error deleting PVC '{pvc.metadata.name}': {e}") + + def get_volume_attachments(self): + return self.storage.list_volume_attachment().items + + def get_attached_volume_attachments(self): + volume_attachments = self.get_volume_attachments() + return [attachment for attachment in volume_attachments if attachment.status.attached] + + def create_namespace(self, namespace): + """ + Returns the namespace object if it exists, otherwise creates it. + """ + try: + namespace = self.api.read_namespace(namespace) + print(f"Namespace '{namespace.metadata.name}' already exists.") + return namespace + except client.rest.ApiException as e: + if e.status == 404: + body = client.V1Namespace(metadata=client.V1ObjectMeta(name=namespace)) + return self.api.create_namespace(body) + else: + raise e + + def delete_namespace(self, namespace): + return self.api.delete_namespace(namespace) diff --git a/modules/python/clusterloader2/slo/config/deployment_template.yaml b/modules/python/clusterloader2/slo/config/deployment_template.yaml index 9e07175df..8c4c93615 100644 --- a/modules/python/clusterloader2/slo/config/deployment_template.yaml +++ b/modules/python/clusterloader2/slo/config/deployment_template.yaml @@ -3,6 +3,13 @@ {{$Image := DefaultParam .Image "mcr.microsoft.com/oss/kubernetes/pause:3.6"}} +{{$EnableNetworkPolicyEnforcementLatencyTest := DefaultParam .EnableNetworkPolicyEnforcementLatencyTest false}} +{{$TargetLabelValue := DefaultParam .TargetLabelValue "enforcement-latency"}} +# Run a server pod for network policy enforcement latency test only on every Nth pod. +# Default every third pod. +{{$NetPolServerOnEveryNthPod := 3}} +{{$RunNetPolicyTest := and $EnableNetworkPolicyEnforcementLatencyTest (eq (Mod .Index $NetPolServerOnEveryNthPod) 0)}} + apiVersion: apps/v1 kind: Deployment metadata: @@ -16,7 +23,7 @@ spec: replicas: {{.Replicas}} selector: matchLabels: - name: {{.Name}} + name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}} strategy: type: RollingUpdate rollingUpdate: @@ -25,15 +32,30 @@ spec: template: metadata: labels: - name: {{.Name}} + name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}} group: {{.Group}} {{if .SvcName}} svc: {{.SvcName}}-{{.Index}} {{end}} restart: {{.deploymentLabel}} +{{if $RunNetPolicyTest}} + net-pol-test: {{$TargetLabelValue}} +{{end}} spec: nodeSelector: slo: "true" +{{if $RunNetPolicyTest}} + hostNetwork: false + containers: + - image: nginx + name: nginx-server + ports: + - containerPort: 80 + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} +{{else}} containers: - env: - name: ENV_VAR @@ -41,13 +63,12 @@ spec: image: {{$Image}} imagePullPolicy: IfNotPresent name: {{.Name}} - ports: + ports: [] resources: requests: cpu: {{$CpuRequest}} memory: {{$MemoryRequest}} - # Add not-ready/unreachable tolerations for 15 minutes so that node - # failure doesn't trigger pod deletion. +{{end}} tolerations: - key: "node.kubernetes.io/not-ready" operator: "Exists" @@ -60,4 +81,4 @@ spec: - key: "slo" operator: "Equal" value: "true" - effect: "NoSchedule" \ No newline at end of file + effect: "NoSchedule" diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 61ceacd1b..ff496d7b6 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -2,6 +2,7 @@ name: load-config # Config options for test type {{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST true}} +{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} # Config options for test parameters {{$nodesPerNamespace := DefaultParam .CL2_NODES_PER_NAMESPACE 100}} @@ -12,12 +13,12 @@ name: load-config {{$groupName := DefaultParam .CL2_GROUP_NAME "service-discovery"}} # TODO(jshr-w): This should eventually use >1 namespace. -{{$namespaces := 1}} +{{$namespaces := DefaultParam .CL2_NO_OF_NAMESPACES 1}} {{$nodes := DefaultParam .CL2_NODES 1000}} {{$deploymentQPS := DivideFloat $loadTestThroughput $deploymentSize}} {{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} -{{$totalPods := MultiplyInt $namespaces $nodes $podsPerNode}} +{{$totalPods := MultiplyInt $namespaces $nodesPerNamespace $podsPerNode}} {{$podsPerNamespace := DivideInt $totalPods $namespaces}} {{$deploymentsPerNamespace := DivideInt $podsPerNamespace $deploymentSize}} @@ -29,9 +30,9 @@ name: load-config # Service test {{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 4000}} -{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 20}} +{{$SMALL_GROUP_SIZE := DefaultParam .CL2_DEPLOYMENT_SIZE 20}} {{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}} -{{$smallDeploymentPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} +{{$smallDeploymentPods := DivideInt $totalPods $namespaces}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: @@ -53,7 +54,7 @@ tuningSets: qps: {{$deploymentQPS}} steps: - - name: Log - namespaces={{$namespaces}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}} + - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}} measurements: - Identifier: Dummy Method: Sleep @@ -74,6 +75,13 @@ steps: action: start {{end}} +{{if $NETWORK_TEST}} + - module: + path: /modules/network-policy/net-policy-metrics.yaml + params: + action: start +{{end}} + {{range $i := Loop $repeats}} {{if $SERVICE_TEST}} - module: @@ -85,6 +93,15 @@ steps: bigServicesPerNamespace: {{$bigDeploymentsPerNamespace}} {{end}} +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + setup: true + run: true + testType: "pod-creation" +{{end}} + - module: path: /modules/reconcile-objects.yaml params: @@ -101,6 +118,27 @@ steps: Group: {{$groupName}} deploymentLabel: start +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + usePolicyCreationMetrics: true + usePodCreationMetrics: true + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "pod-creation" + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + run: true + testType: "policy-creation" +{{end}} + - module: path: /modules/reconcile-objects.yaml params: @@ -152,3 +190,18 @@ steps: params: action: gather group: {{$groupName}} + +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + usePolicyCreationMetrics: true + usePodCreationMetrics: true + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "policy-creation" +{{end}} diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml new file mode 100644 index 000000000..dab88c92d --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml @@ -0,0 +1,57 @@ +{{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE := DefaultParam .CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY "test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE "net-policy-client"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS 100}} +{{$NET_POLICY_ENFORCEMENT_LOAD_COUNT := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT 1000}} +{{$NET_POLICY_ENFORCEMENT_LOAD_QPS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS 10}} +{{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME := DefaultParam .CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME "small-deployment"}} + +{{$setup := DefaultParam .setup false}} +{{$run := DefaultParam .run false}} +{{$complete := DefaultParam .complete false}} +{{$testType := DefaultParam .testType "policy-creation"}} +# Target port needs to match the server container port of target pods that have +# "targetLabelKey: targetLabelValue" label selector. +{{$targetPort := 80}} + +steps: + {{if $setup}} +- name: Setup network policy enforcement latency measurement + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: setup + targetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + targetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + baseline: {{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE}} + testClientNodeSelectorKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY}} + testClientNodeSelectorValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE}} + {{end}} + + {{if $run}} +- name: "Run pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: run + testType: {{$testType}} + targetPort: {{$targetPort}} + maxTargets: {{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS}} + policyLoadCount: {{$NET_POLICY_ENFORCEMENT_LOAD_COUNT}} + policyLoadQPS: {{$NET_POLICY_ENFORCEMENT_LOAD_QPS}} + policyLoadTargetBaseName: {{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME}} + {{end}} + + {{if $complete}} +- name: "Complete pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: complete + testType: {{$testType}} + {{end}} \ No newline at end of file diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml new file mode 100644 index 000000000..5be48be8b --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml @@ -0,0 +1,122 @@ +# Valid actions: "start", "gather" +{{$action := .action}} +{{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} +{{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} +{{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} + +# CL2 params +# Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. +{{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}} +{{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}} +{{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}} +{{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}} +{{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}} + +steps: +- name: "{{$action}}ing network policy metrics" + measurements: + - Identifier: NetworkPolicyEnforcementLatency + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Enforcement Latency" + metricVersion: v1 + unit: s + queries: + # Network policy enforcement metrics gathered from the test clients. + {{if $usePolicyCreationMetrics}} + - name: PolicyCreation - TargetCount + query: sum(policy_enforcement_latency_policy_creation_seconds_count) + - name: PolicyCreation - Perc50 + query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc90 + query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc95 + query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc99 + query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + {{if $usePodCreationMetrics}} + - name: PodCreation - TargetCount + query: sum(pod_creation_reachability_latency_seconds_count) + - name: PodCreation - Perc50 + query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc90 + query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + - name: PodIpAssignedLatency - TargetCount + query: sum(pod_ip_address_assigned_latency_seconds_count) + - name: PodIpAssignedLatency - Perc50 + query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc90 + query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + + {{if $useCiliumMetrics}} + - Identifier: NetworkPolicyMetrics + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Performance" + metricVersion: v1 + unit: s + queries: + # Cilium agent metrics that are related to network policies. + - name: Number of times a policy import has failed + # To be replaced with the new Cilium metric that counts all policy changes, not just import errors. + # With that, this can be a percentage of failed imports. + # https://github.com/cilium/cilium/pull/23349 + query: sum(cilium_policy_import_errors_total) + threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}} + - name: Failed endpoint regenerations percentage + query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100 + threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}} + - name: Policy regeneration time - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Policy regeneration time - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Time between a policy change and it being fully deployed into the datapath - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Time between a policy change and it being fully deployed into the datapath - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Latency of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Latency of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Duration of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Duration of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Endpoint regeneration latency - Perc50 + query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Endpoint regeneration latency - Perc99 + query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Number of policies currently loaded + query: avg(cilium_policy) + - name: Number of endpoints labeled by policy enforcement status + query: sum(cilium_policy_endpoint_enforcement_status) + {{end}} \ No newline at end of file diff --git a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml index e169dd699..994808424 100644 --- a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml +++ b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml @@ -14,6 +14,13 @@ {{$smallDeploymentSize := .smallDeploymentSize}} {{$smallDeploymentsPerNamespace := .smallDeploymentsPerNamespace}} +{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} +{{$ENABLE_NETWORKPOLICIES := DefaultParam .CL2_NETWORK_TEST false}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_SERVER_EVERY_NTH_POD := DefaultParam .CL2_NET_POLICY_SERVER_EVERY_NTH_POD 3}} + steps: - name: Starting measurement for '{{$actionName}}' measurements: @@ -32,6 +39,7 @@ steps: - name: {{$actionName}} phases: +{{if not $NETWORK_TEST}} - namespaceRange: min: 1 max: {{$namespaces}} @@ -45,6 +53,7 @@ steps: SvcName: big-service Group: {{.Group}} deploymentLabel: {{.deploymentLabel}} +{{end}} - namespaceRange: min: 1 max: {{$namespaces}} @@ -55,6 +64,10 @@ steps: objectTemplatePath: deployment_template.yaml templateFillMap: Replicas: {{$smallDeploymentSize}} + EnableNetworkPolicyEnforcementLatencyTest: {{$NETWORK_TEST}} + TargetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + TargetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + NetPolServerOnEveryNthPod: {{$NET_POLICY_SERVER_EVERY_NTH_POD}} SvcName: small-service Group: {{.Group}} deploymentLabel: {{.deploymentLabel}} diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 473226603..41d0bbaa1 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -7,9 +7,6 @@ from utils import parse_xml_to_json, run_cl2_command, get_measurement from kubernetes_client import KubernetesClient -DEFAULT_PODS_PER_NODE = 50 -LOAD_PODS_PER_NODE = 20 - DEFAULT_NODES_PER_NAMESPACE = 100 CPU_REQUEST_LIMIT_MILLI = 1 DAEMONSETS_PER_NODE = { @@ -24,13 +21,9 @@ } # TODO: Remove aks once CL2 update provider name to be azure -def calculate_config(cpu_per_node, node_count, provider, service_test): +def calculate_config(cpu_per_node, node_per_step, pods_per_node, provider): throughput = 100 - nodes_per_namespace = min(node_count, DEFAULT_NODES_PER_NAMESPACE) - - pods_per_node = DEFAULT_PODS_PER_NODE - if service_test: - pods_per_node = LOAD_PODS_PER_NODE + nodes_per_namespace = min(node_per_step, DEFAULT_NODES_PER_NAMESPACE) # Different cloud has different reserved values and number of daemonsets # Using the same percentage will lead to incorrect nodes number as the number of nodes grow @@ -40,33 +33,39 @@ def calculate_config(cpu_per_node, node_count, provider, service_test): cpu_request = (cpu_per_node * 1000 * capacity) // pods_per_node cpu_request = max(cpu_request, CPU_REQUEST_LIMIT_MILLI) - return throughput, nodes_per_namespace, pods_per_node, cpu_request + return throughput, nodes_per_namespace, cpu_request def configure_clusterloader2( cpu_per_node, node_count, node_per_step, max_pods, + pods_per_node, repeats, operation_timeout, + no_of_namespaces, + total_network_policies, provider, cilium_enabled, service_test, + network_test, override_file): steps = node_count // node_per_step - throughput, nodes_per_namespace, pods_per_node, cpu_request = calculate_config(cpu_per_node, node_per_step, provider, service_test) + throughput, nodes_per_namespace, cpu_request = calculate_config(cpu_per_node, node_per_step, pods_per_node, provider) with open(override_file, 'w') as file: file.write(f"CL2_LOAD_TEST_THROUGHPUT: {throughput}\n") file.write(f"CL2_NODES_PER_NAMESPACE: {nodes_per_namespace}\n") file.write(f"CL2_NODES_PER_STEP: {node_per_step}\n") + file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_PODS_PER_NODE: {pods_per_node}\n") file.write(f"CL2_DEPLOYMENT_SIZE: {pods_per_node}\n") file.write(f"CL2_LATENCY_POD_CPU: {cpu_request}\n") file.write(f"CL2_REPEATS: {repeats}\n") file.write(f"CL2_STEPS: {steps}\n") file.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n") + file.write(f"CL2_NO_OF_NAMESPACES: {no_of_namespaces}\n") file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 30.0\n") file.write("CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 30.0\n") @@ -82,6 +81,27 @@ def configure_clusterloader2( if service_test: file.write("CL2_SERVICE_TEST: true\n") + if network_test: + file.write("CL2_NETWORK_TEST: true\n") + file.write("CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST: true\n") + file.write("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: true\n") + file.write("CL2_PROMETHEUS_SCRAPE_KUBE_PROXY: true\n") + file.write("CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD: 30s\n") + file.write("CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES: false\n") + file.write("CL2_NETWORK_LATENCY_THRESHOLD: 0s\n") + file.write("CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION: 1s\n") + file.write("CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY: true\n") + file.write("CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT: 15m\n") + file.write("CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE: false\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY: net-pol-test\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE: enforcement-latency\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY: test\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE: net-policy-client\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS: 10\n") + file.write(f"CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT: {total_network_policies}\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS: 10\n") + file.write("CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME: small-deployment\n") + with open(override_file, 'r') as file: print(f"Content of file {override_file}:\n{file.read()}") @@ -109,12 +129,14 @@ def collect_clusterloader2( cpu_per_node, node_count, max_pods, + pods_per_node, repeats, cl2_report_dir, cloud_info, run_id, run_url, service_test, + network_test, result_file, test_type="default_config", ): @@ -128,9 +150,9 @@ def collect_clusterloader2( else: raise Exception(f"No testsuites found in the report! Raw data: {details}") - _, _, pods_per_node, _ = calculate_config(cpu_per_node, node_count, provider, service_test) pod_count = node_count * pods_per_node + # TODO: Expose optional parameter to include test details template = { "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), "cpu_per_node": cpu_per_node, @@ -141,7 +163,7 @@ def collect_clusterloader2( "group": None, "measurement": None, "result": None, - "test_details": details, + # "test_details": details, "cloud_info": cloud_info, "run_id": run_id, "run_url": run_url, @@ -191,13 +213,18 @@ def main(): parser_configure.add_argument("node_count", type=int, help="Number of nodes") parser_configure.add_argument("node_per_step", type=int, help="Number of nodes per scaling step") parser_configure.add_argument("max_pods", type=int, help="Maximum number of pods per node") + parser_configure.add_argument("pods_per_node", type=int, help="Number of pods per node") parser_configure.add_argument("repeats", type=int, help="Number of times to repeat the deployment churn") parser_configure.add_argument("operation_timeout", type=str, help="Timeout before failing the scale up test") + parser_configure.add_argument("no_of_namespaces", type=int, default=1, help="Number of namespaces to create") + parser_configure.add_argument("total_network_policies", type=int, help="Total number of network policies to create", default=1000) parser_configure.add_argument("provider", type=str, help="Cloud provider name") parser_configure.add_argument("cilium_enabled", type=eval, choices=[True, False], default=False, help="Whether cilium is enabled. Must be either True or False") parser_configure.add_argument("service_test", type=eval, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") + parser_configure.add_argument("network_test", type=eval, choices=[True, False], default=False, + help="Whether network test is running. Must be either True or False") parser_configure.add_argument("cl2_override_file", type=str, help="Path to the overrides of CL2 config file") # Sub-command for validate_clusterloader2 @@ -219,6 +246,7 @@ def main(): parser_collect.add_argument("cpu_per_node", type=int, help="CPU per node") parser_collect.add_argument("node_count", type=int, help="Number of nodes") parser_collect.add_argument("max_pods", type=int, help="Maximum number of pods per node") + parser_collect.add_argument("pods_per_node", type=int, help="Number of pods per node") parser_collect.add_argument("repeats", type=int, help="Number of times to repeat the deployment churn") parser_collect.add_argument("cl2_report_dir", type=str, help="Path to the CL2 report directory") parser_collect.add_argument("cloud_info", type=str, help="Cloud information") @@ -226,6 +254,8 @@ def main(): parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("service_test", type=eval, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") + parser_collect.add_argument("network_test", type=eval, choices=[True, False], default=False, + help="Whether network test is running. Must be either True or False") parser_collect.add_argument("result_file", type=str, help="Path to the result file") parser_collect.add_argument("test_type", type=str, nargs='?', default="default-config", help="Description of test type") @@ -233,18 +263,19 @@ def main(): args = parser.parse_args() if args.command == "configure": - configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods, - args.repeats, args.operation_timeout, args.provider, args.cilium_enabled, - args.service_test, args.cl2_override_file) + configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods, + args.pods_per_node, args.repeats, args.operation_timeout, args.no_of_namespaces, + args.total_network_policies, args.provider, + args.cilium_enabled, args.service_test, args.network_test, args.cl2_override_file) elif args.command == "validate": validate_clusterloader2(args.node_count, args.operation_timeout) elif args.command == "execute": execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.cl2_config_file, args.kubeconfig, args.provider) elif args.command == "collect": - collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.repeats, - args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, - args.service_test, args.result_file, args.test_type) + collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.pods_per_node, + args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, + args.service_test, args.network_test, args.result_file, args.test_type) if __name__ == "__main__": main() \ No newline at end of file diff --git a/modules/python/csi/csi.py b/modules/python/csi/csi.py new file mode 100644 index 000000000..b6cefc5dd --- /dev/null +++ b/modules/python/csi/csi.py @@ -0,0 +1,269 @@ +import time +import argparse +import os +import json +from datetime import datetime, timezone +from concurrent.futures import ThreadPoolExecutor, as_completed +from clusterloader2.kubernetes_client import KubernetesClient, client + +KUBERNETERS_CLIENT=KubernetesClient() + +# TODO: Move to utils folder later to be shared with other modules +def validate_node_count(node_label, node_count, operation_timeout_in_minutes): + kube_client = KubernetesClient() + ready_node_count = 0 + timeout = time.time() + (operation_timeout_in_minutes * 60) + print(f"Validating {node_count} nodes with label {node_label} are ready.") + while time.time() < timeout: + ready_nodes = kube_client.get_ready_nodes(label_selector=node_label) + ready_node_count = len(ready_nodes) + print(f"Currently {ready_node_count} nodes are ready.") + if ready_node_count == node_count: + break + print(f"Waiting for {node_count} nodes to be ready.") + time.sleep(10) + if ready_node_count != node_count: + raise Exception(f"Only {ready_node_count} nodes are ready, expected {node_count} nodes!") + +def calculate_percentiles(disk_number): + """Calculate percentile values for pods.""" + p50 = disk_number // 2 + p90 = disk_number * 9 // 10 + p99 = disk_number * 99 // 100 + return p50, p90, p99, disk_number + +def create_statefulset(namespace, replicas, storage_class): + """Create a StatefulSet dynamically.""" + statefulset = client.V1StatefulSet( + api_version="apps/v1", + kind="StatefulSet", + metadata=client.V1ObjectMeta(name="statefulset-local"), + spec=client.V1StatefulSetSpec( + pod_management_policy="Parallel", # Default is OrderedReady + replicas=replicas, + selector=client.V1LabelSelector(match_labels={"app": "nginx"}), + service_name="statefulset-local", + template=client.V1PodTemplateSpec( + metadata=client.V1ObjectMeta(labels={"app": "nginx"}), + spec=client.V1PodSpec( + node_selector={"kubernetes.io/os": "linux"}, + containers=[ + client.V1Container( + name="statefulset-local", + image="mcr.microsoft.com/oss/nginx/nginx:1.19.5", + command=[ + "/bin/bash", + "-c", + "set -euo pipefail; while true; do echo $(date) >> /mnt/local/outfile; sleep 1; done", + ], + volume_mounts=[ + client.V1VolumeMount(name="persistent-storage", mount_path="/mnt/local") + ], + ) + ], + ), + ), + volume_claim_templates=[ + client.V1PersistentVolumeClaimTemplate( + metadata=client.V1ObjectMeta( + name="persistent-storage", + annotations={"volume.beta.kubernetes.io/storage-class": storage_class}, + ), + spec=client.V1PersistentVolumeClaimSpec( + access_modes=["ReadWriteOnce"], + resources=client.V1ResourceRequirements(requests={"storage": "1Gi"}), + ), + ) + ], + ), + ) + app_client = KUBERNETERS_CLIENT.get_app_client() + ss = app_client.create_namespaced_stateful_set(namespace, statefulset) + return ss + +def log_duration(description, start_time, log_file): + """Log the time duration of an operation.""" + end_time = datetime.now() + duration = int((end_time - start_time).total_seconds()) + if ":" in description: + raise Exception(f"Description cannot contain a colon ':' character!") + with open(log_file, "a") as f: + f.write(f"{description}: {duration}\n") + print(f"{description}: {duration}s") + +def wait_for_condition(check_function, target, comparison="gte", interval=1): + """ + Wait for a condition using a given check function. + The check function should return a list of items. + The condition is satisfied when the length of the list meets the target. + """ + while True: + current_list = check_function() + current = len(current_list) + print(f"Current: {current}, Target: {target}") + if (comparison == "gte" and current >= target) or (comparison == "lte" and current <= target): + return current + time.sleep(interval) + +def monitor_thresholds(description, monitor_function, thresholds, comparison, start_time, log_file): + """Monitor thresholds and log their completion.""" + for target, threshold_desc in thresholds: + wait_for_condition(monitor_function, target, comparison) + log_duration(f"{description} {threshold_desc}", start_time, log_file) + +def execute_attach_detach(disk_number, storage_class, wait_time, result_dir): + """Execute the attach detach test.""" + print(f"Starting running test with {disk_number} disks and {storage_class} storage class") + + # Create the result directory and log file + if not os.path.exists(result_dir): + os.mkdir(result_dir) + log_file = os.path.join(result_dir, f"attachdetach-{disk_number}.txt") + + namespace = f"test-{time.time_ns()}" + + p50, p90, p99, p100 = calculate_percentiles(disk_number) + print(f"Percentiles: p50={p50}, p90={p90}, p99={p99}, p100={p100}") + attach_thresholds = [(p50, "p50"), (p90, "p90"), (p99, "p99"), (p100, "p100")] + detach_thresholds = [(p100 - p50, "p50"), (p100 - p90, "p90"), (p100 - p99, "p99"), (0, "p100")] + + # Create a namespace + ns = KUBERNETERS_CLIENT.create_namespace(namespace) + print(f"Created namespace {ns.metadata.name}") + + # Start the timer + creation_start_time = datetime.now() + + # Create StatefulSet + ss = create_statefulset(namespace, disk_number, storage_class) + print(f"Created StatefulSet {ss.metadata.name}") + + # Measure PVC creation and attachment + with ThreadPoolExecutor(max_workers=2) as executor: + futures = [] + futures.append( + executor.submit( + monitor_thresholds, + "PV creation", + lambda: KUBERNETERS_CLIENT.get_bound_persistent_volume_claims_by_namespace(namespace), + attach_thresholds, + "gte", + creation_start_time, + log_file + ) + ) + futures.append( + executor.submit( + monitor_thresholds, + "PV attachment", + lambda: KUBERNETERS_CLIENT.get_running_pods_by_namespace(namespace), + attach_thresholds, + "gte", + creation_start_time, + log_file + ) + ) + + # Wait for all threads to complete + for future in as_completed(futures): + future.result() # Blocks until the thread finishes execution + + print(f"Measuring creation and attachment of PVCs completed! Waiting for {wait_time} seconds before starting deletion.") + time.sleep(wait_time) + + # Start the timer + deletion_start_time = datetime.now() + + # Delete StatefulSet + KUBERNETERS_CLIENT.app.delete_namespaced_stateful_set(ss.metadata.name, namespace) + KUBERNETERS_CLIENT.delete_persistent_volume_claim_by_namespace(namespace) + + # Measure PVC detachment + with ThreadPoolExecutor(max_workers=2) as executor: + future = executor.submit( + monitor_thresholds, + "PV detachment", + lambda: KUBERNETERS_CLIENT.get_attached_volume_attachments(), + detach_thresholds, + "lte", + deletion_start_time, + log_file + ) + future.result() + + KUBERNETERS_CLIENT.delete_namespace(namespace) + print("Measuring detachment of PVCs completed.") + +def collect_attach_detach(case_name, node_number, disk_number, storage_class, cloud_info, run_id, run_url, result_dir): + raw_result_file = os.path.join(result_dir, f"attachdetach-{disk_number}.txt") + result_file = os.path.join(result_dir, "results.json") + print(f"Collecting attach detach test results from {raw_result_file} into {result_file}") + + with open(raw_result_file, 'r') as file: + content = file.read() + print(content) + + # Parse metrics from the result file + metrics = {} + for line in content.splitlines(): + if ':' in line: # Only process lines with key-value pairs + key, value = map(str.strip, line.split(':', 1)) + metrics[key.replace(' ', '_')] = value + + print(f"Parsed metrics: {metrics}") + + content = { + "timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'), + "case_name": case_name, + "node_number": node_number, + "disk_number": disk_number, + "storage_class": storage_class, + "result": metrics, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url + } + + os.makedirs(os.path.dirname(result_file), exist_ok=True) + with open(result_file, 'w') as f: + f.write(json.dumps(content)) + +def main(): + parser = argparse.ArgumentParser(description="CSI Benchmark.") + subparsers = parser.add_subparsers(dest="command") + + # Sub-command for validate + parser_validate = subparsers.add_parser("validate", help="Validate node count") + parser_validate.add_argument("node_label", type=str, help="Node label selector") + parser_validate.add_argument("node_count", type=int, help="Number of nodes") + parser_validate.add_argument("operation_timeout", type=int, help="Timeout for the operation in seconds") + + # Sub-command for execute_attach_detach + parser_execute = subparsers.add_parser("execute", help="Execute attach detach test") + parser_execute.add_argument("disk_number", type=int, help="Disk number") + parser_execute.add_argument("storage_class", type=str, help="Storage class") + parser_execute.add_argument("wait_time", type=int, help="Wait time before deletion") + parser_execute.add_argument("result_dir", type=str, help="Result directory") + + # Sub-command for collect_attach_detach + parser_collect = subparsers.add_parser("collect", help="Collect attach detach test results") + parser_collect.add_argument("case_name", type=str, help="Case name") + parser_collect.add_argument("node_number", type=int, help="Node number") + parser_collect.add_argument("disk_number", type=int, help="Disk number") + parser_collect.add_argument("storage_class", type=str, help="Storage class") + parser_collect.add_argument("cloud_info", type=str, help="Cloud info") + parser_collect.add_argument("run_id", type=str, help="Run ID") + parser_collect.add_argument("run_url", type=str, help="Run URL") + parser_collect.add_argument("result_dir", type=str, help="Result directory") + + args = parser.parse_args() + if args.command == "validate": + validate_node_count(args.node_label, args.node_count, args.operation_timeout) + elif args.command == "execute": + execute_attach_detach(args.disk_number, args.storage_class, args.wait_time, args.result_dir) + elif args.command == "collect": + collect_attach_detach(args.case_name, args.node_number, args.disk_number, args.storage_class, + args.cloud_info, args.run_id, args.run_url, args.result_dir) + +if __name__ == "__main__": + main() diff --git a/modules/python/tests/test_csi.py b/modules/python/tests/test_csi.py new file mode 100644 index 000000000..2a67b9760 --- /dev/null +++ b/modules/python/tests/test_csi.py @@ -0,0 +1,267 @@ +import unittest +import json +from datetime import datetime, timedelta, timezone +from unittest.mock import MagicMock, patch, mock_open +from kubernetes.client.models import ( + V1StatefulSet, + V1ObjectMeta, + V1StatefulSetSpec, + V1LabelSelector, + V1PodTemplateSpec, + V1PodSpec, + V1Container, + V1VolumeMount, + V1PersistentVolumeClaimTemplate, + V1PersistentVolumeClaimSpec, + V1ResourceRequirements +) + +with patch("clusterloader2.kubernetes_client.config.load_kube_config") as mock_load_kube_config: + # Mock the load_kube_config function to do nothing + mock_load_kube_config.return_value = None + + # Now import the module where the global KUBERNETERS_CLIENT is defined + from csi.csi import ( + wait_for_condition, calculate_percentiles, log_duration, + create_statefulset, collect_attach_detach + ) + +class TestCSI(unittest.TestCase): + + def test_calculate_percentiles(self): + disk_numbers = [300, 1000] + expected_percentiles = { + 300: [150, 270, 297, 300], + 1000: [500, 900, 990, 1000] + } + for disk_number in disk_numbers: + p50, p90, p99, p100 = calculate_percentiles(disk_number) + self.assertEqual(p50, expected_percentiles[disk_number][0]) + self.assertEqual(p90, expected_percentiles[disk_number][1]) + self.assertEqual(p99, expected_percentiles[disk_number][2]) + self.assertEqual(p100, expected_percentiles[disk_number][3]) + + @patch("builtins.open", new_callable=mock_open) + @patch("csi.csi.datetime") + def test_log_duration_success(self, mock_datetime, mock_open_file): + duration = 200 + # Mock start_time and end_time + mock_start_time = datetime(2024, 1, 1, 12, 0, 0) # Fixed start time + mock_end_time = mock_start_time + timedelta(seconds=duration) # Fixed end time + + # Mock datetime.now to return the end_time + mock_datetime.now.return_value = mock_end_time + + # Call the function + log_file = "log.txt" + description = "PV creation p99" + log_duration(description, mock_start_time, log_file) + + # Verify file write + mock_open_file.assert_called_once_with(log_file, "a") + mock_open_file().write.assert_called_once_with(f"{description}: {duration}\n") + + # Verify print output + with patch("builtins.print") as mock_print: + log_duration(description, mock_start_time, log_file) + mock_print.assert_called_with(f"{description}: {duration}s") + + def test_log_duration_failure(self): + # Test that an exception is raised when the description contains ":" + start_time = datetime.now() + log_file = "log.txt" + description = "Invalid:Description" + + with self.assertRaises(Exception) as context: + log_duration(description, start_time, log_file) + + self.assertEqual( + str(context.exception), "Description cannot contain a colon ':' character!" + ) + + def test_wait_for_condition_met_immediately(self): + check_function = MagicMock(return_value=[f"disk-{i}" for i in range(3)]) + result = wait_for_condition(check_function, 3, "gte", 1) + self.assertEqual(result, 3) + check_function.assert_called_once() + + check_function.reset_mock() + check_function.return_value = [f"disk-{i}" for i in range(4)] + result = wait_for_condition(check_function, 5, "lte", 1) + self.assertEqual(result, 4) + check_function.assert_called_once() + + def test_wait_for_condition_met_after_iterations(self): + check_function = MagicMock(side_effect=[ + [f"disk-{i}" for i in range(1)], + [f"disk-{i}" for i in range(2)], + [f"disk-{i}" for i in range(3)] + ]) + result = wait_for_condition(check_function, 3, "gte", 1) + self.assertEqual(result, 3) + self.assertEqual(check_function.call_count, 3) + + check_function.reset_mock() + check_function.side_effect = [ + [f"disk-{i}" for i in range(7)], + [f"disk-{i}" for i in range(6)], + [f"disk-{i}" for i in range(5)] + ] + result = wait_for_condition(check_function, 5, "lte", 1) + self.assertEqual(result, 5) + self.assertEqual(check_function.call_count, 3) + + @patch("clusterloader2.kubernetes_client.KubernetesClient.get_app_client") + def test_create_statefulset_success(self, mock_get_app_client): + namespace = "test" + replicas = 10 + storage_class = "default" + stateful_set = V1StatefulSet( + api_version="apps/v1", + kind="StatefulSet", + metadata=V1ObjectMeta(name="statefulset-local"), + spec=V1StatefulSetSpec( + pod_management_policy="Parallel", + replicas=replicas, + selector=V1LabelSelector(match_labels={"app": "nginx"}), + service_name="statefulset-local", + template=V1PodTemplateSpec( + metadata=V1ObjectMeta(labels={"app": "nginx"}), + spec=V1PodSpec( + node_selector={"kubernetes.io/os": "linux"}, + containers=[ + V1Container( + name="statefulset-local", + image="mcr.microsoft.com/oss/nginx/nginx:1.19.5", + command=[ + "/bin/bash", + "-c", + "set -euo pipefail; while true; do echo $(date) >> /mnt/local/outfile; sleep 1; done", + ], + volume_mounts=[ + V1VolumeMount( + name="persistent-storage", mount_path="/mnt/local" + ) + ], + ) + ], + ), + ), + volume_claim_templates=[ + V1PersistentVolumeClaimTemplate( + metadata=V1ObjectMeta( + name="persistent-storage", + annotations={"volume.beta.kubernetes.io/storage-class": storage_class}, + ), + spec=V1PersistentVolumeClaimSpec( + access_modes=["ReadWriteOnce"], + resources=V1ResourceRequirements(requests={"storage": "1Gi"}), + ), + ) + ], + ), + ) + + mock_app_client = MagicMock() + mock_get_app_client.return_value = mock_app_client + mock_app_client.create_namespaced_stateful_set.return_value = stateful_set + + ss = create_statefulset(namespace, replicas, storage_class) + + mock_get_app_client.assert_called_once() + mock_app_client.create_namespaced_stateful_set.assert_called_once_with( + namespace, stateful_set + ) + self.assertEqual(ss, stateful_set) + + @patch("builtins.open", new_callable=mock_open) + @patch("os.makedirs") + @patch("os.path.join") + @patch("csi.csi.datetime") + def test_collect_attach_detach_results( + self, mock_datetime, mock_path_join, mock_makedirs, mock_open_file + ): + result_dir = "result_dir" + case_name = "Standard_D16s_v3_1000pods_40nodes" + node_number = 40 + disk_number = 1000 + storage_class = "default" + cloud_info = {"cloud": "azure"} + run_id = "12345789" + run_url = "http://example.com/test-run" + + mock_path_join.side_effect = lambda *args: "/".join(args) + raw_result_file = f"{result_dir}/attachdetach-{disk_number}.txt" + result_file = f"{result_dir}/results.json" + + mock_timestamp = datetime(2024, 1, 1, 12, 0, 0, tzinfo=timezone.utc) + mock_datetime.now.return_value = mock_timestamp + + raw_file_content = """ +PV creation p50: 178 +PV creation p90: 240 +PV creation p99: 252 +PV creation p100: 253 +PV attachment p50: 261 +PV attachment p90: 347 +PV attachment p99: 359 +PV attachment p100: 367 +PV detachment p50: 211 +PV detachment p90: 281 +PV detachment p99: 401 +PV detachment p100: 412 +""" + mock_open_file().read.return_value = raw_file_content + + collect_attach_detach( + case_name, + node_number, + disk_number, + storage_class, + cloud_info, + run_id, + run_url, + result_dir + ) + + mock_makedirs.assert_called_once_with(result_dir, exist_ok=True) + + mock_open_file.assert_any_call(raw_result_file, "r") + mock_open_file().read.assert_called_once() + + expected_metrics = { + "PV_creation_p50": "178", + "PV_creation_p90": "240", + "PV_creation_p99": "252", + "PV_creation_p100": "253", + "PV_attachment_p50": "261", + "PV_attachment_p90": "347", + "PV_attachment_p99": "359", + "PV_attachment_p100": "367", + "PV_detachment_p50": "211", + "PV_detachment_p90": "281", + "PV_detachment_p99": "401", + "PV_detachment_p100": "412", + } + + mock_open_file.assert_any_call(result_file, "w") + written_content = mock_open_file().write.call_args[0][0] + written_json = json.loads(written_content) + + expected_content = { + "timestamp": mock_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ'), + "case_name": case_name, + "node_number": node_number, + "disk_number": disk_number, + "storage_class": storage_class, + "result": expected_metrics, + "cloud_info": cloud_info, + "run_id": run_id, + "run_url": run_url, + } + + self.maxDiff = None + self.assertEqual(written_json, expected_content) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/modules/python/tests/test_kubernetes_client.py b/modules/python/tests/test_kubernetes_client.py index 89d019367..023745a8f 100644 --- a/modules/python/tests/test_kubernetes_client.py +++ b/modules/python/tests/test_kubernetes_client.py @@ -1,7 +1,10 @@ import unittest from unittest.mock import patch from kubernetes.client.models import ( - V1Node, V1NodeStatus, V1NodeCondition, V1NodeSpec, V1ObjectMeta, V1Taint + V1Node, V1NodeStatus, V1NodeCondition, V1NodeSpec, V1ObjectMeta, V1Taint, + V1PersistentVolumeClaim, V1PersistentVolumeClaimStatus, + V1VolumeAttachment, V1VolumeAttachmentStatus, V1VolumeAttachmentSpec, V1VolumeAttachmentSource, + V1PodStatus, V1Pod, V1PodSpec, V1Namespace ) from clusterloader2.kubernetes_client import KubernetesClient @@ -54,6 +57,125 @@ def test_get_ready_nodes_with_network_unavailable(self, mock_get_nodes): self.assertCountEqual(ready_nodes, [node_ready_network_available, node_ready_no_network_condition, node_ready_taint_no_effect] ) + + def _create_namespace(self, name): + return V1Namespace(metadata=V1ObjectMeta(name=name)) + + def _create_pod(self, namespace, name, phase, labels=None): + return V1Pod( + metadata=V1ObjectMeta(name=name, namespace=namespace, labels=labels), + status=V1PodStatus(phase=phase), + spec=V1PodSpec(containers=[]) + ) + + def _create_pvc(self, name, namespace, phase): + return V1PersistentVolumeClaim( + metadata=V1ObjectMeta(name=name, namespace=namespace), + status=V1PersistentVolumeClaimStatus(phase=phase) + ) + + def _create_volume_attachment(self, name, namespace, phase, attacher, node_name): + return V1VolumeAttachment( + metadata=V1ObjectMeta(name=name, namespace=namespace), + spec=V1VolumeAttachmentSpec( + attacher=attacher, + node_name=node_name, + source=V1VolumeAttachmentSource(persistent_volume_name=name)), + status=V1VolumeAttachmentStatus(attached=phase) + ) + + @patch("kubernetes.client.CoreV1Api.create_namespace") + @patch("kubernetes.client.CoreV1Api.read_namespace") + def test_create_existing_namespace(self, mock_read_namespace, mock_create_namespace): + name = "test-namespace" + mock_namespace = self._create_namespace(name) + mock_read_namespace.return_value = mock_namespace + + ns = self.client.create_namespace(name) + self.assertEqual(ns.metadata.name, mock_read_namespace.return_value.metadata.name) + mock_create_namespace.assert_not_called() + + @patch('clusterloader2.kubernetes_client.KubernetesClient.create_namespace') + @patch('clusterloader2.kubernetes_client.KubernetesClient.delete_namespace') + def test_create_delete_namespace(self, mock_delete_namespace, mock_create_namespace): + name = "test-namespace" + mock_namespace = self._create_namespace(name) + mock_create_namespace.return_value = mock_namespace + + ns = self.client.create_namespace(name) + + self.assertEqual(ns.metadata.name, mock_create_namespace.return_value.metadata.name) + mock_create_namespace.assert_called_once_with(name) + + mock_delete_namespace.return_value = None + ns = self.client.delete_namespace(name) + self.assertEqual(mock_delete_namespace.return_value, ns) + mock_delete_namespace.assert_called_once_with(name) + + @patch('clusterloader2.kubernetes_client.KubernetesClient.get_pods_by_namespace') + def test_get_running_pods_by_namespace(self, mock_get_pods_by_namespace): + namespace = "test-namespace" + running_pods = 10 + pending_pods = 5 + labels = {"app": "nginx"} + + mock_get_pods_by_namespace.return_value = [ + self._create_pod(namespace=namespace, name=f"pod-{i}", phase="Running", labels=labels) for i in range(running_pods) + ] + mock_get_pods_by_namespace.return_value.extend( + [self._create_pod(namespace=namespace, name=f"pod-{i}", phase="Pending", labels=labels) for i in range(running_pods, pending_pods + running_pods)] + ) + + self.assertEqual(len(mock_get_pods_by_namespace.return_value), running_pods + pending_pods) + + expected_pods = [pod for pod in mock_get_pods_by_namespace.return_value if pod.status.phase == "Running"] + returned_pods = self.client.get_running_pods_by_namespace(namespace=namespace, label_selector="app=nginx") + + for pod in returned_pods: + self.assertEqual(pod.metadata.labels, labels) + self.assertEqual(pod.status.phase, "Running") + + mock_get_pods_by_namespace.assert_called_once_with(namespace=namespace, label_selector="app=nginx", field_selector=None) + self.assertCountEqual(returned_pods, expected_pods) + + @patch('clusterloader2.kubernetes_client.KubernetesClient.get_persistent_volume_claims_by_namespace') + def test_get_bound_persistent_volume_claims_by_namespace(self, mock_get_persistent_volume_claims_by_namespace): + namespace = "test-namespace" + bound_claims = 10 + pending_claims = 5 + mock_get_persistent_volume_claims_by_namespace.return_value = [ + self._create_pvc(name=f"pvc-{i}", namespace=namespace, phase="Bound") for i in range(bound_claims) + ] + mock_get_persistent_volume_claims_by_namespace.return_value.extend( + self._create_pvc(name=f"pvc-{i}", namespace=namespace, phase="Pending") for i in range(bound_claims, pending_claims + bound_claims)) + + self.assertEqual(len(mock_get_persistent_volume_claims_by_namespace.return_value), bound_claims + pending_claims) + + expected_claims = [claim for claim in mock_get_persistent_volume_claims_by_namespace.return_value if claim.status.phase == "Bound"] + returned_claims = self.client.get_bound_persistent_volume_claims_by_namespace(namespace=namespace) + self.assertCountEqual(returned_claims, expected_claims) + mock_get_persistent_volume_claims_by_namespace.assert_called_once_with(namespace=namespace) + + @patch('clusterloader2.kubernetes_client.KubernetesClient.get_volume_attachments') + def test_get_attached_volume_attachments(self, mock_get_volume_attachments): + attached_attachments = 10 + detached_attachments = 5 + mock_get_volume_attachments.return_value = [ + self._create_volume_attachment( + name=f"attachment-{i}", namespace="test-namespace", phase=True, attacher="csi-driver", node_name="node-{i}" + ) for i in range(attached_attachments) + ] + mock_get_volume_attachments.return_value.extend( + self._create_volume_attachment( + name=f"attachment-{i}", namespace="test-namespace", phase=False, attacher="csi-driver", node_name="node-{i}" + ) for i in range(attached_attachments, detached_attachments + attached_attachments)) + + self.assertEqual(len(mock_get_volume_attachments.return_value), attached_attachments + detached_attachments) + + expected_volume_attachments = [attachment for attachment in mock_get_volume_attachments.return_value if attachment.status.attached] + returned_volume_attachments = self.client.get_attached_volume_attachments() + self.assertCountEqual(returned_volume_attachments, expected_volume_attachments) + mock_get_volume_attachments.assert_called_once() if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100-EZ.yml b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100-EZ.yml index ec0d6c1ee..ce0a58d03 100644 --- a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100-EZ.yml +++ b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100-EZ.yml @@ -32,7 +32,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.3 + runner_image: ghcr.io/azure/kperf:0.1.6 benchmark_subcmd: node10_job1_pod100 benchmark_subcmd_args: "--total 1000" max_parallel: 1 diff --git a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100.yml b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100.yml index 7e5451e60..53e49f81f 100644 --- a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100.yml +++ b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes10-pods100.yml @@ -33,7 +33,7 @@ stages: extra_benchmark_subcmd_args: "" disable_warmup: "true" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.5 + runner_image: ghcr.io/azure/kperf:0.1.5 benchmark_subcmd: node10_job1_pod100 benchmark_subcmd_args: "--total 1000" max_parallel: 2 @@ -60,7 +60,7 @@ stages: extra_benchmark_subcmd_args: "" sku_tier: Free engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.5 + runner_image: ghcr.io/azure/kperf:0.1.5 benchmark_subcmd: node10_job1_pod100 benchmark_subcmd_args: "--total 1000" max_parallel: 2 @@ -87,7 +87,7 @@ stages: extra_benchmark_subcmd_args: "" sku_tier: Standard engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.5 + runner_image: ghcr.io/azure/kperf:0.1.5 benchmark_subcmd: node10_job1_pod100 benchmark_subcmd_args: "--total 1000" max_parallel: 2 diff --git a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods10k.yml b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods10k.yml index 85db6cc2a..9981857b5 100644 --- a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods10k.yml +++ b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods10k.yml @@ -37,7 +37,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "--padding-bytes=16384" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.6 + runner_image: ghcr.io/azure/kperf:0.1.6 benchmark_subcmd: node100_pod10k benchmark_subcmd_args: "--total 72000 --deployments=10 --interval 24h --cpu 64 --memory 192" max_parallel: 2 @@ -70,7 +70,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "--padding-bytes=16384" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.6 + runner_image: ghcr.io/azure/kperf:0.1.6 benchmark_subcmd: node100_pod10k benchmark_subcmd_args: "--total 72000 --deployments=10 --interval 24h --cpu 64 --memory 192" max_parallel: 2 @@ -103,7 +103,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "--padding-bytes=16384" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.6 + runner_image: ghcr.io/azure/kperf:0.1.6 benchmark_subcmd: node100_pod10k benchmark_subcmd_args: "--total 72000 --deployments=10 --interval 24h --cpu 64 --memory 192 --content-type json" max_parallel: 2 @@ -136,7 +136,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "--padding-bytes=16384" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.6 + runner_image: ghcr.io/azure/kperf:0.1.6 benchmark_subcmd: node100_pod10k benchmark_subcmd_args: "--total 72000 --deployments=10 --interval 24h --cpu 64 --memory 192 --content-type protobuf" max_parallel: 2 diff --git a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods3k.yml b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods3k.yml index 91a414ffd..2240a79b4 100644 --- a/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods3k.yml +++ b/pipelines/perf-eval/API Server Benchmark/apiserver-benchmark-virtualnodes100-pods3k.yml @@ -31,7 +31,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.5 + runner_image: ghcr.io/azure/kperf:0.1.5 benchmark_subcmd: node100_job1_pod3k benchmark_subcmd_args: "--total 36000" max_parallel: 2 @@ -56,7 +56,7 @@ stages: flowcontrol: "exempt:5" extra_benchmark_subcmd_args: "" engine_input: - runner_image: telescope.azurecr.io/oss/kperf:v0.1.5 + runner_image: ghcr.io/azure/kperf:0.1.5 benchmark_subcmd: node100_job1_pod3k benchmark_subcmd_args: "--total 36000" max_parallel: 2 diff --git a/pipelines/perf-eval/CNI Benchmark/cilium-ab-testing.yml b/pipelines/perf-eval/CNI Benchmark/cilium-ab-testing.yml new file mode 100644 index 000000000..0e485eb88 --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/cilium-ab-testing.yml @@ -0,0 +1,37 @@ +trigger: none + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: ab-testing-cilium-parameters + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: cilium-usercluster + matrix: + azure_cilium: + cpu_per_node: 4 + node_count: 1000 + node_per_step: 1000 + max_pods: 110 + repeats: 1 + scale_timeout: "30m" + cilium_enabled: True + network_policy: cilium + network_dataplane: cilium + service_test: False + cl2_config_file: load-config.yaml + max_parallel: 1 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/pipelines/perf-eval/CNI Benchmark/cilium-cluster-churn-nodes-cilium-nodesubnet.yml b/pipelines/perf-eval/CNI Benchmark/cilium-cluster-churn-nodes-cilium-nodesubnet.yml new file mode 100644 index 000000000..e452d9697 --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/cilium-cluster-churn-nodes-cilium-nodesubnet.yml @@ -0,0 +1,44 @@ +trigger: none +schedules: + - cron: "0 10,22 * * *" + displayName: "10:00 AM & PM Daily" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: cilium-cluster-churn-cilium-nodesubnet + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - $(LOCATION) + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: cilium-usercluster-autoscale + matrix: + azure_cilium: + cpu_per_node: 4 + node_count: 1000 + node_per_step: 100 + max_pods: 110 + repeats: 1 + scale_timeout: "30m" + cilium_enabled: True + network_policy: cilium + network_dataplane: cilium + cl2_config_file: cluster-scale-config.yaml + service_test: False + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml new file mode 100644 index 000000000..70d695e3d --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -0,0 +1,96 @@ +trigger: none + +parameters: + - name: node_count + type: number + default: 100 + - name: node_per_step + type: number + default: 10 + - name: pods_per_node + type: number + default: 10 + - name: repeats + type: number + default: 1 + - name: scale_timeout + type: string + default: "15m" + - name: no_of_namespaces + type: number + default: 10 + - name: total_nework_policies + type: number + default: 1000 + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: cilium-network-churn + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: network-churn + matrix: + azure_cilium: + cpu_per_node: 4 + node_count: ${{ parameters.node_count }} + node_per_step: ${{ parameters.node_per_step }} + max_pods: 100 + pods_per_node: ${{ parameters.pods_per_node }} + repeats: ${{ parameters.repeats }} + scale_timeout: ${{ parameters.scale_timeout }} + no_of_namespaces: ${{ parameters.no_of_namespaces }} + total_network_policies: ${{ parameters.total_nework_policies }} + cilium_enabled: True + network_policy: cilium + network_dataplane: cilium + service_test: False + network_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false + - stage: azure_npm_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: network-churn + matrix: + azure_cni: + cpu_per_node: 4 + node_count: ${{ parameters.node_count }} + node_per_step: ${{ parameters.node_per_step }} + max_pods: 100 + pods_per_node: ${{ parameters.pods_per_node }} + repeats: ${{ parameters.repeats }} + scale_timeout: ${{ parameters.scale_timeout }} + no_of_namespaces: ${{ parameters.no_of_namespaces }} + total_network_policies: ${{ parameters.total_nework_policies }} + cilium_enabled: False + service_test: False + network_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false + use_secondary_cluster: true diff --git a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml new file mode 100644 index 000000000..1d8d42390 --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml @@ -0,0 +1,44 @@ +trigger: none +schedules: + - cron: "0 4,16 * * *" + displayName: "4:00 AM & PM Daily" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: slo-servicediscovery-cilium-nodesubnet + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - $(LOCATION) + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: cilium-usercluster + matrix: + azure_cilium: + cpu_per_node: 4 + node_count: 1000 + node_per_step: 1000 + max_pods: 110 + repeats: 10 + scale_timeout: "15m" + cilium_enabled: True + network_policy: cilium + network_dataplane: cilium + service_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml index 32bda1064..870d2680c 100644 --- a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml +++ b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml @@ -2,7 +2,7 @@ trigger: none variables: SCENARIO_TYPE: perf-eval - SCENARIO_NAME: slo-servicediscovery-ces + SCENARIO_NAME: slo-servicediscovery-feature SCENARIO_VERSION: main stages: diff --git a/pipelines/perf-eval/CSI Benchmark/csi-attach-detach-1000.yml b/pipelines/perf-eval/CSI Benchmark/csi-attach-detach-1000.yml new file mode 100644 index 000000000..49dcbb826 --- /dev/null +++ b/pipelines/perf-eval/CSI Benchmark/csi-attach-detach-1000.yml @@ -0,0 +1,59 @@ +trigger: none +schedules: + - cron: "0 20 1-31/2 * *" + displayName: "Every Odd Day" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: storage-attach-detach-1000 + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + topology: csi-attach-detach + engine: attach + engine_input: + disk_number: 1000 + storage_class: default + wait_time: 300 + matrix: + Standard_D16s_v3_1000pods_40nodes: + case_name: Standard_D16s_v3_1000pods_40nodes + node_number: 40 + max_parallel: 1 + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false + - stage: aws_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: aws + regions: + - us-east-2 + topology: csi-attach-detach + engine: attach + engine_input: + disk_number: 1000 + storage_class: ebs-sc + wait_time: 300 + matrix: + m7i_2xlarge_1000pods_40nodes: + case_name: m7i_2xlarge_1000pods_40nodes + node_number: 40 + max_parallel: 1 + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false diff --git a/pipelines/perf-eval/CSI Benchmark/csi-attach-detach-300.yml b/pipelines/perf-eval/CSI Benchmark/csi-attach-detach-300.yml new file mode 100644 index 000000000..a4a4799eb --- /dev/null +++ b/pipelines/perf-eval/CSI Benchmark/csi-attach-detach-300.yml @@ -0,0 +1,59 @@ +trigger: none +schedules: + - cron: "0 16 1-31/2 * *" + displayName: "Every Odd Day" + branches: + include: + - main + always: true + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: storage-attach-detach-300 + SCENARIO_VERSION: main + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + topology: csi-attach-detach + engine: attach + engine_input: + disk_number: 300 + storage_class: default + wait_time: 300 + matrix: + Standard_D2s_v3_300pods_300nodes: + case_name: Standard_D2s_v3_300pods_300nodes + node_number: 300 + max_parallel: 1 + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false + - stage: aws_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: aws + regions: + - us-east-2 + topology: csi-attach-detach + engine: attach + engine_input: + disk_number: 300 + storage_class: ebs-sc + wait_time: 300 + matrix: + m7i_large_300pods_300nodes: + case_name: m7i_large_300pods_300nodes + node_number: 300 + max_parallel: 1 + timeout_in_minutes: 180 + credential_type: service_connection + ssh_key_enabled: false diff --git a/scenarios/perf-eval/ab-testing-cilium-parameters/terraform-inputs/azure.tfvars b/scenarios/perf-eval/ab-testing-cilium-parameters/terraform-inputs/azure.tfvars new file mode 100644 index 000000000..3584070af --- /dev/null +++ b/scenarios/perf-eval/ab-testing-cilium-parameters/terraform-inputs/azure.tfvars @@ -0,0 +1,4 @@ +scenario_type = "perf-eval" +scenario_name = "ab-testing-cilium-parameters" +deletion_delay = "12h" +owner = "aks" diff --git a/scenarios/perf-eval/ab-testing-cilium-parameters/terraform-test-inputs/azure.json b/scenarios/perf-eval/ab-testing-cilium-parameters/terraform-test-inputs/azure.json new file mode 100644 index 000000000..6609135b4 --- /dev/null +++ b/scenarios/perf-eval/ab-testing-cilium-parameters/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id": "123456789", + "region": "eastus2" +} \ No newline at end of file diff --git a/scenarios/perf-eval/storage-attach-detach-1000/kubernetes/storageclass.aws.yml b/scenarios/perf-eval/storage-attach-detach-1000/kubernetes/storageclass.aws.yml new file mode 100644 index 000000000..ee61caaf2 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-1000/kubernetes/storageclass.aws.yml @@ -0,0 +1,16 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: $STORAGE_CLASS + annotations: + storageclass.kubernetes.io/is-default-class: "true" +allowVolumeExpansion: true +provisioner: ebs.csi.aws.com +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete +parameters: + type: gp2 + tagSpecification_1: "owner=${OWNER}" + tagSpecification_2: "scenario=${SCENARIO}" + tagSpecification_3: "run_id=${RUN_ID}" + tagSpecification_4: "deletion_due_time=${DELETION_DUE_TIME}" diff --git a/scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/aws.tfvars b/scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/aws.tfvars new file mode 100644 index 000000000..07844c5f3 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/aws.tfvars @@ -0,0 +1,82 @@ +scenario_type = "perf-eval" +scenario_name = "storage-attach-detach-1000" +deletion_delay = "6h" +owner = "aks" +network_config_list = [ + { + role = "client" + vpc_name = "client-vpc" + vpc_cidr_block = "10.0.0.0/16" + subnet = [ + { + name = "client-subnet" + cidr_block = "10.0.0.0/17" + zone_suffix = "a" + map_public_ip_on_launch = true + }, + { + name = "client-subnet-2" + cidr_block = "10.0.128.0/17" + zone_suffix = "b" + map_public_ip_on_launch = true + } + ] + security_group_name = "client-sg" + route_tables = [ + { + name = "internet-rt" + cidr_block = "0.0.0.0/0" + } + ], + route_table_associations = [ + { + name = "client-subnet-rt-assoc" + subnet_name = "client-subnet" + route_table_name = "internet-rt" + }, + { + name = "client-subnet-rt-assoc-2" + subnet_name = "client-subnet-2" + route_table_name = "internet-rt" + } + ] + sg_rules = { + ingress = [] + egress = [ + { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_block = "0.0.0.0/0" + } + ] + } + } +] + +eks_config_list = [{ + role = "client" + eks_name = "perfevala1000" + vpc_name = "client-vpc" + policy_arns = ["AmazonEKSClusterPolicy", "AmazonEKSVPCResourceController", "AmazonEKSWorkerNodePolicy", "AmazonEKS_CNI_Policy", "AmazonEC2ContainerRegistryReadOnly"] + eks_managed_node_groups = [ + { + name = "user" + ami_type = "AL2_x86_64" + instance_types = ["m7i.2xlarge"] + min_size = 40 + max_size = 40 + desired_size = 40 + labels = { "csi" = "true" } + } + ] + eks_addons = [ + { + name = "aws-ebs-csi-driver" + service_account = "ebs-csi-controller-sa" + policy_arns = ["service-role/AmazonEBSCSIDriverPolicy"] + } + ] + kubernetes_version = "1.30" +}] + diff --git a/scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/azure.tfvars b/scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/azure.tfvars new file mode 100644 index 000000000..56b0923f0 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-1000/terraform-inputs/azure.tfvars @@ -0,0 +1,35 @@ +scenario_type = "perf-eval" +scenario_name = "storage-attach-detach-1000" +deletion_delay = "6h" +owner = "aks" +aks_config_list = [ + { + role = "client" + aks_name = "perfevala1000" + dns_prefix = "attach" + sku_tier = "Free" + network_profile = { + network_plugin = "kubenet" + pod_cidr = "125.4.0.0/14" + } + default_node_pool = { + name = "default" + node_count = 3 + subnet_name = "aks-network" + vm_size = "Standard_D2s_v3" + os_disk_type = "Managed" + only_critical_addons_enabled = true + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "user" + node_count = 40 + subnet_name = "aks-network" + vm_size = "Standard_D16s_v3" + node_labels = { "csi" = "true" } + } + ] + kubernetes_version = "1.30" + } +] diff --git a/scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/aws.json b/scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/aws.json new file mode 100644 index 000000000..cc6fdb472 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/aws.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "us-east-1" +} \ No newline at end of file diff --git a/scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/azure.json b/scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/azure.json new file mode 100644 index 000000000..5d88ca96b --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-1000/terraform-test-inputs/azure.json @@ -0,0 +1,5 @@ +{ + "run_id" : "123456789", + "region" : "eastus", + "accelerated_networking" : true +} \ No newline at end of file diff --git a/scenarios/perf-eval/storage-attach-detach-300/kubernetes/storageclass.aws.yml b/scenarios/perf-eval/storage-attach-detach-300/kubernetes/storageclass.aws.yml new file mode 100644 index 000000000..ee61caaf2 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-300/kubernetes/storageclass.aws.yml @@ -0,0 +1,16 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: $STORAGE_CLASS + annotations: + storageclass.kubernetes.io/is-default-class: "true" +allowVolumeExpansion: true +provisioner: ebs.csi.aws.com +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete +parameters: + type: gp2 + tagSpecification_1: "owner=${OWNER}" + tagSpecification_2: "scenario=${SCENARIO}" + tagSpecification_3: "run_id=${RUN_ID}" + tagSpecification_4: "deletion_due_time=${DELETION_DUE_TIME}" diff --git a/scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/aws.tfvars b/scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/aws.tfvars new file mode 100644 index 000000000..684bc29d9 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/aws.tfvars @@ -0,0 +1,81 @@ +scenario_type = "perf-eval" +scenario_name = "storage-attach-detach-300" +deletion_delay = "6h" +owner = "aks" +network_config_list = [ + { + role = "client" + vpc_name = "client-vpc" + vpc_cidr_block = "10.0.0.0/16" + subnet = [ + { + name = "client-subnet" + cidr_block = "10.0.0.0/17" + zone_suffix = "a" + map_public_ip_on_launch = true + }, + { + name = "client-subnet-2" + cidr_block = "10.0.128.0/17" + zone_suffix = "b" + map_public_ip_on_launch = true + } + ] + security_group_name = "client-sg" + route_tables = [ + { + name = "internet-rt" + cidr_block = "0.0.0.0/0" + } + ], + route_table_associations = [ + { + name = "client-subnet-rt-assoc" + subnet_name = "client-subnet" + route_table_name = "internet-rt" + }, + { + name = "client-subnet-rt-assoc-2" + subnet_name = "client-subnet-2" + route_table_name = "internet-rt" + } + ] + sg_rules = { + ingress = [] + egress = [ + { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_block = "0.0.0.0/0" + } + ] + } + } +] + +eks_config_list = [{ + role = "client" + eks_name = "perfevala300" + vpc_name = "client-vpc" + policy_arns = ["AmazonEKSClusterPolicy", "AmazonEKSVPCResourceController", "AmazonEKSWorkerNodePolicy", "AmazonEKS_CNI_Policy", "AmazonEC2ContainerRegistryReadOnly"] + eks_managed_node_groups = [ + { + name = "user" + ami_type = "AL2_x86_64" + instance_types = ["t2.large"] + min_size = 300 + max_size = 300 + desired_size = 300 + labels = { "csi" = "true" } + } + ] + eks_addons = [ + { + name = "aws-ebs-csi-driver" + service_account = "ebs-csi-controller-sa" + policy_arns = ["service-role/AmazonEBSCSIDriverPolicy"] + } + ] + kubernetes_version = "1.30" +}] diff --git a/scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/azure.tfvars b/scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/azure.tfvars new file mode 100644 index 000000000..d47b892aa --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-300/terraform-inputs/azure.tfvars @@ -0,0 +1,49 @@ +scenario_type = "perf-eval" +scenario_name = "storage-attach-detach-300" +deletion_delay = "6h" +owner = "aks" +network_config_list = [ + { + role = "network" + vnet_name = "aks-network-vnet" + vnet_address_space = "125.0.0.0/8" + subnet = [{ + name = "aks-network-300" + address_prefix = "125.0.0.0/14" + }] + network_security_group_name = "aks-network-nsg" + nic_public_ip_associations = [] + nsr_rules = [] + } +] +aks_config_list = [ + { + role = "client" + aks_name = "perfevala300" + dns_prefix = "attach" + sku_tier = "Free" + network_profile = { + network_plugin = "kubenet" + pod_cidr = "125.4.0.0/14" + } + default_node_pool = { + name = "default" + subnet_name = "aks-network-300" + node_count = 3 + vm_size = "Standard_D2s_v3" + os_disk_type = "Managed" + only_critical_addons_enabled = true + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "user" + subnet_name = "aks-network-300" + node_count = 300 + vm_size = "Standard_D2s_v3" + node_labels = { "csi" = "true" } + } + ] + kubernetes_version = "1.30" + } +] diff --git a/scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/aws.json b/scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/aws.json new file mode 100644 index 000000000..cc6fdb472 --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/aws.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "us-east-1" +} \ No newline at end of file diff --git a/scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/azure.json b/scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/azure.json new file mode 100644 index 000000000..5d88ca96b --- /dev/null +++ b/scenarios/perf-eval/storage-attach-detach-300/terraform-test-inputs/azure.json @@ -0,0 +1,5 @@ +{ + "run_id" : "123456789", + "region" : "eastus", + "accelerated_networking" : true +} \ No newline at end of file diff --git a/steps/collect-telescope-metadata.yml b/steps/collect-telescope-metadata.yml index 0c787371f..2f370acdc 100644 --- a/steps/collect-telescope-metadata.yml +++ b/steps/collect-telescope-metadata.yml @@ -17,11 +17,13 @@ steps: --arg run_id $RUN_ID \ --arg run_url $RUN_URL \ --arg code_url $CODE_URL \ + --arg time_stamp "$(date -u +"%Y-%m-%dT%H:%M:%S.000Z")" \ --arg reason "$(Build.Reason)" \ --arg pipeline_branch "$(Build.SourceBranchName)" \ --arg requester "$(Build.RequestedFor)" \ --arg scenario_name "$SCENARIO_NAME" \ --arg scenario_type "$SCENARIO_TYPE" \ + --arg owner $OWNER \ --arg engine "$ENGINE" \ --arg topology "$TOPOLOGY" \ --arg engine_input "$(echo $ENGINE_INPUT | jq -r '.')" \ @@ -42,11 +44,13 @@ steps: pipeline_name: $pipeline_name, cron_schedule_display_name: $cron_schedule_display_name, project_name: $project_name, - code_url: $code_url + code_url: $code_url, + time_stamp: $time_stamp }, scenario_info: { scenario_name: $scenario_name, scenario_type: $scenario_type, + owner: $owner, engine: $engine, topology: $topology, engine_input: $engine_input @@ -69,12 +73,16 @@ steps: REGIONS: ${{ convertToJson(parameters.regions) }} - script: | - # Read the test results file and add the telescope metadata to it for each json record in the file + # Append run_id to the test results file if the file exists set -eux - jq --argfile telescope_metadata $(TELESCOPE_METADATA_FILE) \ - -c '. + {telescope_metadata: $telescope_metadata}' $(TEST_RESULTS_FILE) > temp-$RUN_ID.json \ - && mv temp-$RUN_ID.json $(TEST_RESULTS_FILE) - displayName: "Add Telescope Metadata to Test Results" + if [ -f "$(TEST_RESULTS_FILE)" ]; then + jq --arg telescope_run_id $RUN_ID \ + -c '. + {telescope_run_id: $telescope_run_id}' $(TEST_RESULTS_FILE) > temp-$RUN_ID.json \ + && mv temp-$RUN_ID.json $(TEST_RESULTS_FILE) + else + echo "File $(TEST_RESULTS_FILE) does not exist." + fi + displayName: "Add RUN_ID to Test Results" condition: always() - ${{ if eq(variables['Build.SourceBranchName'], 'main') }}: @@ -83,6 +91,7 @@ steps: source_file_name: $(TELESCOPE_METADATA_FILE) destination_file_name: $(RUN_ID).json subfolder: telescope-metadata/main + container_name: system credential_type: ${{ parameters.credential_type }} cloud: ${{ parameters.cloud }} upload_type: "Telescope Metadata" diff --git a/steps/engine/attach/collect.yml b/steps/engine/attach/collect.yml new file mode 100644 index 000000000..db781db34 --- /dev/null +++ b/steps/engine/attach/collect.yml @@ -0,0 +1,31 @@ +parameters: + - name: cloud + type: string + - name: region + type: string + - name: result_dir + type: string + - name: disk_number + type: number + - name: storage_class + type: string + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml + parameters: + region: ${{ parameters.region }} + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ + $CASE_NAME $NODE_NUMBER $DISK_NUMBER $STORAGE_CLASS \ + "$CLOUD_INFO" $RUN_ID $RUN_URL $RESULT_DIR + displayName: "Collect Result Attach Detach ${{ parameters.disk_number }}" + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + REGION: ${{ parameters.region }} + RESULT_DIR: ${{ parameters.result_dir }} + DISK_NUMBER: ${{ parameters.disk_number }} + STORAGE_CLASS: ${{ parameters.storage_class }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/csi/csi.py diff --git a/steps/engine/attach/execute-aws.yml b/steps/engine/attach/execute-aws.yml new file mode 100644 index 000000000..20f1a7429 --- /dev/null +++ b/steps/engine/attach/execute-aws.yml @@ -0,0 +1,40 @@ +parameters: + cloud: "" + result_dir: "" + +steps: + - script: | + envsubst < "$STORAGE_CLASS_FILE" | kubectl apply -f - + displayName: "Create StorageClass" + env: + STORAGE_CLASS: ${{ parameters.storage_class }} + STORAGE_CLASS_FILE: $(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/storageclass.${{ parameters.cloud }}.yml + OWNER: $(OWNER) + RUN_ID: $(RUN_ID) + SCENARIO: $(SCENARIO_TYPE)-$(SCENARIO_NAME) + DELETION_DUE_TIME: $(DELETION_DUE_TIME) + + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + $DISK_NUMBER $STORAGE_CLASS $WAIT_TIME $RESULT_DIR + displayName: "Execute Test attach detach ${{ parameters.disk_number }}" + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + DISK_NUMBER: ${{ parameters.disk_number }} + STORAGE_CLASS: ${{ parameters.storage_class }} + WAIT_TIME: ${{ parameters.wait_time }} + RESULT_DIR: ${{ parameters.result_dir }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/csi/csi.py + + - script: | + set -eu + + kubectl delete pvc --all + kubectl delete pv --all + kubectl delete sc --all + displayName: "Clean up PVC, PV, StorageClass" + timeoutInMinutes: 20 + condition: always() diff --git a/steps/engine/attach/execute-azure.yml b/steps/engine/attach/execute-azure.yml new file mode 100644 index 000000000..247d80720 --- /dev/null +++ b/steps/engine/attach/execute-azure.yml @@ -0,0 +1,21 @@ +parameters: + cloud: "" + result_dir: "" + disk_number: 1 + storage_class: "" + +steps: + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ + $DISK_NUMBER $STORAGE_CLASS $WAIT_TIME $RESULT_DIR + displayName: "Execute Test attach detach ${{ parameters.disk_number }}" + workingDirectory: modules/python + env: + CLOUD: ${{ parameters.cloud }} + DISK_NUMBER: ${{ parameters.disk_number }} + STORAGE_CLASS: ${{ parameters.storage_class }} + WAIT_TIME: ${{ parameters.wait_time }} + RESULT_DIR: ${{ parameters.result_dir }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/csi/csi.py diff --git a/steps/engine/attach/validate.yml b/steps/engine/attach/validate.yml new file mode 100644 index 000000000..c2be48f92 --- /dev/null +++ b/steps/engine/attach/validate.yml @@ -0,0 +1,21 @@ +parameters: + - name: node_label + type: string + - name: desired_nodes + type: string + - name: operation_timeout_in_minutes + type: number + +steps: + - script: | + set -eo pipefail + + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE validate "$NODE_LABEL" $DESIRED_NODES $OPERATION_TIMEOUT + workingDirectory: modules/python + timeoutInMinutes: ${{ parameters.operation_timeout_in_minutes }} + displayName: "Validate node count" + env: + NODE_LABEL: ${{ parameters.node_label }} + DESIRED_NODES: ${{ parameters.desired_nodes }} + OPERATION_TIMEOUT: ${{ parameters.operation_timeout_in_minutes }} + PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/csi/csi.py diff --git a/steps/engine/clusterloader2/cilium/scale-cluster.yml b/steps/engine/clusterloader2/cilium/scale-cluster.yml index 6c65930c4..dbfeee9dc 100644 --- a/steps/engine/clusterloader2/cilium/scale-cluster.yml +++ b/steps/engine/clusterloader2/cilium/scale-cluster.yml @@ -43,8 +43,8 @@ steps: if [ "true" = "${{ parameters.enable_autoscale }}" ]; then az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --enable-cluster-autoscaler --min-count 0 --max-count 500 fi - az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --node-taints "slo=true:NoSchedule" --labels slo=true - sleep 300 + az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --labels slo=true test-np=net-policy-client + # sleep 300 done env: ROLE: ${{ parameters.role }} diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index 2cb5d22cf..4eeea15e4 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -7,6 +7,9 @@ parameters: default: {} - name: region type: string +- name: pods_per_node + type: number + default: 50 # Default value for PODS_PER_NODE steps: - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml @@ -16,8 +19,8 @@ steps: set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ - $CPU_PER_NODE $NODE_COUNT $MAX_PODS $REPEATS \ - $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST $TEST_RESULTS_FILE \ + $CPU_PER_NODE $NODE_COUNT $MAX_PODS $PODS_PER_NODE $REPEATS \ + $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST $NETWORK_TEST $TEST_RESULTS_FILE \ $TEST_TYPE workingDirectory: modules/python/clusterloader2 env: @@ -25,4 +28,5 @@ steps: RUN_URL: $(RUN_URL) PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/slo.py CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results + PODS_PER_NODE: ${{ parameters.pods_per_node }} displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index b317c8a00..5ec44592e 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -7,15 +7,18 @@ parameters: default: {} - name: region type: string + - name: pods_per_node + type: number + default: 50 # Default value for PODS_PER_NODE steps: - script: | set -eo pipefail PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ - $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP $MAX_PODS \ - $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED \ - $SERVICE_TEST ${CL2_CONFIG_DIR}/overrides.yaml + $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP $MAX_PODS $PODS_PER_NODE \ + $REPEATS $SCALE_TIMEOUT $NO_OF_NAMESPACES $TOTAL_NETWORK_POLICIES $CLOUD $CILIUM_ENABLED \ + $SERVICE_TEST $NETWORK_TEST ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ ${HOME}/.kube/config $CLOUD @@ -30,4 +33,5 @@ steps: CL2_IMAGE: ${{ parameters.engine_input.image }} CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/config CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results + PODS_PER_NODE: ${{ parameters.pods_per_node }} displayName: "Run Benchmark" diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index c6f595ac4..81c891a57 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -54,7 +54,7 @@ steps: echo "Deletion Due Time: $deletion_due_time" echo "##vso[task.setvariable variable=DELETION_DUE_TIME]$deletion_due_time" - owner=$(grep "owner" "$terraform_input_file" | awk -F'=' '{gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2}') + owner=$(grep "owner" "$terraform_input_file" | awk -F'=' '{gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2}' | sed 's/^"//;s/"$//') echo "Owner: $owner" echo "##vso[task.setvariable variable=OWNER]$owner" displayName: "Get Deletion Due Time and Owner" diff --git a/steps/setup-tests.yml b/steps/setup-tests.yml index 7237dde48..4b998aa8f 100644 --- a/steps/setup-tests.yml +++ b/steps/setup-tests.yml @@ -9,6 +9,9 @@ parameters: - name: run_id type: string default: '' +- name: run_id_2 + type: string + default: '' - name: retry_attempt_count type: number default: 3 @@ -16,20 +19,30 @@ parameters: type: string - name: ssh_key_enabled type: boolean +- name: use_secondary_cluster + type: boolean + default: false steps: - script: | - if [ -n "$RUN_ID" ]; then + set -eu + + if [ "${{ parameters.use_secondary_cluster }}" == "True" ] && [ -n "${RUN_ID_2:-}" ]; then + echo "Using secondary cluster" + run_id=$RUN_ID_2 + elif [ -n "${RUN_ID:-}" ]; then + echo "Using primary cluster" run_id=$RUN_ID else run_id=$(Build.BuildId)-$(System.JobId) fi + echo "Run ID: $run_id" echo "##vso[task.setvariable variable=RUN_ID]$run_id" displayName: "Set Run ID" env: RUN_ID: ${{ parameters.run_id }} - + RUN_ID_2: ${{ parameters.run_id_2 }} - script: | run_url="$(System.TeamFoundationCollectionUri)$(System.TeamProject)/_build/results?buildId=$(Build.BuildId)&view=logs&j=$(System.JobId)" echo "Run URL: $run_url" diff --git a/steps/topology/csi-attach-detach/collect-attach.yml b/steps/topology/csi-attach-detach/collect-attach.yml new file mode 100644 index 000000000..19fb2716c --- /dev/null +++ b/steps/topology/csi-attach-detach/collect-attach.yml @@ -0,0 +1,20 @@ +parameters: + - name: cloud + type: string + default: "" + - name: regions + type: object + - name: engine_input + type: object + default: + disk_number: 1 + storage_class: "" + +steps: + - template: /steps/engine/attach/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + region: ${{ parameters.regions[0] }} + result_dir: $(TEST_RESULTS_DIR) + disk_number: ${{ parameters.engine_input.disk_number }} + storage_class: ${{ parameters.engine_input.storage_class }} diff --git a/steps/topology/csi-attach-detach/execute-attach.yml b/steps/topology/csi-attach-detach/execute-attach.yml new file mode 100644 index 000000000..d406068f2 --- /dev/null +++ b/steps/topology/csi-attach-detach/execute-attach.yml @@ -0,0 +1,22 @@ +parameters: + - name: cloud + type: string + default: "" + - name: engine_input + type: object + default: + disk_number: 1 + storage_class: "" + wait_time: 1 + - name: regions + type: object + default: {} + +steps: + - template: /steps/engine/attach/execute-${{ parameters.cloud }}.yml + parameters: + cloud: ${{ parameters.cloud }} + result_dir: $(TEST_RESULTS_DIR) + disk_number: ${{ parameters.engine_input.disk_number }} + storage_class: ${{ parameters.engine_input.storage_class }} + wait_time: ${{ parameters.engine_input.wait_time }} diff --git a/steps/topology/csi-attach-detach/validate-resources.yml b/steps/topology/csi-attach-detach/validate-resources.yml new file mode 100644 index 000000000..a65eae1e3 --- /dev/null +++ b/steps/topology/csi-attach-detach/validate-resources.yml @@ -0,0 +1,18 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: client + region: ${{ parameters.regions[0] }} + - template: /steps/engine/${{ parameters.engine }}/validate.yml + parameters: + node_label: "csi=true" + desired_nodes: $(NODE_NUMBER) + operation_timeout_in_minutes: 10 diff --git a/steps/topology/network-churn/collect-clusterloader2.yml b/steps/topology/network-churn/collect-clusterloader2.yml new file mode 100644 index 000000000..36170431b --- /dev/null +++ b/steps/topology/network-churn/collect-clusterloader2.yml @@ -0,0 +1,23 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/slo/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} + +- script: | + run_id=$(Build.BuildId)-$(System.JobId) + echo "Run ID: $run_id" + echo "##vso[task.setvariable variable=RUN_ID]$run_id" + displayName: "Set unique Run ID before publish" diff --git a/steps/topology/network-churn/execute-clusterloader2.yml b/steps/topology/network-churn/execute-clusterloader2.yml new file mode 100644 index 000000000..d084b2ef0 --- /dev/null +++ b/steps/topology/network-churn/execute-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/slo/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml new file mode 100644 index 000000000..e0bf09252 --- /dev/null +++ b/steps/topology/network-churn/validate-resources.yml @@ -0,0 +1,20 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + nodes_per_nodepool: 10 #TODO: Update once node-pools are added. + enable_autoscale: "false" + \ No newline at end of file From a0599f332b0481f0f9463c2cdb4f27cc3b98f9bb Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Mon, 30 Dec 2024 15:18:33 +0000 Subject: [PATCH 02/23] Refactor YAML files for network churn: clean up formatting and add OWNER variable --- .../CNI Benchmark/network-churn/cilium-network-churn.yml | 3 ++- steps/engine/clusterloader2/slo/collect.yml | 2 +- steps/engine/clusterloader2/slo/execute.yml | 2 +- steps/topology/network-churn/collect-clusterloader2.yml | 2 +- steps/topology/network-churn/validate-resources.yml | 1 - 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 70d695e3d..a46484b3a 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -12,7 +12,7 @@ parameters: default: 10 - name: repeats type: number - default: 1 + default: 1 - name: scale_timeout type: string default: "15m" @@ -27,6 +27,7 @@ variables: SCENARIO_TYPE: perf-eval SCENARIO_NAME: cilium-network-churn SCENARIO_VERSION: main + OWNER: aks stages: - stage: azure_eastus2 diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index 4eeea15e4..8af2aa7a5 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -8,7 +8,7 @@ parameters: - name: region type: string - name: pods_per_node - type: number + type: number default: 50 # Default value for PODS_PER_NODE steps: diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index 5ec44592e..e1a11284d 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -8,7 +8,7 @@ parameters: - name: region type: string - name: pods_per_node - type: number + type: number default: 50 # Default value for PODS_PER_NODE steps: diff --git a/steps/topology/network-churn/collect-clusterloader2.yml b/steps/topology/network-churn/collect-clusterloader2.yml index 36170431b..5c5105ada 100644 --- a/steps/topology/network-churn/collect-clusterloader2.yml +++ b/steps/topology/network-churn/collect-clusterloader2.yml @@ -15,7 +15,7 @@ steps: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} - + - script: | run_id=$(Build.BuildId)-$(System.JobId) echo "Run ID: $run_id" diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index e0bf09252..b01b313c9 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -17,4 +17,3 @@ steps: region: ${{ parameters.regions[0] }} nodes_per_nodepool: 10 #TODO: Update once node-pools are added. enable_autoscale: "false" - \ No newline at end of file From b9aa5f6172831143343588deb6819b07ddb2e5fe Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 8 Jan 2025 10:46:28 +0000 Subject: [PATCH 03/23] Update nodes_per_nodepool value in validate-resources.yml to 500 --- .../clusterloader2/slo/config/modules/reconcile-objects.yaml | 2 +- steps/topology/network-churn/validate-resources.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml index 994808424..196fa136d 100644 --- a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml +++ b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml @@ -49,7 +49,7 @@ steps: - basename: big-deployment objectTemplatePath: deployment_template.yaml templateFillMap: - Replicas: {{$bigDeploymentSize}} + Replicas: {{$bigDeploymentSize}}kube SvcName: big-service Group: {{.Group}} deploymentLabel: {{.deploymentLabel}} diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index b01b313c9..f7bd48190 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -15,5 +15,5 @@ steps: parameters: role: net region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 10 #TODO: Update once node-pools are added. + nodes_per_nodepool: 500 enable_autoscale: "false" From e976e313490c370b4889269e912999f3d8fb9952 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 8 Jan 2025 10:51:28 +0000 Subject: [PATCH 04/23] Enable existing namespaces in load-config.yaml --- modules/python/clusterloader2/slo/config/load-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index ff496d7b6..b50e79d47 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -40,7 +40,7 @@ namespace: prefix: slo deleteStaleNamespaces: true deleteAutomanagedNamespaces: true - enableExistingNamespaces: false + enableExistingNamespaces: true tuningSets: - name: Sequence From 9c20982c484c1815c602c9c26722aca93b94a123 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Thu, 9 Jan 2025 10:01:32 +0000 Subject: [PATCH 05/23] Remove redundant parameters from validate-resources.yml --- .../collect-clusterloader2.yml | 5 +++-- steps/topology/network-churn/validate-resources.yml | 8 +------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml index c5e1d59d6..0594baea3 100644 --- a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml +++ b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml @@ -1,7 +1,7 @@ parameters: - name: cloud type: string - default: '' +P[ ; ] default: '' - name: engine_input type: object default: {} @@ -16,7 +16,8 @@ steps: engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} -- template: /steps/engine/clusterloader2/cilium/scale-cluster.yml +- template: /steps/engine/clusterloader2/cilium/ qw +0-+9*5614 0;'01'.yml parameters: role: ces region: ${{ parameters.regions[0] }} diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index f7bd48190..e93d1f229 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -10,10 +10,4 @@ steps: - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml parameters: role: net - region: ${{ parameters.regions[0] }} - - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml - parameters: - role: net - region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 500 - enable_autoscale: "false" + region: ${{ parameters.regions[0] }} \ No newline at end of file From 9b5522017181f4e4c9641fe965451e992bed8cd2 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Mon, 13 Jan 2025 16:10:01 +0000 Subject: [PATCH 06/23] Disable existing namespaces in autoscale and slo configurations; adjust deployment template for network policy enforcement and refine cluster scaling parameters. --- .../python/clusterloader2/autoscale/config/config.yaml | 2 +- .../clusterloader2/slo/config/deployment_template.yaml | 4 ++-- .../python/clusterloader2/slo/config/load-config.yaml | 9 ++++++--- .../slo/config/modules/reconcile-objects.yaml | 2 +- modules/python/clusterloader2/slo/slo.py | 2 +- steps/engine/clusterloader2/cilium/scale-cluster.yml | 4 ++-- .../collect-clusterloader2.yml | 5 ++--- steps/topology/network-churn/validate-resources.yml | 2 +- 8 files changed, 16 insertions(+), 14 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/config/config.yaml b/modules/python/clusterloader2/autoscale/config/config.yaml index dfba08cb3..7995a569f 100644 --- a/modules/python/clusterloader2/autoscale/config/config.yaml +++ b/modules/python/clusterloader2/autoscale/config/config.yaml @@ -18,7 +18,7 @@ namespace: prefix: autoscale deleteStaleNamespaces: true deleteAutomanagedNamespaces: true - enableExistingNamespaces: true + enableExistingNamespaces: false tuningSets: - name: Uniform1qps diff --git a/modules/python/clusterloader2/slo/config/deployment_template.yaml b/modules/python/clusterloader2/slo/config/deployment_template.yaml index 8c4c93615..1f43007f0 100644 --- a/modules/python/clusterloader2/slo/config/deployment_template.yaml +++ b/modules/python/clusterloader2/slo/config/deployment_template.yaml @@ -6,8 +6,8 @@ {{$EnableNetworkPolicyEnforcementLatencyTest := DefaultParam .EnableNetworkPolicyEnforcementLatencyTest false}} {{$TargetLabelValue := DefaultParam .TargetLabelValue "enforcement-latency"}} # Run a server pod for network policy enforcement latency test only on every Nth pod. -# Default every third pod. -{{$NetPolServerOnEveryNthPod := 3}} +# Default run on every pod. +{{$NetPolServerOnEveryNthPod := 1}} {{$RunNetPolicyTest := and $EnableNetworkPolicyEnforcementLatencyTest (eq (Mod .Index $NetPolServerOnEveryNthPod) 0)}} apiVersion: apps/v1 diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index b50e79d47..4aea0a00b 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -30,9 +30,12 @@ name: load-config # Service test {{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 4000}} -{{$SMALL_GROUP_SIZE := DefaultParam .CL2_DEPLOYMENT_SIZE 20}} +{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 20}} {{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}} -{{$smallDeploymentPods := DivideInt $totalPods $namespaces}} + +# TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count +{{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} +{{$smallDeploymentPods := if eq $namespaces 1 $calculatedPods $podsPerNamespace}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: @@ -40,7 +43,7 @@ namespace: prefix: slo deleteStaleNamespaces: true deleteAutomanagedNamespaces: true - enableExistingNamespaces: true + enableExistingNamespaces: false tuningSets: - name: Sequence diff --git a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml index 196fa136d..994808424 100644 --- a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml +++ b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml @@ -49,7 +49,7 @@ steps: - basename: big-deployment objectTemplatePath: deployment_template.yaml templateFillMap: - Replicas: {{$bigDeploymentSize}}kube + Replicas: {{$bigDeploymentSize}} SvcName: big-service Group: {{.Group}} deploymentLabel: {{.deploymentLabel}} diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 6ef3d53cf..a0d5eedaa 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -167,7 +167,7 @@ def collect_clusterloader2( "group": None, "measurement": None, "result": None, - # # "test_details": details, + # "test_details": details, "cloud_info": cloud_info, "run_id": run_id, "run_url": run_url, diff --git a/steps/engine/clusterloader2/cilium/scale-cluster.yml b/steps/engine/clusterloader2/cilium/scale-cluster.yml index dbfeee9dc..6c65930c4 100644 --- a/steps/engine/clusterloader2/cilium/scale-cluster.yml +++ b/steps/engine/clusterloader2/cilium/scale-cluster.yml @@ -43,8 +43,8 @@ steps: if [ "true" = "${{ parameters.enable_autoscale }}" ]; then az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --enable-cluster-autoscaler --min-count 0 --max-count 500 fi - az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --labels slo=true test-np=net-policy-client - # sleep 300 + az aks nodepool update --cluster-name $aks_name --name $np --resource-group $aks_rg --node-taints "slo=true:NoSchedule" --labels slo=true + sleep 300 done env: ROLE: ${{ parameters.role }} diff --git a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml index 0594baea3..dc5142f89 100644 --- a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml +++ b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml @@ -1,7 +1,7 @@ parameters: - name: cloud type: string -P[ ; ] default: '' + default: '' - name: engine_input type: object default: {} @@ -16,8 +16,7 @@ steps: engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} -- template: /steps/engine/clusterloader2/cilium/ qw -0-+9*5614 0;'01'.yml +template: /steps/engine/clusterloader2/cilium/ parameters: role: ces region: ${{ parameters.regions[0] }} diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index e93d1f229..fe97a40cb 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -10,4 +10,4 @@ steps: - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml parameters: role: net - region: ${{ parameters.regions[0] }} \ No newline at end of file + region: ${{ parameters.regions[0] }} From bdb72671aa214149ee35d90f9434f8b5f7465a60 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Mon, 13 Jan 2025 16:14:19 +0000 Subject: [PATCH 07/23] Enable existing namespaces in autoscale configuration and update template path for cluster scaling. --- modules/python/clusterloader2/autoscale/config/config.yaml | 2 +- .../cilium-usercluster-autoscale/collect-clusterloader2.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/python/clusterloader2/autoscale/config/config.yaml b/modules/python/clusterloader2/autoscale/config/config.yaml index 7995a569f..dfba08cb3 100644 --- a/modules/python/clusterloader2/autoscale/config/config.yaml +++ b/modules/python/clusterloader2/autoscale/config/config.yaml @@ -18,7 +18,7 @@ namespace: prefix: autoscale deleteStaleNamespaces: true deleteAutomanagedNamespaces: true - enableExistingNamespaces: false + enableExistingNamespaces: true tuningSets: - name: Uniform1qps diff --git a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml index dc5142f89..af5748492 100644 --- a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml +++ b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml @@ -16,7 +16,7 @@ steps: engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} -template: /steps/engine/clusterloader2/cilium/ +template: /steps/engine/clusterloader2/cilium/scale-cluster.yml parameters: role: ces region: ${{ parameters.regions[0] }} From 6ced26d9c8b6229f04b7154ffbae0cbfc6f2457c Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Mon, 13 Jan 2025 16:59:34 +0000 Subject: [PATCH 08/23] Fix indentation in collect-clusterloader2.yml for template path consistency --- .../cilium-usercluster-autoscale/collect-clusterloader2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml index af5748492..c5e1d59d6 100644 --- a/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml +++ b/steps/topology/cilium-usercluster-autoscale/collect-clusterloader2.yml @@ -16,7 +16,7 @@ steps: engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} -template: /steps/engine/clusterloader2/cilium/scale-cluster.yml +- template: /steps/engine/clusterloader2/cilium/scale-cluster.yml parameters: role: ces region: ${{ parameters.regions[0] }} From 2e3e8ee97740d6ef50085de503f7392f9c5c829f Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 14 Jan 2025 14:32:47 +0000 Subject: [PATCH 09/23] Update load-config.yaml and slo.py for deployment size adjustments and fix string formatting --- modules/python/clusterloader2/slo/config/load-config.yaml | 4 ++-- modules/python/clusterloader2/slo/slo.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 4aea0a00b..ffc2cfcce 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -30,12 +30,12 @@ name: load-config # Service test {{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 4000}} -{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 20}} +{{$SMALL_GROUP_SIZE := DefaultParam .CL2_DEPLOYMENT_SIZE 20}} {{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}} # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} -{{$smallDeploymentPods := if eq $namespaces 1 $calculatedPods $podsPerNamespace}} +{{$smallDeploymentPods := DivideInt $totalPods $namespaces}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index a0d5eedaa..d0d60f6f5 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -101,7 +101,7 @@ def configure_clusterloader2( file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY: test\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE: net-policy-client\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS: 10\n") - file.write(f"CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT: {total_network_policies}\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT: {total_network_policies}\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS: 10\n") file.write("CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME: small-deployment\n") From 568b69ce7eaf03ed7f83f8f52ed3ce589a40eaa5 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 10:56:53 +0000 Subject: [PATCH 10/23] Add new parameters for namespaces, network policies, and service testing in execute.yml --- steps/engine/clusterloader2/slo/execute.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index e1a11284d..e44c24b50 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -10,6 +10,15 @@ parameters: - name: pods_per_node type: number default: 50 # Default value for PODS_PER_NODE + - name: no_of_namespaces + type: number + default: 1 # Default value for NO_OF_NAMESPACES + - name: total_network_policies + type: number + default: 0 # Default value for TOTAL_NETWORK_POLICIES + - name: service_test + type: boolean + default: false # Default value for NETWORK_TEST steps: - script: | From 6ddeace5310203877239eef28f75cacc72b3929e Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 11:27:50 +0000 Subject: [PATCH 11/23] To test Commit --- modules/python/clusterloader2/slo/slo.py | 2 +- steps/engine/clusterloader2/slo/execute.yml | 6 +++++- .../cilium-usercluster-autoscale/validate-resources.yml | 7 +------ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index d0d60f6f5..38c28d498 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -221,7 +221,7 @@ def main(): parser_configure.add_argument("repeats", type=int, help="Number of times to repeat the deployment churn") parser_configure.add_argument("operation_timeout", type=str, help="Timeout before failing the scale up test") parser_configure.add_argument("no_of_namespaces", type=int, default=1, help="Number of namespaces to create") - parser_configure.add_argument("total_network_policies", type=int, help="Total number of network policies to create", default=1000) + parser_configure.add_argument("total_network_policies", type=int, default=0, help="Total number of network policies to create") parser_configure.add_argument("provider", type=str, help="Cloud provider name") parser_configure.add_argument("cilium_enabled", type=eval, choices=[True, False], default=False, help="Whether cilium is enabled. Must be either True or False") diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index e44c24b50..10f2ee89e 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -16,7 +16,7 @@ parameters: - name: total_network_policies type: number default: 0 # Default value for TOTAL_NETWORK_POLICIES - - name: service_test + - name: network_test type: boolean default: false # Default value for NETWORK_TEST @@ -43,4 +43,8 @@ steps: CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/config CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results PODS_PER_NODE: ${{ parameters.pods_per_node }} + NO_OF_NAMESPACES: ${{ parameters.no_of_namespaces }} + TOTAL_NETWORK_POLICIES: ${{ parameters.total_network_policies }} + NETWORK_TEST: ${{ parameters.network_test }} + CL2_CONFIG_FILE: load-config.yaml displayName: "Run Benchmark" diff --git a/steps/topology/cilium-usercluster-autoscale/validate-resources.yml b/steps/topology/cilium-usercluster-autoscale/validate-resources.yml index 3d1ae1b9b..bd389deac 100644 --- a/steps/topology/cilium-usercluster-autoscale/validate-resources.yml +++ b/steps/topology/cilium-usercluster-autoscale/validate-resources.yml @@ -11,9 +11,4 @@ steps: parameters: role: ces region: ${{ parameters.regions[0] }} - - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml - parameters: - role: ces - region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 0 - enable_autoscale: "true" + From a3c8a830a416ea322f2ad71e9c9800c1b10d1018 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 12:24:28 +0000 Subject: [PATCH 12/23] Revert and update deployment size --- .../python/clusterloader2/slo/config/load-config.yaml | 2 +- steps/engine/clusterloader2/slo/collect.yml | 6 +++++- steps/engine/clusterloader2/slo/execute.yml | 10 +++++----- .../validate-resources.yml | 7 ++++++- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index ffc2cfcce..adb1f9ea5 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -35,7 +35,7 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} -{{$smallDeploymentPods := DivideInt $totalPods $namespaces}} +{{$smallDeploymentPods := (if eq $namespaces 1 $calculatedPods (DivideInt $totalPods $namespaces))}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index 8af2aa7a5..f18f24e9b 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -9,7 +9,10 @@ parameters: type: string - name: pods_per_node type: number - default: 50 # Default value for PODS_PER_NODE + default: 50 +- name: network_test + type: boolean + default: false steps: - template: /steps/cloud/${{ parameters.cloud }}/collect-cloud-info.yml @@ -29,4 +32,5 @@ steps: PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/slo.py CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results PODS_PER_NODE: ${{ parameters.pods_per_node }} + NETWORK_TEST: ${{ parameters.network_test }} displayName: "Collect Results" diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index 10f2ee89e..d2db812cf 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -9,16 +9,16 @@ parameters: type: string - name: pods_per_node type: number - default: 50 # Default value for PODS_PER_NODE + default: 50 - name: no_of_namespaces type: number - default: 1 # Default value for NO_OF_NAMESPACES + default: 1 - name: total_network_policies type: number - default: 0 # Default value for TOTAL_NETWORK_POLICIES + default: 0 - name: network_test type: boolean - default: false # Default value for NETWORK_TEST + default: false steps: - script: | @@ -44,7 +44,7 @@ steps: CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results PODS_PER_NODE: ${{ parameters.pods_per_node }} NO_OF_NAMESPACES: ${{ parameters.no_of_namespaces }} - TOTAL_NETWORK_POLICIES: ${{ parameters.total_network_policies }} + TOTAL_NETWORK_POLICIES: ${{ parameters.total_network_policies }} NETWORK_TEST: ${{ parameters.network_test }} CL2_CONFIG_FILE: load-config.yaml displayName: "Run Benchmark" diff --git a/steps/topology/cilium-usercluster-autoscale/validate-resources.yml b/steps/topology/cilium-usercluster-autoscale/validate-resources.yml index bd389deac..e542cb35c 100644 --- a/steps/topology/cilium-usercluster-autoscale/validate-resources.yml +++ b/steps/topology/cilium-usercluster-autoscale/validate-resources.yml @@ -11,4 +11,9 @@ steps: parameters: role: ces region: ${{ parameters.regions[0] }} - + - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + parameters: + role: ces + region: ${{ parameters.regions[0] }} + nodes_per_nodepool: 0 + enable_autoscale: "true" \ No newline at end of file From 56df80f97644a0ed0cff2cb1a3c9c0fbf1e02322 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 12:29:08 +0000 Subject: [PATCH 13/23] test load condition --- modules/python/clusterloader2/slo/config/load-config.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index adb1f9ea5..a9389b01b 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -35,7 +35,12 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} -{{$smallDeploymentPods := (if eq $namespaces 1 $calculatedPods (DivideInt $totalPods $namespaces))}} +{{$smallDeploymentPods := 0}} +{{if eq $namespaces 1}} + {{$smallDeploymentPods = $calculatedPods}} +{{else}} + {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} +{{end}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: From b7e79fc1687ffa3d445c09664b0a0e0613379fd6 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 12:36:43 +0000 Subject: [PATCH 14/23] Test valid datatype --- modules/python/clusterloader2/slo/config/load-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index a9389b01b..3f9760897 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -36,7 +36,7 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} {{$smallDeploymentPods := 0}} -{{if eq $namespaces 1}} +{{if eq (ToInt $namespaces) 1}} {{$smallDeploymentPods = $calculatedPods}} {{else}} {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} From 95e70066e1f3197a23fdd27b2523aa724169413a Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 12:39:29 +0000 Subject: [PATCH 15/23] test string type --- modules/python/clusterloader2/slo/config/load-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 3f9760897..49345c3f0 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -36,7 +36,7 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} {{$smallDeploymentPods := 0}} -{{if eq (ToInt $namespaces) 1}} +{{if eq $namespaces "1"}} {{$smallDeploymentPods = $calculatedPods}} {{else}} {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} From ed4c562f963a2f89e27b2439dd1d3d32414136fb Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 13:39:31 +0000 Subject: [PATCH 16/23] test type --- modules/python/clusterloader2/slo/config/load-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 49345c3f0..58c51df68 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -36,11 +36,12 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} {{$smallDeploymentPods := 0}} -{{if eq $namespaces "1"}} +{{if eq (int $namespaces) 1}} {{$smallDeploymentPods = $calculatedPods}} {{else}} {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} {{end}} +{{printf "Type: %T, Value: %v" $namespaces $namespaces}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: From 293b0cb873d28712d21a06b1255c91d3b29a3015 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 13:40:05 +0000 Subject: [PATCH 17/23] add commnet --- modules/python/clusterloader2/slo/config/load-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 58c51df68..d04b27651 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -36,12 +36,12 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} {{$smallDeploymentPods := 0}} +{{printf "Type: %T, Value: %v" $namespaces $namespaces}} {{if eq (int $namespaces) 1}} {{$smallDeploymentPods = $calculatedPods}} {{else}} {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} {{end}} -{{printf "Type: %T, Value: %v" $namespaces $namespaces}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: From 6f6950f826eeede1eb6a67f92639a3dcfde2d395 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 13:47:10 +0000 Subject: [PATCH 18/23] test if --- modules/python/clusterloader2/slo/config/load-config.yaml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index d04b27651..1d4637968 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -35,13 +35,7 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} -{{$smallDeploymentPods := 0}} -{{printf "Type: %T, Value: %v" $namespaces $namespaces}} -{{if eq (int $namespaces) 1}} - {{$smallDeploymentPods = $calculatedPods}} -{{else}} - {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} -{{end}} +{{$smallDeploymentPods := (if $NETWORK_TEST (DivideInt $totalPods $namespaces) $calculatedPods)}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: From 725c6c2e914eb0af9602156ce9c246f71e1232d5 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 13:52:21 +0000 Subject: [PATCH 19/23] conditional logic --- .../python/clusterloader2/slo/config/load-config.yaml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 1d4637968..5d9d862fc 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -35,7 +35,15 @@ name: load-config # TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} -{{$smallDeploymentPods := (if $NETWORK_TEST (DivideInt $totalPods $namespaces) $calculatedPods)}} + +# Use explicit conditional block to assign smallDeploymentPods +{{$smallDeploymentPods := 0}} +{{if $NETWORK_TEST}} + {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} +{{else}} + {{$smallDeploymentPods = $calculatedPods}} +{{end}} + {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} namespace: From b27b8062029723f2cfaac10352e22a38b92193e1 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 14:44:57 +0000 Subject: [PATCH 20/23] refactor: update group size parameters and maintain backward compatibility in configuration --- modules/python/clusterloader2/slo/config/load-config.yaml | 7 +++---- modules/python/clusterloader2/slo/slo.py | 3 --- .../perf-eval/CNI Benchmark/slo-servicediscovery-ces.yml | 1 + .../slo-servicediscovery-cilium-nodesubnet.yml | 1 + .../CNI Benchmark/slo-servicediscovery-feature.yml | 1 + pipelines/perf-eval/CNI Benchmark/slo-servicediscovery.yml | 3 +++ steps/engine/clusterloader2/slo/execute.yml | 2 +- 7 files changed, 10 insertions(+), 8 deletions(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 5d9d862fc..59b74681a 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -30,16 +30,15 @@ name: load-config # Service test {{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 4000}} -{{$SMALL_GROUP_SIZE := DefaultParam .CL2_DEPLOYMENT_SIZE 20}} +{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 20}} {{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}} -# TODO: maintaining old logic when namespace value is 1, use smallDeploymentPods as calculatedPods count +# Use explicit conditional block to assign smallDeploymentPods to maintain backward compatibility {{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} -# Use explicit conditional block to assign smallDeploymentPods {{$smallDeploymentPods := 0}} {{if $NETWORK_TEST}} - {{$smallDeploymentPods = DivideInt $totalPods $namespaces}} + {{$smallDeploymentPods = $podsPerNamespace}} {{else}} {{$smallDeploymentPods = $calculatedPods}} {{end}} diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 38c28d498..eb66fc581 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -7,9 +7,6 @@ from utils import parse_xml_to_json, run_cl2_command, get_measurement from kubernetes_client import KubernetesClient -DEFAULT_PODS_PER_NODE = 40 -LOAD_PODS_PER_NODE = 20 - DEFAULT_NODES_PER_NAMESPACE = 100 CPU_REQUEST_LIMIT_MILLI = 1 DAEMONSETS_PER_NODE = { diff --git a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-ces.yml b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-ces.yml index 79a6a5c32..3b0386116 100644 --- a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-ces.yml +++ b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-ces.yml @@ -33,6 +33,7 @@ stages: node_count: 1000 node_per_step: 1000 max_pods: 110 + pods_per_node: 20 repeats: 10 scale_timeout: "15m" cilium_enabled: True diff --git a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml index 57e67bdd3..99b25bdb8 100644 --- a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml +++ b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-cilium-nodesubnet.yml @@ -32,6 +32,7 @@ stages: node_count: 1000 node_per_step: 1000 max_pods: 110 + pods_per_node: 20 repeats: 10 scale_timeout: "15m" cilium_enabled: True diff --git a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml index 306edca1e..32d5ad57d 100644 --- a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml +++ b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery-feature.yml @@ -25,6 +25,7 @@ stages: node_count: 1000 node_per_step: 1000 max_pods: 110 + pods_per_node: 20 repeats: 10 scale_timeout: "15m" cilium_enabled: True diff --git a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery.yml b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery.yml index 4e941aa4d..c61f7aa29 100644 --- a/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery.yml +++ b/pipelines/perf-eval/CNI Benchmark/slo-servicediscovery.yml @@ -31,6 +31,7 @@ stages: node_count: 1000 node_per_step: 1000 max_pods: 110 + pods_per_node: 20 repeats: 10 scale_timeout: "15m" cilium_enabled: False @@ -58,6 +59,7 @@ stages: node_count: 1000 node_per_step: 1000 max_pods: 110 + pods_per_node: 20 repeats: 10 scale_timeout: "15m" cilium_enabled: False @@ -68,6 +70,7 @@ stages: node_count: 1000 node_per_step: 1000 max_pods: 110 + pods_per_node: 20 repeats: 10 scale_timeout: "15m" cilium_enabled: True diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index d2db812cf..36ab05913 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -9,7 +9,7 @@ parameters: type: string - name: pods_per_node type: number - default: 50 + default: 40 - name: no_of_namespaces type: number default: 1 From ec21cce35adc0edf47df01a02c93df398a1dcb27 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 15:56:57 +0000 Subject: [PATCH 21/23] fix: update nginx image to use the latest version from the Azure container registry --- .../python/clusterloader2/slo/config/deployment_template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/deployment_template.yaml b/modules/python/clusterloader2/slo/config/deployment_template.yaml index 1f43007f0..08424527f 100644 --- a/modules/python/clusterloader2/slo/config/deployment_template.yaml +++ b/modules/python/clusterloader2/slo/config/deployment_template.yaml @@ -47,7 +47,7 @@ spec: {{if $RunNetPolicyTest}} hostNetwork: false containers: - - image: nginx + - image: acnpublic.azurecr.io/scaletest/nginx:latest name: nginx-server ports: - containerPort: 80 From c26601d597f3a2a1c62c408a672f0d22dd45baa2 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 17:56:53 +0000 Subject: [PATCH 22/23] add metrics for npm --- .../network-policy/net-policy-metrics.yaml | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml index 5be48be8b..28a73ffd7 100644 --- a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml @@ -119,4 +119,67 @@ steps: query: avg(cilium_policy) - name: Number of endpoints labeled by policy enforcement status query: sum(cilium_policy_endpoint_enforcement_status) + {{end}} + + - Identifier: NetworkPolicyMetrics-NPM + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Performance" + metricVersion: v1 + unit: s + queries: + # Policy Metrics + - name: Time taken to add network policy - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_add_policy_exec_time_bucket[%v])) by (le)) + - name: Time taken to add network policy - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_add_policy_exec_time_bucket[%v])) by (le)) + - name: Time taken to delete network policy - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_delete_policy_exec_time_bucket[%v])) by (le)) + - name: Time taken to delete network policy - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_delete_policy_exec_time_bucket[%v])) by (le)) + - name: Number of network policies currently active + query: avg(npm_policy_count) + + # Pod and Namespace Metrics + - name: Time taken to add pod IP mapping - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_add_pod_exec_time_bucket[%v])) by (le)) + - name: Time taken to add pod IP mapping - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_add_pod_exec_time_bucket[%v])) by (le)) + - name: Time taken to delete pod IP mapping - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_delete_pod_exec_time_bucket[%v])) by (le)) + - name: Time taken to delete pod IP mapping - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_delete_pod_exec_time_bucket[%v])) by (le)) + - name: Number of pods currently managed + query: avg(npm_pod_count) + - name: Number of namespaces currently managed + query: avg(npm_namespace_count) + + # IPSet Metrics + - name: Time taken to add IPSet - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_add_set_exec_time_bucket[%v])) by (le)) + - name: Time taken to add IPSet - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_add_set_exec_time_bucket[%v])) by (le)) + - name: Time taken to delete IPSet - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_delete_set_exec_time_bucket[%v])) by (le)) + - name: Time taken to delete IPSet - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_delete_set_exec_time_bucket[%v])) by (le)) + - name: Number of IPSets currently active + query: avg(npm_set_count) + + # Performance Metrics + - name: NPM reconciliation time - Perc50 + query: histogram_quantile(0.50, sum(rate(npm_reconcile_exec_time_bucket[%v])) by (le)) + - name: NPM reconciliation time - Perc99 + query: histogram_quantile(0.99, sum(rate(npm_reconcile_exec_time_bucket[%v])) by (le)) + - name: Memory usage by NPM + query: avg(npm_memory_usage_bytes) + + # Error Rates + - name: Policy application error rate + query: sum(rate(npm_add_policy_exec_time_count{status="error"}[%v])) + - name: Pod mapping error rate + query: sum(rate(npm_add_pod_exec_time_count{status="error"}[%v])) + - name: IPSet operation error rate + query: sum(rate(npm_add_set_exec_time_count{status="error"}[%v])) {{end}} \ No newline at end of file From 00c1e73759815c54b7fda122062e5ca33b6c7b23 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 15 Jan 2025 18:02:03 +0000 Subject: [PATCH 23/23] adding condition --- .../slo/config/modules/network-policy/net-policy-metrics.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml index 28a73ffd7..60f4d85c4 100644 --- a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml @@ -3,6 +3,7 @@ {{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} {{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} {{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} +{{$useNPMMetrics := DefaultParam .useNPMMetrics true}} # CL2 params # Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. @@ -121,6 +122,7 @@ steps: query: sum(cilium_policy_endpoint_enforcement_status) {{end}} + {{if $useNPMMetrics}} - Identifier: NetworkPolicyMetrics-NPM Method: GenericPrometheusQuery Params: