From 515f99a1a52496160fee7ca331e2f017379db813 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Mon, 2 Sep 2024 12:00:26 +0300 Subject: [PATCH] e2e: add topology-aware memory allocation tests. Add first e2e tests for topology-aware policy memory allocation and type control. Co-authored-by: Krisztian Litkey Signed-off-by: Krisztian Litkey --- .../topology-aware/helm-config.yaml.in | 1 + .../n6-hbm-cxl/py_consts.var.py | 6 ++ .../test01-memory-types/code.var.sh | 65 +++++++++++++++++++ .../n6-hbm-cxl/topology.var.json | 7 ++ 4 files changed, 79 insertions(+) create mode 100644 test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/py_consts.var.py create mode 100644 test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/test01-memory-types/code.var.sh create mode 100644 test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/topology.var.json diff --git a/test/e2e/policies.test-suite/topology-aware/helm-config.yaml.in b/test/e2e/policies.test-suite/topology-aware/helm-config.yaml.in index 1adb03c63..3f436dd2a 100644 --- a/test/e2e/policies.test-suite/topology-aware/helm-config.yaml.in +++ b/test/e2e/policies.test-suite/topology-aware/helm-config.yaml.in @@ -20,6 +20,7 @@ config: - nri-resource-policy - resource-manager - cache + - libmem - policy source: true klog: diff --git a/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/py_consts.var.py b/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/py_consts.var.py new file mode 100644 index 000000000..5571e9d1c --- /dev/null +++ b/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/py_consts.var.py @@ -0,0 +1,6 @@ +dram0 = "node0" +dram1 = "node1" +hbm0 = "node2" +hbm1 = "node3" +pmem0 = "node4" +pmem1 = "node5" diff --git a/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/test01-memory-types/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/test01-memory-types/code.var.sh new file mode 100644 index 000000000..8b4644a2b --- /dev/null +++ b/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/test01-memory-types/code.var.sh @@ -0,0 +1,65 @@ +cleanup() { + vm-command "kubectl delete pods --all --now" + helm-terminate +} + +cleanup +helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware + +# container pod0c0 has no annotations, default: dram, pmem +ANN1="memory-type.resource-policy.nri.io/container.pod0c1: dram" \ + ANN2="memory-type.resource-policy.nri.io/container.pod0c2: hbm" \ + ANN3="memory-type.resource-policy.nri.io/container.pod0c3: pmem" \ + CONTCOUNT=4 \ + CPU=100m \ + MEM=512M \ + create guaranteed + +report allowed +verify 'mems["pod0c0"] == {dram0, pmem0} if packages["pod0c0"] == {"package0"} else mems["pod0c0"] == {dram1, pmem1}' \ + 'mems["pod0c1"] == {dram0} if packages["pod0c1"] == {"package0"} else mems["pod0c1"] == {dram1}' \ + 'mems["pod0c2"] == {hbm0} if packages["pod0c2"] == {"package0"} else mems["pod0c2"] == {hbm1}' \ + 'mems["pod0c3"] == {pmem0} if packages["pod0c3"] == {"package0"} else mems["pod0c3"] == {pmem1}' + +# Release memory allocated for pod0c*. If something is left behind in +# hbm or dram, the next text fails. If not, it will +vm-command "kubectl delete pods pod0 --now" + +ANN0="memory-type.resource-policy.nri.io/container.pod1c0: hbm,dram" \ + ANN1="memory-type.resource-policy.nri.io/container.pod1c1: hbm,dram" \ + ANN2="memory-type.resource-policy.nri.io/container.pod1c2: pmem" \ + ANN3="memory-type.resource-policy.nri.io/container.pod1c3: pmem" \ + CONTCOUNT=4 \ + CPU=100m \ + MEM=2816M \ + create guaranteed + +report allowed +verify 'mems["pod1c0"] == {hbm0, dram0} if packages["pod1c0"] == {"package0"} else mems["pod1c0"] == {hbm1, dram1}' \ + 'mems["pod1c1"] == {hbm0, dram0} if packages["pod1c1"] == {"package0"} else mems["pod1c1"] == {hbm1, dram1}' \ + 'mems["pod1c2"] == {pmem0} if packages["pod1c2"] == {"package0"} else mems["pod1c2"] == {pmem1}' \ + 'mems["pod1c3"] == {pmem0} if packages["pod1c3"] == {"package0"} else mems["pod1c3"] == {pmem1}' + +# 2.6G + 2.6G of PMEM is consumed, 1.4G + 1.4G remains. One more 2.0G +# pmem allocation does not fit into any single PMEM node. libmem will +# first find an initial placement without considering overflow of any +# nodes. Then it will attempt resolving overflows by spreading lower +# priority requests until no zones overflow. Priority is increasing +# by request QoS class, size, and age. Here all containers are of the +# same (guaranteed) QoS class, and pod1 containers have larger size +# than pod2. Therefore pod2c0 should end up spread over {pmem0, pmem1}. + +ANN0="memory-type.resource-policy.nri.io/container.pod2c0: pmem" \ + CONTCOUNT=1 \ + CPU=100m \ + MEM=2G \ + create guaranteed + +report allowed +verify 'mems["pod1c0"] == {hbm0, dram0} if packages["pod1c0"] == {"package0"} else mems["pod1c0"] == {hbm1, dram1}' \ + 'mems["pod1c1"] == {hbm0, dram0} if packages["pod1c1"] == {"package0"} else mems["pod1c1"] == {hbm1, dram1}' \ + 'mems["pod1c2"] == {pmem0} if packages["pod1c2"] == {"package0"} else mems["pod1c2"] == {pmem1}' \ + 'mems["pod1c3"] == {pmem0} if packages["pod1c3"] == {"package0"} else mems["pod1c3"] == {pmem1}' \ + 'mems["pod2c0"] == {pmem0, pmem1}' + +cleanup diff --git a/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/topology.var.json b/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/topology.var.json new file mode 100644 index 000000000..1695b4605 --- /dev/null +++ b/test/e2e/policies.test-suite/topology-aware/n6-hbm-cxl/topology.var.json @@ -0,0 +1,7 @@ +[ + {"mem": "2G", "threads":2, "cores": 2, "nodes": 1, "packages": 2}, + {"mem": "1G", "node-dist": {"0": 15, "1": 30, "2": 10, "3": 35}}, + {"mem": "1G", "node-dist": {"0": 30, "1": 15, "2": 35, "3": 10}}, + {"mem": "4G", "node-dist": {"0": 60, "1": 70, "2": 62, "3": 72, "4": 10, "5": 75}}, + {"mem": "4G", "node-dist": {"0": 70, "1": 60, "2": 72, "3": 62, "4": 75, "5": 10}} +]