From 1e6050dbd7aa1cc57249e5795322217eb281ccee Mon Sep 17 00:00:00 2001
From: Antti Kervinen <antti.kervinen@intel.com>
Date: Fri, 14 Jun 2024 13:21:18 +0300
Subject: [PATCH] topology-aware: add support for hide-hyperthreads annotation.

Containers with effective hide-hyperthreads annotation are allowed to
run only on the first CPU hyperthread of all physical CPU cores
allocated to its pool.

Signed-off-by: Antti Kervinen <antti.kervinen@intel.com>
---
 .../topology-aware/policy/pod-preferences.go  | 18 +++++++
 cmd/plugins/topology-aware/policy/pools.go    | 50 +++++++++++++------
 docs/resource-policy/policy/topology-aware.md | 18 +++++++
 pkg/sysfs/system.go                           | 20 ++++++++
 .../n4c16/test00-basic-placement/code.var.sh  |  5 +-
 .../code.var.sh                               | 39 +++++++++------
 6 files changed, 119 insertions(+), 31 deletions(-)

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
index 37bc7bf4d..b8f8dde7d 100644
--- a/cmd/plugins/topology-aware/policy/pod-preferences.go
+++ b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -44,6 +44,8 @@ const (
 	keyReservedCPUsPreference = "prefer-reserved-cpus"
 	// annotation key for CPU Priority preference
 	keyCpuPriorityPreference = "prefer-cpu-priority"
+	// annotation key for hiding hyperthreads from allocated CPU sets
+	keyHideHyperthreads = "hide-hyperthreads"
 
 	// effective annotation key for isolated CPU preference
 	preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace
@@ -57,6 +59,8 @@ const (
 	preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace
 	// effective annotation key for CPU priority preference
 	preferCpuPriorityKey = keyCpuPriorityPreference + "." + kubernetes.ResmgrKeyNamespace
+	// effective annotation key for hiding hyperthreads
+	hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
 )
 
 // cpuClass is a type of CPU to allocate
@@ -187,6 +191,20 @@ func cpuPrioPreference(pod cache.Pod, container cache.Container, fallback cpuPri
 	return prio
 }
 
+// hideHyperthreadsPreference returns if a container should run using
+// only single hyperthread from each physical core.
+func hideHyperthreadsPreference(pod cache.Pod, container cache.Container) bool {
+	value, ok := container.GetEffectiveAnnotation(hideHyperthreadsKey)
+	if !ok {
+		return false
+	}
+	hide, err := strconv.ParseBool(value)
+	if err != nil {
+		return false
+	}
+	return hide
+}
+
 // memoryTypePreference returns what type of memory should be allocated for the container.
 //
 // If the effective annotations are not found, this function falls back to
diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go
index 2716f6c53..8fa1a6b1c 100644
--- a/cmd/plugins/topology-aware/policy/pools.go
+++ b/cmd/plugins/topology-aware/policy/pools.go
@@ -15,6 +15,7 @@
 package topologyaware
 
 import (
+	"fmt"
 	"math"
 	"sort"
 
@@ -604,6 +605,25 @@ func (p *policy) allocatePool(container cache.Container, poolHint string) (Grant
 	return grant, nil
 }
 
+// setPreferredCpusetCpus pins container's CPUs according to what has been
+// allocated for it, taking into account if the container should run
+// with hyperthreads hidden.
+func (p *policy) setPreferredCpusetCpus(container cache.Container, allocated cpuset.CPUSet, info string) {
+	allow := allocated
+	hidingInfo := ""
+	pod, ok := container.GetPod()
+	if ok && hideHyperthreadsPreference(pod, container) {
+		allow = p.sys.SingleThreadForCPUs(allocated)
+		if allow.Size() != allocated.Size() {
+			hidingInfo = fmt.Sprintf(" (hide %d hyperthreads, remaining cpuset: %s)", allocated.Size()-allow.Size(), allow)
+		} else {
+			hidingInfo = " (no hyperthreads to hide)"
+		}
+	}
+	log.Info("%s%s", info, hidingInfo)
+	container.SetCpusetCpus(allow.String())
+}
+
 // Apply the result of allocation to the requesting container.
 func (p *policy) applyGrant(grant Grant) {
 	log.Info("* applying grant %s", grant)
@@ -615,25 +635,25 @@ func (p *policy) applyGrant(grant Grant) {
 	shared := grant.SharedCPUs()
 	cpuPortion := grant.SharedPortion()
 
-	cpus := ""
+	cpus := cpuset.New()
 	kind := ""
 	switch cpuType {
 	case cpuNormal:
 		if exclusive.IsEmpty() {
-			cpus = shared.String()
+			cpus = shared
 			kind = "shared"
 		} else {
 			kind = "exclusive"
 			if cpuPortion > 0 {
 				kind += "+shared"
-				cpus = exclusive.Union(shared).String()
+				cpus = exclusive.Union(shared)
 			} else {
-				cpus = exclusive.String()
+				cpus = exclusive
 			}
 		}
 	case cpuReserved:
 		kind = "reserved"
-		cpus = reserved.String()
+		cpus = reserved
 		cpuPortion = grant.ReservedPortion()
 	case cpuPreserve:
 		// Will skip CPU pinning, may still pin memory.
@@ -651,12 +671,14 @@ func (p *policy) applyGrant(grant Grant) {
 		if cpuType == cpuPreserve {
 			log.Info("  => preserving %s cpuset %s", container.PrettyName(), container.GetCpusetCpus())
 		} else {
-			if cpus != "" {
-				log.Info("  => pinning %s to (%s) cpuset %s", container.PrettyName(), kind, cpus)
+			if cpus.Size() > 0 {
+				p.setPreferredCpusetCpus(container, cpus,
+					fmt.Sprintf("  => pinning %s to (%s) cpuset %s",
+						container.PrettyName(), kind, cpus))
 			} else {
 				log.Info("  => not pinning %s CPUs, cpuset is empty...", container.PrettyName())
+				container.SetCpusetCpus("")
 			}
-			container.SetCpusetCpus(cpus)
 		}
 
 		// Notes:
@@ -757,13 +779,13 @@ func (p *policy) updateSharedAllocations(grant *Grant) {
 			shared := other.GetCPUNode().FreeSupply().SharableCPUs()
 			exclusive := other.ExclusiveCPUs()
 			if exclusive.IsEmpty() {
-				log.Info("  => updating %s with shared CPUs of %s: %s...",
-					other, other.GetCPUNode().Name(), shared.String())
-				other.GetContainer().SetCpusetCpus(shared.String())
+				p.setPreferredCpusetCpus(other.GetContainer(), shared,
+					fmt.Sprintf("  => updating %s with shared CPUs of %s: %s...",
+						other, other.GetCPUNode().Name(), shared.String()))
 			} else {
-				log.Info("  => updating %s with exclusive+shared CPUs of %s: %s+%s...",
-					other, other.GetCPUNode().Name(), exclusive.String(), shared.String())
-				other.GetContainer().SetCpusetCpus(exclusive.Union(shared).String())
+				p.setPreferredCpusetCpus(other.GetContainer(), exclusive.Union(shared),
+					fmt.Sprintf("  => updating %s with exclusive+shared CPUs of %s: %s+%s...",
+						other, other.GetCPUNode().Name(), exclusive.String(), shared.String()))
 			}
 		}
 	}
diff --git a/docs/resource-policy/policy/topology-aware.md b/docs/resource-policy/policy/topology-aware.md
index 5ef308981..4735a41de 100644
--- a/docs/resource-policy/policy/topology-aware.md
+++ b/docs/resource-policy/policy/topology-aware.md
@@ -271,6 +271,24 @@ metadata:
 These Pod annotations have no effect on containers which are not eligible for
 exclusive allocation.
 
+### Selectively Disabling Hyperthreading
+
+If a container opts to hide hyperthreads, it is allowed to use only
+one hyperthread from every physical CPU core allocated to it. Note
+that as a result the container may be allowed to run on only half of
+the CPUs it has requested. In case of workloads that do not benefit
+from hyperthreading this nevertheless results in better performance
+compared to running on all hyperthreads of the same CPU cores. If
+container's CPU allocation is exclusive, no other container can run on
+hidden hyperthreads either.
+
+```yaml
+metadata:
+  annotations:
+    # allow the "LLM" container to use only single thread per physical CPU core
+    hide-hyperthreads.resource-policy.nri.io/container.LLM: "true"
+```
+
 ### Implicit Hardware Topology Hints
 
 `NRI Resource Policy` automatically generates HW `Topology Hints` for devices
diff --git a/pkg/sysfs/system.go b/pkg/sysfs/system.go
index b3e9d78c4..84ce2da56 100644
--- a/pkg/sysfs/system.go
+++ b/pkg/sysfs/system.go
@@ -116,6 +116,7 @@ type System interface {
 	CoreKindCPUs(CoreKind) cpuset.CPUSet
 	CoreKinds() []CoreKind
 	AllThreadsForCPUs(cpuset.CPUSet) cpuset.CPUSet
+	SingleThreadForCPUs(cpuset.CPUSet) cpuset.CPUSet
 
 	Offlined() cpuset.CPUSet
 	Isolated() cpuset.CPUSet
@@ -685,6 +686,25 @@ func (sys *system) AllThreadsForCPUs(cpus cpuset.CPUSet) cpuset.CPUSet {
 	return all
 }
 
+// SingleThreadForCPUs returns a subset of input cpus so that from
+// each physical core only a cpu with the smallest id is included in
+// the set.
+func (sys *system) SingleThreadForCPUs(cpus cpuset.CPUSet) cpuset.CPUSet {
+	result := make([]int, 0, cpus.Size())
+	handled := make(map[int]struct{}, cpus.Size())
+	for _, cpu := range cpus.List() {
+		if _, ok := handled[cpu]; ok {
+			continue
+		}
+		handled[cpu] = struct{}{}
+		result = append(result, cpu)
+		for _, sibling := range sys.CPU(cpu).ThreadCPUSet().UnsortedList() {
+			handled[sibling] = struct{}{}
+		}
+	}
+	return cpuset.New(result...)
+}
+
 // Offlined gets the set of offlined CPUs.
 func (sys *system) Offlined() cpuset.CPUSet {
 	return sys.OfflineCPUs()
diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
index 007bb4ac0..ab597cf50 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh
@@ -38,12 +38,13 @@ vm-command "kubectl delete pods --all --now"
 
 # pod1: Test that 4 guaranteed containers not eligible for isolated CPU allocation
 # gets evenly spread over NUMA nodes.
-CONTCOUNT=4 CPU=3 create guaranteed
+CONTCOUNT=4 CPU=3 ANN0='hide-hyperthreads.resource-policy.nri.io/container.pod1c2: "true"' create guaranteed
 report allowed
 verify \
     'len(cpus["pod1c0"]) == 3' \
     'len(cpus["pod1c1"]) == 3' \
-    'len(cpus["pod1c2"]) == 3' \
+    'len(cpus["pod1c2"]) == 2' \
+    'len(cores["pod1c2"]) == 2' \
     'len(cpus["pod1c3"]) == 3' \
     'disjoint_sets(cpus["pod1c0"], cpus["pod1c1"], cpus["pod1c2"], cpus["pod1c3"])' \
     'disjoint_sets(nodes["pod1c0"], nodes["pod1c1"], nodes["pod1c2"], nodes["pod1c3"])'
diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
index b9c20ef79..315b4f884 100644
--- a/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
+++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh
@@ -7,6 +7,7 @@ cleanup-test-pods() {
     ( vm-command "kubectl delete pods pod0 --now" ) || true
     ( vm-command "kubectl delete pods pod1 --now" ) || true
     ( vm-command "kubectl delete pods pod2 --now" ) || true
+    ( vm-command "kubectl delete pods pod3 --now" ) || true
 }
 cleanup-test-pods
 
@@ -19,33 +20,41 @@ helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware
 ANNOTATIONS='prefer-reserved-cpus.resource-policy.nri.io/pod: "true"'
 CONTCOUNT=1 create reserved-annotated
 report allowed
+verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'
 
 ANNOTATIONS='prefer-reserved-cpus.resource-policy.nri.io/container.special: "false"'
 CONTCOUNT=1 create reserved-annotated
 report allowed
-
-verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'
 verify 'cpus["pod1c0"] == {"cpu08"}'
 
+vm-command "kubectl delete pods pod0 --now"
+ANNOTATIONS=(
+    'prefer-reserved-cpus.resource-policy.nri.io/pod: "true"'
+    'hide-hyperthreads.resource-policy.nri.io/pod: "true"'
+)
+CONTCOUNT=1 create reserved-annotated
+report allowed
+verify 'cpus["pod2c0"] == {"cpu10"}'
+
 ANNOTATIONS=(
     'cpu.preserve.resource-policy.nri.io: "true"'
-    'memory.preserve.resource-policy.nri.io/container.pod2c1: "true"'
-    'memory.preserve.resource-policy.nri.io/container.pod2c2: "true"'
-    'cpu.preserve.resource-policy.nri.io/container.pod2c2: "false"'
-    'cpu.preserve.resource-policy.nri.io/container.pod2c3: "false"'
-    'memory.preserve.resource-policy.nri.io/container.pod2c3: "false"'
+    'memory.preserve.resource-policy.nri.io/container.pod3c1: "true"'
+    'memory.preserve.resource-policy.nri.io/container.pod3c2: "true"'
+    'cpu.preserve.resource-policy.nri.io/container.pod3c2: "false"'
+    'cpu.preserve.resource-policy.nri.io/container.pod3c3: "false"'
+    'memory.preserve.resource-policy.nri.io/container.pod3c3: "false"'
 )
 CONTCOUNT=4 CPU=100m MEM=100M create reserved-annotated
 report allowed
 
-verify 'len(cpus["pod2c0"]) == 16' \
-       'len(mems["pod2c0"]) == 4' \
-       'len(cpus["pod2c1"]) == 16' \
-       'len(mems["pod2c1"]) == 4' \
-       'len(cpus["pod2c2"]) == 1' \
-       'len(mems["pod2c2"]) == 4' \
-       'len(cpus["pod2c3"]) == 1' \
-       'len(mems["pod2c3"]) == 1'
+verify 'len(cpus["pod3c0"]) == 16' \
+       'len(mems["pod3c0"]) == 4' \
+       'len(cpus["pod3c1"]) == 16' \
+       'len(mems["pod3c1"]) == 4' \
+       'len(cpus["pod3c2"]) == 1' \
+       'len(mems["pod3c2"]) == 4' \
+       'len(cpus["pod3c3"]) == 1' \
+       'len(mems["pod3c3"]) == 1'
 
 cleanup-test-pods