diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go index 37bc7bf4d..b8f8dde7d 100644 --- a/cmd/plugins/topology-aware/policy/pod-preferences.go +++ b/cmd/plugins/topology-aware/policy/pod-preferences.go @@ -44,6 +44,8 @@ const ( keyReservedCPUsPreference = "prefer-reserved-cpus" // annotation key for CPU Priority preference keyCpuPriorityPreference = "prefer-cpu-priority" + // annotation key for hiding hyperthreads from allocated CPU sets + keyHideHyperthreads = "hide-hyperthreads" // effective annotation key for isolated CPU preference preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace @@ -57,6 +59,8 @@ const ( preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace // effective annotation key for CPU priority preference preferCpuPriorityKey = keyCpuPriorityPreference + "." + kubernetes.ResmgrKeyNamespace + // effective annotation key for hiding hyperthreads + hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace ) // cpuClass is a type of CPU to allocate @@ -187,6 +191,20 @@ func cpuPrioPreference(pod cache.Pod, container cache.Container, fallback cpuPri return prio } +// hideHyperthreadsPreference returns if a container should run using +// only single hyperthread from each physical core. +func hideHyperthreadsPreference(pod cache.Pod, container cache.Container) bool { + value, ok := container.GetEffectiveAnnotation(hideHyperthreadsKey) + if !ok { + return false + } + hide, err := strconv.ParseBool(value) + if err != nil { + return false + } + return hide +} + // memoryTypePreference returns what type of memory should be allocated for the container. // // If the effective annotations are not found, this function falls back to diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go index 2716f6c53..8fa1a6b1c 100644 --- a/cmd/plugins/topology-aware/policy/pools.go +++ b/cmd/plugins/topology-aware/policy/pools.go @@ -15,6 +15,7 @@ package topologyaware import ( + "fmt" "math" "sort" @@ -604,6 +605,25 @@ func (p *policy) allocatePool(container cache.Container, poolHint string) (Grant return grant, nil } +// setPreferredCpusetCpus pins container's CPUs according to what has been +// allocated for it, taking into account if the container should run +// with hyperthreads hidden. +func (p *policy) setPreferredCpusetCpus(container cache.Container, allocated cpuset.CPUSet, info string) { + allow := allocated + hidingInfo := "" + pod, ok := container.GetPod() + if ok && hideHyperthreadsPreference(pod, container) { + allow = p.sys.SingleThreadForCPUs(allocated) + if allow.Size() != allocated.Size() { + hidingInfo = fmt.Sprintf(" (hide %d hyperthreads, remaining cpuset: %s)", allocated.Size()-allow.Size(), allow) + } else { + hidingInfo = " (no hyperthreads to hide)" + } + } + log.Info("%s%s", info, hidingInfo) + container.SetCpusetCpus(allow.String()) +} + // Apply the result of allocation to the requesting container. func (p *policy) applyGrant(grant Grant) { log.Info("* applying grant %s", grant) @@ -615,25 +635,25 @@ func (p *policy) applyGrant(grant Grant) { shared := grant.SharedCPUs() cpuPortion := grant.SharedPortion() - cpus := "" + cpus := cpuset.New() kind := "" switch cpuType { case cpuNormal: if exclusive.IsEmpty() { - cpus = shared.String() + cpus = shared kind = "shared" } else { kind = "exclusive" if cpuPortion > 0 { kind += "+shared" - cpus = exclusive.Union(shared).String() + cpus = exclusive.Union(shared) } else { - cpus = exclusive.String() + cpus = exclusive } } case cpuReserved: kind = "reserved" - cpus = reserved.String() + cpus = reserved cpuPortion = grant.ReservedPortion() case cpuPreserve: // Will skip CPU pinning, may still pin memory. @@ -651,12 +671,14 @@ func (p *policy) applyGrant(grant Grant) { if cpuType == cpuPreserve { log.Info(" => preserving %s cpuset %s", container.PrettyName(), container.GetCpusetCpus()) } else { - if cpus != "" { - log.Info(" => pinning %s to (%s) cpuset %s", container.PrettyName(), kind, cpus) + if cpus.Size() > 0 { + p.setPreferredCpusetCpus(container, cpus, + fmt.Sprintf(" => pinning %s to (%s) cpuset %s", + container.PrettyName(), kind, cpus)) } else { log.Info(" => not pinning %s CPUs, cpuset is empty...", container.PrettyName()) + container.SetCpusetCpus("") } - container.SetCpusetCpus(cpus) } // Notes: @@ -757,13 +779,13 @@ func (p *policy) updateSharedAllocations(grant *Grant) { shared := other.GetCPUNode().FreeSupply().SharableCPUs() exclusive := other.ExclusiveCPUs() if exclusive.IsEmpty() { - log.Info(" => updating %s with shared CPUs of %s: %s...", - other, other.GetCPUNode().Name(), shared.String()) - other.GetContainer().SetCpusetCpus(shared.String()) + p.setPreferredCpusetCpus(other.GetContainer(), shared, + fmt.Sprintf(" => updating %s with shared CPUs of %s: %s...", + other, other.GetCPUNode().Name(), shared.String())) } else { - log.Info(" => updating %s with exclusive+shared CPUs of %s: %s+%s...", - other, other.GetCPUNode().Name(), exclusive.String(), shared.String()) - other.GetContainer().SetCpusetCpus(exclusive.Union(shared).String()) + p.setPreferredCpusetCpus(other.GetContainer(), exclusive.Union(shared), + fmt.Sprintf(" => updating %s with exclusive+shared CPUs of %s: %s+%s...", + other, other.GetCPUNode().Name(), exclusive.String(), shared.String())) } } } diff --git a/docs/resource-policy/policy/topology-aware.md b/docs/resource-policy/policy/topology-aware.md index 5ef308981..4735a41de 100644 --- a/docs/resource-policy/policy/topology-aware.md +++ b/docs/resource-policy/policy/topology-aware.md @@ -271,6 +271,24 @@ metadata: These Pod annotations have no effect on containers which are not eligible for exclusive allocation. +### Selectively Disabling Hyperthreading + +If a container opts to hide hyperthreads, it is allowed to use only +one hyperthread from every physical CPU core allocated to it. Note +that as a result the container may be allowed to run on only half of +the CPUs it has requested. In case of workloads that do not benefit +from hyperthreading this nevertheless results in better performance +compared to running on all hyperthreads of the same CPU cores. If +container's CPU allocation is exclusive, no other container can run on +hidden hyperthreads either. + +```yaml +metadata: + annotations: + # allow the "LLM" container to use only single thread per physical CPU core + hide-hyperthreads.resource-policy.nri.io/container.LLM: "true" +``` + ### Implicit Hardware Topology Hints `NRI Resource Policy` automatically generates HW `Topology Hints` for devices diff --git a/pkg/sysfs/system.go b/pkg/sysfs/system.go index b3e9d78c4..84ce2da56 100644 --- a/pkg/sysfs/system.go +++ b/pkg/sysfs/system.go @@ -116,6 +116,7 @@ type System interface { CoreKindCPUs(CoreKind) cpuset.CPUSet CoreKinds() []CoreKind AllThreadsForCPUs(cpuset.CPUSet) cpuset.CPUSet + SingleThreadForCPUs(cpuset.CPUSet) cpuset.CPUSet Offlined() cpuset.CPUSet Isolated() cpuset.CPUSet @@ -685,6 +686,25 @@ func (sys *system) AllThreadsForCPUs(cpus cpuset.CPUSet) cpuset.CPUSet { return all } +// SingleThreadForCPUs returns a subset of input cpus so that from +// each physical core only a cpu with the smallest id is included in +// the set. +func (sys *system) SingleThreadForCPUs(cpus cpuset.CPUSet) cpuset.CPUSet { + result := make([]int, 0, cpus.Size()) + handled := make(map[int]struct{}, cpus.Size()) + for _, cpu := range cpus.List() { + if _, ok := handled[cpu]; ok { + continue + } + handled[cpu] = struct{}{} + result = append(result, cpu) + for _, sibling := range sys.CPU(cpu).ThreadCPUSet().UnsortedList() { + handled[sibling] = struct{}{} + } + } + return cpuset.New(result...) +} + // Offlined gets the set of offlined CPUs. func (sys *system) Offlined() cpuset.CPUSet { return sys.OfflineCPUs() diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh index 007bb4ac0..ab597cf50 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh @@ -38,12 +38,13 @@ vm-command "kubectl delete pods --all --now" # pod1: Test that 4 guaranteed containers not eligible for isolated CPU allocation # gets evenly spread over NUMA nodes. -CONTCOUNT=4 CPU=3 create guaranteed +CONTCOUNT=4 CPU=3 ANN0='hide-hyperthreads.resource-policy.nri.io/container.pod1c2: "true"' create guaranteed report allowed verify \ 'len(cpus["pod1c0"]) == 3' \ 'len(cpus["pod1c1"]) == 3' \ - 'len(cpus["pod1c2"]) == 3' \ + 'len(cpus["pod1c2"]) == 2' \ + 'len(cores["pod1c2"]) == 2' \ 'len(cpus["pod1c3"]) == 3' \ 'disjoint_sets(cpus["pod1c0"], cpus["pod1c1"], cpus["pod1c2"], cpus["pod1c3"])' \ 'disjoint_sets(nodes["pod1c0"], nodes["pod1c1"], nodes["pod1c2"], nodes["pod1c3"])' diff --git a/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh b/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh index b9c20ef79..315b4f884 100644 --- a/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh +++ b/test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh @@ -7,6 +7,7 @@ cleanup-test-pods() { ( vm-command "kubectl delete pods pod0 --now" ) || true ( vm-command "kubectl delete pods pod1 --now" ) || true ( vm-command "kubectl delete pods pod2 --now" ) || true + ( vm-command "kubectl delete pods pod3 --now" ) || true } cleanup-test-pods @@ -19,33 +20,41 @@ helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware ANNOTATIONS='prefer-reserved-cpus.resource-policy.nri.io/pod: "true"' CONTCOUNT=1 create reserved-annotated report allowed +verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}' ANNOTATIONS='prefer-reserved-cpus.resource-policy.nri.io/container.special: "false"' CONTCOUNT=1 create reserved-annotated report allowed - -verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}' verify 'cpus["pod1c0"] == {"cpu08"}' +vm-command "kubectl delete pods pod0 --now" +ANNOTATIONS=( + 'prefer-reserved-cpus.resource-policy.nri.io/pod: "true"' + 'hide-hyperthreads.resource-policy.nri.io/pod: "true"' +) +CONTCOUNT=1 create reserved-annotated +report allowed +verify 'cpus["pod2c0"] == {"cpu10"}' + ANNOTATIONS=( 'cpu.preserve.resource-policy.nri.io: "true"' - 'memory.preserve.resource-policy.nri.io/container.pod2c1: "true"' - 'memory.preserve.resource-policy.nri.io/container.pod2c2: "true"' - 'cpu.preserve.resource-policy.nri.io/container.pod2c2: "false"' - 'cpu.preserve.resource-policy.nri.io/container.pod2c3: "false"' - 'memory.preserve.resource-policy.nri.io/container.pod2c3: "false"' + 'memory.preserve.resource-policy.nri.io/container.pod3c1: "true"' + 'memory.preserve.resource-policy.nri.io/container.pod3c2: "true"' + 'cpu.preserve.resource-policy.nri.io/container.pod3c2: "false"' + 'cpu.preserve.resource-policy.nri.io/container.pod3c3: "false"' + 'memory.preserve.resource-policy.nri.io/container.pod3c3: "false"' ) CONTCOUNT=4 CPU=100m MEM=100M create reserved-annotated report allowed -verify 'len(cpus["pod2c0"]) == 16' \ - 'len(mems["pod2c0"]) == 4' \ - 'len(cpus["pod2c1"]) == 16' \ - 'len(mems["pod2c1"]) == 4' \ - 'len(cpus["pod2c2"]) == 1' \ - 'len(mems["pod2c2"]) == 4' \ - 'len(cpus["pod2c3"]) == 1' \ - 'len(mems["pod2c3"]) == 1' +verify 'len(cpus["pod3c0"]) == 16' \ + 'len(mems["pod3c0"]) == 4' \ + 'len(cpus["pod3c1"]) == 16' \ + 'len(mems["pod3c1"]) == 4' \ + 'len(cpus["pod3c2"]) == 1' \ + 'len(mems["pod3c2"]) == 4' \ + 'len(cpus["pod3c3"]) == 1' \ + 'len(mems["pod3c3"]) == 1' cleanup-test-pods