Skip to content

Commit

Permalink
topology-aware: add support for hide-hyperthreads annotation.
Browse files Browse the repository at this point in the history
Containers with effective hide-hyperthreads annotation are allowed to
run only on the first CPU hyperthread of all physical CPU cores
allocated to its pool.

Signed-off-by: Antti Kervinen <[email protected]>
  • Loading branch information
askervin committed Jun 20, 2024
1 parent 9143f22 commit 1e6050d
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 31 deletions.
18 changes: 18 additions & 0 deletions cmd/plugins/topology-aware/policy/pod-preferences.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ const (
keyReservedCPUsPreference = "prefer-reserved-cpus"
// annotation key for CPU Priority preference
keyCpuPriorityPreference = "prefer-cpu-priority"
// annotation key for hiding hyperthreads from allocated CPU sets
keyHideHyperthreads = "hide-hyperthreads"

// effective annotation key for isolated CPU preference
preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace
Expand All @@ -57,6 +59,8 @@ const (
preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace
// effective annotation key for CPU priority preference
preferCpuPriorityKey = keyCpuPriorityPreference + "." + kubernetes.ResmgrKeyNamespace
// effective annotation key for hiding hyperthreads
hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
)

// cpuClass is a type of CPU to allocate
Expand Down Expand Up @@ -187,6 +191,20 @@ func cpuPrioPreference(pod cache.Pod, container cache.Container, fallback cpuPri
return prio
}

// hideHyperthreadsPreference returns if a container should run using
// only single hyperthread from each physical core.
func hideHyperthreadsPreference(pod cache.Pod, container cache.Container) bool {
value, ok := container.GetEffectiveAnnotation(hideHyperthreadsKey)
if !ok {
return false
}
hide, err := strconv.ParseBool(value)
if err != nil {
return false
}
return hide
}

// memoryTypePreference returns what type of memory should be allocated for the container.
//
// If the effective annotations are not found, this function falls back to
Expand Down
50 changes: 36 additions & 14 deletions cmd/plugins/topology-aware/policy/pools.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package topologyaware

import (
"fmt"
"math"
"sort"

Expand Down Expand Up @@ -604,6 +605,25 @@ func (p *policy) allocatePool(container cache.Container, poolHint string) (Grant
return grant, nil
}

// setPreferredCpusetCpus pins container's CPUs according to what has been
// allocated for it, taking into account if the container should run
// with hyperthreads hidden.
func (p *policy) setPreferredCpusetCpus(container cache.Container, allocated cpuset.CPUSet, info string) {
allow := allocated
hidingInfo := ""
pod, ok := container.GetPod()
if ok && hideHyperthreadsPreference(pod, container) {
allow = p.sys.SingleThreadForCPUs(allocated)
if allow.Size() != allocated.Size() {
hidingInfo = fmt.Sprintf(" (hide %d hyperthreads, remaining cpuset: %s)", allocated.Size()-allow.Size(), allow)
} else {
hidingInfo = " (no hyperthreads to hide)"
}
}
log.Info("%s%s", info, hidingInfo)
container.SetCpusetCpus(allow.String())
}

// Apply the result of allocation to the requesting container.
func (p *policy) applyGrant(grant Grant) {
log.Info("* applying grant %s", grant)
Expand All @@ -615,25 +635,25 @@ func (p *policy) applyGrant(grant Grant) {
shared := grant.SharedCPUs()
cpuPortion := grant.SharedPortion()

cpus := ""
cpus := cpuset.New()
kind := ""
switch cpuType {
case cpuNormal:
if exclusive.IsEmpty() {
cpus = shared.String()
cpus = shared
kind = "shared"
} else {
kind = "exclusive"
if cpuPortion > 0 {
kind += "+shared"
cpus = exclusive.Union(shared).String()
cpus = exclusive.Union(shared)
} else {
cpus = exclusive.String()
cpus = exclusive
}
}
case cpuReserved:
kind = "reserved"
cpus = reserved.String()
cpus = reserved
cpuPortion = grant.ReservedPortion()
case cpuPreserve:
// Will skip CPU pinning, may still pin memory.
Expand All @@ -651,12 +671,14 @@ func (p *policy) applyGrant(grant Grant) {
if cpuType == cpuPreserve {
log.Info(" => preserving %s cpuset %s", container.PrettyName(), container.GetCpusetCpus())
} else {
if cpus != "" {
log.Info(" => pinning %s to (%s) cpuset %s", container.PrettyName(), kind, cpus)
if cpus.Size() > 0 {
p.setPreferredCpusetCpus(container, cpus,
fmt.Sprintf(" => pinning %s to (%s) cpuset %s",
container.PrettyName(), kind, cpus))
} else {
log.Info(" => not pinning %s CPUs, cpuset is empty...", container.PrettyName())
container.SetCpusetCpus("")
}
container.SetCpusetCpus(cpus)
}

// Notes:
Expand Down Expand Up @@ -757,13 +779,13 @@ func (p *policy) updateSharedAllocations(grant *Grant) {
shared := other.GetCPUNode().FreeSupply().SharableCPUs()
exclusive := other.ExclusiveCPUs()
if exclusive.IsEmpty() {
log.Info(" => updating %s with shared CPUs of %s: %s...",
other, other.GetCPUNode().Name(), shared.String())
other.GetContainer().SetCpusetCpus(shared.String())
p.setPreferredCpusetCpus(other.GetContainer(), shared,
fmt.Sprintf(" => updating %s with shared CPUs of %s: %s...",
other, other.GetCPUNode().Name(), shared.String()))
} else {
log.Info(" => updating %s with exclusive+shared CPUs of %s: %s+%s...",
other, other.GetCPUNode().Name(), exclusive.String(), shared.String())
other.GetContainer().SetCpusetCpus(exclusive.Union(shared).String())
p.setPreferredCpusetCpus(other.GetContainer(), exclusive.Union(shared),
fmt.Sprintf(" => updating %s with exclusive+shared CPUs of %s: %s+%s...",
other, other.GetCPUNode().Name(), exclusive.String(), shared.String()))
}
}
}
Expand Down
18 changes: 18 additions & 0 deletions docs/resource-policy/policy/topology-aware.md
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,24 @@ metadata:
These Pod annotations have no effect on containers which are not eligible for
exclusive allocation.
### Selectively Disabling Hyperthreading
If a container opts to hide hyperthreads, it is allowed to use only
one hyperthread from every physical CPU core allocated to it. Note
that as a result the container may be allowed to run on only half of
the CPUs it has requested. In case of workloads that do not benefit
from hyperthreading this nevertheless results in better performance
compared to running on all hyperthreads of the same CPU cores. If
container's CPU allocation is exclusive, no other container can run on
hidden hyperthreads either.
```yaml
metadata:
annotations:
# allow the "LLM" container to use only single thread per physical CPU core
hide-hyperthreads.resource-policy.nri.io/container.LLM: "true"
```
### Implicit Hardware Topology Hints
`NRI Resource Policy` automatically generates HW `Topology Hints` for devices
Expand Down
20 changes: 20 additions & 0 deletions pkg/sysfs/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ type System interface {
CoreKindCPUs(CoreKind) cpuset.CPUSet
CoreKinds() []CoreKind
AllThreadsForCPUs(cpuset.CPUSet) cpuset.CPUSet
SingleThreadForCPUs(cpuset.CPUSet) cpuset.CPUSet

Offlined() cpuset.CPUSet
Isolated() cpuset.CPUSet
Expand Down Expand Up @@ -685,6 +686,25 @@ func (sys *system) AllThreadsForCPUs(cpus cpuset.CPUSet) cpuset.CPUSet {
return all
}

// SingleThreadForCPUs returns a subset of input cpus so that from
// each physical core only a cpu with the smallest id is included in
// the set.
func (sys *system) SingleThreadForCPUs(cpus cpuset.CPUSet) cpuset.CPUSet {
result := make([]int, 0, cpus.Size())
handled := make(map[int]struct{}, cpus.Size())
for _, cpu := range cpus.List() {
if _, ok := handled[cpu]; ok {
continue
}
handled[cpu] = struct{}{}
result = append(result, cpu)
for _, sibling := range sys.CPU(cpu).ThreadCPUSet().UnsortedList() {
handled[sibling] = struct{}{}
}
}
return cpuset.New(result...)
}

// Offlined gets the set of offlined CPUs.
func (sys *system) Offlined() cpuset.CPUSet {
return sys.OfflineCPUs()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@ vm-command "kubectl delete pods --all --now"

# pod1: Test that 4 guaranteed containers not eligible for isolated CPU allocation
# gets evenly spread over NUMA nodes.
CONTCOUNT=4 CPU=3 create guaranteed
CONTCOUNT=4 CPU=3 ANN0='hide-hyperthreads.resource-policy.nri.io/container.pod1c2: "true"' create guaranteed
report allowed
verify \
'len(cpus["pod1c0"]) == 3' \
'len(cpus["pod1c1"]) == 3' \
'len(cpus["pod1c2"]) == 3' \
'len(cpus["pod1c2"]) == 2' \
'len(cores["pod1c2"]) == 2' \
'len(cpus["pod1c3"]) == 3' \
'disjoint_sets(cpus["pod1c0"], cpus["pod1c1"], cpus["pod1c2"], cpus["pod1c3"])' \
'disjoint_sets(nodes["pod1c0"], nodes["pod1c1"], nodes["pod1c2"], nodes["pod1c3"])'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ cleanup-test-pods() {
( vm-command "kubectl delete pods pod0 --now" ) || true
( vm-command "kubectl delete pods pod1 --now" ) || true
( vm-command "kubectl delete pods pod2 --now" ) || true
( vm-command "kubectl delete pods pod3 --now" ) || true
}
cleanup-test-pods

Expand All @@ -19,33 +20,41 @@ helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware
ANNOTATIONS='prefer-reserved-cpus.resource-policy.nri.io/pod: "true"'
CONTCOUNT=1 create reserved-annotated
report allowed
verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'

ANNOTATIONS='prefer-reserved-cpus.resource-policy.nri.io/container.special: "false"'
CONTCOUNT=1 create reserved-annotated
report allowed

verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}'
verify 'cpus["pod1c0"] == {"cpu08"}'

vm-command "kubectl delete pods pod0 --now"
ANNOTATIONS=(
'prefer-reserved-cpus.resource-policy.nri.io/pod: "true"'
'hide-hyperthreads.resource-policy.nri.io/pod: "true"'
)
CONTCOUNT=1 create reserved-annotated
report allowed
verify 'cpus["pod2c0"] == {"cpu10"}'

ANNOTATIONS=(
'cpu.preserve.resource-policy.nri.io: "true"'
'memory.preserve.resource-policy.nri.io/container.pod2c1: "true"'
'memory.preserve.resource-policy.nri.io/container.pod2c2: "true"'
'cpu.preserve.resource-policy.nri.io/container.pod2c2: "false"'
'cpu.preserve.resource-policy.nri.io/container.pod2c3: "false"'
'memory.preserve.resource-policy.nri.io/container.pod2c3: "false"'
'memory.preserve.resource-policy.nri.io/container.pod3c1: "true"'
'memory.preserve.resource-policy.nri.io/container.pod3c2: "true"'
'cpu.preserve.resource-policy.nri.io/container.pod3c2: "false"'
'cpu.preserve.resource-policy.nri.io/container.pod3c3: "false"'
'memory.preserve.resource-policy.nri.io/container.pod3c3: "false"'
)
CONTCOUNT=4 CPU=100m MEM=100M create reserved-annotated
report allowed

verify 'len(cpus["pod2c0"]) == 16' \
'len(mems["pod2c0"]) == 4' \
'len(cpus["pod2c1"]) == 16' \
'len(mems["pod2c1"]) == 4' \
'len(cpus["pod2c2"]) == 1' \
'len(mems["pod2c2"]) == 4' \
'len(cpus["pod2c3"]) == 1' \
'len(mems["pod2c3"]) == 1'
verify 'len(cpus["pod3c0"]) == 16' \
'len(mems["pod3c0"]) == 4' \
'len(cpus["pod3c1"]) == 16' \
'len(mems["pod3c1"]) == 4' \
'len(cpus["pod3c2"]) == 1' \
'len(mems["pod3c2"]) == 4' \
'len(cpus["pod3c3"]) == 1' \
'len(mems["pod3c3"]) == 1'

cleanup-test-pods

Expand Down

0 comments on commit 1e6050d

Please sign in to comment.