Skip to content

Commit

Permalink
Add the ability to set affinities as part of a ComputeDomain
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Jan 24, 2025
1 parent 6d6e037 commit 29ba85b
Show file tree
Hide file tree
Showing 7 changed files with 747 additions and 19 deletions.
37 changes: 34 additions & 3 deletions api/nvidia.com/resource/v1beta1/computedomain.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package v1beta1

import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down Expand Up @@ -49,9 +50,39 @@ type ComputeDomainList struct {

// ComputeDomainSpec provides the spec for a ComputeDomain.
type ComputeDomainSpec struct {
NumNodes int `json:"numNodes"`
ResourceClaimName string `json:"resourceClaimName,omitempty"`
DeviceClassName string `json:"deviceClassName,omitempty"`
NumNodes int `json:"numNodes"`
ResourceClaimName string `json:"resourceClaimName,omitempty"`
DeviceClassName string `json:"deviceClassName,omitempty"`
NodeSelector map[string]string `json:"nodeSelector,omitempty"`
NodeAffinity *ComputeDomainNodeAffinity `json:"nodeAffinity,omitempty"`
TopologyAlignment *ComputeDomainTopologyAlignment `json:"topologyAlignment,omitempty"`
TopologyAntiAlignment *ComputeDomainTopologyAlignment `json:"topologyAntiAlignment,omitempty"`
}

// +kubebuilder:validation:XValidation:rule="has(self.preferred) || has(self.required)",message="At least one of 'preferred' or 'required' must be set."

type ComputeDomainNodeAffinity struct {
// +listType=atomic
Preferred []corev1.PreferredSchedulingTerm `json:"preferred,omitempty"`
Required *corev1.NodeSelector `json:"required,omitempty"`
}

// +kubebuilder:validation:XValidation:rule="has(self.preferred) || has(self.required)",message="At least one of 'preferred' or 'required' must be set."

type ComputeDomainTopologyAlignment struct {
// +listType=atomic
Preferred []ComputeDomainWeightedTopologyKey `json:"preferred,omitempty"`
Required *ComputeDomainTopologyKeys `json:"required,omitempty"`
}

type ComputeDomainTopologyKeys struct {
// +listType=atomic
TopologyKeys []string `json:"topologyKeys"`
}

type ComputeDomainWeightedTopologyKey struct {
Weight int32 `json:"weight"`
TopologyKey string `json:"topologyKey"`
}

// ComputeDomainStatus provides the status for a ComputeDomain.
Expand Down
112 changes: 111 additions & 1 deletion api/nvidia.com/resource/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

95 changes: 95 additions & 0 deletions cmd/nvidia-dra-imex-controller/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"text/template"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
Expand Down Expand Up @@ -212,6 +213,8 @@ func (m *DeploymentManager) Create(ctx context.Context, namespace string, replic
return nil, fmt.Errorf("failed to convert unstructured data to typed object: %w", err)
}

m.applyAffinities(&deployment, cd)

d, err = m.config.clientsets.Core.AppsV1().Deployments(deployment.Namespace).Create(ctx, &deployment, metav1.CreateOptions{})
if err != nil {
return nil, fmt.Errorf("error creating Deployment: %w", err)
Expand Down Expand Up @@ -354,3 +357,95 @@ func (m *DeploymentManager) removeAllPodManagers() error {
m.Unlock()
return nil
}

func (m *DeploymentManager) applyAffinities(d *appsv1.Deployment, cd *nvapi.ComputeDomain) {
labelSelector := &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: computeDomainLabelKey,
Operator: metav1.LabelSelectorOpIn,
Values: []string{string(cd.UID)},
},
},
}

preferredTopologyAlignment := []corev1.WeightedPodAffinityTerm{
{
Weight: 100,
PodAffinityTerm: corev1.PodAffinityTerm{
LabelSelector: labelSelector,
TopologyKey: CliqueIDLabelKey,
},
},
}

affinity := &corev1.Affinity{
PodAffinity: &corev1.PodAffinity{
PreferredDuringSchedulingIgnoredDuringExecution: preferredTopologyAlignment,
},
}

getPTS := func(alignment *nvapi.ComputeDomainTopologyAlignment) []corev1.PodAffinityTerm {
podAffinityTerms := make([]corev1.PodAffinityTerm, len(alignment.Required.TopologyKeys))
for i, key := range alignment.Required.TopologyKeys {
podAffinityTerms[i] = corev1.PodAffinityTerm{
LabelSelector: labelSelector,
TopologyKey: key,
}
}
return podAffinityTerms
}

getWPTS := func(alignment *nvapi.ComputeDomainTopologyAlignment) []corev1.WeightedPodAffinityTerm {
weightedPodAffinityTerms := make([]corev1.WeightedPodAffinityTerm, len(alignment.Preferred))
for i, term := range alignment.Preferred {
weightedPodAffinityTerms[i] = corev1.WeightedPodAffinityTerm{
Weight: term.Weight,
PodAffinityTerm: corev1.PodAffinityTerm{
LabelSelector: labelSelector,
TopologyKey: term.TopologyKey,
},
}
}
return weightedPodAffinityTerms
}

if cd.Spec.NodeSelector != nil {
d.Spec.Template.Spec.NodeSelector = cd.Spec.NodeSelector
}

if cd.Spec.NodeAffinity != nil {
nodeAffinity := &corev1.NodeAffinity{}
if cd.Spec.NodeAffinity.Required != nil {
nodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = cd.Spec.NodeAffinity.Required
}
if cd.Spec.NodeAffinity.Preferred != nil {
nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = cd.Spec.NodeAffinity.Preferred
}
affinity.NodeAffinity = nodeAffinity
}

if cd.Spec.TopologyAlignment != nil {
podAffinity := &corev1.PodAffinity{}
if cd.Spec.TopologyAlignment.Required != nil {
podAffinity.RequiredDuringSchedulingIgnoredDuringExecution = getPTS(cd.Spec.TopologyAlignment)
}
if cd.Spec.TopologyAlignment.Preferred != nil {
podAffinity.PreferredDuringSchedulingIgnoredDuringExecution = getWPTS(cd.Spec.TopologyAlignment)
}
affinity.PodAffinity = podAffinity
}

if cd.Spec.TopologyAntiAlignment != nil {
podAntiAffinity := &corev1.PodAntiAffinity{}
if cd.Spec.TopologyAntiAlignment.Required != nil {
podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = getPTS(cd.Spec.TopologyAntiAlignment)
}
if cd.Spec.TopologyAlignment.Preferred != nil {
podAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = getWPTS(cd.Spec.TopologyAntiAlignment)
}
affinity.PodAntiAffinity = podAntiAffinity
}

d.Spec.Template.Spec.Affinity = affinity
}
Loading

0 comments on commit 29ba85b

Please sign in to comment.