Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Drop the hostname requirement when handling PV topology #1018

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 126 additions & 7 deletions pkg/controllers/disruption/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,12 @@ var _ = AfterEach(func() {

var _ = Describe("Simulate Scheduling", func() {
var nodePool *v1beta1.NodePool
var nodeClaims []*v1beta1.NodeClaim
var nodes []*v1.Node
var numNodes int
BeforeEach(func() {
numNodes = 10
nodePool = test.NodePool()
nodeClaims, nodes = test.NodeClaimsAndNodes(numNodes, v1beta1.NodeClaim{
})
It("should allow pods on deleting nodes to reschedule to uninitialized nodes", func() {
numNodes := 10
nodeClaims, nodes := test.NodeClaimsAndNodes(numNodes, v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"karpenter.sh/test-finalizer"},
Labels: map[string]string{
Expand All @@ -189,8 +188,6 @@ var _ = Describe("Simulate Scheduling", func() {
// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

})
It("should allow pods on deleting nodes to reschedule to uninitialized nodes", func() {
pod := test.Pod(test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Requests: v1.ResourceList{
Expand Down Expand Up @@ -234,6 +231,33 @@ var _ = Describe("Simulate Scheduling", func() {
Expect(results.PodErrors[pod]).To(BeNil())
})
It("should allow multiple replace operations to happen successively", func() {
numNodes := 10
nodeClaims, nodes := test.NodeClaimsAndNodes(numNodes, v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Finalizers: []string{"karpenter.sh/test-finalizer"},
Labels: map[string]string{
v1beta1.NodePoolLabelKey: nodePool.Name,
v1.LabelInstanceTypeStable: mostExpensiveInstance.Name,
v1beta1.CapacityTypeLabelKey: mostExpensiveOffering.CapacityType,
v1.LabelTopologyZone: mostExpensiveOffering.Zone,
},
},
Status: v1beta1.NodeClaimStatus{
Allocatable: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse("3"),
v1.ResourcePods: resource.MustParse("100"),
},
},
})
ExpectApplied(ctx, env.Client, nodePool)

for i := 0; i < numNodes; i++ {
ExpectApplied(ctx, env.Client, nodeClaims[i], nodes[i])
}

// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

// Create a pod for each node
pods := test.Pods(10, test.PodOptions{
ResourceRequirements: v1.ResourceRequirements{
Expand Down Expand Up @@ -336,6 +360,101 @@ var _ = Describe("Simulate Scheduling", func() {
ncs = ExpectNodeClaims(ctx, env.Client)
Expect(len(ncs)).To(Equal(13))
})
It("can replace node with a local PV (ignoring hostname affinity)", func() {
nodeClaim, node := test.NodeClaimAndNode(v1beta1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1beta1.NodePoolLabelKey: nodePool.Name,
v1.LabelInstanceTypeStable: mostExpensiveInstance.Name,
v1beta1.CapacityTypeLabelKey: mostExpensiveOffering.CapacityType,
v1.LabelTopologyZone: mostExpensiveOffering.Zone,
},
},
Status: v1beta1.NodeClaimStatus{
Allocatable: map[v1.ResourceName]resource.Quantity{
v1.ResourceCPU: resource.MustParse("32"),
v1.ResourcePods: resource.MustParse("100"),
},
},
})
labels := map[string]string{
"app": "test",
}
// create our RS so we can link a pod to it
ss := test.StatefulSet()
ExpectApplied(ctx, env.Client, ss)

// StorageClass that references "no-provisioner" and is used for local volume storage
storageClass := test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{
Name: "local-path",
},
Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"),
})
persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{UseLocal: true})
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
// This PV is only valid for use against this node
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{node.Name},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
pod := test.Pod(test.PodOptions{
ObjectMeta: metav1.ObjectMeta{Labels: labels,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "apps/v1",
Kind: "StatefulSet",
Name: ss.Name,
UID: ss.UID,
Controller: ptr.Bool(true),
BlockOwnerDeletion: ptr.Bool(true),
},
},
},
PersistentVolumeClaims: []string{persistentVolumeClaim.Name},
})
ExpectApplied(ctx, env.Client, ss, pod, nodeClaim, node, nodePool, storageClass, persistentVolume, persistentVolumeClaim)

// bind pods to node
ExpectManualBinding(ctx, env.Client, pod, node)

// inform cluster state about nodes and nodeclaims
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, []*v1.Node{node}, []*v1beta1.NodeClaim{nodeClaim})

// disruption won't delete the old node until the new node is ready
var wg sync.WaitGroup
ExpectTriggerVerifyAction(&wg)
ExpectMakeNewNodeClaimsReady(ctx, env.Client, &wg, cluster, cloudProvider, 1)
ExpectReconcileSucceeded(ctx, disruptionController, types.NamespacedName{})
wg.Wait()

// Process the item so that the nodes can be deleted.
ExpectReconcileSucceeded(ctx, queue, types.NamespacedName{})
// Cascade any deletion of the nodeClaim to the node
ExpectNodeClaimsCascadeDeletion(ctx, env.Client, nodeClaim)

// Expect that the new nodeClaim was created, and it's different than the original
// We should succeed in getting a replacement, since we assume that the node affinity requirement will be invalid
// once we spin-down the old node
ExpectNotFound(ctx, env.Client, nodeClaim, node)
nodeclaims := ExpectNodeClaims(ctx, env.Client)
nodes := ExpectNodes(ctx, env.Client)
Expect(nodeclaims).To(HaveLen(1))
Expect(nodes).To(HaveLen(1))
Expect(nodeclaims[0].Name).ToNot(Equal(nodeClaim.Name))
Expect(nodes[0].Name).ToNot(Equal(node.Name))
})
})

var _ = Describe("Disruption Taints", func() {
Expand Down
9 changes: 8 additions & 1 deletion pkg/controllers/provisioning/scheduling/volumetopology.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,14 @@ func (v *VolumeTopology) getPersistentVolumeRequirements(ctx context.Context, po
var requirements []v1.NodeSelectorRequirement
if len(pv.Spec.NodeAffinity.Required.NodeSelectorTerms) > 0 {
// Terms are ORed, only use the first term
requirements = append(requirements, pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions...)
requirements = pv.Spec.NodeAffinity.Required.NodeSelectorTerms[0].MatchExpressions
// If we are using a Local volume or a HostPath volume, then we should ignore the Hostname affinity
// on it because re-scheduling this pod to a new node means not using the same Hostname affinity that we currently have
if pv.Spec.Local != nil || pv.Spec.HostPath != nil {
requirements = lo.Reject(requirements, func(n v1.NodeSelectorRequirement, _ int) bool {
return n.Key == v1.LabelHostname
})
}
}
return requirements, nil
}
Expand Down
111 changes: 111 additions & 0 deletions pkg/controllers/provisioning/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,117 @@ var _ = Describe("Provisioning", func() {
node := ExpectScheduled(ctx, env.Client, pod)
Expect(node.Labels).To(HaveKeyWithValue(v1.LabelTopologyZone, "test-zone-3"))
})
DescribeTable("should ignore hostname affinity scheduling when using local path volumes",
func(volumeOptions test.PersistentVolumeOptions) {
// StorageClass that references "no-provisioner" and is used for local volume storage
storageClass = test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{
Name: "local-path",
},
Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"),
})
// Create a PersistentVolume that is using a random node name for its affinity
persistentVolume := test.PersistentVolume(volumeOptions)
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{test.RandomName()},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, persistentVolumeClaim, persistentVolume)
pod := test.UnschedulablePod(test.PodOptions{
PersistentVolumeClaims: []string{persistentVolumeClaim.Name},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
// Expect that we are still able to schedule this pod to a node, even though we had a hostname affinity on it
ExpectScheduled(ctx, env.Client, pod)
},
Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}),
Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}),
)
DescribeTable("should ignore hostname affinity scheduling when using local path volumes (ephemeral volume)",
func(volumeOptions test.PersistentVolumeOptions) {
// StorageClass that references "no-provisioner" and is used for local volume storage
storageClass = test.StorageClass(test.StorageClassOptions{
ObjectMeta: metav1.ObjectMeta{
Name: "local-path",
},
Provisioner: lo.ToPtr("kubernetes.io/no-provisioner"),
})
pod := test.UnschedulablePod(test.PodOptions{
EphemeralVolumeTemplates: []test.EphemeralVolumeTemplateOptions{
{
StorageClassName: &storageClass.Name,
},
},
})
persistentVolume := test.PersistentVolume(volumeOptions)
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{test.RandomName()},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-%s", pod.Name, pod.Spec.Volumes[0].Name),
},
VolumeName: persistentVolume.Name,
StorageClassName: &storageClass.Name,
})
ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, pod, persistentVolumeClaim, persistentVolume)
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectScheduled(ctx, env.Client, pod)
},
Entry("when using local volumes", test.PersistentVolumeOptions{UseLocal: true}),
Entry("when using hostpath volumes", test.PersistentVolumeOptions{UseHostPath: true}),
)
It("should not ignore hostname affinity when using non-local path volumes", func() {
// This PersistentVolume is going to use a standard CSI volume for provisioning
persistentVolume := test.PersistentVolume()
persistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: v1.LabelHostname,
Operator: v1.NodeSelectorOpIn,
Values: []string{test.RandomName()},
},
},
},
},
},
}
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
ExpectApplied(ctx, env.Client, test.NodePool(), storageClass, persistentVolumeClaim, persistentVolume)
pod := test.UnschedulablePod(test.PodOptions{
PersistentVolumeClaims: []string{persistentVolumeClaim.Name},
})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
// Expect that this pod can't schedule because we have a hostname affinity, and we don't currently have a pod that we can schedule to
ExpectNotScheduled(ctx, env.Client, pod)
})
It("should not schedule if volume zones are incompatible", func() {
persistentVolume := test.PersistentVolume(test.PersistentVolumeOptions{Zones: []string{"test-zone-3"}})
persistentVolumeClaim := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{VolumeName: persistentVolume.Name, StorageClassName: &storageClass.Name})
Expand Down
14 changes: 14 additions & 0 deletions pkg/test/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type PersistentVolumeOptions struct {
StorageClassName string
Driver string
UseAWSInTreeDriver bool
UseLocal bool
UseHostPath bool
}

func PersistentVolume(overrides ...PersistentVolumeOptions) *v1.PersistentVolume {
Expand All @@ -46,6 +48,18 @@ func PersistentVolume(overrides ...PersistentVolumeOptions) *v1.PersistentVolume
// Determine the PersistentVolumeSource based on the options
var source v1.PersistentVolumeSource
switch {
case options.UseLocal:
source = v1.PersistentVolumeSource{
Local: &v1.LocalVolumeSource{
Path: "/mnt/local-disks",
},
}
case options.UseHostPath:
source = v1.PersistentVolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/mnt/local-disks",
},
}
case options.UseAWSInTreeDriver:
source = v1.PersistentVolumeSource{
AWSElasticBlockStore: &v1.AWSElasticBlockStoreVolumeSource{
Expand Down
Loading