From 43cffafa3bf656dc5366369a498e628bc1be50bb Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Thu, 19 Dec 2024 14:59:24 +0530
Subject: [PATCH 01/27] [WIP] fix CA marking machines for deletion

---
 .../cloudprovider/mcm/mcm_manager.go          | 53 +++++++++++--------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 399862c9e383..2b217f8914a1 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -27,6 +27,7 @@ import (
 	"errors"
 	"flag"
 	"fmt"
+	"k8s.io/apimachinery/pkg/util/sets"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
 	"maps"
@@ -98,6 +99,9 @@ const (
 	machineDeploymentPausedReason = "DeploymentPaused"
 	// machineDeploymentNameLabel key for Machine Deployment name in machine labels
 	machineDeploymentNameLabel = "name"
+	// machinesMarkedByCAForDeletion is the annotation set by CA on machine deployment. Its value denotes the machines that
+	// CA marked for deletion by updating the priority annotation to 1 and scaling down the machine deployment.
+	machinesMarkedByCAForDeletion = "cluster-autoscaler.kubernetes.io/machines-marked-by-ca-for-deletion"
 )
 
 var (
@@ -424,7 +428,7 @@ func (m *McmManager) Refresh() error {
 			klog.Infof("[Refresh] machine deployment %s is under rolling update, skipping", machineDeployment.Name)
 			continue
 		}
-		replicas := machineDeployment.Spec.Replicas
+		markedMachines := sets.New(strings.Split(machineDeployment.Annotations[machinesMarkedByCAForDeletion], ",")...)
 		// check if number of annotated machine objects is more than desired and correspondingly reset the priority annotation value if needed.
 		machines, err := m.getMachinesForMachineDeployment(machineDeployment.Name)
 		if err != nil {
@@ -432,27 +436,17 @@ func (m *McmManager) Refresh() error {
 			collectiveError = errors.Join(collectiveError, err)
 			continue
 		}
-		var machinesMarkedForDeletion []*v1alpha1.Machine
+		var incorrectlyMarkedMachines []*Ref
 		for _, machine := range machines {
 			// no need to reset priority for machines already in termination or failed phase
 			if machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
 				continue
 			}
-			if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForCandidateMachines {
-				machinesMarkedForDeletion = append(machinesMarkedForDeletion, machine)
+			if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForCandidateMachines && !markedMachines.Has(machine.Name) {
+				incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, &Ref{Name: machine.Name, Namespace: machine.Namespace})
 			}
 		}
-		if int(replicas) > len(machines)-len(machinesMarkedForDeletion) {
-			slices.SortStableFunc(machinesMarkedForDeletion, func(m1, m2 *v1alpha1.Machine) int {
-				return -m1.CreationTimestamp.Compare(m2.CreationTimestamp.Time)
-			})
-			diff := int(replicas) - len(machines) + len(machinesMarkedForDeletion)
-			targetRefs := make([]*Ref, 0, diff)
-			for i := 0; i < min(diff, len(machinesMarkedForDeletion)); i++ {
-				targetRefs = append(targetRefs, &Ref{Name: machinesMarkedForDeletion[i].Name, Namespace: machinesMarkedForDeletion[i].Namespace})
-			}
-			collectiveError = errors.Join(collectiveError, m.resetPriorityForMachines(targetRefs))
-		}
+		collectiveError = errors.Join(collectiveError, m.resetPriorityForMachines(incorrectlyMarkedMachines))
 	}
 	return collectiveError
 }
@@ -508,18 +502,29 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*Ref) error {
 	if !isRollingUpdateFinished(md) {
 		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonMachineDeployment.Name)
 	}
+	markedMachines := sets.New(strings.Split(md.Annotations[machinesMarkedByCAForDeletion], ",")...)
+	var filteredTargetMachineRefs []*Ref
+	for _, targetMachineRef := range targetMachineRefs {
+		if !markedMachines.Has(targetMachineRef.Name) {
+			filteredTargetMachineRefs = append(filteredTargetMachineRefs, targetMachineRef)
+			markedMachines.Insert(targetMachineRef.Name)
+		} else {
+			klog.Infof("Machine %s is already marked for deletion, skipping", targetMachineRef.Name)
+		}
+	}
+
 	// update priorities of machines to be deleted except the ones already in termination to 1
-	scaleDownAmount, err := m.prioritizeMachinesForDeletion(targetMachineRefs)
+	err = m.prioritizeMachinesForDeletion(filteredTargetMachineRefs)
 	if err != nil {
 		return err
 	}
 	// Trying to update the machineDeployment till the deadline
 	err = m.retry(func(ctx context.Context) (bool, error) {
-		return m.scaleDownMachineDeployment(ctx, commonMachineDeployment.Name, scaleDownAmount)
+		return m.scaleDownMachineDeployment(ctx, commonMachineDeployment.Name, len(filteredTargetMachineRefs), strings.Join(markedMachines.UnsortedList(), ","))
 	}, "MachineDeployment", "update", commonMachineDeployment.Name)
 	if err != nil {
 		klog.Errorf("unable to scale in machine deployment %s, will reset priority of target machines, Error: %v", commonMachineDeployment.Name, err)
-		return errors.Join(err, m.resetPriorityForMachines(targetMachineRefs))
+		return errors.Join(err, m.resetPriorityForMachines(filteredTargetMachineRefs))
 	}
 	return nil
 }
@@ -552,7 +557,7 @@ func (m *McmManager) resetPriorityForMachines(mcRefs []*Ref) error {
 }
 
 // prioritizeMachinesForDeletion prioritizes the targeted machines by updating their priority annotation to 1
-func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) (int, error) {
+func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) error {
 	var expectedToTerminateMachineNodePairs = make(map[string]string)
 	for _, machineRef := range targetMachineRefs {
 		// Trying to update the priority of machineRef till m.maxRetryTimeout
@@ -573,11 +578,11 @@ func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) (in
 			return m.updateAnnotationOnMachine(ctx, mc.Name, machinePriorityAnnotation, priorityValueForCandidateMachines)
 		}, "Machine", "update", machineRef.Name); err != nil {
 			klog.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
-			return 0, fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
+			return fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
 		}
 	}
 	klog.V(2).Infof("Expected to remove following {machineRef: corresponding node} pairs %s", expectedToTerminateMachineNodePairs)
-	return len(expectedToTerminateMachineNodePairs), nil
+	return nil
 }
 
 // updateAnnotationOnMachine returns error only when updating the annotations on machine has been failing consequently and deadline is crossed
@@ -610,7 +615,7 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 }
 
 // scaleDownMachineDeployment scales down the machine deployment by the provided scaleDownAmount and returns the updated spec.Replicas after scale down.
-func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int) (bool, error) {
+func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int, markedMachines string) (bool, error) {
 	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
 	if err != nil {
 		klog.Errorf("Unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
@@ -626,6 +631,10 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 		return false, fmt.Errorf("cannot delete machines in machine deployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, mdclone.Spec.Replicas)
 	}
 	mdclone.Spec.Replicas = expectedReplicas
+	if mdclone.Annotations == nil {
+		mdclone.Annotations = make(map[string]string)
+	}
+	mdclone.Annotations[machinesMarkedByCAForDeletion] = markedMachines
 	_, err = m.machineClient.MachineDeployments(mdclone.Namespace).Update(ctx, mdclone, metav1.UpdateOptions{})
 	if err != nil {
 		return true, fmt.Errorf("unable to scale in machine deployment %s, Error: %w", mdName, err)

From 0d939b02ffd303b8b57e06af6c86e5944395dcab Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Thu, 19 Dec 2024 16:00:40 +0530
Subject: [PATCH 02/27] [WIP] add mutex for machine deployment

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 55 +++++----------
 .../cloudprovider/mcm/mcm_manager.go          | 70 +++++++++++--------
 2 files changed, 56 insertions(+), 69 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index e6ff41de6279..41ac61010622 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -26,6 +26,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	apiv1 "k8s.io/api/core/v1"
@@ -67,15 +68,14 @@ const (
 // MCMCloudProvider implements the cloud provider interface for machine-controller-manager
 // Reference: https://github.com/gardener/machine-controller-manager
 type mcmCloudProvider struct {
-	mcmManager         *McmManager
-	machinedeployments map[types.NamespacedName]*MachineDeployment
-	resourceLimiter    *cloudprovider.ResourceLimiter
+	mcmManager      *McmManager
+	resourceLimiter *cloudprovider.ResourceLimiter
 }
 
 // BuildMcmCloudProvider builds CloudProvider implementation for machine-controller-manager.
 func BuildMcmCloudProvider(mcmManager *McmManager, resourceLimiter *cloudprovider.ResourceLimiter) (cloudprovider.CloudProvider, error) {
 	if mcmManager.discoveryOpts.StaticDiscoverySpecified() {
-		return buildStaticallyDiscoveringProvider(mcmManager, mcmManager.discoveryOpts.NodeGroupSpecs, resourceLimiter)
+		return buildStaticallyDiscoveringProvider(mcmManager, resourceLimiter)
 	}
 	return nil, fmt.Errorf("Failed to build an mcm cloud provider: Either node group specs or node group auto discovery spec must be specified")
 }
@@ -96,16 +96,10 @@ func BuildMCM(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscover
 	return provider
 }
 
-func buildStaticallyDiscoveringProvider(mcmManager *McmManager, specs []string, resourceLimiter *cloudprovider.ResourceLimiter) (*mcmCloudProvider, error) {
+func buildStaticallyDiscoveringProvider(mcmManager *McmManager, resourceLimiter *cloudprovider.ResourceLimiter) (*mcmCloudProvider, error) {
 	mcm := &mcmCloudProvider{
-		mcmManager:         mcmManager,
-		machinedeployments: make(map[types.NamespacedName]*MachineDeployment),
-		resourceLimiter:    resourceLimiter,
-	}
-	for _, spec := range specs {
-		if err := mcm.addNodeGroup(spec); err != nil {
-			return nil, err
-		}
+		mcmManager:      mcmManager,
+		resourceLimiter: resourceLimiter,
 	}
 	return mcm, nil
 }
@@ -116,31 +110,14 @@ func (mcm *mcmCloudProvider) Cleanup() error {
 	return nil
 }
 
-// addNodeGroup adds node group defined in string spec. Format:
-// minNodes:maxNodes:namespace.machineDeploymentName
-func (mcm *mcmCloudProvider) addNodeGroup(spec string) error {
-	machinedeployment, err := buildMachineDeploymentFromSpec(spec, mcm.mcmManager)
-	if err != nil {
-		return err
-	}
-	mcm.addMachineDeployment(machinedeployment)
-	return nil
-}
-
-func (mcm *mcmCloudProvider) addMachineDeployment(machinedeployment *MachineDeployment) {
-	key := types.NamespacedName{Namespace: machinedeployment.Namespace, Name: machinedeployment.Name}
-	mcm.machinedeployments[key] = machinedeployment
-	return
-}
-
 func (mcm *mcmCloudProvider) Name() string {
 	return "machine-controller-manager"
 }
 
 // NodeGroups returns all node groups configured for this cloud provider.
 func (mcm *mcmCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
-	result := make([]cloudprovider.NodeGroup, 0, len(mcm.machinedeployments))
-	for _, machinedeployment := range mcm.machinedeployments {
+	result := make([]cloudprovider.NodeGroup, 0, len(mcm.mcmManager.machineDeployments))
+	for _, machinedeployment := range mcm.mcmManager.machineDeployments {
 		if machinedeployment.maxSize == 0 {
 			continue
 		}
@@ -172,7 +149,7 @@ func (mcm *mcmCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
 	}
 
 	key := types.NamespacedName{Namespace: md.Namespace, Name: md.Name}
-	_, isManaged := mcm.machinedeployments[key]
+	_, isManaged := mcm.mcmManager.machineDeployments[key]
 	if !isManaged {
 		klog.V(4).Infof("Skipped node %v, it's not managed by this controller", node.Spec.ProviderID)
 		return nil, nil
@@ -293,8 +270,9 @@ type MachineDeployment struct {
 
 	mcmManager *McmManager
 
-	minSize int
-	maxSize int
+	scalingMutex sync.Mutex
+	minSize      int
+	maxSize      int
 }
 
 // MaxSize returns maximum size of the node group.
@@ -541,9 +519,10 @@ func buildMachineDeploymentFromSpec(value string, mcmManager *McmManager) (*Mach
 
 func buildMachineDeployment(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *MachineDeployment {
 	return &MachineDeployment{
-		mcmManager: mcmManager,
-		minSize:    minSize,
-		maxSize:    maxSize,
+		mcmManager:   mcmManager,
+		minSize:      minSize,
+		maxSize:      maxSize,
+		scalingMutex: sync.Mutex{},
 		Ref: Ref{
 			Name:      name,
 			Namespace: namespace,
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 2b217f8914a1..c9fcd1c1d30d 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -27,6 +27,7 @@ import (
 	"errors"
 	"flag"
 	"fmt"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/sets"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
@@ -57,7 +58,6 @@ import (
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
-	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	"k8s.io/autoscaler/cluster-autoscaler/processors/nodegroupset"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	"k8s.io/client-go/discovery"
@@ -128,6 +128,7 @@ type McmManager struct {
 	namespace               string
 	interrupt               chan struct{}
 	discoveryOpts           cloudprovider.NodeGroupDiscoveryOptions
+	machineDeployments      map[types.NamespacedName]*MachineDeployment
 	deploymentLister        v1appslister.DeploymentLister
 	machineClient           machineapi.MachineV1alpha1Interface
 	machineDeploymentLister machinelisters.MachineDeploymentLister
@@ -264,7 +265,11 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 			maxRetryTimeout:         maxRetryTimeout,
 			retryInterval:           retryInterval,
 		}
-
+		for _, spec := range discoveryOpts.NodeGroupSpecs {
+			if err := m.addNodeGroup(spec); err != nil {
+				return nil, err
+			}
+		}
 		targetCoreInformerFactory.Start(m.interrupt)
 		controlMachineInformerFactory.Start(m.interrupt)
 		appsInformerFactory.Start(m.interrupt)
@@ -287,6 +292,23 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 	return nil, fmt.Errorf("Unable to start cloud provider MCM for cluster autoscaler: API GroupVersion %q or %q or %q is not available; \nFound: %#v", machineGVR, machineSetGVR, machineDeploymentGVR, availableResources)
 }
 
+// addNodeGroup adds node group defined in string spec. Format:
+// minNodes:maxNodes:namespace.machineDeploymentName
+func (m *McmManager) addNodeGroup(spec string) error {
+	machineDeployment, err := buildMachineDeploymentFromSpec(spec, m)
+	if err != nil {
+		return err
+	}
+	m.addMachineDeployment(machineDeployment)
+	return nil
+}
+
+func (m *McmManager) addMachineDeployment(machineDeployment *MachineDeployment) {
+	key := types.NamespacedName{Namespace: machineDeployment.Namespace, Name: machineDeployment.Name}
+	m.machineDeployments[key] = machineDeployment
+	return
+}
+
 // TODO: In general, any controller checking this needs to be dynamic so
 // users don't have to restart their controller manager if they change the apiserver.
 // Until we get there, the structure here needs to be exposed for the construction of a proper ControllerContext.
@@ -381,36 +403,11 @@ func (m *McmManager) GetMachineDeploymentForMachine(machine *Ref) (*MachineDeplo
 		return nil, fmt.Errorf("unable to find parent MachineDeployment of given MachineSet object %s %v", machineSetName, err)
 	}
 
-	mcmRef := Ref{
-		Name:      machineDeploymentName,
-		Namespace: m.namespace,
+	machineDeployment, ok := m.machineDeployments[types.NamespacedName{Namespace: m.namespace, Name: machineDeploymentName}]
+	if !ok {
+		return nil, fmt.Errorf("machineDeployment %s not found in the list of machine deployments", machineDeploymentName)
 	}
-
-	discoveryOpts := m.discoveryOpts
-	specs := discoveryOpts.NodeGroupSpecs
-	var min, max int
-	for _, spec := range specs {
-		s, err := dynamic.SpecFromString(spec, true)
-		if err != nil {
-			return nil, fmt.Errorf("Error occurred while parsing the spec")
-		}
-
-		str := strings.Split(s.Name, ".")
-		_, Name := str[0], str[1]
-
-		if Name == machineDeploymentName {
-			min = s.MinSize
-			max = s.MaxSize
-			break
-		}
-	}
-
-	return &MachineDeployment{
-		mcmRef,
-		m,
-		min,
-		max,
-	}, nil
+	return machineDeployment, nil
 }
 
 // Refresh method, for each machine deployment, will reset the priority of the machines if the number of annotated machines is more than desired.
@@ -428,12 +425,19 @@ func (m *McmManager) Refresh() error {
 			klog.Infof("[Refresh] machine deployment %s is under rolling update, skipping", machineDeployment.Name)
 			continue
 		}
+		mcd, ok := m.machineDeployments[types.NamespacedName{Namespace: m.namespace, Name: machineDeployment.Name}]
+		if !ok {
+			klog.Errorf("[Refresh] machine deployment %s not found in the list of machine deployments", machineDeployment.Name)
+			continue
+		}
+		mcd.scalingMutex.Lock()
 		markedMachines := sets.New(strings.Split(machineDeployment.Annotations[machinesMarkedByCAForDeletion], ",")...)
 		// check if number of annotated machine objects is more than desired and correspondingly reset the priority annotation value if needed.
 		machines, err := m.getMachinesForMachineDeployment(machineDeployment.Name)
 		if err != nil {
 			klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", machineDeployment.Name, err.Error())
 			collectiveError = errors.Join(collectiveError, err)
+			mcd.scalingMutex.Unlock()
 			continue
 		}
 		var incorrectlyMarkedMachines []*Ref
@@ -447,6 +451,7 @@ func (m *McmManager) Refresh() error {
 			}
 		}
 		collectiveError = errors.Join(collectiveError, m.resetPriorityForMachines(incorrectlyMarkedMachines))
+		mcd.scalingMutex.Unlock()
 	}
 	return collectiveError
 }
@@ -493,6 +498,9 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*Ref) error {
 	if err != nil {
 		return err
 	}
+	// acquire the mutex
+	commonMachineDeployment.scalingMutex.Lock()
+	defer commonMachineDeployment.scalingMutex.Unlock()
 	// get the machine deployment and return if rolling update is not finished
 	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(commonMachineDeployment.Name)
 	if err != nil {

From 98f20d36c819383109b0bb0a42f517310d3ca68c Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Fri, 20 Dec 2024 12:45:51 +0530
Subject: [PATCH 03/27] initialise machinedeployment map in mcmManager

---
 cluster-autoscaler/cloudprovider/mcm/mcm_manager.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index c9fcd1c1d30d..23ee0b5c642d 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -254,6 +254,7 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 		m := &McmManager{
 			namespace:               namespace,
 			interrupt:               make(chan struct{}),
+			machineDeployments:      make(map[types.NamespacedName]*MachineDeployment),
 			deploymentLister:        deploymentLister,
 			machineClient:           controlMachineClient,
 			machineClassLister:      machineClassLister,

From 56d80aca4881c5240d54112a137d216bba7e1ef6 Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Fri, 20 Dec 2024 16:36:07 +0530
Subject: [PATCH 04/27] add Refresh method in nodegrp implementation

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 56 ++++++++++++
 .../cloudprovider/mcm/mcm_manager.go          | 89 +++++--------------
 2 files changed, 80 insertions(+), 65 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 41ac61010622..3d3febe2c953 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -24,6 +24,10 @@ package mcm
 import (
 	"context"
 	"fmt"
+	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/sets"
+	"slices"
 	"strconv"
 	"strings"
 	"sync"
@@ -321,6 +325,8 @@ func (machinedeployment *MachineDeployment) IncreaseSize(delta int) error {
 	if delta <= 0 {
 		return fmt.Errorf("size increase must be positive")
 	}
+	machinedeployment.scalingMutex.Lock()
+	defer machinedeployment.scalingMutex.Unlock()
 	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
 	if err != nil {
 		return err
@@ -344,6 +350,8 @@ func (machinedeployment *MachineDeployment) DecreaseTargetSize(delta int) error
 	if delta >= 0 {
 		return fmt.Errorf("size decrease size must be negative")
 	}
+	machinedeployment.scalingMutex.Lock()
+	defer machinedeployment.scalingMutex.Unlock()
 	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
 	if err != nil {
 		return err
@@ -358,6 +366,54 @@ func (machinedeployment *MachineDeployment) DecreaseTargetSize(delta int) error
 	}, "MachineDeployment", "update", machinedeployment.Name)
 }
 
+// Refresh resets the priority annotation for the machines that are not present in machines-marked-by-ca-for-deletion annotation on the machineDeployment
+func (machineDeployment *MachineDeployment) Refresh() error {
+	machineDeployment.scalingMutex.Lock()
+	defer machineDeployment.scalingMutex.Unlock()
+	mcd, err := machineDeployment.mcmManager.machineDeploymentLister.MachineDeployments(machineDeployment.Namespace).Get(machineDeployment.Name)
+	if err != nil {
+		return fmt.Errorf("failed to get machine deployment %s: %v", machineDeployment.Name, err)
+	}
+	// ignore the machine deployment if it is in rolling update
+	if !isRollingUpdateFinished(mcd) {
+		klog.Infof("machine deployment %s is under rolling update, skipping", machineDeployment.Name)
+		return nil
+	}
+	markedMachines := sets.New(strings.Split(mcd.Annotations[machinesMarkedByCAForDeletion], ",")...)
+	machines, err := machineDeployment.mcmManager.getMachinesForMachineDeployment(machineDeployment.Name)
+	if err != nil {
+		klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", machineDeployment.Name, err.Error())
+		return err
+	}
+	var incorrectlyMarkedMachines []*Ref
+	for _, machine := range machines {
+		// no need to reset priority for machines already in termination or failed phase
+		if machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
+			continue
+		}
+		if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForCandidateMachines && !markedMachines.Has(machine.Name) {
+			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, &Ref{Name: machine.Name, Namespace: machine.Namespace})
+		}
+	}
+	var updatedMarkedMachines []string
+	for machineName := range markedMachines {
+		if slices.ContainsFunc(machines, func(mc *v1alpha1.Machine) bool {
+			return mc.Name == machineName
+		}) {
+			updatedMarkedMachines = append(updatedMarkedMachines, machineName)
+		}
+	}
+	clone := mcd.DeepCopy()
+	clone.Annotations[machinesMarkedByCAForDeletion] = strings.Join(updatedMarkedMachines, ",")
+	ctx, cancelFn := context.WithTimeout(context.Background(), machineDeployment.mcmManager.maxRetryTimeout)
+	defer cancelFn()
+	_, err = machineDeployment.mcmManager.machineClient.MachineDeployments(machineDeployment.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
+	if err != nil {
+		return err
+	}
+	return machineDeployment.mcmManager.resetPriorityForMachines(incorrectlyMarkedMachines)
+}
+
 // Belongs returns true if the given node belongs to the NodeGroup.
 // TODO: Implement this to iterate over machines under machinedeployment, and return true if node exists in list.
 func (machinedeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, error) {
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 23ee0b5c642d..6ce7b5b70a3c 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -414,45 +414,9 @@ func (m *McmManager) GetMachineDeploymentForMachine(machine *Ref) (*MachineDeplo
 // Refresh method, for each machine deployment, will reset the priority of the machines if the number of annotated machines is more than desired.
 // It will select the machines to reset the priority based on the descending order of creation timestamp.
 func (m *McmManager) Refresh() error {
-	machineDeployments, err := m.machineDeploymentLister.MachineDeployments(m.namespace).List(labels.Everything())
-	if err != nil {
-		klog.Errorf("[Refresh] unable to list machine deployments")
-		return err
-	}
 	var collectiveError error
-	for _, machineDeployment := range machineDeployments {
-		// ignore the machine deployment if it is in rolling update
-		if !isRollingUpdateFinished(machineDeployment) {
-			klog.Infof("[Refresh] machine deployment %s is under rolling update, skipping", machineDeployment.Name)
-			continue
-		}
-		mcd, ok := m.machineDeployments[types.NamespacedName{Namespace: m.namespace, Name: machineDeployment.Name}]
-		if !ok {
-			klog.Errorf("[Refresh] machine deployment %s not found in the list of machine deployments", machineDeployment.Name)
-			continue
-		}
-		mcd.scalingMutex.Lock()
-		markedMachines := sets.New(strings.Split(machineDeployment.Annotations[machinesMarkedByCAForDeletion], ",")...)
-		// check if number of annotated machine objects is more than desired and correspondingly reset the priority annotation value if needed.
-		machines, err := m.getMachinesForMachineDeployment(machineDeployment.Name)
-		if err != nil {
-			klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", machineDeployment.Name, err.Error())
-			collectiveError = errors.Join(collectiveError, err)
-			mcd.scalingMutex.Unlock()
-			continue
-		}
-		var incorrectlyMarkedMachines []*Ref
-		for _, machine := range machines {
-			// no need to reset priority for machines already in termination or failed phase
-			if machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
-				continue
-			}
-			if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForCandidateMachines && !markedMachines.Has(machine.Name) {
-				incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, &Ref{Name: machine.Name, Namespace: machine.Namespace})
-			}
-		}
-		collectiveError = errors.Join(collectiveError, m.resetPriorityForMachines(incorrectlyMarkedMachines))
-		mcd.scalingMutex.Unlock()
+	for _, machineDeployment := range m.machineDeployments {
+		collectiveError = errors.Join(collectiveError, machineDeployment.Refresh())
 	}
 	return collectiveError
 }
@@ -512,28 +476,18 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*Ref) error {
 		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonMachineDeployment.Name)
 	}
 	markedMachines := sets.New(strings.Split(md.Annotations[machinesMarkedByCAForDeletion], ",")...)
-	var filteredTargetMachineRefs []*Ref
-	for _, targetMachineRef := range targetMachineRefs {
-		if !markedMachines.Has(targetMachineRef.Name) {
-			filteredTargetMachineRefs = append(filteredTargetMachineRefs, targetMachineRef)
-			markedMachines.Insert(targetMachineRef.Name)
-		} else {
-			klog.Infof("Machine %s is already marked for deletion, skipping", targetMachineRef.Name)
-		}
-	}
-
 	// update priorities of machines to be deleted except the ones already in termination to 1
-	err = m.prioritizeMachinesForDeletion(filteredTargetMachineRefs)
+	machinesWithPrio1, err := m.prioritizeMachinesForDeletion(targetMachineRefs)
 	if err != nil {
 		return err
 	}
+	markedMachines.Insert(machinesWithPrio1...)
 	// Trying to update the machineDeployment till the deadline
 	err = m.retry(func(ctx context.Context) (bool, error) {
-		return m.scaleDownMachineDeployment(ctx, commonMachineDeployment.Name, len(filteredTargetMachineRefs), strings.Join(markedMachines.UnsortedList(), ","))
+		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonMachineDeployment.Name, len(machinesWithPrio1), strings.Join(markedMachines.UnsortedList(), ","))
 	}, "MachineDeployment", "update", commonMachineDeployment.Name)
 	if err != nil {
-		klog.Errorf("unable to scale in machine deployment %s, will reset priority of target machines, Error: %v", commonMachineDeployment.Name, err)
-		return errors.Join(err, m.resetPriorityForMachines(filteredTargetMachineRefs))
+		klog.Errorf("unable to scale in machine deployment %s, Error: %v", commonMachineDeployment.Name, err)
 	}
 	return nil
 }
@@ -543,6 +497,10 @@ func (m *McmManager) resetPriorityForMachines(mcRefs []*Ref) error {
 	var collectiveError error
 	for _, mcRef := range mcRefs {
 		machine, err := m.machineLister.Machines(m.namespace).Get(mcRef.Name)
+		if kube_errors.IsNotFound(err) {
+			klog.Warningf("Machine %s not found, skipping resetting priority annotation", mcRef.Name)
+			continue
+		}
 		if err != nil {
 			collectiveError = errors.Join(collectiveError, fmt.Errorf("unable to get Machine object %s, Error: %v", mcRef, err))
 			continue
@@ -566,8 +524,9 @@ func (m *McmManager) resetPriorityForMachines(mcRefs []*Ref) error {
 }
 
 // prioritizeMachinesForDeletion prioritizes the targeted machines by updating their priority annotation to 1
-func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) error {
+func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) ([]string, error) {
 	var expectedToTerminateMachineNodePairs = make(map[string]string)
+	var machinesMarkedWithPrio1 []string
 	for _, machineRef := range targetMachineRefs {
 		// Trying to update the priority of machineRef till m.maxRetryTimeout
 		if err := m.retry(func(ctx context.Context) (bool, error) {
@@ -583,15 +542,20 @@ func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) err
 			if isMachineFailedOrTerminating(mc) {
 				return false, nil
 			}
+			if mc.Annotations[machinePriorityAnnotation] == priorityValueForCandidateMachines {
+				klog.Infof("Machine %q priority is already set to 1, hence skipping the update", mc.Name)
+				return false, nil
+			}
+			machinesMarkedWithPrio1 = append(machinesMarkedWithPrio1, machineRef.Name)
 			expectedToTerminateMachineNodePairs[mc.Name] = mc.Labels["node"]
 			return m.updateAnnotationOnMachine(ctx, mc.Name, machinePriorityAnnotation, priorityValueForCandidateMachines)
 		}, "Machine", "update", machineRef.Name); err != nil {
 			klog.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
-			return fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
+			return nil, fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
 		}
 	}
 	klog.V(2).Infof("Expected to remove following {machineRef: corresponding node} pairs %s", expectedToTerminateMachineNodePairs)
-	return nil
+	return machinesMarkedWithPrio1, nil
 }
 
 // updateAnnotationOnMachine returns error only when updating the annotations on machine has been failing consequently and deadline is crossed
@@ -606,16 +570,10 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 		return true, err
 	}
 	clone := machine.DeepCopy()
-	if clone.Annotations != nil {
-		if clone.Annotations[key] == val {
-			klog.Infof("Machine %q priority is already set to 1, hence skipping the update", machine.Name)
-			return false, nil
-		}
-		clone.Annotations[key] = val
-	} else {
+	if clone.Annotations == nil {
 		clone.Annotations = make(map[string]string)
-		clone.Annotations[key] = val
 	}
+	clone.Annotations[key] = val
 	_, err = m.machineClient.Machines(machine.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
 	if err == nil {
 		klog.Infof("Machine %s marked with priority %s successfully", mcName, val)
@@ -623,8 +581,9 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 	return true, err
 }
 
-// scaleDownMachineDeployment scales down the machine deployment by the provided scaleDownAmount and returns the updated spec.Replicas after scale down.
-func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int, markedMachines string) (bool, error) {
+// scaleDownAndAnnotateMachineDeployment scales down the machine deployment by the provided scaleDownAmount and returns the updated spec.Replicas after scale down.
+// It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of existing machines marked for deletion.
+func (m *McmManager) scaleDownAndAnnotateMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int, markedMachines string) (bool, error) {
 	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
 	if err != nil {
 		klog.Errorf("Unable to fetch MachineDeployment object %s, Error: %v", mdName, err)

From f3774f4be4b9027e8a98e2aa53a90fe34389344c Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Tue, 24 Dec 2024 17:53:33 +0530
Subject: [PATCH 05/27] address review comments

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 138 +++++++++---------
 .../cloudprovider/mcm/mcm_manager.go          |  11 +-
 2 files changed, 73 insertions(+), 76 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 3d3febe2c953..eff0e4bfaabf 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -229,14 +229,8 @@ func (mcm *mcmCloudProvider) GetNodeGpuConfig(*apiv1.Node) *cloudprovider.GpuCon
 	return nil
 }
 
-// Ref contains a reference to the name of the machine-deployment.
-type Ref struct {
-	Name      string
-	Namespace string
-}
-
 // ReferenceFromProviderID extracts the Ref from providerId. It returns corresponding machine-name to providerid.
-func ReferenceFromProviderID(m *McmManager, id string) (*Ref, error) {
+func ReferenceFromProviderID(m *McmManager, id string) (*types.NamespacedName, error) {
 	machines, err := m.machineLister.Machines(m.namespace).List(labels.Everything())
 	if err != nil {
 		return nil, fmt.Errorf("Could not list machines due to error: %s", err)
@@ -262,7 +256,7 @@ func ReferenceFromProviderID(m *McmManager, id string) (*Ref, error) {
 		klog.V(4).Infof("No machine found for node ID %q", id)
 		return nil, nil
 	}
-	return &Ref{
+	return &types.NamespacedName{
 		Name:      Name,
 		Namespace: Namespace,
 	}, nil
@@ -270,7 +264,7 @@ func ReferenceFromProviderID(m *McmManager, id string) (*Ref, error) {
 
 // MachineDeployment implements NodeGroup interface.
 type MachineDeployment struct {
-	Ref
+	types.NamespacedName
 
 	mcmManager *McmManager
 
@@ -280,64 +274,64 @@ type MachineDeployment struct {
 }
 
 // MaxSize returns maximum size of the node group.
-func (machinedeployment *MachineDeployment) MaxSize() int {
-	return machinedeployment.maxSize
+func (machineDeployment *MachineDeployment) MaxSize() int {
+	return machineDeployment.maxSize
 }
 
 // MinSize returns minimum size of the node group.
-func (machinedeployment *MachineDeployment) MinSize() int {
-	return machinedeployment.minSize
+func (machineDeployment *MachineDeployment) MinSize() int {
+	return machineDeployment.minSize
 }
 
 // TargetSize returns the current TARGET size of the node group. It is possible that the
 // number is different from the number of nodes registered in Kubernetes.
-func (machinedeployment *MachineDeployment) TargetSize() (int, error) {
-	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
+func (machineDeployment *MachineDeployment) TargetSize() (int, error) {
+	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
 	return int(size), err
 }
 
 // Exist checks if the node group really exists on the cloud provider side. Allows to tell the
 // theoretical node group from the real one.
 // TODO: Implement this to check if machine-deployment really exists.
-func (machinedeployment *MachineDeployment) Exist() bool {
+func (machineDeployment *MachineDeployment) Exist() bool {
 	return true
 }
 
 // Create creates the node group on the cloud provider side.
-func (machinedeployment *MachineDeployment) Create() (cloudprovider.NodeGroup, error) {
+func (machineDeployment *MachineDeployment) Create() (cloudprovider.NodeGroup, error) {
 	return nil, cloudprovider.ErrAlreadyExist
 }
 
 // Autoprovisioned returns true if the node group is autoprovisioned.
-func (machinedeployment *MachineDeployment) Autoprovisioned() bool {
+func (machineDeployment *MachineDeployment) Autoprovisioned() bool {
 	return false
 }
 
 // Delete deletes the node group on the cloud provider side.
 // This will be executed only for autoprovisioned node groups, once their size drops to 0.
-func (machinedeployment *MachineDeployment) Delete() error {
+func (machineDeployment *MachineDeployment) Delete() error {
 	return cloudprovider.ErrNotImplemented
 }
 
 // IncreaseSize of the Machinedeployment.
-func (machinedeployment *MachineDeployment) IncreaseSize(delta int) error {
-	klog.V(0).Infof("Received request to increase size of machine deployment %s by %d", machinedeployment.Name, delta)
+func (machineDeployment *MachineDeployment) IncreaseSize(delta int) error {
+	klog.V(0).Infof("Received request to increase size of machine deployment %s by %d", machineDeployment.Name, delta)
 	if delta <= 0 {
 		return fmt.Errorf("size increase must be positive")
 	}
-	machinedeployment.scalingMutex.Lock()
-	defer machinedeployment.scalingMutex.Unlock()
-	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
+	machineDeployment.scalingMutex.Lock()
+	defer machineDeployment.scalingMutex.Unlock()
+	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
 	if err != nil {
 		return err
 	}
 	targetSize := int(size) + delta
-	if targetSize > machinedeployment.MaxSize() {
-		return fmt.Errorf("size increase too large - desired:%d max:%d", targetSize, machinedeployment.MaxSize())
+	if targetSize > machineDeployment.MaxSize() {
+		return fmt.Errorf("size increase too large - desired:%d max:%d", targetSize, machineDeployment.MaxSize())
 	}
-	return machinedeployment.mcmManager.retry(func(ctx context.Context) (bool, error) {
-		return machinedeployment.mcmManager.SetMachineDeploymentSize(ctx, machinedeployment, int64(targetSize))
-	}, "MachineDeployment", "update", machinedeployment.Name)
+	return machineDeployment.mcmManager.retry(func(ctx context.Context) (bool, error) {
+		return machineDeployment.mcmManager.SetMachineDeploymentSize(ctx, machineDeployment, int64(targetSize))
+	}, "MachineDeployment", "update", machineDeployment.Name)
 }
 
 // DecreaseTargetSize decreases the target size of the node group. This function
@@ -345,25 +339,25 @@ func (machinedeployment *MachineDeployment) IncreaseSize(delta int) error {
 // request for new nodes that have not been yet fulfilled. Delta should be negative.
 // It is assumed that cloud provider will not delete the existing nodes if the size
 // when there is an option to just decrease the target.
-func (machinedeployment *MachineDeployment) DecreaseTargetSize(delta int) error {
-	klog.V(0).Infof("Received request to decrease target size of machine deployment %s by %d", machinedeployment.Name, delta)
+func (machineDeployment *MachineDeployment) DecreaseTargetSize(delta int) error {
+	klog.V(0).Infof("Received request to decrease target size of machine deployment %s by %d", machineDeployment.Name, delta)
 	if delta >= 0 {
 		return fmt.Errorf("size decrease size must be negative")
 	}
-	machinedeployment.scalingMutex.Lock()
-	defer machinedeployment.scalingMutex.Unlock()
-	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
+	machineDeployment.scalingMutex.Lock()
+	defer machineDeployment.scalingMutex.Unlock()
+	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
 	if err != nil {
 		return err
 	}
 	decreaseAmount := int(size) + delta
-	if decreaseAmount < machinedeployment.minSize {
-		klog.Warningf("Cannot go below min size= %d for machineDeployment %s, requested target size= %d . Setting target size to min size", machinedeployment.minSize, machinedeployment.Name, size+int64(delta))
-		decreaseAmount = machinedeployment.minSize
+	if decreaseAmount < machineDeployment.minSize {
+		klog.Warningf("Cannot go below min size= %d for machineDeployment %s, requested target size= %d . Setting target size to min size", machineDeployment.minSize, machineDeployment.Name, size+int64(delta))
+		decreaseAmount = machineDeployment.minSize
 	}
-	return machinedeployment.mcmManager.retry(func(ctx context.Context) (bool, error) {
-		return machinedeployment.mcmManager.SetMachineDeploymentSize(ctx, machinedeployment, int64(decreaseAmount))
-	}, "MachineDeployment", "update", machinedeployment.Name)
+	return machineDeployment.mcmManager.retry(func(ctx context.Context) (bool, error) {
+		return machineDeployment.mcmManager.SetMachineDeploymentSize(ctx, machineDeployment, int64(decreaseAmount))
+	}, "MachineDeployment", "update", machineDeployment.Name)
 }
 
 // Refresh resets the priority annotation for the machines that are not present in machines-marked-by-ca-for-deletion annotation on the machineDeployment
@@ -379,20 +373,20 @@ func (machineDeployment *MachineDeployment) Refresh() error {
 		klog.Infof("machine deployment %s is under rolling update, skipping", machineDeployment.Name)
 		return nil
 	}
-	markedMachines := sets.New(strings.Split(mcd.Annotations[machinesMarkedByCAForDeletion], ",")...)
+	markedMachines := getMachinesMarkedByCAForDeletion(mcd)
 	machines, err := machineDeployment.mcmManager.getMachinesForMachineDeployment(machineDeployment.Name)
 	if err != nil {
 		klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", machineDeployment.Name, err.Error())
 		return err
 	}
-	var incorrectlyMarkedMachines []*Ref
+	var incorrectlyMarkedMachines []*types.NamespacedName
 	for _, machine := range machines {
 		// no need to reset priority for machines already in termination or failed phase
 		if machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
 			continue
 		}
 		if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForCandidateMachines && !markedMachines.Has(machine.Name) {
-			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, &Ref{Name: machine.Name, Namespace: machine.Namespace})
+			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, &types.NamespacedName{Name: machine.Name, Namespace: machine.Namespace})
 		}
 	}
 	var updatedMarkedMachines []string
@@ -416,19 +410,19 @@ func (machineDeployment *MachineDeployment) Refresh() error {
 
 // Belongs returns true if the given node belongs to the NodeGroup.
 // TODO: Implement this to iterate over machines under machinedeployment, and return true if node exists in list.
-func (machinedeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, error) {
-	ref, err := ReferenceFromProviderID(machinedeployment.mcmManager, node.Spec.ProviderID)
+func (machineDeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, error) {
+	ref, err := ReferenceFromProviderID(machineDeployment.mcmManager, node.Spec.ProviderID)
 	if err != nil {
 		return false, err
 	}
-	targetMd, err := machinedeployment.mcmManager.GetMachineDeploymentForMachine(ref)
+	targetMd, err := machineDeployment.mcmManager.GetMachineDeploymentForMachine(ref)
 	if err != nil {
 		return false, err
 	}
 	if targetMd == nil {
 		return false, fmt.Errorf("%s doesn't belong to a known MachinDeployment", node.Name)
 	}
-	if targetMd.Id() != machinedeployment.Id() {
+	if targetMd.Id() != machineDeployment.Id() {
 		return false, nil
 	}
 	return true, nil
@@ -436,31 +430,31 @@ func (machinedeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, err
 
 // DeleteNodes deletes the nodes from the group. It is expected that this method will not be called
 // for nodes which are not part of ANY machine deployment.
-func (machinedeployment *MachineDeployment) DeleteNodes(nodes []*apiv1.Node) error {
+func (machineDeployment *MachineDeployment) DeleteNodes(nodes []*apiv1.Node) error {
 	nodeNames := getNodeNames(nodes)
 	klog.V(0).Infof("Received request to delete nodes:- %v", nodeNames)
-	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
+	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
 	if err != nil {
 		return err
 	}
-	if int(size) <= machinedeployment.MinSize() {
+	if int(size) <= machineDeployment.MinSize() {
 		return fmt.Errorf("min size reached, nodes will not be deleted")
 	}
-	machines := make([]*Ref, 0, len(nodes))
+	machines := make([]*types.NamespacedName, 0, len(nodes))
 	for _, node := range nodes {
-		belongs, err := machinedeployment.Belongs(node)
+		belongs, err := machineDeployment.Belongs(node)
 		if err != nil {
 			return err
 		} else if !belongs {
-			return fmt.Errorf("%s belongs to a different machinedeployment than %s", node.Name, machinedeployment.Id())
+			return fmt.Errorf("%s belongs to a different machinedeployment than %s", node.Name, machineDeployment.Id())
 		}
-		ref, err := ReferenceFromProviderID(machinedeployment.mcmManager, node.Spec.ProviderID)
+		ref, err := ReferenceFromProviderID(machineDeployment.mcmManager, node.Spec.ProviderID)
 		if err != nil {
 			return fmt.Errorf("couldn't find the machine-name from provider-id %s", node.Spec.ProviderID)
 		}
 		machines = append(machines, ref)
 	}
-	return machinedeployment.mcmManager.DeleteMachines(machines)
+	return machineDeployment.mcmManager.DeleteMachines(machines)
 }
 
 func getNodeNames(nodes []*apiv1.Node) interface{} {
@@ -472,20 +466,20 @@ func getNodeNames(nodes []*apiv1.Node) interface{} {
 }
 
 // Id returns machinedeployment id.
-func (machinedeployment *MachineDeployment) Id() string {
-	return machinedeployment.Name
+func (machineDeployment *MachineDeployment) Id() string {
+	return machineDeployment.Name
 }
 
 // Debug returns a debug string for the Asg.
-func (machinedeployment *MachineDeployment) Debug() string {
-	return fmt.Sprintf("%s (%d:%d)", machinedeployment.Id(), machinedeployment.MinSize(), machinedeployment.MaxSize())
+func (machineDeployment *MachineDeployment) Debug() string {
+	return fmt.Sprintf("%s (%d:%d)", machineDeployment.Id(), machineDeployment.MinSize(), machineDeployment.MaxSize())
 }
 
 // Nodes returns a list of all nodes that belong to this node group.
-func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, error) {
-	instances, err := machinedeployment.mcmManager.GetInstancesForMachineDeployment(machinedeployment)
+func (machineDeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, error) {
+	instances, err := machineDeployment.mcmManager.GetInstancesForMachineDeployment(machineDeployment)
 	if err != nil {
-		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the machinedeployment %q, error: %v", machinedeployment.Name, err)
+		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the machinedeployment %q, error: %v", machineDeployment.Name, err)
 	}
 	erroneousInstanceInfos := make([]string, 0, len(instances))
 	for _, instance := range instances {
@@ -502,9 +496,9 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e
 // GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
 // NodeGroup. Returning a nil will result in using default options.
 // Implementation optional.
-func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
+func (machineDeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
 	options := defaults
-	mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name)
+	mcdAnnotations, err := machineDeployment.mcmManager.GetMachineDeploymentAnnotations(machineDeployment.Name)
 	if err != nil {
 		return nil, err
 	}
@@ -538,25 +532,25 @@ func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroup
 }
 
 // TemplateNodeInfo returns a node template for this node group.
-func (machinedeployment *MachineDeployment) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
+func (machineDeployment *MachineDeployment) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
 
-	nodeTemplate, err := machinedeployment.mcmManager.GetMachineDeploymentNodeTemplate(machinedeployment)
+	nodeTemplate, err := machineDeployment.mcmManager.GetMachineDeploymentNodeTemplate(machineDeployment)
 	if err != nil {
 		return nil, err
 	}
 
-	node, err := machinedeployment.mcmManager.buildNodeFromTemplate(machinedeployment.Name, nodeTemplate)
+	node, err := machineDeployment.mcmManager.buildNodeFromTemplate(machineDeployment.Name, nodeTemplate)
 	if err != nil {
 		return nil, err
 	}
 
-	nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(machinedeployment.Name))
+	nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(machineDeployment.Name))
 	nodeInfo.SetNode(node)
 	return nodeInfo, nil
 }
 
 // AtomicIncreaseSize is not implemented.
-func (machinedeployment *MachineDeployment) AtomicIncreaseSize(delta int) error {
+func (machineDeployment *MachineDeployment) AtomicIncreaseSize(delta int) error {
 	return cloudprovider.ErrNotImplemented
 }
 
@@ -573,13 +567,17 @@ func buildMachineDeploymentFromSpec(value string, mcmManager *McmManager) (*Mach
 	return machinedeployment, nil
 }
 
+func getMachinesMarkedByCAForDeletion(mcd *v1alpha1.MachineDeployment) sets.Set[string] {
+	return sets.New(strings.Split(mcd.Annotations[machinesMarkedByCAForDeletion], ",")...)
+}
+
 func buildMachineDeployment(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *MachineDeployment {
 	return &MachineDeployment{
 		mcmManager:   mcmManager,
 		minSize:      minSize,
 		maxSize:      maxSize,
 		scalingMutex: sync.Mutex{},
-		Ref: Ref{
+		NamespacedName: types.NamespacedName{
 			Name:      name,
 			Namespace: namespace,
 		},
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 6ce7b5b70a3c..71f682df9c8f 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -28,7 +28,6 @@ import (
 	"flag"
 	"fmt"
 	"k8s.io/apimachinery/pkg/types"
-	"k8s.io/apimachinery/pkg/util/sets"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
 	"maps"
@@ -370,7 +369,7 @@ func CreateMcmManager(discoveryOpts cloudprovider.NodeGroupDiscoveryOptions) (*M
 }
 
 // GetMachineDeploymentForMachine returns the MachineDeployment for the Machine object.
-func (m *McmManager) GetMachineDeploymentForMachine(machine *Ref) (*MachineDeployment, error) {
+func (m *McmManager) GetMachineDeploymentForMachine(machine *types.NamespacedName) (*MachineDeployment, error) {
 	if machine.Name == "" {
 		// Considering the possibility when Machine has been deleted but due to cached Node object it appears here.
 		return nil, fmt.Errorf("Node does not Exists")
@@ -455,7 +454,7 @@ func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, machinedeploy
 }
 
 // DeleteMachines annotates the target machines and also reduces the desired replicas of the MachineDeployment.
-func (m *McmManager) DeleteMachines(targetMachineRefs []*Ref) error {
+func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) error {
 	if len(targetMachineRefs) == 0 {
 		return nil
 	}
@@ -475,7 +474,7 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*Ref) error {
 	if !isRollingUpdateFinished(md) {
 		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonMachineDeployment.Name)
 	}
-	markedMachines := sets.New(strings.Split(md.Annotations[machinesMarkedByCAForDeletion], ",")...)
+	markedMachines := getMachinesMarkedByCAForDeletion(md)
 	// update priorities of machines to be deleted except the ones already in termination to 1
 	machinesWithPrio1, err := m.prioritizeMachinesForDeletion(targetMachineRefs)
 	if err != nil {
@@ -493,7 +492,7 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*Ref) error {
 }
 
 // resetPriorityForMachines resets the priority of machines passed in the argument to defaultPriorityValue
-func (m *McmManager) resetPriorityForMachines(mcRefs []*Ref) error {
+func (m *McmManager) resetPriorityForMachines(mcRefs []*types.NamespacedName) error {
 	var collectiveError error
 	for _, mcRef := range mcRefs {
 		machine, err := m.machineLister.Machines(m.namespace).Get(mcRef.Name)
@@ -524,7 +523,7 @@ func (m *McmManager) resetPriorityForMachines(mcRefs []*Ref) error {
 }
 
 // prioritizeMachinesForDeletion prioritizes the targeted machines by updating their priority annotation to 1
-func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*Ref) ([]string, error) {
+func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*types.NamespacedName) ([]string, error) {
 	var expectedToTerminateMachineNodePairs = make(map[string]string)
 	var machinesMarkedWithPrio1 []string
 	for _, machineRef := range targetMachineRefs {

From 9063248d3af6d37f75834154954584207d586722 Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Thu, 26 Dec 2024 15:01:48 +0530
Subject: [PATCH 06/27] address review comments - part 2

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   |  78 +++++--------
 .../cloudprovider/mcm/mcm_manager.go          | 104 ++++++++++++------
 2 files changed, 97 insertions(+), 85 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index eff0e4bfaabf..ed570472e598 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -26,7 +26,6 @@ import (
 	"fmt"
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/util/sets"
 	"slices"
 	"strconv"
 	"strings"
@@ -39,7 +38,6 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	"k8s.io/autoscaler/cluster-autoscaler/config"
-	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"k8s.io/klog/v2"
 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
@@ -364,47 +362,47 @@ func (machineDeployment *MachineDeployment) DecreaseTargetSize(delta int) error
 func (machineDeployment *MachineDeployment) Refresh() error {
 	machineDeployment.scalingMutex.Lock()
 	defer machineDeployment.scalingMutex.Unlock()
-	mcd, err := machineDeployment.mcmManager.machineDeploymentLister.MachineDeployments(machineDeployment.Namespace).Get(machineDeployment.Name)
+	mcd, err := machineDeployment.mcmManager.GetMachineDeploymentResource(machineDeployment.Name)
 	if err != nil {
-		return fmt.Errorf("failed to get machine deployment %s: %v", machineDeployment.Name, err)
-	}
-	// ignore the machine deployment if it is in rolling update
-	if !isRollingUpdateFinished(mcd) {
-		klog.Infof("machine deployment %s is under rolling update, skipping", machineDeployment.Name)
-		return nil
+		return err
 	}
-	markedMachines := getMachinesMarkedByCAForDeletion(mcd)
+	markedMachineNames := getMachineNamesMarkedByCAForDeletion(mcd)
 	machines, err := machineDeployment.mcmManager.getMachinesForMachineDeployment(machineDeployment.Name)
 	if err != nil {
 		klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", machineDeployment.Name, err.Error())
 		return err
 	}
-	var incorrectlyMarkedMachines []*types.NamespacedName
-	for _, machine := range machines {
-		// no need to reset priority for machines already in termination or failed phase
-		if machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
-			continue
-		}
-		if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForCandidateMachines && !markedMachines.Has(machine.Name) {
-			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, &types.NamespacedName{Name: machine.Name, Namespace: machine.Namespace})
-		}
-	}
-	var updatedMarkedMachines []string
-	for machineName := range markedMachines {
+	// update the machines-marked-by-ca-for-deletion annotation with the machines that are still marked for deletion by CA.
+	// This is done to ensure that the machines that are no longer present are removed from the annotation.
+	var updatedMarkedMachineNames []string
+	for _, machineName := range markedMachineNames {
 		if slices.ContainsFunc(machines, func(mc *v1alpha1.Machine) bool {
 			return mc.Name == machineName
 		}) {
-			updatedMarkedMachines = append(updatedMarkedMachines, machineName)
+			updatedMarkedMachineNames = append(updatedMarkedMachineNames, machineName)
 		}
 	}
 	clone := mcd.DeepCopy()
-	clone.Annotations[machinesMarkedByCAForDeletion] = strings.Join(updatedMarkedMachines, ",")
+	clone.Annotations[machinesMarkedByCAForDeletion] = createMachinesMarkedForDeletionAnnotationValue(updatedMarkedMachineNames)
 	ctx, cancelFn := context.WithTimeout(context.Background(), machineDeployment.mcmManager.maxRetryTimeout)
 	defer cancelFn()
 	_, err = machineDeployment.mcmManager.machineClient.MachineDeployments(machineDeployment.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
 	if err != nil {
 		return err
 	}
+	// reset the priority for the machines that are not present in machines-marked-by-ca-for-deletion annotation
+	var incorrectlyMarkedMachines []types.NamespacedName
+	for _, machine := range machines {
+		// no need to reset priority for machines already in termination or failed phase
+		if isMachineFailedOrTerminating(machine) {
+			continue
+		}
+		// check if the machine is marked for deletion by CA but not present in machines-marked-by-ca-for-deletion annotation. This means that CA was not able to reduce the replicas
+		// corresponding to this machine and hence the machine should not be marked for deletion.
+		if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForDeletionCandidateMachines && !slices.Contains(markedMachineNames, machine.Name) {
+			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, types.NamespacedName{Name: machine.Name, Namespace: machine.Namespace})
+		}
+	}
 	return machineDeployment.mcmManager.resetPriorityForMachines(incorrectlyMarkedMachines)
 }
 
@@ -554,32 +552,10 @@ func (machineDeployment *MachineDeployment) AtomicIncreaseSize(delta int) error
 	return cloudprovider.ErrNotImplemented
 }
 
-func buildMachineDeploymentFromSpec(value string, mcmManager *McmManager) (*MachineDeployment, error) {
-	spec, err := dynamic.SpecFromString(value, true)
-
-	if err != nil {
-		return nil, fmt.Errorf("failed to parse node group spec: %v", err)
-	}
-	s := strings.Split(spec.Name, ".")
-	Namespace, Name := s[0], s[1]
-
-	machinedeployment := buildMachineDeployment(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
-	return machinedeployment, nil
-}
-
-func getMachinesMarkedByCAForDeletion(mcd *v1alpha1.MachineDeployment) sets.Set[string] {
-	return sets.New(strings.Split(mcd.Annotations[machinesMarkedByCAForDeletion], ",")...)
-}
-
-func buildMachineDeployment(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *MachineDeployment {
-	return &MachineDeployment{
-		mcmManager:   mcmManager,
-		minSize:      minSize,
-		maxSize:      maxSize,
-		scalingMutex: sync.Mutex{},
-		NamespacedName: types.NamespacedName{
-			Name:      name,
-			Namespace: namespace,
-		},
+// getMachineNamesMarkedByCAForDeletion returns the set of machine names marked by CA for deletion.
+func getMachineNamesMarkedByCAForDeletion(mcd *v1alpha1.MachineDeployment) []string {
+	if mcd.Annotations == nil || mcd.Annotations[machinesMarkedByCAForDeletion] == "" {
+		return make([]string, 0)
 	}
+	return strings.Split(mcd.Annotations[machinesMarkedByCAForDeletion], ",")
 }
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 71f682df9c8f..8d4e3cdce84f 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -28,6 +28,7 @@ import (
 	"flag"
 	"fmt"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
 	"maps"
@@ -37,6 +38,7 @@ import (
 	"slices"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	awsapis "github.com/gardener/machine-controller-manager-provider-aws/pkg/aws/apis"
@@ -77,9 +79,9 @@ const (
 	defaultResetAnnotationTimeout = 10 * time.Second
 	// defaultPriorityValue is the default value for the priority annotation used by CA. It is set to 3 because MCM defaults the priority of machine it creates to 3.
 	defaultPriorityValue = "3"
-	// priorityValueForCandidateMachines is the priority annotation value set on machines that the CA wants to be deleted. Its value is set to 1.
-	priorityValueForCandidateMachines = "1"
-	minResyncPeriodDefault            = 1 * time.Hour
+	// priorityValueForDeletionCandidateMachines is the priority annotation value set on machines that the CA wants to be deleted. Its value is set to 1.
+	priorityValueForDeletionCandidateMachines = "1"
+	minResyncPeriodDefault                    = 1 * time.Hour
 	// machinePriorityAnnotation is the annotation to set machine priority while deletion
 	machinePriorityAnnotation = "machinepriority.machine.sapcloud.io"
 	// kindMachineClass is the kind for generic machine class used by the OOT providers
@@ -413,11 +415,11 @@ func (m *McmManager) GetMachineDeploymentForMachine(machine *types.NamespacedNam
 // Refresh method, for each machine deployment, will reset the priority of the machines if the number of annotated machines is more than desired.
 // It will select the machines to reset the priority based on the descending order of creation timestamp.
 func (m *McmManager) Refresh() error {
-	var collectiveError error
+	var collectiveError []error
 	for _, machineDeployment := range m.machineDeployments {
-		collectiveError = errors.Join(collectiveError, machineDeployment.Refresh())
+		collectiveError = append(collectiveError, machineDeployment.Refresh())
 	}
-	return collectiveError
+	return errors.Join(collectiveError...)
 }
 
 // Cleanup does nothing at the moment.
@@ -428,18 +430,17 @@ func (m *McmManager) Cleanup() {
 
 // GetMachineDeploymentSize returns the replicas field of the MachineDeployment
 func (m *McmManager) GetMachineDeploymentSize(machinedeployment *MachineDeployment) (int64, error) {
-	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machinedeployment.Name)
+	md, err := m.GetMachineDeploymentResource(machinedeployment.Name)
 	if err != nil {
-		return 0, fmt.Errorf("Unable to fetch MachineDeployment object %s %v", machinedeployment.Name, err)
+		return 0, err
 	}
 	return int64(md.Spec.Replicas), nil
 }
 
 // SetMachineDeploymentSize sets the desired size for the Machinedeployment.
 func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, machinedeployment *MachineDeployment, size int64) (bool, error) {
-	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machinedeployment.Name)
+	md, err := m.GetMachineDeploymentResource(machinedeployment.Name)
 	if err != nil {
-		klog.Errorf("Unable to fetch MachineDeployment object %s, Error: %v", machinedeployment.Name, err)
 		return true, err
 	}
 	// don't scale down during rolling update, as that could remove ready node with workload
@@ -466,24 +467,23 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) e
 	commonMachineDeployment.scalingMutex.Lock()
 	defer commonMachineDeployment.scalingMutex.Unlock()
 	// get the machine deployment and return if rolling update is not finished
-	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(commonMachineDeployment.Name)
+	md, err := m.GetMachineDeploymentResource(commonMachineDeployment.Name)
 	if err != nil {
-		klog.Errorf("Unable to fetch MachineDeployment object %s, Error: %v", commonMachineDeployment.Name, err)
 		return err
 	}
 	if !isRollingUpdateFinished(md) {
 		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonMachineDeployment.Name)
 	}
-	markedMachines := getMachinesMarkedByCAForDeletion(md)
+	machineNamesMarkedByCA := getMachineNamesMarkedByCAForDeletion(md)
 	// update priorities of machines to be deleted except the ones already in termination to 1
-	machinesWithPrio1, err := m.prioritizeMachinesForDeletion(targetMachineRefs)
+	machineNamesWithPrio1, err := m.prioritizeMachinesForDeletion(targetMachineRefs)
 	if err != nil {
 		return err
 	}
-	markedMachines.Insert(machinesWithPrio1...)
+	machineNamesMarkedByCA = append(machineNamesMarkedByCA, machineNamesWithPrio1...)
 	// Trying to update the machineDeployment till the deadline
 	err = m.retry(func(ctx context.Context) (bool, error) {
-		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonMachineDeployment.Name, len(machinesWithPrio1), strings.Join(markedMachines.UnsortedList(), ","))
+		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonMachineDeployment.Name, len(machineNamesWithPrio1), createMachinesMarkedForDeletionAnnotationValue(machineNamesMarkedByCA))
 	}, "MachineDeployment", "update", commonMachineDeployment.Name)
 	if err != nil {
 		klog.Errorf("unable to scale in machine deployment %s, Error: %v", commonMachineDeployment.Name, err)
@@ -492,8 +492,8 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) e
 }
 
 // resetPriorityForMachines resets the priority of machines passed in the argument to defaultPriorityValue
-func (m *McmManager) resetPriorityForMachines(mcRefs []*types.NamespacedName) error {
-	var collectiveError error
+func (m *McmManager) resetPriorityForMachines(mcRefs []types.NamespacedName) error {
+	var collectiveError []error
 	for _, mcRef := range mcRefs {
 		machine, err := m.machineLister.Machines(m.namespace).Get(mcRef.Name)
 		if kube_errors.IsNotFound(err) {
@@ -501,7 +501,7 @@ func (m *McmManager) resetPriorityForMachines(mcRefs []*types.NamespacedName) er
 			continue
 		}
 		if err != nil {
-			collectiveError = errors.Join(collectiveError, fmt.Errorf("unable to get Machine object %s, Error: %v", mcRef, err))
+			collectiveError = append(collectiveError, fmt.Errorf("unable to get Machine object %s, Error: %v", mcRef, err))
 			continue
 		}
 		ctx, cancelFn := context.WithDeadline(context.Background(), time.Now().Add(defaultResetAnnotationTimeout))
@@ -515,17 +515,18 @@ func (m *McmManager) resetPriorityForMachines(mcRefs []*types.NamespacedName) er
 			return nil
 		}()
 		if err != nil {
-			collectiveError = errors.Join(collectiveError, fmt.Errorf("could not reset priority annotation on machine %s, Error: %v", machine.Name, err))
+			collectiveError = append(collectiveError, fmt.Errorf("could not reset priority annotation on machine %s, Error: %v", machine.Name, err))
 			continue
 		}
 	}
-	return collectiveError
+	return errors.Join(collectiveError...)
 }
 
 // prioritizeMachinesForDeletion prioritizes the targeted machines by updating their priority annotation to 1
 func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*types.NamespacedName) ([]string, error) {
 	var expectedToTerminateMachineNodePairs = make(map[string]string)
-	var machinesMarkedWithPrio1 []string
+	var prio1MarkedMachineNames []string
+
 	for _, machineRef := range targetMachineRefs {
 		// Trying to update the priority of machineRef till m.maxRetryTimeout
 		if err := m.retry(func(ctx context.Context) (bool, error) {
@@ -541,20 +542,20 @@ func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*types.Na
 			if isMachineFailedOrTerminating(mc) {
 				return false, nil
 			}
-			if mc.Annotations[machinePriorityAnnotation] == priorityValueForCandidateMachines {
+			if mc.Annotations[machinePriorityAnnotation] == priorityValueForDeletionCandidateMachines {
 				klog.Infof("Machine %q priority is already set to 1, hence skipping the update", mc.Name)
 				return false, nil
 			}
-			machinesMarkedWithPrio1 = append(machinesMarkedWithPrio1, machineRef.Name)
+			prio1MarkedMachineNames = append(prio1MarkedMachineNames, machineRef.Name)
 			expectedToTerminateMachineNodePairs[mc.Name] = mc.Labels["node"]
-			return m.updateAnnotationOnMachine(ctx, mc.Name, machinePriorityAnnotation, priorityValueForCandidateMachines)
+			return m.updateAnnotationOnMachine(ctx, mc.Name, machinePriorityAnnotation, priorityValueForDeletionCandidateMachines)
 		}, "Machine", "update", machineRef.Name); err != nil {
 			klog.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
 			return nil, fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
 		}
 	}
 	klog.V(2).Infof("Expected to remove following {machineRef: corresponding node} pairs %s", expectedToTerminateMachineNodePairs)
-	return machinesMarkedWithPrio1, nil
+	return prio1MarkedMachineNames, nil
 }
 
 // updateAnnotationOnMachine returns error only when updating the annotations on machine has been failing consequently and deadline is crossed
@@ -583,9 +584,8 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 // scaleDownAndAnnotateMachineDeployment scales down the machine deployment by the provided scaleDownAmount and returns the updated spec.Replicas after scale down.
 // It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of existing machines marked for deletion.
 func (m *McmManager) scaleDownAndAnnotateMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int, markedMachines string) (bool, error) {
-	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
+	md, err := m.GetMachineDeploymentResource(mdName)
 	if err != nil {
-		klog.Errorf("Unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
 		return true, err
 	}
 	mdclone := md.DeepCopy()
@@ -732,21 +732,19 @@ func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error {
 
 // GetMachineDeploymentAnnotations returns the annotations present on the machine deployment for the provided machine deployment name
 func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName string) (map[string]string, error) {
-	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machineDeploymentName)
+	md, err := m.GetMachineDeploymentResource(machineDeploymentName)
 	if err != nil {
-		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", machineDeploymentName, err)
+		return nil, err
 	}
-
 	return md.Annotations, nil
 }
 
 // GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment
 // If no node present then it forms the nodeTemplate using the one present in machineClass
 func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *MachineDeployment) (*nodeTemplate, error) {
-
-	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machinedeployment.Name)
+	md, err := m.GetMachineDeploymentResource(machinedeployment.Name)
 	if err != nil {
-		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", machinedeployment.Name, err)
+		return nil, err
 	}
 
 	var (
@@ -892,6 +890,16 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
 	return nodeTmpl, nil
 }
 
+// GetMachineDeploymentResource returns the MachineDeployment object for the provided machine deployment name
+func (m *McmManager) GetMachineDeploymentResource(mdName string) (*v1alpha1.MachineDeployment, error) {
+	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
+	if err != nil {
+		klog.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
+		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
+	}
+	return md, nil
+}
+
 func isRollingUpdateFinished(md *v1alpha1.MachineDeployment) bool {
 	for _, cond := range md.Status.Conditions {
 		switch {
@@ -1034,6 +1042,30 @@ func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]stri
 	return result
 }
 
+func buildMachineDeploymentFromSpec(value string, mcmManager *McmManager) (*MachineDeployment, error) {
+	spec, err := dynamic.SpecFromString(value, true)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse node group spec: %v", err)
+	}
+	s := strings.Split(spec.Name, ".")
+	Namespace, Name := s[0], s[1]
+	machinedeployment := buildMachineDeployment(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
+	return machinedeployment, nil
+}
+
+func buildMachineDeployment(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *MachineDeployment {
+	return &MachineDeployment{
+		mcmManager:   mcmManager,
+		minSize:      minSize,
+		maxSize:      maxSize,
+		scalingMutex: sync.Mutex{},
+		NamespacedName: types.NamespacedName{
+			Name:      name,
+			Namespace: namespace,
+		},
+	}
+}
+
 // isMachineFailedOrTerminating returns true if machine is already being terminated or considered for termination by autoscaler.
 func isMachineFailedOrTerminating(machine *v1alpha1.Machine) bool {
 	if !machine.GetDeletionTimestamp().IsZero() || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
@@ -1051,3 +1083,7 @@ func filterExtendedResources(allResources v1.ResourceList) (extendedResources v1
 	})
 	return
 }
+
+func createMachinesMarkedForDeletionAnnotationValue(machineNames []string) string {
+	return strings.Join(machineNames, ",")
+}

From 68d2046910072f74a194fa322172f53ce886536b Mon Sep 17 00:00:00 2001
From: Rishabh Patel <rishabh.patel@sap.com>
Date: Sat, 28 Dec 2024 15:45:25 +0530
Subject: [PATCH 07/27] update unit tests, misc code changes

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   |  22 +-
 .../mcm/mcm_cloud_provider_test.go            | 292 ++++++++----------
 .../cloudprovider/mcm/mcm_manager.go          |  31 +-
 .../cloudprovider/mcm/test_utils.go           |  55 ++--
 4 files changed, 197 insertions(+), 203 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index ed570472e598..bf6cda949dcc 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -383,15 +383,21 @@ func (machineDeployment *MachineDeployment) Refresh() error {
 		}
 	}
 	clone := mcd.DeepCopy()
-	clone.Annotations[machinesMarkedByCAForDeletion] = createMachinesMarkedForDeletionAnnotationValue(updatedMarkedMachineNames)
-	ctx, cancelFn := context.WithTimeout(context.Background(), machineDeployment.mcmManager.maxRetryTimeout)
-	defer cancelFn()
-	_, err = machineDeployment.mcmManager.machineClient.MachineDeployments(machineDeployment.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
-	if err != nil {
-		return err
+	if clone.Annotations == nil {
+		clone.Annotations = map[string]string{}
+	}
+	updatedMachinesMarkedByCAForDeletionAnnotationVal := createMachinesMarkedForDeletionAnnotationValue(updatedMarkedMachineNames)
+	if clone.Annotations[machinesMarkedByCAForDeletion] != updatedMachinesMarkedByCAForDeletionAnnotationVal {
+		clone.Annotations[machinesMarkedByCAForDeletion] = updatedMachinesMarkedByCAForDeletionAnnotationVal
+		ctx, cancelFn := context.WithTimeout(context.Background(), machineDeployment.mcmManager.maxRetryTimeout)
+		defer cancelFn()
+		_, err = machineDeployment.mcmManager.machineClient.MachineDeployments(machineDeployment.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
+		if err != nil {
+			return err
+		}
 	}
 	// reset the priority for the machines that are not present in machines-marked-by-ca-for-deletion annotation
-	var incorrectlyMarkedMachines []types.NamespacedName
+	var incorrectlyMarkedMachines []string
 	for _, machine := range machines {
 		// no need to reset priority for machines already in termination or failed phase
 		if isMachineFailedOrTerminating(machine) {
@@ -400,7 +406,7 @@ func (machineDeployment *MachineDeployment) Refresh() error {
 		// check if the machine is marked for deletion by CA but not present in machines-marked-by-ca-for-deletion annotation. This means that CA was not able to reduce the replicas
 		// corresponding to this machine and hence the machine should not be marked for deletion.
 		if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForDeletionCandidateMachines && !slices.Contains(markedMachineNames, machine.Name) {
-			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, types.NamespacedName{Name: machine.Name, Namespace: machine.Namespace})
+			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, machine.Name)
 		}
 	}
 	return machineDeployment.mcmManager.resetPriorityForMachines(incorrectlyMarkedMachines)
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 6752c2497852..995f51834f95 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -9,6 +9,7 @@ import (
 	"errors"
 	"fmt"
 	"math"
+	"slices"
 	"strings"
 	"testing"
 	"time"
@@ -85,10 +86,11 @@ func TestDeleteNodes(t *testing.T) {
 		node *corev1.Node
 	}
 	type expect struct {
-		machines   []*v1alpha1.Machine
-		mdName     string
-		mdReplicas int32
-		err        error
+		prio1Machines                     []*v1alpha1.Machine
+		mdName                            string
+		mdReplicas                        int32
+		machinesMarkedByCAAnnotationValue string
+		err                               error
 	}
 	type data struct {
 		name   string
@@ -100,42 +102,44 @@ func TestDeleteNodes(t *testing.T) {
 		{
 			"should scale down machine deployment to remove a node",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				mdName:     "machinedeployment-1",
-				mdReplicas: 1,
-				err:        nil,
+				prio1Machines:                     newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				mdName:                            "machinedeployment-1",
+				machinesMarkedByCAAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
+				mdReplicas:                        1,
+				err:                               nil,
 			},
 		},
 		{
 			"should scale down machine deployment to remove a placeholder node",
 			setup{
 				nodes:              nil,
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{false}),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup2},
 			},
-			action{node: newNode("node-1", "requested://machine-1", true)},
+			action{node: newNode("node-1", "requested://machine-1")},
 			expect{
-				machines:   newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				mdName:     "machinedeployment-1",
-				mdReplicas: 0,
-				err:        nil,
+				prio1Machines:                     newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				machinesMarkedByCAAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
+				mdName:                            "machinedeployment-1",
+				mdReplicas:                        0,
+				err:                               nil,
 			},
 		},
 		{
 			"should not scale down a machine deployment when it is under rolling update",
 			setup{
-				nodes:       newNodes(2, "fakeID", []bool{true, false}),
-				machines:    newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:       newNodes(2, "fakeID"),
+				machines:    newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets: newMachineSets(2, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, &v1alpha1.MachineDeploymentStatus{
 					Conditions: []v1alpha1.MachineDeploymentCondition{
@@ -144,19 +148,19 @@ func TestDeleteNodes(t *testing.T) {
 				}, nil, nil),
 				nodeGroups: []string{nodeGroup1},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   nil,
-				mdName:     "machinedeployment-1",
-				mdReplicas: 2,
-				err:        fmt.Errorf("MachineDeployment machinedeployment-1 is under rolling update , cannot reduce replica count"),
+				prio1Machines: nil,
+				mdName:        "machinedeployment-1",
+				mdReplicas:    2,
+				err:           fmt.Errorf("MachineDeployment machinedeployment-1 is under rolling update , cannot reduce replica count"),
 			},
 		},
 		{
 			"should not scale down when machine deployment update call times out and should reset priority of the corresponding machine",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
@@ -166,9 +170,8 @@ func TestDeleteNodes(t *testing.T) {
 					},
 				},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{false}),
 				mdName:     "machinedeployment-1",
 				mdReplicas: 2,
 				err:        errors.Join(nil, fmt.Errorf("unable to scale in machine deployment machinedeployment-1, Error: %w", errors.New(mdUpdateErrorMsg))),
@@ -177,8 +180,8 @@ func TestDeleteNodes(t *testing.T) {
 		{
 			"should scale down when machine deployment update call fails but passes within the timeout period",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
@@ -188,26 +191,26 @@ func TestDeleteNodes(t *testing.T) {
 					},
 				},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				mdName:     "machinedeployment-1",
-				mdReplicas: 1,
-				err:        nil,
+				prio1Machines:                     newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				machinesMarkedByCAAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
+				mdName:                            "machinedeployment-1",
+				mdReplicas:                        1,
+				err:                               nil,
 			},
 		},
 		{
 			"should not scale down a machine deployment when the corresponding machine is already in terminating state",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{true, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", &v1alpha1.MachineStatus{CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineTerminating}}, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{true}),
 				mdName:     "machinedeployment-1",
 				mdReplicas: 2,
 				err:        nil,
@@ -216,15 +219,14 @@ func TestDeleteNodes(t *testing.T) {
 		{
 			"should not scale down a machine deployment when the corresponding machine is already in failed state",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", &v1alpha1.MachineStatus{CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed}}, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", &v1alpha1.MachineStatus{CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed}}, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
 			},
-			action{node: newNodes(1, "fakeID", []bool{false})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   newMachines(2, "fakeID", &v1alpha1.MachineStatus{CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed}}, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
 				mdName:     "machinedeployment-1",
 				mdReplicas: 2,
 				err:        nil,
@@ -233,25 +235,25 @@ func TestDeleteNodes(t *testing.T) {
 		{
 			"should not scale down a machine deployment below the minimum",
 			setup{
-				nodes:              newNodes(1, "fakeID", []bool{true}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{false}),
+				nodes:              newNodes(1, "fakeID"),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   nil,
-				mdName:     "machinedeployment-1",
-				mdReplicas: 1,
-				err:        fmt.Errorf("min size reached, nodes will not be deleted"),
+				prio1Machines: nil,
+				mdName:        "machinedeployment-1",
+				mdReplicas:    1,
+				err:           fmt.Errorf("min size reached, nodes will not be deleted"),
 			},
 		},
 		{
 			"no scale down of machine deployment if priority of the targeted machine cannot be updated to 1",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-1"),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup1},
@@ -261,29 +263,29 @@ func TestDeleteNodes(t *testing.T) {
 					},
 				},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   nil,
-				mdName:     "machinedeployment-1",
-				mdReplicas: 2,
-				err:        fmt.Errorf("could not prioritize machine machine-1 for deletion, aborting scale in of machine deployment, Error: %s", mcUpdateErrorMsg),
+				prio1Machines: nil,
+				mdName:        "machinedeployment-1",
+				mdReplicas:    2,
+				err:           fmt.Errorf("could not prioritize machine machine-1 for deletion, aborting scale in of machine deployment, Error: %s", mcUpdateErrorMsg),
 			},
 		},
 		{
 			"should not scale down machine deployment if the node belongs to another machine deployment",
 			setup{
-				nodes:              newNodes(2, "fakeID", []bool{true, false}),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-2", "machineset-1", []string{"3", "3"}, []bool{false, false}),
+				nodes:              newNodes(2, "fakeID"),
+				machines:           newMachines(2, "fakeID", nil, "machinedeployment-2", "machineset-1", []string{"3", "3"}),
 				machineSets:        newMachineSets(1, "machinedeployment-2"),
 				machineDeployments: newMachineDeployments(2, 2, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup2, nodeGroup3},
 			},
-			action{node: newNodes(1, "fakeID", []bool{true})[0]},
+			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				machines:   nil,
-				mdName:     "machinedeployment-2",
-				mdReplicas: 2,
-				err:        fmt.Errorf("node-1 belongs to a different machinedeployment than machinedeployment-1"),
+				prio1Machines: nil,
+				mdName:        "machinedeployment-2",
+				mdReplicas:    2,
+				err:           fmt.Errorf("node-1 belongs to a different machinedeployment than machinedeployment-1"),
 			},
 		},
 	}
@@ -296,7 +298,7 @@ func TestDeleteNodes(t *testing.T) {
 			stop := make(chan struct{})
 			defer close(stop)
 			controlMachineObjects, targetCoreObjects, _ := setupEnv(&entry.setup)
-			m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, nil, controlMachineObjects, targetCoreObjects, nil)
+			m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, entry.setup.nodeGroups, controlMachineObjects, targetCoreObjects, nil)
 			defer trackers.Stop()
 			waitForCacheSync(t, stop, hasSyncedCacheFns)
 
@@ -321,6 +323,7 @@ func TestDeleteNodes(t *testing.T) {
 			machineDeployment, err := m.machineClient.MachineDeployments(m.namespace).Get(context.TODO(), entry.expect.mdName, metav1.GetOptions{})
 			g.Expect(err).ToNot(HaveOccurred())
 			g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", entry.expect.mdReplicas))
+			g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletion]).To(Equal(entry.expect.machinesMarkedByCAAnnotationValue))
 
 			machines, err := m.machineClient.Machines(m.namespace).List(context.TODO(), metav1.ListOptions{
 				LabelSelector: metav1.FormatLabelSelector(&metav1.LabelSelector{
@@ -329,26 +332,52 @@ func TestDeleteNodes(t *testing.T) {
 			})
 
 			for _, machine := range machines.Items {
-				flag := false
-				for _, entryMachineItem := range entry.expect.machines {
-					if entryMachineItem.Name == machine.Name {
-						g.Expect(machine.Annotations[machinePriorityAnnotation]).To(Equal(entryMachineItem.Annotations[machinePriorityAnnotation]))
-						flag = true
-						break
-					}
-				}
-				if !flag {
-					g.Expect(machine.Annotations[machinePriorityAnnotation]).To(Equal("3"))
+				if slices.ContainsFunc(entry.expect.prio1Machines, func(m *v1alpha1.Machine) bool {
+					return machine.Name == m.Name
+				}) {
+					g.Expect(machine.Annotations[machinePriorityAnnotation]).To(Equal(priorityValueForDeletionCandidateMachines))
+				} else {
+					g.Expect(machine.Annotations[machinePriorityAnnotation]).To(Equal(defaultPriorityValue))
 				}
 			}
 		})
 	}
 }
 
+func TestIdempotencyOfDeleteNodes(t *testing.T) {
+	setupObj := setup{
+		nodes:              newNodes(3, "fakeID"),
+		machines:           newMachines(3, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3", "3"}),
+		machineSets:        newMachineSets(1, "machinedeployment-1"),
+		machineDeployments: newMachineDeployments(1, 3, nil, nil, nil),
+		nodeGroups:         []string{nodeGroup1},
+	}
+	g := NewWithT(t)
+	stop := make(chan struct{})
+	defer close(stop)
+	controlMachineObjects, targetCoreObjects, _ := setupEnv(&setupObj)
+	m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, setupObj.nodeGroups, controlMachineObjects, targetCoreObjects, nil)
+	defer trackers.Stop()
+	waitForCacheSync(t, stop, hasSyncedCacheFns)
+	md, err := buildMachineDeploymentFromSpec(setupObj.nodeGroups[0], m)
+	g.Expect(err).To(BeNil())
+
+	err = md.DeleteNodes(newNodes(1, "fakeID"))
+	g.Expect(err).To(BeNil())
+	err = md.DeleteNodes(newNodes(1, "fakeID"))
+	g.Expect(err).To(BeNil())
+
+	machineDeployment, err := m.machineClient.MachineDeployments(m.namespace).Get(context.TODO(), setupObj.machineDeployments[0].Name, metav1.GetOptions{})
+	g.Expect(err).ToNot(HaveOccurred())
+	g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", 2))
+	g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletion]).To(Equal(createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1))))
+}
+
 func TestRefresh(t *testing.T) {
 	type expect struct {
-		machines []*v1alpha1.Machine
-		err      error
+		prio3Machines                                []string
+		machinesMarkedByCAForDeletionAnnotationValue string
+		err                                          error
 	}
 	type data struct {
 		name   string
@@ -359,8 +388,8 @@ func TestRefresh(t *testing.T) {
 		{
 			"should return an error if MCM has zero available replicas",
 			setup{
-				nodes:              newNodes(1, "fakeID", []bool{false}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
+				nodes:              newNodes(1, "fakeID"),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
 				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup2},
 				mcmDeployment:      newMCMDeployment(0),
@@ -372,8 +401,8 @@ func TestRefresh(t *testing.T) {
 		{
 			"should return an error if MCM deployment is not found",
 			setup{
-				nodes:              newNodes(1, "fakeID", []bool{false}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
+				nodes:              newNodes(1, "fakeID"),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
 				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup2},
 			},
@@ -382,88 +411,38 @@ func TestRefresh(t *testing.T) {
 			},
 		},
 		{
-			"should reset priority of a machine to 3 if machine deployment is not scaled in",
+			"should reset priority of a machine if it is not present in machines-marked-by-ca-for-deletion annotation on machine deployment",
 			setup{
-				nodes:              newNodes(1, "fakeID", []bool{false}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
+				nodes:              newNodes(1, "fakeID"),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
 				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
 				nodeGroups:         []string{nodeGroup2},
 				mcmDeployment:      newMCMDeployment(1),
 			},
 			expect{
-				machines: newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{false}),
-				err:      nil,
+				prio3Machines: generateNames("machine", 1),
+				err:           nil,
 			},
 		},
 		{
-			"should reset priority of a machine to 3 if machine deployment is not scaled in even if ToBeDeletedTaint is present on the corresponding node",
+			"should update the machines-marked-by-ca-for-deletion annotation and remove non-existing machines",
 			setup{
-				nodes:              newNodes(1, "fakeID", []bool{true}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
+				nodes:              newNodes(1, "fakeID"),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				machineDeployments: newMachineDeployments(1, 0, nil, map[string]string{machinesMarkedByCAForDeletion: "machine-1,machine-2"}, nil),
 				nodeGroups:         []string{nodeGroup2},
 				mcmDeployment:      newMCMDeployment(1),
 			},
 			expect{
-				machines: newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{false}),
-				err:      nil,
-			},
-		},
-		{
-			"should NOT skip paused machine deployment",
-			setup{
-				nodes:    newNodes(1, "fakeID", []bool{false}),
-				machines: newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				machineDeployments: newMachineDeployments(1, 1, &v1alpha1.MachineDeploymentStatus{
-					Conditions: []v1alpha1.MachineDeploymentCondition{
-						{Type: v1alpha1.MachineDeploymentProgressing, Status: v1alpha1.ConditionUnknown, Reason: machineDeploymentPausedReason},
-					},
-				}, nil, nil),
-				nodeGroups:    []string{nodeGroup2},
-				mcmDeployment: newMCMDeployment(1),
-			},
-			expect{
-				machines: newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3"}, []bool{false}),
-				err:      nil,
-			},
-		},
-		{
-			"should ignore terminating/failed machines in checking if number of annotated machines is more than desired",
-			setup{
-				nodes: newNodes(1, "fakeID", []bool{true}),
-				machines: newMachines(1, "fakeID", &v1alpha1.MachineStatus{
-					CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed},
-				}, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
-				nodeGroups:         []string{nodeGroup2},
-				mcmDeployment:      newMCMDeployment(1),
-			},
-			expect{
-				machines: newMachines(1, "fakeID", &v1alpha1.MachineStatus{
-					CurrentStatus: v1alpha1.CurrentStatus{Phase: v1alpha1.MachineFailed},
-				}, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
+				machinesMarkedByCAForDeletionAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
 				err: nil,
 			},
 		},
-		{
-			"should not reset priority of a machine to 3 if machine deployment is scaled in",
-			setup{
-				nodes:              newNodes(1, "fakeID", []bool{true}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				machineDeployments: newMachineDeployments(1, 0, nil, nil, nil),
-				nodeGroups:         []string{nodeGroup2},
-				mcmDeployment:      newMCMDeployment(1),
-			},
-			expect{
-				machines: newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
-				err:      nil,
-			},
-		},
 		{
 			"priority reset of machine fails",
 			setup{
-				nodes:              newNodes(1, "fakeID", []bool{false}),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}, []bool{false}),
+				nodes:              newNodes(1, "fakeID"),
+				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
 				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
 				controlMachineFakeResourceActions: &customfake.ResourceActions{
 					Machine: customfake.Actions{
@@ -474,8 +453,7 @@ func TestRefresh(t *testing.T) {
 				mcmDeployment: newMCMDeployment(1),
 			},
 			expect{
-				machines: []*v1alpha1.Machine{newMachine("machine-1", "fakeID-1", nil, "machinedeployment-1", "machineset-1", "1", false, true)},
-				err:      errors.Join(nil, errors.Join(fmt.Errorf("could not reset priority annotation on machine machine-1, Error: %v", mcUpdateErrorMsg))),
+				err: errors.Join(nil, errors.Join(fmt.Errorf("could not reset priority annotation on machine machine-1, Error: %v", mcUpdateErrorMsg))),
 			},
 		},
 	}
@@ -505,10 +483,14 @@ func TestRefresh(t *testing.T) {
 			} else {
 				g.Expect(err).To(BeNil())
 			}
-			for _, mc := range entry.expect.machines {
-				machine, err := m.machineClient.Machines(m.namespace).Get(context.TODO(), mc.Name, metav1.GetOptions{})
-				g.Expect(err).To(BeNil())
-				g.Expect(mc.Annotations[machinePriorityAnnotation]).To(Equal(machine.Annotations[machinePriorityAnnotation]))
+			machines, err := m.machineClient.Machines(m.namespace).List(context.TODO(), metav1.ListOptions{})
+			g.Expect(err).To(BeNil())
+			for _, mc := range machines.Items {
+				if slices.Contains(entry.expect.prio3Machines, mc.Name) {
+					g.Expect(mc.Annotations[machinePriorityAnnotation]).To(Equal(defaultPriorityValue))
+				} else {
+					g.Expect(mc.Annotations[machinePriorityAnnotation]).To(Equal(priorityValueForDeletionCandidateMachines))
+				}
 			}
 		})
 	}
@@ -554,14 +536,14 @@ func TestNodes(t *testing.T) {
 		{
 			"Correct instances should be returned for machine objects under the machinedeployment",
 			setup{
-				nodes: []*corev1.Node{newNode("node-1", "fakeID-1", false)},
+				nodes: []*corev1.Node{newNode("node-1", "fakeID-1")},
 				machines: func() []*v1alpha1.Machine {
 					allMachines := make([]*v1alpha1.Machine, 0, 5)
-					allMachines = append(allMachines, newMachine("machine-with-registered-node", "fakeID-1", nil, "machinedeployment-1", "", "", false, true))
-					allMachines = append(allMachines, newMachine("machine-with-vm-but-no-node", "fakeID-2", nil, "machinedeployment-1", "", "", false, false))
-					allMachines = append(allMachines, newMachine("machine-with-vm-creating", "", nil, "machinedeployment-1", "", "", false, false))
-					allMachines = append(allMachines, newMachine("machine-with-vm-create-error-out-of-quota", "", &v1alpha1.MachineStatus{LastOperation: v1alpha1.LastOperation{Type: v1alpha1.MachineOperationCreate, State: v1alpha1.MachineStateFailed, ErrorCode: machinecodes.ResourceExhausted.String(), Description: outOfQuotaMachineStatusErrorDescription}}, "machinedeployment-1", "", "", false, false))
-					allMachines = append(allMachines, newMachine("machine-with-vm-create-error-invalid-credentials", "", &v1alpha1.MachineStatus{LastOperation: v1alpha1.LastOperation{Type: v1alpha1.MachineOperationCreate, State: v1alpha1.MachineStateFailed, ErrorCode: machinecodes.Internal.String(), Description: invalidCredentialsMachineStatusErrorDescription}}, "machinedeployment-1", "", "", false, false))
+					allMachines = append(allMachines, newMachine("machine-with-registered-node", "fakeID-1", nil, "machinedeployment-1", "", "", true))
+					allMachines = append(allMachines, newMachine("machine-with-vm-but-no-node", "fakeID-2", nil, "machinedeployment-1", "", "", false))
+					allMachines = append(allMachines, newMachine("machine-with-vm-creating", "", nil, "machinedeployment-1", "", "", false))
+					allMachines = append(allMachines, newMachine("machine-with-vm-create-error-out-of-quota", "", &v1alpha1.MachineStatus{LastOperation: v1alpha1.LastOperation{Type: v1alpha1.MachineOperationCreate, State: v1alpha1.MachineStateFailed, ErrorCode: machinecodes.ResourceExhausted.String(), Description: outOfQuotaMachineStatusErrorDescription}}, "machinedeployment-1", "", "", false))
+					allMachines = append(allMachines, newMachine("machine-with-vm-create-error-invalid-credentials", "", &v1alpha1.MachineStatus{LastOperation: v1alpha1.LastOperation{Type: v1alpha1.MachineOperationCreate, State: v1alpha1.MachineStateFailed, ErrorCode: machinecodes.Internal.String(), Description: invalidCredentialsMachineStatusErrorDescription}}, "machinedeployment-1", "", "", false))
 					return allMachines
 				}(),
 				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 8d4e3cdce84f..c9cba993775b 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -267,10 +267,9 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 			maxRetryTimeout:         maxRetryTimeout,
 			retryInterval:           retryInterval,
 		}
-		for _, spec := range discoveryOpts.NodeGroupSpecs {
-			if err := m.addNodeGroup(spec); err != nil {
-				return nil, err
-			}
+		err = m.generateMachineDeploymentMap()
+		if err != nil {
+			return nil, err
 		}
 		targetCoreInformerFactory.Start(m.interrupt)
 		controlMachineInformerFactory.Start(m.interrupt)
@@ -294,6 +293,15 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 	return nil, fmt.Errorf("Unable to start cloud provider MCM for cluster autoscaler: API GroupVersion %q or %q or %q is not available; \nFound: %#v", machineGVR, machineSetGVR, machineDeploymentGVR, availableResources)
 }
 
+func (m *McmManager) generateMachineDeploymentMap() error {
+	for _, spec := range m.discoveryOpts.NodeGroupSpecs {
+		if err := m.addNodeGroup(spec); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
 // addNodeGroup adds node group defined in string spec. Format:
 // minNodes:maxNodes:namespace.machineDeploymentName
 func (m *McmManager) addNodeGroup(spec string) error {
@@ -486,22 +494,23 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) e
 		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonMachineDeployment.Name, len(machineNamesWithPrio1), createMachinesMarkedForDeletionAnnotationValue(machineNamesMarkedByCA))
 	}, "MachineDeployment", "update", commonMachineDeployment.Name)
 	if err != nil {
-		klog.Errorf("unable to scale in machine deployment %s, Error: %v", commonMachineDeployment.Name, err)
+		klog.Errorf("unable to scale in machine deployment %s, will reset priority of target machines, Error: %v", commonMachineDeployment.Name, err)
+		return errors.Join(err, m.resetPriorityForMachines(machineNamesWithPrio1))
 	}
-	return nil
+	return err
 }
 
 // resetPriorityForMachines resets the priority of machines passed in the argument to defaultPriorityValue
-func (m *McmManager) resetPriorityForMachines(mcRefs []types.NamespacedName) error {
+func (m *McmManager) resetPriorityForMachines(mcNames []string) error {
 	var collectiveError []error
-	for _, mcRef := range mcRefs {
-		machine, err := m.machineLister.Machines(m.namespace).Get(mcRef.Name)
+	for _, mcName := range mcNames {
+		machine, err := m.machineLister.Machines(m.namespace).Get(mcName)
 		if kube_errors.IsNotFound(err) {
-			klog.Warningf("Machine %s not found, skipping resetting priority annotation", mcRef.Name)
+			klog.Warningf("Machine %s not found, skipping resetting priority annotation", mcName)
 			continue
 		}
 		if err != nil {
-			collectiveError = append(collectiveError, fmt.Errorf("unable to get Machine object %s, Error: %v", mcRef, err))
+			collectiveError = append(collectiveError, fmt.Errorf("unable to get Machine object %s, Error: %v", mcName, err))
 			continue
 		}
 		ctx, cancelFn := context.WithDeadline(context.Background(), time.Now().Add(defaultResetAnnotationTimeout))
diff --git a/cluster-autoscaler/cloudprovider/mcm/test_utils.go b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
index 3c3f55e4c696..4822f30f1fc3 100644
--- a/cluster-autoscaler/cloudprovider/mcm/test_utils.go
+++ b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
@@ -7,6 +7,7 @@ package mcm
 import (
 	"fmt"
 	appsv1 "k8s.io/api/apps/v1"
+	types "k8s.io/apimachinery/pkg/types"
 	"k8s.io/utils/pointer"
 	"testing"
 	"time"
@@ -24,7 +25,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	customfake "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mcm/fakeclient"
-	deletetaint "k8s.io/autoscaler/cluster-autoscaler/utils/taints"
 	appsv1informers "k8s.io/client-go/informers"
 	coreinformers "k8s.io/client-go/informers"
 )
@@ -42,7 +42,7 @@ func newMachineDeployments(
 	labels map[string]string,
 ) []*v1alpha1.MachineDeployment {
 	machineDeployments := make([]*v1alpha1.MachineDeployment, machineDeploymentCount)
-	for i := range machineDeployments {
+	for i := 0; i < machineDeploymentCount; i++ {
 		machineDeployment := &v1alpha1.MachineDeployment{
 			TypeMeta: metav1.TypeMeta{
 				APIVersion: "machine.sapcloud.io",
@@ -74,7 +74,7 @@ func newMachineSets(
 ) []*v1alpha1.MachineSet {
 
 	machineSets := make([]*v1alpha1.MachineSet, machineSetCount)
-	for i := range machineSets {
+	for i := 0; i < machineSetCount; i++ {
 		ms := &v1alpha1.MachineSet{
 			TypeMeta: metav1.TypeMeta{
 				APIVersion: "machine.sapcloud.io",
@@ -97,10 +97,9 @@ func newMachine(
 	statusTemplate *v1alpha1.MachineStatus,
 	mdName, msName string,
 	priorityAnnotationValue string,
-	setDeletionTimeStamp,
 	setNodeLabel bool,
 ) *v1alpha1.Machine {
-	m := newMachines(1, providerId, statusTemplate, mdName, msName, []string{priorityAnnotationValue}, []bool{setDeletionTimeStamp})[0]
+	m := newMachines(1, providerId, statusTemplate, mdName, msName, []string{priorityAnnotationValue})[0]
 	m.Name = name
 	m.Spec.ProviderID = providerId
 	if !setNodeLabel {
@@ -109,26 +108,34 @@ func newMachine(
 	return m
 }
 
+func generateNames(prefix string, count int) []string {
+	names := make([]string, count)
+	for i := 0; i < count; i++ {
+		names[i] = fmt.Sprintf("%s-%d", prefix, i+1)
+	}
+	return names
+}
+
 func newMachines(
 	machineCount int,
 	providerIdGenerateName string,
 	statusTemplate *v1alpha1.MachineStatus,
 	mdName, msName string,
 	priorityAnnotationValues []string,
-	setDeletionTimeStamp []bool,
 ) []*v1alpha1.Machine {
 	machines := make([]*v1alpha1.Machine, machineCount)
-
+	machineNames := generateNames("machine", machineCount)
+	nodeNames := generateNames("node", machineCount)
 	currentTime := metav1.Now()
 
-	for i := range machines {
+	for i := 0; i < machineCount; i++ {
 		m := &v1alpha1.Machine{
 			TypeMeta: metav1.TypeMeta{
 				APIVersion: "machine.sapcloud.io",
 				Kind:       "Machine",
 			},
 			ObjectMeta: metav1.ObjectMeta{
-				Name:      fmt.Sprintf("machine-%d", i+1),
+				Name:      machineNames[i],
 				Namespace: testNamespace,
 				OwnerReferences: []metav1.OwnerReference{
 					{Name: msName},
@@ -143,12 +150,12 @@ func newMachines(
 			m.Spec = v1alpha1.MachineSpec{ProviderID: fmt.Sprintf("%s/i%d", providerIdGenerateName, i+1)}
 		}
 
-		m.Labels["node"] = fmt.Sprintf("node-%d", i+1)
-		if setDeletionTimeStamp[i] {
-			m.ObjectMeta.DeletionTimestamp = &currentTime
-		}
+		m.Labels["node"] = nodeNames[i]
 		if statusTemplate != nil {
 			m.Status = *newMachineStatus(statusTemplate)
+			if m.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating {
+				m.DeletionTimestamp = &currentTime
+			}
 		}
 		machines[i] = m
 	}
@@ -158,9 +165,8 @@ func newMachines(
 func newNode(
 	nodeName,
 	providerId string,
-	addToBeDeletedTaint bool,
 ) *corev1.Node {
-	node := newNodes(1, providerId, []bool{addToBeDeletedTaint})[0]
+	node := newNodes(1, providerId)[0]
 	clone := node.DeepCopy()
 	clone.Name = nodeName
 	clone.Spec.ProviderID = providerId
@@ -170,30 +176,20 @@ func newNode(
 func newNodes(
 	nodeCount int,
 	providerIdGenerateName string,
-	addToBeDeletedTaint []bool,
 ) []*corev1.Node {
-
 	nodes := make([]*corev1.Node, nodeCount)
-	for i := range nodes {
-		var taints []corev1.Taint
-		if addToBeDeletedTaint[i] {
-			taints = append(taints, corev1.Taint{
-				Key:    deletetaint.ToBeDeletedTaint,
-				Value:  testTaintValue,
-				Effect: corev1.TaintEffectNoSchedule,
-			})
-		}
+	nodeNames := generateNames("node", nodeCount)
+	for i := 0; i < nodeCount; i++ {
 		node := &corev1.Node{
 			TypeMeta: metav1.TypeMeta{
 				APIVersion: "appsv1",
 				Kind:       "Node",
 			},
 			ObjectMeta: metav1.ObjectMeta{
-				Name: fmt.Sprintf("node-%d", i+1),
+				Name: nodeNames[i],
 			},
 			Spec: corev1.NodeSpec{
 				ProviderID: fmt.Sprintf("%s/i%d", providerIdGenerateName, i+1),
-				Taints:     taints,
 			},
 		}
 
@@ -287,6 +283,7 @@ func createMcmManager(
 		discoveryOpts: cloudprovider.NodeGroupDiscoveryOptions{
 			NodeGroupSpecs: nodeGroups,
 		},
+		machineDeployments:      make(map[types.NamespacedName]*MachineDeployment),
 		deploymentLister:        appsControlSharedInformers.Deployments().Lister(),
 		machineClient:           fakeTypedMachineClient,
 		machineDeploymentLister: machineDeployments.Lister(),
@@ -297,7 +294,7 @@ func createMcmManager(
 		maxRetryTimeout:         5 * time.Second,
 		retryInterval:           1 * time.Second,
 	}
-
+	g.Expect(mcmManager.generateMachineDeploymentMap()).To(gomega.Succeed())
 	hasSyncedCachesFns := []cache.InformerSynced{
 		nodes.Informer().HasSynced,
 		machines.Informer().HasSynced,

From b2e1c3cd6d51799a969e176e060c0bb75db01eb4 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Thu, 16 Jan 2025 08:10:01 +0530
Subject: [PATCH 08/27] review comments addressed

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 154 +++++++++---------
 .../mcm/mcm_cloud_provider_test.go            |  18 +-
 .../cloudprovider/mcm/mcm_manager.go          | 108 ++++++------
 .../cloudprovider/mcm/test_utils.go           |   4 +-
 4 files changed, 141 insertions(+), 143 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index bf6cda949dcc..6ea6a3660787 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -106,7 +106,7 @@ func buildStaticallyDiscoveringProvider(mcmManager *McmManager, resourceLimiter
 	return mcm, nil
 }
 
-// Cleanup stops the go routine that is handling the current view of the MachineDeployment in the form of a cache
+// Cleanup stops the go routine that is handling the current view of the NodeGroupImpl in the form of a cache
 func (mcm *mcmCloudProvider) Cleanup() error {
 	mcm.mcmManager.Cleanup()
 	return nil
@@ -118,8 +118,8 @@ func (mcm *mcmCloudProvider) Name() string {
 
 // NodeGroups returns all node groups configured for this cloud provider.
 func (mcm *mcmCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
-	result := make([]cloudprovider.NodeGroup, 0, len(mcm.mcmManager.machineDeployments))
-	for _, machinedeployment := range mcm.mcmManager.machineDeployments {
+	result := make([]cloudprovider.NodeGroup, 0, len(mcm.mcmManager.nodeGroups))
+	for _, machinedeployment := range mcm.mcmManager.nodeGroups {
 		if machinedeployment.maxSize == 0 {
 			continue
 		}
@@ -145,13 +145,13 @@ func (mcm *mcmCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
 		return nil, nil
 	}
 
-	md, err := mcm.mcmManager.GetMachineDeploymentForMachine(ref)
+	md, err := mcm.mcmManager.GetNodeGroupImpl(ref)
 	if err != nil {
 		return nil, err
 	}
 
 	key := types.NamespacedName{Namespace: md.Namespace, Name: md.Name}
-	_, isManaged := mcm.mcmManager.machineDeployments[key]
+	_, isManaged := mcm.mcmManager.nodeGroups[key]
 	if !isManaged {
 		klog.V(4).Infof("Skipped node %v, it's not managed by this controller", node.Spec.ProviderID)
 		return nil, nil
@@ -260,8 +260,8 @@ func ReferenceFromProviderID(m *McmManager, id string) (*types.NamespacedName, e
 	}, nil
 }
 
-// MachineDeployment implements NodeGroup interface.
-type MachineDeployment struct {
+// NodeGroupImpl implements NodeGroup interface.
+type NodeGroupImpl struct {
 	types.NamespacedName
 
 	mcmManager *McmManager
@@ -272,64 +272,64 @@ type MachineDeployment struct {
 }
 
 // MaxSize returns maximum size of the node group.
-func (machineDeployment *MachineDeployment) MaxSize() int {
-	return machineDeployment.maxSize
+func (ngImpl *NodeGroupImpl) MaxSize() int {
+	return ngImpl.maxSize
 }
 
 // MinSize returns minimum size of the node group.
-func (machineDeployment *MachineDeployment) MinSize() int {
-	return machineDeployment.minSize
+func (ngImpl *NodeGroupImpl) MinSize() int {
+	return ngImpl.minSize
 }
 
 // TargetSize returns the current TARGET size of the node group. It is possible that the
 // number is different from the number of nodes registered in Kubernetes.
-func (machineDeployment *MachineDeployment) TargetSize() (int, error) {
-	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
+func (ngImpl *NodeGroupImpl) TargetSize() (int, error) {
+	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	return int(size), err
 }
 
 // Exist checks if the node group really exists on the cloud provider side. Allows to tell the
 // theoretical node group from the real one.
 // TODO: Implement this to check if machine-deployment really exists.
-func (machineDeployment *MachineDeployment) Exist() bool {
+func (ngImpl *NodeGroupImpl) Exist() bool {
 	return true
 }
 
 // Create creates the node group on the cloud provider side.
-func (machineDeployment *MachineDeployment) Create() (cloudprovider.NodeGroup, error) {
+func (ngImpl *NodeGroupImpl) Create() (cloudprovider.NodeGroup, error) {
 	return nil, cloudprovider.ErrAlreadyExist
 }
 
 // Autoprovisioned returns true if the node group is autoprovisioned.
-func (machineDeployment *MachineDeployment) Autoprovisioned() bool {
+func (ngImpl *NodeGroupImpl) Autoprovisioned() bool {
 	return false
 }
 
 // Delete deletes the node group on the cloud provider side.
 // This will be executed only for autoprovisioned node groups, once their size drops to 0.
-func (machineDeployment *MachineDeployment) Delete() error {
+func (ngImpl *NodeGroupImpl) Delete() error {
 	return cloudprovider.ErrNotImplemented
 }
 
 // IncreaseSize of the Machinedeployment.
-func (machineDeployment *MachineDeployment) IncreaseSize(delta int) error {
-	klog.V(0).Infof("Received request to increase size of machine deployment %s by %d", machineDeployment.Name, delta)
+func (ngImpl *NodeGroupImpl) IncreaseSize(delta int) error {
+	klog.V(0).Infof("Received request to increase size of machine deployment %s by %d", ngImpl.Name, delta)
 	if delta <= 0 {
 		return fmt.Errorf("size increase must be positive")
 	}
-	machineDeployment.scalingMutex.Lock()
-	defer machineDeployment.scalingMutex.Unlock()
-	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
+	ngImpl.scalingMutex.Lock()
+	defer ngImpl.scalingMutex.Unlock()
+	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
 		return err
 	}
 	targetSize := int(size) + delta
-	if targetSize > machineDeployment.MaxSize() {
-		return fmt.Errorf("size increase too large - desired:%d max:%d", targetSize, machineDeployment.MaxSize())
+	if targetSize > ngImpl.MaxSize() {
+		return fmt.Errorf("size increase too large - desired:%d max:%d", targetSize, ngImpl.MaxSize())
 	}
-	return machineDeployment.mcmManager.retry(func(ctx context.Context) (bool, error) {
-		return machineDeployment.mcmManager.SetMachineDeploymentSize(ctx, machineDeployment, int64(targetSize))
-	}, "MachineDeployment", "update", machineDeployment.Name)
+	return ngImpl.mcmManager.retry(func(ctx context.Context) (bool, error) {
+		return ngImpl.mcmManager.SetMachineDeploymentSize(ctx, ngImpl, int64(targetSize))
+	}, "MachineDeployment", "update", ngImpl.Name)
 }
 
 // DecreaseTargetSize decreases the target size of the node group. This function
@@ -337,39 +337,41 @@ func (machineDeployment *MachineDeployment) IncreaseSize(delta int) error {
 // request for new nodes that have not been yet fulfilled. Delta should be negative.
 // It is assumed that cloud provider will not delete the existing nodes if the size
 // when there is an option to just decrease the target.
-func (machineDeployment *MachineDeployment) DecreaseTargetSize(delta int) error {
-	klog.V(0).Infof("Received request to decrease target size of machine deployment %s by %d", machineDeployment.Name, delta)
+func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
+	klog.V(0).Infof("Received request to decrease target size of machine deployment %s by %d", ngImpl.Name, delta)
 	if delta >= 0 {
 		return fmt.Errorf("size decrease size must be negative")
 	}
-	machineDeployment.scalingMutex.Lock()
-	defer machineDeployment.scalingMutex.Unlock()
-	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
+	ngImpl.scalingMutex.Lock()
+	defer ngImpl.scalingMutex.Unlock()
+	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
 		return err
 	}
 	decreaseAmount := int(size) + delta
-	if decreaseAmount < machineDeployment.minSize {
-		klog.Warningf("Cannot go below min size= %d for machineDeployment %s, requested target size= %d . Setting target size to min size", machineDeployment.minSize, machineDeployment.Name, size+int64(delta))
-		decreaseAmount = machineDeployment.minSize
+	if decreaseAmount < ngImpl.minSize {
+		klog.Warningf("Cannot go below min size= %d for ngImpl %s, requested target size= %d . Setting target size to min size", ngImpl.minSize, ngImpl.Name, size+int64(delta))
+		decreaseAmount = ngImpl.minSize
 	}
-	return machineDeployment.mcmManager.retry(func(ctx context.Context) (bool, error) {
-		return machineDeployment.mcmManager.SetMachineDeploymentSize(ctx, machineDeployment, int64(decreaseAmount))
-	}, "MachineDeployment", "update", machineDeployment.Name)
+	return ngImpl.mcmManager.retry(func(ctx context.Context) (bool, error) {
+		return ngImpl.mcmManager.SetMachineDeploymentSize(ctx, ngImpl, int64(decreaseAmount))
+	}, "MachineDeployment", "update", ngImpl.Name)
 }
 
 // Refresh resets the priority annotation for the machines that are not present in machines-marked-by-ca-for-deletion annotation on the machineDeployment
-func (machineDeployment *MachineDeployment) Refresh() error {
-	machineDeployment.scalingMutex.Lock()
-	defer machineDeployment.scalingMutex.Unlock()
-	mcd, err := machineDeployment.mcmManager.GetMachineDeploymentResource(machineDeployment.Name)
+func (ngImpl *NodeGroupImpl) Refresh() error {
+	if !ngImpl.scalingMutex.TryLock() {
+		return fmt.Errorf("cannot Refresh() since scalingMutex currently acquired for %q", ngImpl.Name)
+	}
+	defer ngImpl.scalingMutex.Unlock()
+	mcd, err := ngImpl.mcmManager.GetMachineDeploymentObject(ngImpl.Name)
 	if err != nil {
 		return err
 	}
 	markedMachineNames := getMachineNamesMarkedByCAForDeletion(mcd)
-	machines, err := machineDeployment.mcmManager.getMachinesForMachineDeployment(machineDeployment.Name)
+	machines, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
 	if err != nil {
-		klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", machineDeployment.Name, err.Error())
+		klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", ngImpl.Name, err.Error())
 		return err
 	}
 	// update the machines-marked-by-ca-for-deletion annotation with the machines that are still marked for deletion by CA.
@@ -387,11 +389,11 @@ func (machineDeployment *MachineDeployment) Refresh() error {
 		clone.Annotations = map[string]string{}
 	}
 	updatedMachinesMarkedByCAForDeletionAnnotationVal := createMachinesMarkedForDeletionAnnotationValue(updatedMarkedMachineNames)
-	if clone.Annotations[machinesMarkedByCAForDeletion] != updatedMachinesMarkedByCAForDeletionAnnotationVal {
-		clone.Annotations[machinesMarkedByCAForDeletion] = updatedMachinesMarkedByCAForDeletionAnnotationVal
-		ctx, cancelFn := context.WithTimeout(context.Background(), machineDeployment.mcmManager.maxRetryTimeout)
+	if clone.Annotations[machinesMarkedByCAForDeletionAnnotation] != updatedMachinesMarkedByCAForDeletionAnnotationVal {
+		clone.Annotations[machinesMarkedByCAForDeletionAnnotation] = updatedMachinesMarkedByCAForDeletionAnnotationVal
+		ctx, cancelFn := context.WithTimeout(context.Background(), ngImpl.mcmManager.maxRetryTimeout)
 		defer cancelFn()
-		_, err = machineDeployment.mcmManager.machineClient.MachineDeployments(machineDeployment.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
+		_, err = ngImpl.mcmManager.machineClient.MachineDeployments(ngImpl.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
 		if err != nil {
 			return err
 		}
@@ -409,24 +411,24 @@ func (machineDeployment *MachineDeployment) Refresh() error {
 			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, machine.Name)
 		}
 	}
-	return machineDeployment.mcmManager.resetPriorityForMachines(incorrectlyMarkedMachines)
+	return ngImpl.mcmManager.resetPriorityForMachines(incorrectlyMarkedMachines)
 }
 
 // Belongs returns true if the given node belongs to the NodeGroup.
 // TODO: Implement this to iterate over machines under machinedeployment, and return true if node exists in list.
-func (machineDeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, error) {
-	ref, err := ReferenceFromProviderID(machineDeployment.mcmManager, node.Spec.ProviderID)
+func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (bool, error) {
+	ref, err := ReferenceFromProviderID(ngImpl.mcmManager, node.Spec.ProviderID)
 	if err != nil {
 		return false, err
 	}
-	targetMd, err := machineDeployment.mcmManager.GetMachineDeploymentForMachine(ref)
+	targetMd, err := ngImpl.mcmManager.GetNodeGroupImpl(ref)
 	if err != nil {
 		return false, err
 	}
 	if targetMd == nil {
 		return false, fmt.Errorf("%s doesn't belong to a known MachinDeployment", node.Name)
 	}
-	if targetMd.Id() != machineDeployment.Id() {
+	if targetMd.Id() != ngImpl.Id() {
 		return false, nil
 	}
 	return true, nil
@@ -434,31 +436,31 @@ func (machineDeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, err
 
 // DeleteNodes deletes the nodes from the group. It is expected that this method will not be called
 // for nodes which are not part of ANY machine deployment.
-func (machineDeployment *MachineDeployment) DeleteNodes(nodes []*apiv1.Node) error {
+func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 	nodeNames := getNodeNames(nodes)
 	klog.V(0).Infof("Received request to delete nodes:- %v", nodeNames)
-	size, err := machineDeployment.mcmManager.GetMachineDeploymentSize(machineDeployment)
+	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
 		return err
 	}
-	if int(size) <= machineDeployment.MinSize() {
+	if int(size) <= ngImpl.MinSize() {
 		return fmt.Errorf("min size reached, nodes will not be deleted")
 	}
 	machines := make([]*types.NamespacedName, 0, len(nodes))
 	for _, node := range nodes {
-		belongs, err := machineDeployment.Belongs(node)
+		belongs, err := ngImpl.Belongs(node)
 		if err != nil {
 			return err
 		} else if !belongs {
-			return fmt.Errorf("%s belongs to a different machinedeployment than %s", node.Name, machineDeployment.Id())
+			return fmt.Errorf("%s belongs to a different machinedeployment than %s", node.Name, ngImpl.Id())
 		}
-		ref, err := ReferenceFromProviderID(machineDeployment.mcmManager, node.Spec.ProviderID)
+		ref, err := ReferenceFromProviderID(ngImpl.mcmManager, node.Spec.ProviderID)
 		if err != nil {
 			return fmt.Errorf("couldn't find the machine-name from provider-id %s", node.Spec.ProviderID)
 		}
 		machines = append(machines, ref)
 	}
-	return machineDeployment.mcmManager.DeleteMachines(machines)
+	return ngImpl.mcmManager.DeleteMachines(machines)
 }
 
 func getNodeNames(nodes []*apiv1.Node) interface{} {
@@ -470,20 +472,20 @@ func getNodeNames(nodes []*apiv1.Node) interface{} {
 }
 
 // Id returns machinedeployment id.
-func (machineDeployment *MachineDeployment) Id() string {
-	return machineDeployment.Name
+func (ngImpl *NodeGroupImpl) Id() string {
+	return ngImpl.Name
 }
 
 // Debug returns a debug string for the Asg.
-func (machineDeployment *MachineDeployment) Debug() string {
-	return fmt.Sprintf("%s (%d:%d)", machineDeployment.Id(), machineDeployment.MinSize(), machineDeployment.MaxSize())
+func (ngImpl *NodeGroupImpl) Debug() string {
+	return fmt.Sprintf("%s (%d:%d)", ngImpl.Id(), ngImpl.MinSize(), ngImpl.MaxSize())
 }
 
 // Nodes returns a list of all nodes that belong to this node group.
-func (machineDeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, error) {
-	instances, err := machineDeployment.mcmManager.GetInstancesForMachineDeployment(machineDeployment)
+func (ngImpl *NodeGroupImpl) Nodes() ([]cloudprovider.Instance, error) {
+	instances, err := ngImpl.mcmManager.GetInstancesForMachineDeployment(ngImpl.Name)
 	if err != nil {
-		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the machinedeployment %q, error: %v", machineDeployment.Name, err)
+		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the machinedeployment %q, error: %v", ngImpl.Name, err)
 	}
 	erroneousInstanceInfos := make([]string, 0, len(instances))
 	for _, instance := range instances {
@@ -500,9 +502,9 @@ func (machineDeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e
 // GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
 // NodeGroup. Returning a nil will result in using default options.
 // Implementation optional.
-func (machineDeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
+func (ngImpl *NodeGroupImpl) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
 	options := defaults
-	mcdAnnotations, err := machineDeployment.mcmManager.GetMachineDeploymentAnnotations(machineDeployment.Name)
+	mcdAnnotations, err := ngImpl.mcmManager.GetMachineDeploymentAnnotations(ngImpl.Name)
 	if err != nil {
 		return nil, err
 	}
@@ -536,32 +538,32 @@ func (machineDeployment *MachineDeployment) GetOptions(defaults config.NodeGroup
 }
 
 // TemplateNodeInfo returns a node template for this node group.
-func (machineDeployment *MachineDeployment) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
+func (ngImpl *NodeGroupImpl) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
 
-	nodeTemplate, err := machineDeployment.mcmManager.GetMachineDeploymentNodeTemplate(machineDeployment)
+	nodeTemplate, err := ngImpl.mcmManager.GetMachineDeploymentNodeTemplate(ngImpl.Name)
 	if err != nil {
 		return nil, err
 	}
 
-	node, err := machineDeployment.mcmManager.buildNodeFromTemplate(machineDeployment.Name, nodeTemplate)
+	node, err := ngImpl.mcmManager.buildNodeFromTemplate(ngImpl.Name, nodeTemplate)
 	if err != nil {
 		return nil, err
 	}
 
-	nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(machineDeployment.Name))
+	nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(ngImpl.Name))
 	nodeInfo.SetNode(node)
 	return nodeInfo, nil
 }
 
 // AtomicIncreaseSize is not implemented.
-func (machineDeployment *MachineDeployment) AtomicIncreaseSize(delta int) error {
+func (ngImpl *NodeGroupImpl) AtomicIncreaseSize(delta int) error {
 	return cloudprovider.ErrNotImplemented
 }
 
 // getMachineNamesMarkedByCAForDeletion returns the set of machine names marked by CA for deletion.
 func getMachineNamesMarkedByCAForDeletion(mcd *v1alpha1.MachineDeployment) []string {
-	if mcd.Annotations == nil || mcd.Annotations[machinesMarkedByCAForDeletion] == "" {
+	if mcd.Annotations == nil || mcd.Annotations[machinesMarkedByCAForDeletionAnnotation] == "" {
 		return make([]string, 0)
 	}
-	return strings.Split(mcd.Annotations[machinesMarkedByCAForDeletion], ",")
+	return strings.Split(mcd.Annotations[machinesMarkedByCAForDeletionAnnotation], ",")
 }
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 995f51834f95..c8a4e584544a 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -153,7 +153,7 @@ func TestDeleteNodes(t *testing.T) {
 				prio1Machines: nil,
 				mdName:        "machinedeployment-1",
 				mdReplicas:    2,
-				err:           fmt.Errorf("MachineDeployment machinedeployment-1 is under rolling update , cannot reduce replica count"),
+				err:           fmt.Errorf("NodeGroupImpl machinedeployment-1 is under rolling update , cannot reduce replica count"),
 			},
 		},
 		{
@@ -309,7 +309,7 @@ func TestDeleteNodes(t *testing.T) {
 				trackers.ControlMachine.SetFailAtFakeResourceActions(entry.setup.controlMachineFakeResourceActions)
 			}
 
-			md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
+			md, err := buildNodeGroupImplFromSpec(entry.setup.nodeGroups[0], m)
 			g.Expect(err).To(BeNil())
 
 			err = md.DeleteNodes([]*corev1.Node{entry.action.node})
@@ -323,7 +323,7 @@ func TestDeleteNodes(t *testing.T) {
 			machineDeployment, err := m.machineClient.MachineDeployments(m.namespace).Get(context.TODO(), entry.expect.mdName, metav1.GetOptions{})
 			g.Expect(err).ToNot(HaveOccurred())
 			g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", entry.expect.mdReplicas))
-			g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletion]).To(Equal(entry.expect.machinesMarkedByCAAnnotationValue))
+			g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletionAnnotation]).To(Equal(entry.expect.machinesMarkedByCAAnnotationValue))
 
 			machines, err := m.machineClient.Machines(m.namespace).List(context.TODO(), metav1.ListOptions{
 				LabelSelector: metav1.FormatLabelSelector(&metav1.LabelSelector{
@@ -359,7 +359,7 @@ func TestIdempotencyOfDeleteNodes(t *testing.T) {
 	m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, setupObj.nodeGroups, controlMachineObjects, targetCoreObjects, nil)
 	defer trackers.Stop()
 	waitForCacheSync(t, stop, hasSyncedCacheFns)
-	md, err := buildMachineDeploymentFromSpec(setupObj.nodeGroups[0], m)
+	md, err := buildNodeGroupImplFromSpec(setupObj.nodeGroups[0], m)
 	g.Expect(err).To(BeNil())
 
 	err = md.DeleteNodes(newNodes(1, "fakeID"))
@@ -370,7 +370,7 @@ func TestIdempotencyOfDeleteNodes(t *testing.T) {
 	machineDeployment, err := m.machineClient.MachineDeployments(m.namespace).Get(context.TODO(), setupObj.machineDeployments[0].Name, metav1.GetOptions{})
 	g.Expect(err).ToNot(HaveOccurred())
 	g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", 2))
-	g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletion]).To(Equal(createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1))))
+	g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletionAnnotation]).To(Equal(createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1))))
 }
 
 func TestRefresh(t *testing.T) {
@@ -429,7 +429,7 @@ func TestRefresh(t *testing.T) {
 			setup{
 				nodes:              newNodes(1, "fakeID"),
 				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
-				machineDeployments: newMachineDeployments(1, 0, nil, map[string]string{machinesMarkedByCAForDeletion: "machine-1,machine-2"}, nil),
+				machineDeployments: newMachineDeployments(1, 0, nil, map[string]string{machinesMarkedByCAForDeletionAnnotation: "machine-1,machine-2"}, nil),
 				nodeGroups:         []string{nodeGroup2},
 				mcmDeployment:      newMCMDeployment(1),
 			},
@@ -581,7 +581,7 @@ func TestNodes(t *testing.T) {
 				trackers.ControlMachine.SetFailAtFakeResourceActions(entry.setup.controlMachineFakeResourceActions)
 			}
 
-			md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
+			md, err := buildNodeGroupImplFromSpec(entry.setup.nodeGroups[0], m)
 			g.Expect(err).To(BeNil())
 
 			returnedInstances, err := md.Nodes()
@@ -645,7 +645,7 @@ func TestGetOptions(t *testing.T) {
 				nodeGroups: []string{nodeGroup1},
 			},
 			expect{
-				err: fmt.Errorf("unable to fetch MachineDeployment object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
+				err: fmt.Errorf("unable to fetch NodeGroupImpl object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
 			},
 		},
 		{
@@ -733,7 +733,7 @@ func TestGetOptions(t *testing.T) {
 			defer trackers.Stop()
 			waitForCacheSync(t, stop, hasSyncedCacheFns)
 
-			md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
+			md, err := buildNodeGroupImplFromSpec(entry.setup.nodeGroups[0], m)
 			g.Expect(err).To(BeNil())
 
 			options, err := md.GetOptions(ngAutoScalingOpDefaults)
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index c9cba993775b..17e8845ed598 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -100,9 +100,9 @@ const (
 	machineDeploymentPausedReason = "DeploymentPaused"
 	// machineDeploymentNameLabel key for Machine Deployment name in machine labels
 	machineDeploymentNameLabel = "name"
-	// machinesMarkedByCAForDeletion is the annotation set by CA on machine deployment. Its value denotes the machines that
+	// machinesMarkedByCAForDeletionAnnotation is the annotation set by CA on machine deployment. Its value denotes the machines that
 	// CA marked for deletion by updating the priority annotation to 1 and scaling down the machine deployment.
-	machinesMarkedByCAForDeletion = "cluster-autoscaler.kubernetes.io/machines-marked-by-ca-for-deletion"
+	machinesMarkedByCAForDeletionAnnotation = "cluster-autoscaler.kubernetes.io/machines-marked-by-ca-for-deletion"
 )
 
 var (
@@ -129,7 +129,7 @@ type McmManager struct {
 	namespace               string
 	interrupt               chan struct{}
 	discoveryOpts           cloudprovider.NodeGroupDiscoveryOptions
-	machineDeployments      map[types.NamespacedName]*MachineDeployment
+	nodeGroups              map[types.NamespacedName]*NodeGroupImpl
 	deploymentLister        v1appslister.DeploymentLister
 	machineClient           machineapi.MachineV1alpha1Interface
 	machineDeploymentLister machinelisters.MachineDeploymentLister
@@ -255,7 +255,7 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 		m := &McmManager{
 			namespace:               namespace,
 			interrupt:               make(chan struct{}),
-			machineDeployments:      make(map[types.NamespacedName]*MachineDeployment),
+			nodeGroups:              make(map[types.NamespacedName]*NodeGroupImpl),
 			deploymentLister:        deploymentLister,
 			machineClient:           controlMachineClient,
 			machineClassLister:      machineClassLister,
@@ -305,20 +305,15 @@ func (m *McmManager) generateMachineDeploymentMap() error {
 // addNodeGroup adds node group defined in string spec. Format:
 // minNodes:maxNodes:namespace.machineDeploymentName
 func (m *McmManager) addNodeGroup(spec string) error {
-	machineDeployment, err := buildMachineDeploymentFromSpec(spec, m)
+	nodeGroup, err := buildNodeGroupImplFromSpec(spec, m)
 	if err != nil {
 		return err
 	}
-	m.addMachineDeployment(machineDeployment)
+	key := types.NamespacedName{Namespace: nodeGroup.Namespace, Name: nodeGroup.Name}
+	m.nodeGroups[key] = nodeGroup
 	return nil
 }
 
-func (m *McmManager) addMachineDeployment(machineDeployment *MachineDeployment) {
-	key := types.NamespacedName{Namespace: machineDeployment.Namespace, Name: machineDeployment.Name}
-	m.machineDeployments[key] = machineDeployment
-	return
-}
-
 // TODO: In general, any controller checking this needs to be dynamic so
 // users don't have to restart their controller manager if they change the apiserver.
 // Until we get there, the structure here needs to be exposed for the construction of a proper ControllerContext.
@@ -378,11 +373,11 @@ func CreateMcmManager(discoveryOpts cloudprovider.NodeGroupDiscoveryOptions) (*M
 	return createMCMManagerInternal(discoveryOpts, defaultRetryInterval, defaultMaxRetryTimeout)
 }
 
-// GetMachineDeploymentForMachine returns the MachineDeployment for the Machine object.
-func (m *McmManager) GetMachineDeploymentForMachine(machine *types.NamespacedName) (*MachineDeployment, error) {
+// GetNodeGroupImpl returns the NodeGroupImpl for the given fully-qualified machine name.
+func (m *McmManager) GetNodeGroupImpl(machine *types.NamespacedName) (*NodeGroupImpl, error) {
 	if machine.Name == "" {
 		// Considering the possibility when Machine has been deleted but due to cached Node object it appears here.
-		return nil, fmt.Errorf("Node does not Exists")
+		return nil, fmt.Errorf("node does not Exists")
 	}
 
 	machineObject, err := m.machineLister.Machines(m.namespace).Get(machine.Name)
@@ -413,7 +408,7 @@ func (m *McmManager) GetMachineDeploymentForMachine(machine *types.NamespacedNam
 		return nil, fmt.Errorf("unable to find parent MachineDeployment of given MachineSet object %s %v", machineSetName, err)
 	}
 
-	machineDeployment, ok := m.machineDeployments[types.NamespacedName{Namespace: m.namespace, Name: machineDeploymentName}]
+	machineDeployment, ok := m.nodeGroups[types.NamespacedName{Namespace: m.namespace, Name: machineDeploymentName}]
 	if !ok {
 		return nil, fmt.Errorf("machineDeployment %s not found in the list of machine deployments", machineDeploymentName)
 	}
@@ -424,8 +419,8 @@ func (m *McmManager) GetMachineDeploymentForMachine(machine *types.NamespacedNam
 // It will select the machines to reset the priority based on the descending order of creation timestamp.
 func (m *McmManager) Refresh() error {
 	var collectiveError []error
-	for _, machineDeployment := range m.machineDeployments {
-		collectiveError = append(collectiveError, machineDeployment.Refresh())
+	for _, nodeGroup := range m.nodeGroups {
+		collectiveError = append(collectiveError, nodeGroup.Refresh())
 	}
 	return errors.Join(collectiveError...)
 }
@@ -436,18 +431,18 @@ func (m *McmManager) Cleanup() {
 	return
 }
 
-// GetMachineDeploymentSize returns the replicas field of the MachineDeployment
-func (m *McmManager) GetMachineDeploymentSize(machinedeployment *MachineDeployment) (int64, error) {
-	md, err := m.GetMachineDeploymentResource(machinedeployment.Name)
+// GetMachineDeploymentSize returns the replicas field of the MachineDeployment corresponding to the given node group.
+func (m *McmManager) GetMachineDeploymentSize(nodeGroupName string) (int64, error) {
+	md, err := m.GetMachineDeploymentObject(nodeGroupName)
 	if err != nil {
 		return 0, err
 	}
 	return int64(md.Spec.Replicas), nil
 }
 
-// SetMachineDeploymentSize sets the desired size for the Machinedeployment.
-func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, machinedeployment *MachineDeployment, size int64) (bool, error) {
-	md, err := m.GetMachineDeploymentResource(machinedeployment.Name)
+// SetMachineDeploymentSize sets the desired size for the backing MachineDeployment of the given nodeGroup.
+func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, nodeGroup *NodeGroupImpl, size int64) (bool, error) {
+	md, err := m.GetMachineDeploymentObject(nodeGroup.Name)
 	if err != nil {
 		return true, err
 	}
@@ -458,29 +453,29 @@ func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, machinedeploy
 	clone := md.DeepCopy()
 	clone.Spec.Replicas = int32(size)
 
-	_, err = m.machineClient.MachineDeployments(machinedeployment.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
+	_, err = m.machineClient.MachineDeployments(nodeGroup.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
 	return true, err
 }
 
-// DeleteMachines annotates the target machines and also reduces the desired replicas of the MachineDeployment.
+// DeleteMachines annotates the target machines and also reduces the desired replicas of the corresponding MachineDeployment.
 func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) error {
 	if len(targetMachineRefs) == 0 {
 		return nil
 	}
-	commonMachineDeployment, err := m.GetMachineDeploymentForMachine(targetMachineRefs[0])
+	commonNodeGroup, err := m.GetNodeGroupImpl(targetMachineRefs[0])
 	if err != nil {
 		return err
 	}
 	// acquire the mutex
-	commonMachineDeployment.scalingMutex.Lock()
-	defer commonMachineDeployment.scalingMutex.Unlock()
+	commonNodeGroup.scalingMutex.Lock()
+	defer commonNodeGroup.scalingMutex.Unlock()
 	// get the machine deployment and return if rolling update is not finished
-	md, err := m.GetMachineDeploymentResource(commonMachineDeployment.Name)
+	md, err := m.GetMachineDeploymentObject(commonNodeGroup.Name)
 	if err != nil {
 		return err
 	}
 	if !isRollingUpdateFinished(md) {
-		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonMachineDeployment.Name)
+		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonNodeGroup.Name)
 	}
 	machineNamesMarkedByCA := getMachineNamesMarkedByCAForDeletion(md)
 	// update priorities of machines to be deleted except the ones already in termination to 1
@@ -491,10 +486,10 @@ func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) e
 	machineNamesMarkedByCA = append(machineNamesMarkedByCA, machineNamesWithPrio1...)
 	// Trying to update the machineDeployment till the deadline
 	err = m.retry(func(ctx context.Context) (bool, error) {
-		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonMachineDeployment.Name, len(machineNamesWithPrio1), createMachinesMarkedForDeletionAnnotationValue(machineNamesMarkedByCA))
-	}, "MachineDeployment", "update", commonMachineDeployment.Name)
+		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonNodeGroup.Name, len(machineNamesWithPrio1), createMachinesMarkedForDeletionAnnotationValue(machineNamesMarkedByCA))
+	}, "MachineDeployment", "update", commonNodeGroup.Name)
 	if err != nil {
-		klog.Errorf("unable to scale in machine deployment %s, will reset priority of target machines, Error: %v", commonMachineDeployment.Name, err)
+		klog.Errorf("unable to scale in machine deployment %s, will reset priority of target machines, Error: %v", commonNodeGroup.Name, err)
 		return errors.Join(err, m.resetPriorityForMachines(machineNamesWithPrio1))
 	}
 	return err
@@ -592,8 +587,9 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 
 // scaleDownAndAnnotateMachineDeployment scales down the machine deployment by the provided scaleDownAmount and returns the updated spec.Replicas after scale down.
 // It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of existing machines marked for deletion.
+// NOTE: Callers are expected to take the NodeGroup scalingMutex before invoking this method.
 func (m *McmManager) scaleDownAndAnnotateMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int, markedMachines string) (bool, error) {
-	md, err := m.GetMachineDeploymentResource(mdName)
+	md, err := m.GetMachineDeploymentObject(mdName)
 	if err != nil {
 		return true, err
 	}
@@ -610,7 +606,7 @@ func (m *McmManager) scaleDownAndAnnotateMachineDeployment(ctx context.Context,
 	if mdclone.Annotations == nil {
 		mdclone.Annotations = make(map[string]string)
 	}
-	mdclone.Annotations[machinesMarkedByCAForDeletion] = markedMachines
+	mdclone.Annotations[machinesMarkedByCAForDeletionAnnotation] = markedMachines
 	_, err = m.machineClient.MachineDeployments(mdclone.Namespace).Update(ctx, mdclone, metav1.UpdateOptions{})
 	if err != nil {
 		return true, fmt.Errorf("unable to scale in machine deployment %s, Error: %w", mdName, err)
@@ -644,10 +640,10 @@ func (m *McmManager) retry(fn func(ctx context.Context) (bool, error), resourceT
 	}
 }
 
-// GetInstancesForMachineDeployment returns list of cloudprovider.Instance for machines which belongs to the MachineDeployment.
-func (m *McmManager) GetInstancesForMachineDeployment(machinedeployment *MachineDeployment) ([]cloudprovider.Instance, error) {
+// GetInstancesForMachineDeployment returns list of cloudprovider.Instance for machines with the given nodeGroupName.
+func (m *McmManager) GetInstancesForMachineDeployment(nodeGroupName string) ([]cloudprovider.Instance, error) {
 	var (
-		list     = []string{machinedeployment.Name}
+		list     = []string{nodeGroupName}
 		selector = labels.NewSelector()
 		req, _   = labels.NewRequirement("name", selection.Equals, list)
 	)
@@ -655,7 +651,7 @@ func (m *McmManager) GetInstancesForMachineDeployment(machinedeployment *Machine
 	selector = selector.Add(*req)
 	machineList, err := m.machineLister.Machines(m.namespace).List(selector)
 	if err != nil {
-		return nil, fmt.Errorf("unable to fetch list of Machine objects %v for machinedeployment %q", err, machinedeployment.Name)
+		return nil, fmt.Errorf("unable to fetch list of Machine objects %v for MachineDeployment %q", err, nodeGroupName)
 	}
 
 	nodeList, err := m.nodeLister.List(labels.Everything())
@@ -741,7 +737,7 @@ func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error {
 
 // GetMachineDeploymentAnnotations returns the annotations present on the machine deployment for the provided machine deployment name
 func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName string) (map[string]string, error) {
-	md, err := m.GetMachineDeploymentResource(machineDeploymentName)
+	md, err := m.GetMachineDeploymentObject(machineDeploymentName)
 	if err != nil {
 		return nil, err
 	}
@@ -750,8 +746,8 @@ func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName strin
 
 // GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment
 // If no node present then it forms the nodeTemplate using the one present in machineClass
-func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *MachineDeployment) (*nodeTemplate, error) {
-	md, err := m.GetMachineDeploymentResource(machinedeployment.Name)
+func (m *McmManager) GetMachineDeploymentNodeTemplate(nodeGroupName string) (*nodeTemplate, error) {
+	md, err := m.GetMachineDeploymentObject(nodeGroupName)
 	if err != nil {
 		return nil, err
 	}
@@ -829,12 +825,12 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
 			var providerSpec *awsapis.AWSProviderSpec
 			err = json.Unmarshal(mc.ProviderSpec.Raw, &providerSpec)
 			if err != nil {
-				return nil, fmt.Errorf("Unable to convert from %s to %s for %s, Error: %v", kindMachineClass, providerAWS, machinedeployment.Name, err)
+				return nil, fmt.Errorf("unable to convert from %s to %s for %s, Error: %v", kindMachineClass, providerAWS, nodeGroupName, err)
 			}
 
 			awsInstance, exists := AWSInstanceTypes[providerSpec.MachineType]
 			if !exists {
-				return nil, fmt.Errorf("Unable to fetch details for VM type %s", providerSpec.MachineType)
+				return nil, fmt.Errorf("unable to fetch details for VM type %s", providerSpec.MachineType)
 			}
 			instance = instanceType{
 				InstanceType: awsInstance.InstanceType,
@@ -851,11 +847,11 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
 			var providerSpec *azureapis.AzureProviderSpec
 			err = json.Unmarshal(mc.ProviderSpec.Raw, &providerSpec)
 			if err != nil {
-				return nil, fmt.Errorf("Unable to convert from %s to %s for %s, Error: %v", kindMachineClass, providerAzure, machinedeployment.Name, err)
+				return nil, fmt.Errorf("unable to convert from %s to %s for %s, Error: %v", kindMachineClass, providerAzure, nodeGroupName, err)
 			}
 			azureInstance, exists := AzureInstanceTypes[providerSpec.Properties.HardwareProfile.VMSize]
 			if !exists {
-				return nil, fmt.Errorf("Unable to fetch details for VM type %s", providerSpec.Properties.HardwareProfile.VMSize)
+				return nil, fmt.Errorf("unable to fetch details for VM type %s", providerSpec.Properties.HardwareProfile.VMSize)
 			}
 			instance = instanceType{
 				InstanceType: azureInstance.InstanceType,
@@ -899,12 +895,12 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
 	return nodeTmpl, nil
 }
 
-// GetMachineDeploymentResource returns the MachineDeployment object for the provided machine deployment name
-func (m *McmManager) GetMachineDeploymentResource(mdName string) (*v1alpha1.MachineDeployment, error) {
+// GetMachineDeploymentObject returns the MachineDeployment object for the provided machine deployment name
+func (m *McmManager) GetMachineDeploymentObject(mdName string) (*v1alpha1.MachineDeployment, error) {
 	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
 	if err != nil {
-		klog.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
-		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
+		klog.Errorf("unable to fetch MachineDeployments object %s, Error: %v", mdName, err)
+		return nil, fmt.Errorf("unable to fetch MachineDeployments object %s, Error: %v", mdName, err)
 	}
 	return md, nil
 }
@@ -1051,19 +1047,19 @@ func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]stri
 	return result
 }
 
-func buildMachineDeploymentFromSpec(value string, mcmManager *McmManager) (*MachineDeployment, error) {
+func buildNodeGroupImplFromSpec(value string, mcmManager *McmManager) (*NodeGroupImpl, error) {
 	spec, err := dynamic.SpecFromString(value, true)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse node group spec: %v", err)
 	}
 	s := strings.Split(spec.Name, ".")
 	Namespace, Name := s[0], s[1]
-	machinedeployment := buildMachineDeployment(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
-	return machinedeployment, nil
+	nodeGroup := buildNodeGroupImpl(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
+	return nodeGroup, nil
 }
 
-func buildMachineDeployment(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *MachineDeployment {
-	return &MachineDeployment{
+func buildNodeGroupImpl(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *NodeGroupImpl {
+	return &NodeGroupImpl{
 		mcmManager:   mcmManager,
 		minSize:      minSize,
 		maxSize:      maxSize,
diff --git a/cluster-autoscaler/cloudprovider/mcm/test_utils.go b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
index 4822f30f1fc3..65c78c553e50 100644
--- a/cluster-autoscaler/cloudprovider/mcm/test_utils.go
+++ b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
@@ -46,7 +46,7 @@ func newMachineDeployments(
 		machineDeployment := &v1alpha1.MachineDeployment{
 			TypeMeta: metav1.TypeMeta{
 				APIVersion: "machine.sapcloud.io",
-				Kind:       "MachineDeployment",
+				Kind:       "NodeGroupImpl",
 			},
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      fmt.Sprintf("machinedeployment-%d", i+1),
@@ -283,7 +283,7 @@ func createMcmManager(
 		discoveryOpts: cloudprovider.NodeGroupDiscoveryOptions{
 			NodeGroupSpecs: nodeGroups,
 		},
-		machineDeployments:      make(map[types.NamespacedName]*MachineDeployment),
+		nodeGroups:              make(map[types.NamespacedName]*NodeGroupImpl),
 		deploymentLister:        appsControlSharedInformers.Deployments().Lister(),
 		machineClient:           fakeTypedMachineClient,
 		machineDeploymentLister: machineDeployments.Lister(),

From c515f39718df0a1c17aeff8987837e19a773cacd Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Thu, 16 Jan 2025 08:55:12 +0530
Subject: [PATCH 09/27] fixed broken test after refactor

---
 .../cloudprovider/mcm/mcm_cloud_provider_test.go              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index c8a4e584544a..3467b48afcac 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -153,7 +153,7 @@ func TestDeleteNodes(t *testing.T) {
 				prio1Machines: nil,
 				mdName:        "machinedeployment-1",
 				mdReplicas:    2,
-				err:           fmt.Errorf("NodeGroupImpl machinedeployment-1 is under rolling update , cannot reduce replica count"),
+				err:           fmt.Errorf("MachineDeployment machinedeployment-1 is under rolling update , cannot reduce replica count"),
 			},
 		},
 		{
@@ -645,7 +645,7 @@ func TestGetOptions(t *testing.T) {
 				nodeGroups: []string{nodeGroup1},
 			},
 			expect{
-				err: fmt.Errorf("unable to fetch NodeGroupImpl object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
+				err: fmt.Errorf("unable to fetch MachineDeployment object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
 			},
 		},
 		{

From 063f07a49d32839a81013be1fda17767bc04e4e0 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Thu, 16 Jan 2025 09:11:00 +0530
Subject: [PATCH 10/27] fixed broken test

---
 cluster-autoscaler/cloudprovider/mcm/mcm_manager.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 17e8845ed598..e0b807c605ac 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -899,8 +899,8 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(nodeGroupName string) (*no
 func (m *McmManager) GetMachineDeploymentObject(mdName string) (*v1alpha1.MachineDeployment, error) {
 	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
 	if err != nil {
-		klog.Errorf("unable to fetch MachineDeployments object %s, Error: %v", mdName, err)
-		return nil, fmt.Errorf("unable to fetch MachineDeployments object %s, Error: %v", mdName, err)
+		klog.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
+		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
 	}
 	return md, nil
 }

From 945c74c4bcbc16c6f900732921eeea5edaf1ea95 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Tue, 28 Jan 2025 19:59:32 +0530
Subject: [PATCH 11/27] alternate solution using single annotation for deletion
 by CA

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 234 ++++++++--------
 .../mcm/mcm_cloud_provider_test.go            |  54 +---
 .../cloudprovider/mcm/mcm_manager.go          | 252 ++++++++----------
 .../cloudprovider/mcm/test_utils.go           |   1 -
 4 files changed, 242 insertions(+), 299 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 6ea6a3660787..ee37265c3442 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -25,7 +25,7 @@ import (
 	"context"
 	"fmt"
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
-	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"maps"
 	"slices"
 	"strconv"
 	"strings"
@@ -34,7 +34,6 @@ import (
 
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
-	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	"k8s.io/autoscaler/cluster-autoscaler/config"
@@ -119,11 +118,11 @@ func (mcm *mcmCloudProvider) Name() string {
 // NodeGroups returns all node groups configured for this cloud provider.
 func (mcm *mcmCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
 	result := make([]cloudprovider.NodeGroup, 0, len(mcm.mcmManager.nodeGroups))
-	for _, machinedeployment := range mcm.mcmManager.nodeGroups {
-		if machinedeployment.maxSize == 0 {
+	for _, nodeGroup := range mcm.mcmManager.nodeGroups {
+		if nodeGroup.maxSize == 0 {
 			continue
 		}
-		result = append(result, machinedeployment)
+		result = append(result, nodeGroup)
 	}
 	return result
 }
@@ -135,17 +134,17 @@ func (mcm *mcmCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
 		return nil, nil
 	}
 
-	ref, err := ReferenceFromProviderID(mcm.mcmManager, node.Spec.ProviderID)
+	machineInfo, err := mcm.mcmManager.GetMachineInfo(node)
 	if err != nil {
 		return nil, err
 	}
 
-	if ref == nil {
+	if machineInfo == nil {
 		klog.V(4).Infof("Skipped node %v, it's either been removed or it's not managed by this controller", node.Spec.ProviderID)
 		return nil, nil
 	}
 
-	md, err := mcm.mcmManager.GetNodeGroupImpl(ref)
+	md, err := mcm.mcmManager.GetNodeGroupImpl(machineInfo.Key)
 	if err != nil {
 		return nil, err
 	}
@@ -227,39 +226,6 @@ func (mcm *mcmCloudProvider) GetNodeGpuConfig(*apiv1.Node) *cloudprovider.GpuCon
 	return nil
 }
 
-// ReferenceFromProviderID extracts the Ref from providerId. It returns corresponding machine-name to providerid.
-func ReferenceFromProviderID(m *McmManager, id string) (*types.NamespacedName, error) {
-	machines, err := m.machineLister.Machines(m.namespace).List(labels.Everything())
-	if err != nil {
-		return nil, fmt.Errorf("Could not list machines due to error: %s", err)
-	}
-
-	var Name, Namespace string
-	for _, machine := range machines {
-		machineID := strings.Split(machine.Spec.ProviderID, "/")
-		nodeID := strings.Split(id, "/")
-		// If registered, the ID will match the cloudprovider instance ID.
-		// If unregistered, the ID will match the machine name.
-		if machineID[len(machineID)-1] == nodeID[len(nodeID)-1] ||
-			nodeID[len(nodeID)-1] == machine.Name {
-
-			Name = machine.Name
-			Namespace = machine.Namespace
-			break
-		}
-	}
-
-	if Name == "" {
-		// Could not find any machine corresponds to node %+v", id
-		klog.V(4).Infof("No machine found for node ID %q", id)
-		return nil, nil
-	}
-	return &types.NamespacedName{
-		Name:      Name,
-		Namespace: Namespace,
-	}, nil
-}
-
 // NodeGroupImpl implements NodeGroup interface.
 type NodeGroupImpl struct {
 	types.NamespacedName
@@ -317,8 +283,8 @@ func (ngImpl *NodeGroupImpl) IncreaseSize(delta int) error {
 	if delta <= 0 {
 		return fmt.Errorf("size increase must be positive")
 	}
-	ngImpl.scalingMutex.Lock()
-	defer ngImpl.scalingMutex.Unlock()
+	release := ngImpl.AcquireScalingMutex("IncreaseSize")
+	defer release()
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
 		return err
@@ -342,8 +308,8 @@ func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
 	if delta >= 0 {
 		return fmt.Errorf("size decrease size must be negative")
 	}
-	ngImpl.scalingMutex.Lock()
-	defer ngImpl.scalingMutex.Unlock()
+	release := ngImpl.AcquireScalingMutex("DecreaseTargetSize")
+	defer release()
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
 		return err
@@ -360,85 +326,56 @@ func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
 
 // Refresh resets the priority annotation for the machines that are not present in machines-marked-by-ca-for-deletion annotation on the machineDeployment
 func (ngImpl *NodeGroupImpl) Refresh() error {
-	if !ngImpl.scalingMutex.TryLock() {
-		return fmt.Errorf("cannot Refresh() since scalingMutex currently acquired for %q", ngImpl.Name)
-	}
-	defer ngImpl.scalingMutex.Unlock()
 	mcd, err := ngImpl.mcmManager.GetMachineDeploymentObject(ngImpl.Name)
 	if err != nil {
 		return err
 	}
 	markedMachineNames := getMachineNamesMarkedByCAForDeletion(mcd)
-	machines, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
-	if err != nil {
-		klog.Errorf("[Refresh] failed to get machines for machine deployment %s, hence skipping it. Err: %v", ngImpl.Name, err.Error())
-		return err
-	}
-	// update the machines-marked-by-ca-for-deletion annotation with the machines that are still marked for deletion by CA.
-	// This is done to ensure that the machines that are no longer present are removed from the annotation.
-	var updatedMarkedMachineNames []string
-	for _, machineName := range markedMachineNames {
-		if slices.ContainsFunc(machines, func(mc *v1alpha1.Machine) bool {
-			return mc.Name == machineName
-		}) {
-			updatedMarkedMachineNames = append(updatedMarkedMachineNames, machineName)
-		}
+	if len(markedMachineNames) == 0 {
+		return nil
 	}
-	clone := mcd.DeepCopy()
-	if clone.Annotations == nil {
-		clone.Annotations = map[string]string{}
+	markedMachines, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
+	if err != nil {
+		klog.Errorf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
+		return fmt.Errorf("failed refresh of NodeGroup %q due to: %v", ngImpl.Name, err)
 	}
-	updatedMachinesMarkedByCAForDeletionAnnotationVal := createMachinesMarkedForDeletionAnnotationValue(updatedMarkedMachineNames)
-	if clone.Annotations[machinesMarkedByCAForDeletionAnnotation] != updatedMachinesMarkedByCAForDeletionAnnotationVal {
-		clone.Annotations[machinesMarkedByCAForDeletionAnnotation] = updatedMachinesMarkedByCAForDeletionAnnotationVal
-		ctx, cancelFn := context.WithTimeout(context.Background(), ngImpl.mcmManager.maxRetryTimeout)
-		defer cancelFn()
-		_, err = ngImpl.mcmManager.machineClient.MachineDeployments(ngImpl.Namespace).Update(ctx, clone, metav1.UpdateOptions{})
-		if err != nil {
-			return err
-		}
+	correspondingNodeNames := getNodeNamesFromMachines(markedMachines)
+	if len(correspondingNodeNames) == 0 {
+		klog.Warningf("NodeGroup.Refresh() of %q could not find correspondingNodeNames for markedMachines %q of MachineDeployment", ngImpl.Name, markedMachineNames)
+		return nil
 	}
-	// reset the priority for the machines that are not present in machines-marked-by-ca-for-deletion annotation
-	var incorrectlyMarkedMachines []string
-	for _, machine := range machines {
-		// no need to reset priority for machines already in termination or failed phase
-		if isMachineFailedOrTerminating(machine) {
-			continue
-		}
-		// check if the machine is marked for deletion by CA but not present in machines-marked-by-ca-for-deletion annotation. This means that CA was not able to reduce the replicas
-		// corresponding to this machine and hence the machine should not be marked for deletion.
-		if annotValue, ok := machine.Annotations[machinePriorityAnnotation]; ok && annotValue == priorityValueForDeletionCandidateMachines && !slices.Contains(markedMachineNames, machine.Name) {
-			incorrectlyMarkedMachines = append(incorrectlyMarkedMachines, machine.Name)
-		}
+	err = ngImpl.mcmManager.cordonNodes(correspondingNodeNames)
+	if err != nil {
+		// we do not return error since we don't want this to block CA operation.
+		klog.Warningf("NodeGroup.Refresh() of %q ran into error cordoning nodes: %v", ngImpl.Name, err)
 	}
-	return ngImpl.mcmManager.resetPriorityForMachines(incorrectlyMarkedMachines)
+	return nil
 }
 
-// Belongs returns true if the given node belongs to the NodeGroup.
-// TODO: Implement this to iterate over machines under machinedeployment, and return true if node exists in list.
-func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (bool, error) {
-	ref, err := ReferenceFromProviderID(ngImpl.mcmManager, node.Spec.ProviderID)
-	if err != nil {
-		return false, err
+// Belongs checks if the given node belongs to this NodeGroup and also returns its MachineInfo for its corresponding Machine
+func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (belongs bool, machineInfo *MachineInfo, err error) {
+	machineInfo, err = ngImpl.mcmManager.GetMachineInfo(node)
+	if err != nil || machineInfo == nil {
+		return
 	}
-	targetMd, err := ngImpl.mcmManager.GetNodeGroupImpl(ref)
+	targetMd, err := ngImpl.mcmManager.GetNodeGroupImpl(machineInfo.Key)
 	if err != nil {
-		return false, err
+		return
 	}
 	if targetMd == nil {
-		return false, fmt.Errorf("%s doesn't belong to a known MachinDeployment", node.Name)
+		err = fmt.Errorf("%s doesn't belong to a known MachinDeployment", node.Name)
+		return
 	}
-	if targetMd.Id() != ngImpl.Id() {
-		return false, nil
+	if targetMd.Id() == ngImpl.Id() {
+		belongs = true
 	}
-	return true, nil
+	return
 }
 
 // DeleteNodes deletes the nodes from the group. It is expected that this method will not be called
 // for nodes which are not part of ANY machine deployment.
 func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
-	nodeNames := getNodeNames(nodes)
-	klog.V(0).Infof("Received request to delete nodes:- %v", nodeNames)
+	klog.V(0).Infof("for NodeGroup %q, Received request to delete nodes:- %v", ngImpl.Name, getNodeNames(nodes))
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
 		return err
@@ -446,24 +383,72 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 	if int(size) <= ngImpl.MinSize() {
 		return fmt.Errorf("min size reached, nodes will not be deleted")
 	}
-	machines := make([]*types.NamespacedName, 0, len(nodes))
+	var toDeleteMachineInfos []MachineInfo
 	for _, node := range nodes {
-		belongs, err := ngImpl.Belongs(node)
+		belongs, machineInfo, err := ngImpl.Belongs(node)
 		if err != nil {
 			return err
 		} else if !belongs {
-			return fmt.Errorf("%s belongs to a different machinedeployment than %s", node.Name, ngImpl.Id())
+			return fmt.Errorf("%s belongs to a different MachineDeployment than %q", node.Name, ngImpl.Name)
 		}
-		ref, err := ReferenceFromProviderID(ngImpl.mcmManager, node.Spec.ProviderID)
-		if err != nil {
-			return fmt.Errorf("couldn't find the machine-name from provider-id %s", node.Spec.ProviderID)
+		if machineInfo.FailedOrTerminating {
+			klog.V(3).Infof("for NodeGroup %q, Machine %q is already marked as terminating - skipping deletion", ngImpl.Name, machineInfo.Key.Name)
+			continue
 		}
-		machines = append(machines, ref)
+		toDeleteMachineInfos = append(toDeleteMachineInfos, *machineInfo)
 	}
-	return ngImpl.mcmManager.DeleteMachines(machines)
+	return ngImpl.deleteMachines(toDeleteMachineInfos)
 }
 
-func getNodeNames(nodes []*apiv1.Node) interface{} {
+// deleteMachines annotates the corresponding MachineDeployment with machine names of toDeleteMachineInfos, reduces the desired replicas of the corresponding MachineDeployment and cordons corresponding nodes belonging to toDeleteMachineInfos
+func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []MachineInfo) error {
+	if len(toDeleteMachineInfos) == 0 {
+		return nil
+	}
+	release := ngImpl.AcquireScalingMutex("deleteMachines")
+	defer release()
+	// get the machine deployment and return if rolling update is not finished
+	md, err := ngImpl.mcmManager.GetMachineDeploymentObject(ngImpl.Name)
+	if err != nil {
+		return err
+	}
+	if !isRollingUpdateFinished(md) {
+		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", ngImpl.Name)
+	}
+
+	var toDeleteMachineNames, toDeleteNodeNames []string
+	for _, machineInfo := range toDeleteMachineInfos {
+		toDeleteMachineNames = append(toDeleteMachineNames, machineInfo.Key.Name)
+		toDeleteNodeNames = append(toDeleteNodeNames, machineInfo.NodeName)
+	}
+
+	// Trying to update the machineDeployment till the deadline
+	err = ngImpl.mcmManager.retry(func(ctx context.Context) (bool, error) {
+		return ngImpl.mcmManager.scaleDownMachineDeployment(ctx, ngImpl.Name, toDeleteMachineNames)
+	}, "MachineDeployment", "update", ngImpl.Name)
+	if err != nil {
+		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, len(toDeleteMachineNames), toDeleteMachineNames, err)
+		return fmt.Errorf("for NodeGroup %q, cannot scale down due to: %w", ngImpl.Name, err)
+	}
+	err = ngImpl.mcmManager.cordonNodes(toDeleteNodeNames)
+	if err != nil {
+		// Do not return error as cordoning is best-effort
+		klog.Warningf("NodeGroup.deleteMachines() of %q ran into error cordoning nodes: %v", ngImpl.Name, err)
+	}
+	return nil
+}
+
+func (ngImpl *NodeGroupImpl) AcquireScalingMutex(operation string) (releaseFn func()) {
+	klog.V(3).Infof("%s is acquired scalingMutex for NodeGroup %q", operation, ngImpl.Name)
+	ngImpl.scalingMutex.Lock()
+	klog.V(3).Infof("%s has acquired scalingMutex for %q", operation, ngImpl.Name)
+	releaseFn = func() {
+		ngImpl.scalingMutex.Unlock()
+	}
+	return
+}
+
+func getNodeNames(nodes []*apiv1.Node) []string {
 	nodeNames := make([]string, 0, len(nodes))
 	for _, node := range nodes {
 		nodeNames = append(nodeNames, node.Name)
@@ -471,7 +456,18 @@ func getNodeNames(nodes []*apiv1.Node) interface{} {
 	return nodeNames
 }
 
-// Id returns machinedeployment id.
+func getNodeNamesFromMachines(machines []*v1alpha1.Machine) []string {
+	var nodeNames []string
+	for _, m := range machines {
+		nodeName := m.Labels["node"]
+		if nodeName != "" {
+			nodeNames = append(nodeNames, nodeName)
+		}
+	}
+	return nodeNames
+}
+
+// Id returns MachineDeployment id.
 func (ngImpl *NodeGroupImpl) Id() string {
 	return ngImpl.Name
 }
@@ -485,7 +481,7 @@ func (ngImpl *NodeGroupImpl) Debug() string {
 func (ngImpl *NodeGroupImpl) Nodes() ([]cloudprovider.Instance, error) {
 	instances, err := ngImpl.mcmManager.GetInstancesForMachineDeployment(ngImpl.Name)
 	if err != nil {
-		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the machinedeployment %q, error: %v", ngImpl.Name, err)
+		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the MachineDeployment %q, error: %v", ngImpl.Name, err)
 	}
 	erroneousInstanceInfos := make([]string, 0, len(instances))
 	for _, instance := range instances {
@@ -563,7 +559,17 @@ func (ngImpl *NodeGroupImpl) AtomicIncreaseSize(delta int) error {
 // getMachineNamesMarkedByCAForDeletion returns the set of machine names marked by CA for deletion.
 func getMachineNamesMarkedByCAForDeletion(mcd *v1alpha1.MachineDeployment) []string {
 	if mcd.Annotations == nil || mcd.Annotations[machinesMarkedByCAForDeletionAnnotation] == "" {
-		return make([]string, 0)
+		return nil
 	}
 	return strings.Split(mcd.Annotations[machinesMarkedByCAForDeletionAnnotation], ",")
 }
+
+func mergeStringSlicesUnique(slice1, slice2 []string) []string {
+	seen := make(map[string]struct{}, len(slice1)+len(slice2))
+	for _, s := range slices.Concat(slice1, slice2) {
+		seen[s] = struct{}{}
+	}
+	concatenated := slices.Collect(maps.Keys(seen))
+	slices.Sort(concatenated)
+	return concatenated
+}
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 3467b48afcac..76f5fd609b65 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -9,7 +9,6 @@ import (
 	"errors"
 	"fmt"
 	"math"
-	"slices"
 	"strings"
 	"testing"
 	"time"
@@ -157,7 +156,7 @@ func TestDeleteNodes(t *testing.T) {
 			},
 		},
 		{
-			"should not scale down when machine deployment update call times out and should reset priority of the corresponding machine",
+			"should not scale down when machine deployment update call times out",
 			setup{
 				nodes:              newNodes(2, "fakeID"),
 				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
@@ -171,10 +170,11 @@ func TestDeleteNodes(t *testing.T) {
 				},
 			},
 			action{node: newNodes(1, "fakeID")[0]},
+			//return fmt.Errorf("for NodeGroup %q, cannot scale down due to: %v", ngImpl.Name, toDeleteMachineNames, err)
 			expect{
 				mdName:     "machinedeployment-1",
 				mdReplicas: 2,
-				err:        errors.Join(nil, fmt.Errorf("unable to scale in machine deployment machinedeployment-1, Error: %w", errors.New(mdUpdateErrorMsg))),
+				err:        fmt.Errorf("for NodeGroup %q, cannot scale down due to: %w", "machinedeployment-1", errors.New(mdUpdateErrorMsg)),
 			},
 		},
 		{
@@ -249,28 +249,6 @@ func TestDeleteNodes(t *testing.T) {
 				err:           fmt.Errorf("min size reached, nodes will not be deleted"),
 			},
 		},
-		{
-			"no scale down of machine deployment if priority of the targeted machine cannot be updated to 1",
-			setup{
-				nodes:              newNodes(2, "fakeID"),
-				machines:           newMachines(2, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"3", "3"}),
-				machineSets:        newMachineSets(1, "machinedeployment-1"),
-				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
-				nodeGroups:         []string{nodeGroup1},
-				controlMachineFakeResourceActions: &customfake.ResourceActions{
-					Machine: customfake.Actions{
-						Update: customfake.CreateFakeResponse(math.MaxInt32, mcUpdateErrorMsg, 0),
-					},
-				},
-			},
-			action{node: newNodes(1, "fakeID")[0]},
-			expect{
-				prio1Machines: nil,
-				mdName:        "machinedeployment-1",
-				mdReplicas:    2,
-				err:           fmt.Errorf("could not prioritize machine machine-1 for deletion, aborting scale in of machine deployment, Error: %s", mcUpdateErrorMsg),
-			},
-		},
 		{
 			"should not scale down machine deployment if the node belongs to another machine deployment",
 			setup{
@@ -285,7 +263,7 @@ func TestDeleteNodes(t *testing.T) {
 				prio1Machines: nil,
 				mdName:        "machinedeployment-2",
 				mdReplicas:    2,
-				err:           fmt.Errorf("node-1 belongs to a different machinedeployment than machinedeployment-1"),
+				err:           fmt.Errorf("node-1 belongs to a different MachineDeployment than %q", "machinedeployment-1"),
 			},
 		},
 	}
@@ -325,21 +303,6 @@ func TestDeleteNodes(t *testing.T) {
 			g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", entry.expect.mdReplicas))
 			g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletionAnnotation]).To(Equal(entry.expect.machinesMarkedByCAAnnotationValue))
 
-			machines, err := m.machineClient.Machines(m.namespace).List(context.TODO(), metav1.ListOptions{
-				LabelSelector: metav1.FormatLabelSelector(&metav1.LabelSelector{
-					MatchLabels: map[string]string{"name": md.Name},
-				}),
-			})
-
-			for _, machine := range machines.Items {
-				if slices.ContainsFunc(entry.expect.prio1Machines, func(m *v1alpha1.Machine) bool {
-					return machine.Name == m.Name
-				}) {
-					g.Expect(machine.Annotations[machinePriorityAnnotation]).To(Equal(priorityValueForDeletionCandidateMachines))
-				} else {
-					g.Expect(machine.Annotations[machinePriorityAnnotation]).To(Equal(defaultPriorityValue))
-				}
-			}
 		})
 	}
 }
@@ -483,15 +446,6 @@ func TestRefresh(t *testing.T) {
 			} else {
 				g.Expect(err).To(BeNil())
 			}
-			machines, err := m.machineClient.Machines(m.namespace).List(context.TODO(), metav1.ListOptions{})
-			g.Expect(err).To(BeNil())
-			for _, mc := range machines.Items {
-				if slices.Contains(entry.expect.prio3Machines, mc.Name) {
-					g.Expect(mc.Annotations[machinePriorityAnnotation]).To(Equal(defaultPriorityValue))
-				} else {
-					g.Expect(mc.Annotations[machinePriorityAnnotation]).To(Equal(priorityValueForDeletionCandidateMachines))
-				}
-			}
 		})
 	}
 }
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index e0b807c605ac..c6e3e58a2f8c 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -28,6 +28,7 @@ import (
 	"flag"
 	"fmt"
 	"k8s.io/apimachinery/pkg/types"
+	utilerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
@@ -64,6 +65,7 @@ import (
 	"k8s.io/client-go/discovery"
 	appsinformers "k8s.io/client-go/informers"
 	coreinformers "k8s.io/client-go/informers"
+	corev1 "k8s.io/client-go/kubernetes/typed/core/v1"
 	corelisters "k8s.io/client-go/listers/core/v1"
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/cache"
@@ -82,8 +84,6 @@ const (
 	// priorityValueForDeletionCandidateMachines is the priority annotation value set on machines that the CA wants to be deleted. Its value is set to 1.
 	priorityValueForDeletionCandidateMachines = "1"
 	minResyncPeriodDefault                    = 1 * time.Hour
-	// machinePriorityAnnotation is the annotation to set machine priority while deletion
-	machinePriorityAnnotation = "machinepriority.machine.sapcloud.io"
 	// kindMachineClass is the kind for generic machine class used by the OOT providers
 	kindMachineClass = "MachineClass"
 	// providerAWS is the provider type for AWS machine class objects
@@ -103,6 +103,8 @@ const (
 	// machinesMarkedByCAForDeletionAnnotation is the annotation set by CA on machine deployment. Its value denotes the machines that
 	// CA marked for deletion by updating the priority annotation to 1 and scaling down the machine deployment.
 	machinesMarkedByCAForDeletionAnnotation = "cluster-autoscaler.kubernetes.io/machines-marked-by-ca-for-deletion"
+	// poolNameLabel is the name of the label for gardener worker pool
+	poolNameLabel = "worker.gardener.cloud/pool"
 )
 
 var (
@@ -137,6 +139,7 @@ type McmManager struct {
 	machineLister           machinelisters.MachineLister
 	machineClassLister      machinelisters.MachineClassLister
 	nodeLister              corelisters.NodeLister
+	nodeInterface           corev1.NodeInterface
 	maxRetryTimeout         time.Duration
 	retryInterval           time.Duration
 }
@@ -243,8 +246,9 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 		targetCoreClientBuilder := ClientBuilder{
 			ClientConfig: targetKubeconfig,
 		}
+		targetCoreClient := targetCoreClientBuilder.ClientOrDie("target-core-shared-informers")
 		targetCoreInformerFactory := coreinformers.NewSharedInformerFactory(
-			targetCoreClientBuilder.ClientOrDie("target-core-shared-informers"),
+			targetCoreClient,
 			*minResyncPeriod,
 		)
 
@@ -263,6 +267,7 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 			machineSetLister:        machineSharedInformers.MachineSets().Lister(),
 			machineDeploymentLister: machineSharedInformers.MachineDeployments().Lister(),
 			nodeLister:              coreSharedInformers.Nodes().Lister(),
+			nodeInterface:           targetCoreClient.CoreV1().Nodes(),
 			discoveryOpts:           discoveryOpts,
 			maxRetryTimeout:         maxRetryTimeout,
 			retryInterval:           retryInterval,
@@ -374,43 +379,43 @@ func CreateMcmManager(discoveryOpts cloudprovider.NodeGroupDiscoveryOptions) (*M
 }
 
 // GetNodeGroupImpl returns the NodeGroupImpl for the given fully-qualified machine name.
-func (m *McmManager) GetNodeGroupImpl(machine *types.NamespacedName) (*NodeGroupImpl, error) {
-	if machine.Name == "" {
+func (m *McmManager) GetNodeGroupImpl(machineKey types.NamespacedName) (*NodeGroupImpl, error) {
+	if machineKey.Name == "" {
 		// Considering the possibility when Machine has been deleted but due to cached Node object it appears here.
 		return nil, fmt.Errorf("node does not Exists")
 	}
 
-	machineObject, err := m.machineLister.Machines(m.namespace).Get(machine.Name)
+	machineObject, err := m.machineLister.Machines(m.namespace).Get(machineKey.Name)
 	if err != nil {
 		if kube_errors.IsNotFound(err) {
 			// Machine has been removed.
 			klog.V(4).Infof("Machine was removed before it could be retrieved: %v", err)
 			return nil, nil
 		}
-		return nil, fmt.Errorf("Unable to fetch Machine object %s %v", machine.Name, err)
+		return nil, fmt.Errorf("unable to fetch Machine object for given Machine name %q due to %w", machineKey.Name, err)
 	}
 
 	var machineSetName, machineDeploymentName string
 	if len(machineObject.OwnerReferences) > 0 {
 		machineSetName = machineObject.OwnerReferences[0].Name
 	} else {
-		return nil, fmt.Errorf("Unable to find parent MachineSet of given Machine object %s %v", machine.Name, err)
+		return nil, fmt.Errorf("unable to find parent MachineSet for given Machine name %q due to: %w", machineKey.Name, err)
 	}
 
 	machineSetObject, err := m.machineSetLister.MachineSets(m.namespace).Get(machineSetName)
 	if err != nil {
-		return nil, fmt.Errorf("Unable to fetch MachineSet object %s %v", machineSetName, err)
+		return nil, fmt.Errorf("unable to fetch MachineSet object for name %q due to: %w", machineSetName, err)
 	}
 
 	if len(machineSetObject.OwnerReferences) > 0 {
 		machineDeploymentName = machineSetObject.OwnerReferences[0].Name
 	} else {
-		return nil, fmt.Errorf("unable to find parent MachineDeployment of given MachineSet object %s %v", machineSetName, err)
+		return nil, fmt.Errorf("unable to find parent MachineDeployment of given MachineSet name %q due to: %w", machineSetName, err)
 	}
 
 	machineDeployment, ok := m.nodeGroups[types.NamespacedName{Namespace: m.namespace, Name: machineDeploymentName}]
 	if !ok {
-		return nil, fmt.Errorf("machineDeployment %s not found in the list of machine deployments", machineDeploymentName)
+		return nil, fmt.Errorf("could not find MachineDeployment %q in the managed nodeGroups", machineDeploymentName)
 	}
 	return machineDeployment, nil
 }
@@ -457,111 +462,6 @@ func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, nodeGroup *No
 	return true, err
 }
 
-// DeleteMachines annotates the target machines and also reduces the desired replicas of the corresponding MachineDeployment.
-func (m *McmManager) DeleteMachines(targetMachineRefs []*types.NamespacedName) error {
-	if len(targetMachineRefs) == 0 {
-		return nil
-	}
-	commonNodeGroup, err := m.GetNodeGroupImpl(targetMachineRefs[0])
-	if err != nil {
-		return err
-	}
-	// acquire the mutex
-	commonNodeGroup.scalingMutex.Lock()
-	defer commonNodeGroup.scalingMutex.Unlock()
-	// get the machine deployment and return if rolling update is not finished
-	md, err := m.GetMachineDeploymentObject(commonNodeGroup.Name)
-	if err != nil {
-		return err
-	}
-	if !isRollingUpdateFinished(md) {
-		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", commonNodeGroup.Name)
-	}
-	machineNamesMarkedByCA := getMachineNamesMarkedByCAForDeletion(md)
-	// update priorities of machines to be deleted except the ones already in termination to 1
-	machineNamesWithPrio1, err := m.prioritizeMachinesForDeletion(targetMachineRefs)
-	if err != nil {
-		return err
-	}
-	machineNamesMarkedByCA = append(machineNamesMarkedByCA, machineNamesWithPrio1...)
-	// Trying to update the machineDeployment till the deadline
-	err = m.retry(func(ctx context.Context) (bool, error) {
-		return m.scaleDownAndAnnotateMachineDeployment(ctx, commonNodeGroup.Name, len(machineNamesWithPrio1), createMachinesMarkedForDeletionAnnotationValue(machineNamesMarkedByCA))
-	}, "MachineDeployment", "update", commonNodeGroup.Name)
-	if err != nil {
-		klog.Errorf("unable to scale in machine deployment %s, will reset priority of target machines, Error: %v", commonNodeGroup.Name, err)
-		return errors.Join(err, m.resetPriorityForMachines(machineNamesWithPrio1))
-	}
-	return err
-}
-
-// resetPriorityForMachines resets the priority of machines passed in the argument to defaultPriorityValue
-func (m *McmManager) resetPriorityForMachines(mcNames []string) error {
-	var collectiveError []error
-	for _, mcName := range mcNames {
-		machine, err := m.machineLister.Machines(m.namespace).Get(mcName)
-		if kube_errors.IsNotFound(err) {
-			klog.Warningf("Machine %s not found, skipping resetting priority annotation", mcName)
-			continue
-		}
-		if err != nil {
-			collectiveError = append(collectiveError, fmt.Errorf("unable to get Machine object %s, Error: %v", mcName, err))
-			continue
-		}
-		ctx, cancelFn := context.WithDeadline(context.Background(), time.Now().Add(defaultResetAnnotationTimeout))
-		err = func() error {
-			defer cancelFn()
-			val, ok := machine.Annotations[machinePriorityAnnotation]
-			if ok && val != defaultPriorityValue {
-				_, err = m.updateAnnotationOnMachine(ctx, machine.Name, machinePriorityAnnotation, defaultPriorityValue)
-				return err
-			}
-			return nil
-		}()
-		if err != nil {
-			collectiveError = append(collectiveError, fmt.Errorf("could not reset priority annotation on machine %s, Error: %v", machine.Name, err))
-			continue
-		}
-	}
-	return errors.Join(collectiveError...)
-}
-
-// prioritizeMachinesForDeletion prioritizes the targeted machines by updating their priority annotation to 1
-func (m *McmManager) prioritizeMachinesForDeletion(targetMachineRefs []*types.NamespacedName) ([]string, error) {
-	var expectedToTerminateMachineNodePairs = make(map[string]string)
-	var prio1MarkedMachineNames []string
-
-	for _, machineRef := range targetMachineRefs {
-		// Trying to update the priority of machineRef till m.maxRetryTimeout
-		if err := m.retry(func(ctx context.Context) (bool, error) {
-			mc, err := m.machineLister.Machines(m.namespace).Get(machineRef.Name)
-			if err != nil {
-				if kube_errors.IsNotFound(err) {
-					klog.Warningf("Machine %s not found, skipping prioritizing it for deletion", machineRef.Name)
-					return false, nil
-				}
-				klog.Errorf("Unable to fetch Machine object %s, Error: %v", machineRef.Name, err)
-				return true, err
-			}
-			if isMachineFailedOrTerminating(mc) {
-				return false, nil
-			}
-			if mc.Annotations[machinePriorityAnnotation] == priorityValueForDeletionCandidateMachines {
-				klog.Infof("Machine %q priority is already set to 1, hence skipping the update", mc.Name)
-				return false, nil
-			}
-			prio1MarkedMachineNames = append(prio1MarkedMachineNames, machineRef.Name)
-			expectedToTerminateMachineNodePairs[mc.Name] = mc.Labels["node"]
-			return m.updateAnnotationOnMachine(ctx, mc.Name, machinePriorityAnnotation, priorityValueForDeletionCandidateMachines)
-		}, "Machine", "update", machineRef.Name); err != nil {
-			klog.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
-			return nil, fmt.Errorf("could not prioritize machine %s for deletion, aborting scale in of machine deployment, Error: %v", machineRef.Name, err)
-		}
-	}
-	klog.V(2).Infof("Expected to remove following {machineRef: corresponding node} pairs %s", expectedToTerminateMachineNodePairs)
-	return prio1MarkedMachineNames, nil
-}
-
 // updateAnnotationOnMachine returns error only when updating the annotations on machine has been failing consequently and deadline is crossed
 func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName string, key, val string) (bool, error) {
 	machine, err := m.machineLister.Machines(m.namespace).Get(mcName)
@@ -585,33 +485,39 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 	return true, err
 }
 
-// scaleDownAndAnnotateMachineDeployment scales down the machine deployment by the provided scaleDownAmount and returns the updated spec.Replicas after scale down.
-// It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of existing machines marked for deletion.
-// NOTE: Callers are expected to take the NodeGroup scalingMutex before invoking this method.
-func (m *McmManager) scaleDownAndAnnotateMachineDeployment(ctx context.Context, mdName string, scaleDownAmount int, markedMachines string) (bool, error) {
+// scaleDownMachineDeployment scales down the MachineDeployment for given name by the length of toDeleteMachineNames
+// It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of toDeleteMachineNames
+// NOTE: Callers MUST take the NodeGroup scalingMutex before invoking this method.
+func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toDeleteMachineNames []string) (bool, error) {
 	md, err := m.GetMachineDeploymentObject(mdName)
 	if err != nil {
 		return true, err
 	}
-	mdclone := md.DeepCopy()
-	expectedReplicas := mdclone.Spec.Replicas - int32(scaleDownAmount)
-	if expectedReplicas == mdclone.Spec.Replicas {
-		klog.Infof("MachineDeployment %q is already set to %d, skipping the update", mdclone.Name, expectedReplicas)
+
+	scaleDownAmount := len(toDeleteMachineNames)
+	expectedReplicas := md.Spec.Replicas - int32(scaleDownAmount)
+	if expectedReplicas == md.Spec.Replicas {
+		klog.Infof("MachineDeployment %q is already set to %d, skipping the update", md.Name, expectedReplicas)
 		return false, nil
 	} else if expectedReplicas < 0 {
-		klog.Errorf("Cannot delete machines in machine deployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, mdclone.Spec.Replicas)
-		return false, fmt.Errorf("cannot delete machines in machine deployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, mdclone.Spec.Replicas)
+		klog.Errorf("Cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
+		return false, fmt.Errorf("cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
 	}
-	mdclone.Spec.Replicas = expectedReplicas
-	if mdclone.Annotations == nil {
-		mdclone.Annotations = make(map[string]string)
+
+	mdCopy := md.DeepCopy()
+	mdCopy.Spec.Replicas = expectedReplicas
+	if mdCopy.Annotations == nil {
+		mdCopy.Annotations = make(map[string]string)
 	}
-	mdclone.Annotations[machinesMarkedByCAForDeletionAnnotation] = markedMachines
-	_, err = m.machineClient.MachineDeployments(mdclone.Namespace).Update(ctx, mdclone, metav1.UpdateOptions{})
+	alreadyMarkedMachineNames := getMachineNamesMarkedByCAForDeletion(md)
+	toMarkMachineNames := mergeStringSlicesUnique(alreadyMarkedMachineNames, toDeleteMachineNames)
+	markedAnnotValue := createMachinesMarkedForDeletionAnnotationValue(toMarkMachineNames)
+	mdCopy.Annotations[machinesMarkedByCAForDeletionAnnotation] = markedAnnotValue
+	_, err = m.machineClient.MachineDeployments(mdCopy.Namespace).Update(ctx, mdCopy, metav1.UpdateOptions{})
 	if err != nil {
-		return true, fmt.Errorf("unable to scale in machine deployment %s, Error: %w", mdName, err)
+		return true, err
 	}
-	klog.V(2).Infof("MachineDeployment %s size decreased to %d ", mdclone.Name, mdclone.Spec.Replicas)
+	klog.V(2).Infof("MachineDeployment %s size decreased to %d ", mdCopy.Name, mdCopy.Spec.Replicas)
 	return false, nil
 }
 
@@ -744,7 +650,7 @@ func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName strin
 	return md.Annotations, nil
 }
 
-// GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment
+// GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the MachineDeployment
 // If no node present then it forms the nodeTemplate using the one present in machineClass
 func (m *McmManager) GetMachineDeploymentNodeTemplate(nodeGroupName string) (*nodeTemplate, error) {
 	md, err := m.GetMachineDeploymentObject(nodeGroupName)
@@ -1021,6 +927,84 @@ func (m *McmManager) buildNodeFromTemplate(name string, template *nodeTemplate)
 	return &node, nil
 }
 
+func (m *McmManager) cordonNodes(nodeNames []string) error {
+	if len(nodeNames) == 0 {
+		return nil
+	}
+	if m.nodeInterface == nil {
+		return nil
+	}
+	ctx, cancelFn := context.WithDeadline(context.Background(), time.Now().Add(m.maxRetryTimeout))
+	defer cancelFn()
+	var errs []error
+	for _, nodeName := range nodeNames {
+		node, err := m.nodeLister.Get(nodeName)
+		if err != nil {
+			errs = append(errs, err)
+			continue
+		}
+		if node.Spec.Unschedulable {
+			klog.V(4).Infof("Node %q is already cordoned", nodeName)
+			continue
+		}
+		adjustNode := node.DeepCopy()
+		adjustNode.Spec.Unschedulable = true
+		_, err = m.nodeInterface.Update(ctx, adjustNode, metav1.UpdateOptions{})
+		if err != nil {
+			errs = append(errs, fmt.Errorf("failed to cordon Node %q: %w", nodeName, err))
+		}
+		klog.V(3).Infof("Node %q has been cordoned successfully", nodeName)
+	}
+	if len(errs) > 0 {
+		return utilerrors.NewAggregate(errs)
+	}
+	return nil
+}
+
+type MachineInfo struct {
+	Key                 types.NamespacedName
+	NodeName            string
+	FailedOrTerminating bool
+}
+
+// GetMachineInfo extracts the machine Key from the given node's providerID if found and checks whether it is failed or terminating and returns the MachineInfo or an error
+func (m *McmManager) GetMachineInfo(node *apiv1.Node) (*MachineInfo, error) {
+	machines, err := m.machineLister.Machines(m.namespace).List(labels.Everything())
+	if err != nil {
+		return nil, fmt.Errorf("cannot list machines in namespace %q due to: %s", m.namespace, err)
+	}
+
+	providerID := node.Spec.ProviderID
+	var machineName, machineNamespace string
+	var isFailedOrTerminating bool
+	for _, machine := range machines {
+		machineID := strings.Split(machine.Spec.ProviderID, "/")
+		nodeID := strings.Split(node.Spec.ProviderID, "/")
+		// If registered, the ID will match the cloudprovider instance ID.
+		// If unregistered, the ID will match the machine name.
+		if machineID[len(machineID)-1] == nodeID[len(nodeID)-1] ||
+			nodeID[len(nodeID)-1] == machine.Name {
+			machineName = machine.Name
+			machineNamespace = machine.Namespace
+			isFailedOrTerminating = isMachineFailedOrTerminating(machine)
+			break
+		}
+	}
+
+	if machineName == "" {
+		klog.V(3).Infof("No Machine found for node providerID %q", providerID)
+		return nil, nil
+	}
+	return &MachineInfo{
+		Key: types.NamespacedName{
+			Name:      machineName,
+			Namespace: machineNamespace,
+		},
+		NodeName:            node.Name,
+		FailedOrTerminating: isFailedOrTerminating,
+	}, nil
+}
+
 func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]string {
 	result := make(map[string]string)
 	// TODO: extract from MCM
diff --git a/cluster-autoscaler/cloudprovider/mcm/test_utils.go b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
index 65c78c553e50..188eb3db70f5 100644
--- a/cluster-autoscaler/cloudprovider/mcm/test_utils.go
+++ b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
@@ -141,7 +141,6 @@ func newMachines(
 					{Name: msName},
 				},
 				Labels:            map[string]string{machineDeploymentNameLabel: mdName},
-				Annotations:       map[string]string{machinePriorityAnnotation: priorityAnnotationValues[i]},
 				CreationTimestamp: metav1.Now(),
 			},
 		}

From b021a3c859132b235fe59be03aee271b3eb9c3ff Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Tue, 28 Jan 2025 20:16:01 +0530
Subject: [PATCH 12/27] fixed use of Go 1.23 functions

---
 .../cloudprovider/mcm/mcm_cloud_provider.go    |  6 ++++--
 .../mcm/mcm_cloud_provider_test.go             | 18 ------------------
 2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index ee37265c3442..07e4ff19efaf 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -25,7 +25,6 @@ import (
 	"context"
 	"fmt"
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
-	"maps"
 	"slices"
 	"strconv"
 	"strings"
@@ -569,7 +568,10 @@ func mergeStringSlicesUnique(slice1, slice2 []string) []string {
 	for _, s := range slices.Concat(slice1, slice2) {
 		seen[s] = struct{}{}
 	}
-	concatenated := slices.Collect(maps.Keys(seen))
+	concatenated := make([]string, 0, len(seen)) // TODO: Change to slices.Collect(maps.Keys(seen)) from Go 1.23
+	for s := range seen {
+		concatenated = append(concatenated, s)
+	}
 	slices.Sort(concatenated)
 	return concatenated
 }
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 76f5fd609b65..f1d052844472 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -401,24 +401,6 @@ func TestRefresh(t *testing.T) {
 				err: nil,
 			},
 		},
-		{
-			"priority reset of machine fails",
-			setup{
-				nodes:              newNodes(1, "fakeID"),
-				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
-				machineDeployments: newMachineDeployments(1, 1, nil, nil, nil),
-				controlMachineFakeResourceActions: &customfake.ResourceActions{
-					Machine: customfake.Actions{
-						Update: customfake.CreateFakeResponse(math.MaxInt32, mcUpdateErrorMsg, 0),
-					},
-				},
-				nodeGroups:    []string{nodeGroup2},
-				mcmDeployment: newMCMDeployment(1),
-			},
-			expect{
-				err: errors.Join(nil, errors.Join(fmt.Errorf("could not reset priority annotation on machine machine-1, Error: %v", mcUpdateErrorMsg))),
-			},
-		},
 	}
 	for _, entry := range table {
 		entry := entry

From 3b02417da70abe800cb785c28a438d8f3fd0067e Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Tue, 28 Jan 2025 20:40:15 +0530
Subject: [PATCH 13/27] fixed test

---
 .../cloudprovider/mcm/mcm_cloud_provider.go      | 12 ++++++------
 .../cloudprovider/mcm/mcm_manager.go             | 16 ++++++++--------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 07e4ff19efaf..ac5c91bfefa9 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -352,12 +352,12 @@ func (ngImpl *NodeGroupImpl) Refresh() error {
 }
 
 // Belongs checks if the given node belongs to this NodeGroup and also returns its MachineInfo for its corresponding Machine
-func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (belongs bool, machineInfo *MachineInfo, err error) {
-	machineInfo, err = ngImpl.mcmManager.GetMachineInfo(node)
-	if err != nil || machineInfo == nil {
+func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (belongs bool, mInfo *machineInfo, err error) {
+	mInfo, err = ngImpl.mcmManager.GetMachineInfo(node)
+	if err != nil || mInfo == nil {
 		return
 	}
-	targetMd, err := ngImpl.mcmManager.GetNodeGroupImpl(machineInfo.Key)
+	targetMd, err := ngImpl.mcmManager.GetNodeGroupImpl(mInfo.Key)
 	if err != nil {
 		return
 	}
@@ -382,7 +382,7 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 	if int(size) <= ngImpl.MinSize() {
 		return fmt.Errorf("min size reached, nodes will not be deleted")
 	}
-	var toDeleteMachineInfos []MachineInfo
+	var toDeleteMachineInfos []machineInfo
 	for _, node := range nodes {
 		belongs, machineInfo, err := ngImpl.Belongs(node)
 		if err != nil {
@@ -400,7 +400,7 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 }
 
 // deleteMachines annotates the corresponding MachineDeployment with machine names of toDeleteMachineInfos, reduces the desired replicas of the corresponding MachineDeployment and cordons corresponding nodes belonging to toDeleteMachineInfos
-func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []MachineInfo) error {
+func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []machineInfo) error {
 	if len(toDeleteMachineInfos) == 0 {
 		return nil
 	}
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index c6e3e58a2f8c..593e1aff2496 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -163,6 +163,12 @@ type nodeTemplate struct {
 	Taints       []apiv1.Taint
 }
 
+type machineInfo struct {
+	Key                 types.NamespacedName
+	NodeName            string
+	FailedOrTerminating bool
+}
+
 func init() {
 	controlBurst = flag.Int("control-apiserver-burst", rest.DefaultBurst, "Throttling burst configuration for the client to control cluster's apiserver.")
 	controlQPS = flag.Float64("control-apiserver-qps", float64(rest.DefaultQPS), "Throttling QPS configuration for the client to control cluster's apiserver.")
@@ -961,14 +967,8 @@ func (m *McmManager) cordonNodes(nodeNames []string) error {
 	return nil
 }
 
-type MachineInfo struct {
-	Key                 types.NamespacedName
-	NodeName            string
-	FailedOrTerminating bool
-}
-
 // GetMachineInfo extracts the machine Key from the given node's providerID if found and checks whether it is failed or terminating and returns the MachineInfo or an error
-func (m *McmManager) GetMachineInfo(node *apiv1.Node) (*MachineInfo, error) {
+func (m *McmManager) GetMachineInfo(node *apiv1.Node) (*machineInfo, error) {
 	machines, err := m.machineLister.Machines(m.namespace).List(labels.Everything())
 	if err != nil {
 		return nil, fmt.Errorf("cannot list machines in namespace %q due to: %s", m.namespace, err)
@@ -995,7 +995,7 @@ func (m *McmManager) GetMachineInfo(node *apiv1.Node) (*MachineInfo, error) {
 		klog.V(3).Infof("No Machine found for node providerID %q", providerID)
 		return nil, nil
 	}
-	return &MachineInfo{
+	return &machineInfo{
 		Key: types.NamespacedName{
 			Name:      machineName,
 			Namespace: machineNamespace,

From 22ced54528c0d3e58435042f5364feabfe116dfb Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Tue, 28 Jan 2025 20:48:02 +0530
Subject: [PATCH 14/27] added godoc for AcquireScalingMutex

---
 cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index ac5c91bfefa9..2041033652dc 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -437,6 +437,7 @@ func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []machineInfo)
 	return nil
 }
 
+// AcquireScalingMutex acquires the scalingMutex associated with this NodeGroup and returns a function that releases the scalingMutex that is expected to be deferred by the caller.
 func (ngImpl *NodeGroupImpl) AcquireScalingMutex(operation string) (releaseFn func()) {
 	klog.V(3).Infof("%s is acquired scalingMutex for NodeGroup %q", operation, ngImpl.Name)
 	ngImpl.scalingMutex.Lock()

From bc85986902271b826704e013b32567c699bb0cb8 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Wed, 29 Jan 2025 11:03:05 +0530
Subject: [PATCH 15/27] correct godoc for
 machinesMarkedByCAForDeletionAnnotation

---
 cluster-autoscaler/cloudprovider/mcm/mcm_manager.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 593e1aff2496..37bc251fddce 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -100,8 +100,7 @@ const (
 	machineDeploymentPausedReason = "DeploymentPaused"
 	// machineDeploymentNameLabel key for Machine Deployment name in machine labels
 	machineDeploymentNameLabel = "name"
-	// machinesMarkedByCAForDeletionAnnotation is the annotation set by CA on machine deployment. Its value denotes the machines that
-	// CA marked for deletion by updating the priority annotation to 1 and scaling down the machine deployment.
+	// machinesMarkedByCAForDeletionAnnotation is the annotation set by CA on the MachineDeployment and represents machines that must be drained and deleted.
 	machinesMarkedByCAForDeletionAnnotation = "cluster-autoscaler.kubernetes.io/machines-marked-by-ca-for-deletion"
 	// poolNameLabel is the name of the label for gardener worker pool
 	poolNameLabel = "worker.gardener.cloud/pool"

From 4d33bef7df999fc7df5059ab25436283f9b9bc7f Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Wed, 29 Jan 2025 12:41:29 +0530
Subject: [PATCH 16/27] changed to machineutils.TriggerDeletionByMCM

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 37 +++++---
 .../mcm/mcm_cloud_provider_test.go            | 61 ++++++-------
 .../cloudprovider/mcm/mcm_manager.go          | 27 +++---
 .../pkg/util/provider/machineutils/utils.go   | 85 +++++++++++++++++++
 cluster-autoscaler/vendor/modules.txt         |  1 +
 5 files changed, 157 insertions(+), 54 deletions(-)
 create mode 100644 cluster-autoscaler/vendor/github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils/utils.go

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 2041033652dc..922549d95642 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -25,12 +25,14 @@ import (
 	"context"
 	"fmt"
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"slices"
 	"strconv"
 	"strings"
 	"sync"
 	"time"
 
+	"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/types"
@@ -323,29 +325,34 @@ func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
 	}, "MachineDeployment", "update", ngImpl.Name)
 }
 
-// Refresh resets the priority annotation for the machines that are not present in machines-marked-by-ca-for-deletion annotation on the machineDeployment
+// Refresh cordons the Nodes corresponding to the machines that have been marked for deletion in the TriggerDeletionByMCM annotation on the MachineDeployment
 func (ngImpl *NodeGroupImpl) Refresh() error {
 	mcd, err := ngImpl.mcmManager.GetMachineDeploymentObject(ngImpl.Name)
 	if err != nil {
 		return err
 	}
-	markedMachineNames := getMachineNamesMarkedByCAForDeletion(mcd)
-	if len(markedMachineNames) == 0 {
+	toDeleteMachineNames := getMachineNamesTriggeredForDeletion(mcd)
+	if len(toDeleteMachineNames) == 0 {
 		return nil
 	}
-	markedMachines, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
+	machinesOfNodeGroup, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
 	if err != nil {
 		klog.Errorf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
 		return fmt.Errorf("failed refresh of NodeGroup %q due to: %v", ngImpl.Name, err)
 	}
-	correspondingNodeNames := getNodeNamesFromMachines(markedMachines)
-	if len(correspondingNodeNames) == 0 {
-		klog.Warningf("NodeGroup.Refresh() of %q could not find correspondingNodeNames for markedMachines %q of MachineDeployment", ngImpl.Name, markedMachineNames)
+	toDeleteMachines := filterMachinesMatchingNames(machinesOfNodeGroup, sets.New(toDeleteMachineNames...))
+	if len(toDeleteMachines) == 0 {
+		klog.Warningf("NodeGroup.Refresh() of %q could not find Machine objects for toDeleteMachineNames %q", ngImpl.Name, toDeleteMachineNames)
 		return nil
 	}
-	err = ngImpl.mcmManager.cordonNodes(correspondingNodeNames)
+	toDeleteNodeNames := getNodeNamesFromMachines(toDeleteMachines)
+	if len(toDeleteNodeNames) == 0 {
+		klog.Warningf("NodeGroup.Refresh() of %q could not find toDeleteNodeNames for toDeleteMachineNames %q of MachineDeployment", ngImpl.Name, toDeleteMachineNames)
+		return nil
+	}
+	err = ngImpl.mcmManager.cordonNodes(toDeleteNodeNames)
 	if err != nil {
-		// we do not return error since we don't want this to block CA operation.
+		// we do not return error since we don't want this to block CA operation. This is best-effort.
 		klog.Warningf("NodeGroup.Refresh() of %q ran into error cordoning nodes: %v", ngImpl.Name, err)
 	}
 	return nil
@@ -556,12 +563,16 @@ func (ngImpl *NodeGroupImpl) AtomicIncreaseSize(delta int) error {
 	return cloudprovider.ErrNotImplemented
 }
 
-// getMachineNamesMarkedByCAForDeletion returns the set of machine names marked by CA for deletion.
-func getMachineNamesMarkedByCAForDeletion(mcd *v1alpha1.MachineDeployment) []string {
-	if mcd.Annotations == nil || mcd.Annotations[machinesMarkedByCAForDeletionAnnotation] == "" {
+// getMachineNamesTriggeredForDeletion returns the set of machine names contained within the machineutils.TriggerDeletionByMCM annotation on the given MachineDeployment
+func getMachineNamesTriggeredForDeletion(mcd *v1alpha1.MachineDeployment) []string {
+	if mcd.Annotations == nil || mcd.Annotations[machineutils.TriggerDeletionByMCM] == "" {
 		return nil
 	}
-	return strings.Split(mcd.Annotations[machinesMarkedByCAForDeletionAnnotation], ",")
+	return strings.Split(mcd.Annotations[machineutils.TriggerDeletionByMCM], ",")
+}
+
+func createMachinesTriggeredForDeletionAnnotValue(machineNames []string) string {
+	return strings.Join(machineNames, ",")
 }
 
 func mergeStringSlicesUnique(slice1, slice2 []string) []string {
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index f1d052844472..325347196209 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -8,6 +8,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
 	"math"
 	"strings"
 	"testing"
@@ -85,11 +86,11 @@ func TestDeleteNodes(t *testing.T) {
 		node *corev1.Node
 	}
 	type expect struct {
-		prio1Machines                     []*v1alpha1.Machine
-		mdName                            string
-		mdReplicas                        int32
-		machinesMarkedByCAAnnotationValue string
-		err                               error
+		prio1Machines                          []*v1alpha1.Machine
+		mdName                                 string
+		mdReplicas                             int32
+		machinesTriggerDeletionAnnotationValue string
+		err                                    error
 	}
 	type data struct {
 		name   string
@@ -109,11 +110,11 @@ func TestDeleteNodes(t *testing.T) {
 			},
 			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				prio1Machines:                     newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
-				mdName:                            "machinedeployment-1",
-				machinesMarkedByCAAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
-				mdReplicas:                        1,
-				err:                               nil,
+				prio1Machines:                          newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				mdName:                                 "machinedeployment-1",
+				machinesTriggerDeletionAnnotationValue: createMachinesTriggeredForDeletionAnnotValue(generateNames("machine", 1)),
+				mdReplicas:                             1,
+				err:                                    nil,
 			},
 		},
 		{
@@ -127,11 +128,11 @@ func TestDeleteNodes(t *testing.T) {
 			},
 			action{node: newNode("node-1", "requested://machine-1")},
 			expect{
-				prio1Machines:                     newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
-				machinesMarkedByCAAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
-				mdName:                            "machinedeployment-1",
-				mdReplicas:                        0,
-				err:                               nil,
+				prio1Machines:                          newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				machinesTriggerDeletionAnnotationValue: createMachinesTriggeredForDeletionAnnotValue(generateNames("machine", 1)),
+				mdName:                                 "machinedeployment-1",
+				mdReplicas:                             0,
+				err:                                    nil,
 			},
 		},
 		{
@@ -193,11 +194,11 @@ func TestDeleteNodes(t *testing.T) {
 			},
 			action{node: newNodes(1, "fakeID")[0]},
 			expect{
-				prio1Machines:                     newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
-				machinesMarkedByCAAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
-				mdName:                            "machinedeployment-1",
-				mdReplicas:                        1,
-				err:                               nil,
+				prio1Machines:                          newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
+				machinesTriggerDeletionAnnotationValue: createMachinesTriggeredForDeletionAnnotValue(generateNames("machine", 1)),
+				mdName:                                 "machinedeployment-1",
+				mdReplicas:                             1,
+				err:                                    nil,
 			},
 		},
 		{
@@ -301,7 +302,7 @@ func TestDeleteNodes(t *testing.T) {
 			machineDeployment, err := m.machineClient.MachineDeployments(m.namespace).Get(context.TODO(), entry.expect.mdName, metav1.GetOptions{})
 			g.Expect(err).ToNot(HaveOccurred())
 			g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", entry.expect.mdReplicas))
-			g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletionAnnotation]).To(Equal(entry.expect.machinesMarkedByCAAnnotationValue))
+			g.Expect(machineDeployment.Annotations[machineutils.TriggerDeletionByMCM]).To(Equal(entry.expect.machinesTriggerDeletionAnnotationValue))
 
 		})
 	}
@@ -333,14 +334,14 @@ func TestIdempotencyOfDeleteNodes(t *testing.T) {
 	machineDeployment, err := m.machineClient.MachineDeployments(m.namespace).Get(context.TODO(), setupObj.machineDeployments[0].Name, metav1.GetOptions{})
 	g.Expect(err).ToNot(HaveOccurred())
 	g.Expect(machineDeployment.Spec.Replicas).To(BeNumerically("==", 2))
-	g.Expect(machineDeployment.Annotations[machinesMarkedByCAForDeletionAnnotation]).To(Equal(createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1))))
+	g.Expect(machineDeployment.Annotations[machineutils.TriggerDeletionByMCM]).To(Equal(createMachinesTriggeredForDeletionAnnotValue(generateNames("machine", 1))))
 }
 
 func TestRefresh(t *testing.T) {
 	type expect struct {
-		prio3Machines                                []string
-		machinesMarkedByCAForDeletionAnnotationValue string
-		err                                          error
+		prio3Machines                          []string
+		machinesTriggerDeletionAnnotationValue string
+		err                                    error
 	}
 	type data struct {
 		name   string
@@ -374,7 +375,7 @@ func TestRefresh(t *testing.T) {
 			},
 		},
 		{
-			"should reset priority of a machine if it is not present in machines-marked-by-ca-for-deletion annotation on machine deployment",
+			"should reset priority of a machine if it is not present in trigger deletion annotation on machine deployment",
 			setup{
 				nodes:              newNodes(1, "fakeID"),
 				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
@@ -388,17 +389,17 @@ func TestRefresh(t *testing.T) {
 			},
 		},
 		{
-			"should update the machines-marked-by-ca-for-deletion annotation and remove non-existing machines",
+			"should update the trigger deletion annotation and remove non-existing machines",
 			setup{
 				nodes:              newNodes(1, "fakeID"),
 				machines:           newMachines(1, "fakeID", nil, "machinedeployment-1", "machineset-1", []string{"1"}),
-				machineDeployments: newMachineDeployments(1, 0, nil, map[string]string{machinesMarkedByCAForDeletionAnnotation: "machine-1,machine-2"}, nil),
+				machineDeployments: newMachineDeployments(1, 0, nil, map[string]string{machineutils.TriggerDeletionByMCM: "machine-1,machine-2"}, nil),
 				nodeGroups:         []string{nodeGroup2},
 				mcmDeployment:      newMCMDeployment(1),
 			},
 			expect{
-				machinesMarkedByCAForDeletionAnnotationValue: createMachinesMarkedForDeletionAnnotationValue(generateNames("machine", 1)),
-				err: nil,
+				machinesTriggerDeletionAnnotationValue: createMachinesTriggeredForDeletionAnnotValue(generateNames("machine", 1)),
+				err:                                    nil,
 			},
 		},
 	}
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 37bc251fddce..28175a2c6ee9 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -27,8 +27,10 @@ import (
 	"errors"
 	"flag"
 	"fmt"
+	"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
 	"k8s.io/apimachinery/pkg/types"
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
@@ -72,6 +74,7 @@ import (
 	"k8s.io/client-go/tools/clientcmd"
 	"k8s.io/klog/v2"
 	kubeletapis "k8s.io/kubelet/pkg/apis"
+	//	"github.com/gardener/machine-controller-manager/pkg/util/provider/"
 )
 
 const (
@@ -100,8 +103,6 @@ const (
 	machineDeploymentPausedReason = "DeploymentPaused"
 	// machineDeploymentNameLabel key for Machine Deployment name in machine labels
 	machineDeploymentNameLabel = "name"
-	// machinesMarkedByCAForDeletionAnnotation is the annotation set by CA on the MachineDeployment and represents machines that must be drained and deleted.
-	machinesMarkedByCAForDeletionAnnotation = "cluster-autoscaler.kubernetes.io/machines-marked-by-ca-for-deletion"
 	// poolNameLabel is the name of the label for gardener worker pool
 	poolNameLabel = "worker.gardener.cloud/pool"
 )
@@ -514,10 +515,10 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 	if mdCopy.Annotations == nil {
 		mdCopy.Annotations = make(map[string]string)
 	}
-	alreadyMarkedMachineNames := getMachineNamesMarkedByCAForDeletion(md)
-	toMarkMachineNames := mergeStringSlicesUnique(alreadyMarkedMachineNames, toDeleteMachineNames)
-	markedAnnotValue := createMachinesMarkedForDeletionAnnotationValue(toMarkMachineNames)
-	mdCopy.Annotations[machinesMarkedByCAForDeletionAnnotation] = markedAnnotValue
+	alreadyMarkedMachineNames := getMachineNamesTriggeredForDeletion(md)
+	toDeleteMachineNames = mergeStringSlicesUnique(alreadyMarkedMachineNames, toDeleteMachineNames)
+	markedAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toDeleteMachineNames)
+	mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = markedAnnotValue
 	_, err = m.machineClient.MachineDeployments(mdCopy.Namespace).Update(ctx, mdCopy, metav1.UpdateOptions{})
 	if err != nil {
 		return true, err
@@ -845,10 +846,18 @@ func filterOutNodes(nodes []*v1.Node, instanceType string) []*v1.Node {
 			filteredNodes = append(filteredNodes, node)
 		}
 	}
-
 	return filteredNodes
 }
 
+func filterMachinesMatchingNames(machines []*v1alpha1.Machine, matchingNames sets.Set[string]) (filteredMachines []*v1alpha1.Machine) {
+	for _, m := range machines {
+		if matchingNames.Has(m.Name) {
+			filteredMachines = append(filteredMachines, m)
+		}
+	}
+	return
+}
+
 func getInstanceTypeForNode(node *v1.Node) string {
 	var instanceTypeLabelValue string
 	if node.Labels != nil {
@@ -1071,7 +1080,3 @@ func filterExtendedResources(allResources v1.ResourceList) (extendedResources v1
 	})
 	return
 }
-
-func createMachinesMarkedForDeletionAnnotationValue(machineNames []string) string {
-	return strings.Join(machineNames, ",")
-}
diff --git a/cluster-autoscaler/vendor/github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils/utils.go b/cluster-autoscaler/vendor/github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils/utils.go
new file mode 100644
index 000000000000..6be7c4cb5419
--- /dev/null
+++ b/cluster-autoscaler/vendor/github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils/utils.go
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: 2024 SAP SE or an SAP affiliate company and Gardener contributors
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// Package machineutils contains the consts and global vaariables for machine operation
+package machineutils
+
+import (
+	"time"
+
+	v1 "k8s.io/api/core/v1"
+)
+
+const (
+	// GetVMStatus sets machine status to terminating and specifies next step as getting VMs
+	GetVMStatus = "Set machine status to termination. Now, getting VM Status"
+
+	// InstanceInitialization is a step that represents initialization of a VM instance (post-creation).
+	InstanceInitialization = "Initialize VM Instance"
+
+	// InitiateDrain specifies next step as initiate node drain
+	InitiateDrain = "Initiate node drain"
+
+	// DelVolumesAttachments specifies next step as deleting volume attachments
+	DelVolumesAttachments = "Delete Volume Attachments"
+
+	// InitiateVMDeletion specifies next step as initiate VM deletion
+	InitiateVMDeletion = "Initiate VM deletion"
+
+	// InitiateNodeDeletion specifies next step as node object deletion
+	InitiateNodeDeletion = "Initiate node object deletion"
+
+	// InitiateFinalizerRemoval specifies next step as machine finalizer removal
+	InitiateFinalizerRemoval = "Initiate machine object finalizer removal"
+
+	// LastAppliedALTAnnotation contains the last configuration of annotations, labels & taints applied on the node object
+	LastAppliedALTAnnotation = "node.machine.sapcloud.io/last-applied-anno-labels-taints"
+
+	// MachinePriority is the annotation used to specify priority
+	// associated with a machine while deleting it. The less its
+	// priority the more likely it is to be deleted first
+	// Default priority for a machine is set to 3
+	MachinePriority = "machinepriority.machine.sapcloud.io"
+
+	// MachineClassKind is used to identify the machineClassKind for generic machineClasses
+	MachineClassKind = "MachineClass"
+
+	// NotManagedByMCM annotation helps in identifying the nodes which are not handled by MCM
+	NotManagedByMCM = "node.machine.sapcloud.io/not-managed-by-mcm"
+
+	// TriggerDeletionByMCM annotation on the node would trigger the deletion of the corresponding machine object in the control cluster
+	TriggerDeletionByMCM = "node.machine.sapcloud.io/trigger-deletion-by-mcm"
+
+	// NodeUnhealthy is a node termination reason for failed machines
+	NodeUnhealthy = "Unhealthy"
+
+	// NodeScaledDown is a node termination reason for healthy deleted machines
+	NodeScaledDown = "ScaleDown"
+
+	// NodeTerminationCondition describes nodes that are terminating
+	NodeTerminationCondition v1.NodeConditionType = "Terminating"
+
+	// TaintNodeCriticalComponentsNotReady is the name of a gardener taint
+	// indicating that a node is not yet ready to have user workload scheduled
+	TaintNodeCriticalComponentsNotReady = "node.gardener.cloud/critical-components-not-ready"
+)
+
+// RetryPeriod is an alias for specifying the retry period
+type RetryPeriod time.Duration
+
+// These are the valid values for RetryPeriod
+const (
+	// ConflictRetry tells the controller to retry quickly - 200 milliseconds
+	ConflictRetry RetryPeriod = RetryPeriod(200 * time.Millisecond)
+	// ShortRetry tells the controller to retry after a short duration - 15 seconds
+	ShortRetry RetryPeriod = RetryPeriod(5 * time.Second)
+	// MediumRetry tells the controller to retry after a medium duration - 2 minutes
+	MediumRetry RetryPeriod = RetryPeriod(3 * time.Minute)
+	// LongRetry tells the controller to retry after a long duration - 10 minutes
+	LongRetry RetryPeriod = RetryPeriod(10 * time.Minute)
+)
+
+// EssentialTaints are taints on node object which if added/removed, require an immediate reconcile by machine controller
+// TODO: update this when taints for ALT updation and PostCreate operations is introduced.
+var EssentialTaints = []string{TaintNodeCriticalComponentsNotReady}
diff --git a/cluster-autoscaler/vendor/modules.txt b/cluster-autoscaler/vendor/modules.txt
index 2b8c030a1ad3..ed951d75775e 100644
--- a/cluster-autoscaler/vendor/modules.txt
+++ b/cluster-autoscaler/vendor/modules.txt
@@ -252,6 +252,7 @@ github.com/gardener/machine-controller-manager/pkg/client/informers/externalvers
 github.com/gardener/machine-controller-manager/pkg/client/listers/machine/v1alpha1
 github.com/gardener/machine-controller-manager/pkg/util/provider/cache
 github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes
+github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils
 # github.com/gardener/machine-controller-manager-provider-aws v0.20.0
 ## explicit; go 1.21
 github.com/gardener/machine-controller-manager-provider-aws/pkg/aws/apis

From 6ea649b85c9b6349c98849bf4a498b81f819c183 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Wed, 29 Jan 2025 14:48:41 +0530
Subject: [PATCH 17/27] removed var shadowing, added TODO to godoc for util fns

---
 .../cloudprovider/mcm/mcm_cloud_provider.go       | 15 ++++++++-------
 .../cloudprovider/mcm/mcm_manager.go              |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 922549d95642..e661e8adec5a 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -135,17 +135,17 @@ func (mcm *mcmCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
 		return nil, nil
 	}
 
-	machineInfo, err := mcm.mcmManager.GetMachineInfo(node)
+	mInfo, err := mcm.mcmManager.GetMachineInfo(node)
 	if err != nil {
 		return nil, err
 	}
 
-	if machineInfo == nil {
+	if mInfo == nil {
 		klog.V(4).Infof("Skipped node %v, it's either been removed or it's not managed by this controller", node.Spec.ProviderID)
 		return nil, nil
 	}
 
-	md, err := mcm.mcmManager.GetNodeGroupImpl(machineInfo.Key)
+	md, err := mcm.mcmManager.GetNodeGroupImpl(mInfo.Key)
 	if err != nil {
 		return nil, err
 	}
@@ -391,17 +391,17 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 	}
 	var toDeleteMachineInfos []machineInfo
 	for _, node := range nodes {
-		belongs, machineInfo, err := ngImpl.Belongs(node)
+		belongs, mInfo, err := ngImpl.Belongs(node)
 		if err != nil {
 			return err
 		} else if !belongs {
 			return fmt.Errorf("%s belongs to a different MachineDeployment than %q", node.Name, ngImpl.Name)
 		}
-		if machineInfo.FailedOrTerminating {
-			klog.V(3).Infof("for NodeGroup %q, Machine %q is already marked as terminating - skipping deletion", ngImpl.Name, machineInfo.Key.Name)
+		if mInfo.FailedOrTerminating {
+			klog.V(3).Infof("for NodeGroup %q, Machine %q is already marked as terminating - skipping deletion", ngImpl.Name, mInfo.Key.Name)
 			continue
 		}
-		toDeleteMachineInfos = append(toDeleteMachineInfos, *machineInfo)
+		toDeleteMachineInfos = append(toDeleteMachineInfos, *mInfo)
 	}
 	return ngImpl.deleteMachines(toDeleteMachineInfos)
 }
@@ -564,6 +564,7 @@ func (ngImpl *NodeGroupImpl) AtomicIncreaseSize(delta int) error {
 }
 
 // getMachineNamesTriggeredForDeletion returns the set of machine names contained within the machineutils.TriggerDeletionByMCM annotation on the given MachineDeployment
+// TODO: Move to using MCM annotations.GetMachineNamesTriggeredForDeletion after MCM release.
 func getMachineNamesTriggeredForDeletion(mcd *v1alpha1.MachineDeployment) []string {
 	if mcd.Annotations == nil || mcd.Annotations[machineutils.TriggerDeletionByMCM] == "" {
 		return nil
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 28175a2c6ee9..cebf0eb2b4a3 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -1064,9 +1064,9 @@ func buildNodeGroupImpl(mcmManager *McmManager, minSize int, maxSize int, namesp
 }
 
 // isMachineFailedOrTerminating returns true if machine is already being terminated or considered for termination by autoscaler.
+// TODO: Move to MCM machineutils.IsMachineFailedOrTerminating after MCM release.
 func isMachineFailedOrTerminating(machine *v1alpha1.Machine) bool {
 	if !machine.GetDeletionTimestamp().IsZero() || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
-		klog.Infof("Machine %q is already being terminated or in a failed phase, and hence skipping the deletion", machine.Name)
 		return true
 	}
 	return false

From f890502331ee0df33e71b1392270cbbafe85beb9 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Thu, 30 Jan 2025 22:35:46 +0530
Subject: [PATCH 18/27] corr var names, clear mutex acquire, releasing logs

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 56 ++++++++++---------
 .../cloudprovider/mcm/mcm_manager.go          | 12 ++--
 2 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index e661e8adec5a..702af0a73c28 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -331,8 +331,8 @@ func (ngImpl *NodeGroupImpl) Refresh() error {
 	if err != nil {
 		return err
 	}
-	toDeleteMachineNames := getMachineNamesTriggeredForDeletion(mcd)
-	if len(toDeleteMachineNames) == 0 {
+	toBeDeletedMachineNames := getMachineNamesTriggeredForDeletion(mcd)
+	if len(toBeDeletedMachineNames) == 0 {
 		return nil
 	}
 	machinesOfNodeGroup, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
@@ -340,17 +340,17 @@ func (ngImpl *NodeGroupImpl) Refresh() error {
 		klog.Errorf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
 		return fmt.Errorf("failed refresh of NodeGroup %q due to: %v", ngImpl.Name, err)
 	}
-	toDeleteMachines := filterMachinesMatchingNames(machinesOfNodeGroup, sets.New(toDeleteMachineNames...))
-	if len(toDeleteMachines) == 0 {
-		klog.Warningf("NodeGroup.Refresh() of %q could not find Machine objects for toDeleteMachineNames %q", ngImpl.Name, toDeleteMachineNames)
+	toBeDeletedMachines := filterMachinesMatchingNames(machinesOfNodeGroup, sets.New(toBeDeletedMachineNames...))
+	if len(toBeDeletedMachines) == 0 {
+		klog.Warningf("NodeGroup.Refresh() of %q could not find Machine objects for toBeDeletedMachineNames %q", ngImpl.Name, toBeDeletedMachineNames)
 		return nil
 	}
-	toDeleteNodeNames := getNodeNamesFromMachines(toDeleteMachines)
-	if len(toDeleteNodeNames) == 0 {
-		klog.Warningf("NodeGroup.Refresh() of %q could not find toDeleteNodeNames for toDeleteMachineNames %q of MachineDeployment", ngImpl.Name, toDeleteMachineNames)
+	toBeDeletedNodeNames := getNodeNamesFromMachines(toBeDeletedMachines)
+	if len(toBeDeletedNodeNames) == 0 {
+		klog.Warningf("NodeGroup.Refresh() of %q could not find toBeDeletedNodeNames for toBeDeletedMachineNames %q of MachineDeployment", ngImpl.Name, toBeDeletedMachineNames)
 		return nil
 	}
-	err = ngImpl.mcmManager.cordonNodes(toDeleteNodeNames)
+	err = ngImpl.mcmManager.cordonNodes(toBeDeletedNodeNames)
 	if err != nil {
 		// we do not return error since we don't want this to block CA operation. This is best-effort.
 		klog.Warningf("NodeGroup.Refresh() of %q ran into error cordoning nodes: %v", ngImpl.Name, err)
@@ -389,7 +389,7 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 	if int(size) <= ngImpl.MinSize() {
 		return fmt.Errorf("min size reached, nodes will not be deleted")
 	}
-	var toDeleteMachineInfos []machineInfo
+	var toBeDeletedMachineInfos []machineInfo
 	for _, node := range nodes {
 		belongs, mInfo, err := ngImpl.Belongs(node)
 		if err != nil {
@@ -401,18 +401,25 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 			klog.V(3).Infof("for NodeGroup %q, Machine %q is already marked as terminating - skipping deletion", ngImpl.Name, mInfo.Key.Name)
 			continue
 		}
-		toDeleteMachineInfos = append(toDeleteMachineInfos, *mInfo)
+		toBeDeletedMachineInfos = append(toBeDeletedMachineInfos, *mInfo)
 	}
-	return ngImpl.deleteMachines(toDeleteMachineInfos)
+	return ngImpl.deleteMachines(toBeDeletedMachineInfos)
 }
 
-// deleteMachines annotates the corresponding MachineDeployment with machine names of toDeleteMachineInfos, reduces the desired replicas of the corresponding MachineDeployment and cordons corresponding nodes belonging to toDeleteMachineInfos
-func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []machineInfo) error {
-	if len(toDeleteMachineInfos) == 0 {
+// deleteMachines annotates the corresponding MachineDeployment with machine names of toBeDeletedMachineInfos, reduces the desired replicas of the corresponding MachineDeployment and cordons corresponding nodes belonging to toBeDeletedMachineInfos
+func (ngImpl *NodeGroupImpl) deleteMachines(toBeDeletedMachineInfos []machineInfo) error {
+	if len(toBeDeletedMachineInfos) == 0 {
 		return nil
 	}
-	release := ngImpl.AcquireScalingMutex("deleteMachines")
+	var toBeDeletedMachineNames, toBeDeletedNodeNames []string
+	for _, mInfo := range toBeDeletedMachineInfos {
+		toBeDeletedMachineNames = append(toBeDeletedMachineNames, mInfo.Key.Name)
+		toBeDeletedNodeNames = append(toBeDeletedNodeNames, mInfo.NodeName)
+	}
+
+	release := ngImpl.AcquireScalingMutex(fmt.Sprintf("deleteMachines for %q", toBeDeletedMachineNames))
 	defer release()
+
 	// get the machine deployment and return if rolling update is not finished
 	md, err := ngImpl.mcmManager.GetMachineDeploymentObject(ngImpl.Name)
 	if err != nil {
@@ -422,21 +429,15 @@ func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []machineInfo)
 		return fmt.Errorf("MachineDeployment %s is under rolling update , cannot reduce replica count", ngImpl.Name)
 	}
 
-	var toDeleteMachineNames, toDeleteNodeNames []string
-	for _, machineInfo := range toDeleteMachineInfos {
-		toDeleteMachineNames = append(toDeleteMachineNames, machineInfo.Key.Name)
-		toDeleteNodeNames = append(toDeleteNodeNames, machineInfo.NodeName)
-	}
-
 	// Trying to update the machineDeployment till the deadline
 	err = ngImpl.mcmManager.retry(func(ctx context.Context) (bool, error) {
-		return ngImpl.mcmManager.scaleDownMachineDeployment(ctx, ngImpl.Name, toDeleteMachineNames)
+		return ngImpl.mcmManager.scaleDownMachineDeployment(ctx, ngImpl.Name, toBeDeletedMachineNames)
 	}, "MachineDeployment", "update", ngImpl.Name)
 	if err != nil {
-		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, len(toDeleteMachineNames), toDeleteMachineNames, err)
+		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, len(toBeDeletedMachineNames), toBeDeletedMachineNames, err)
 		return fmt.Errorf("for NodeGroup %q, cannot scale down due to: %w", ngImpl.Name, err)
 	}
-	err = ngImpl.mcmManager.cordonNodes(toDeleteNodeNames)
+	err = ngImpl.mcmManager.cordonNodes(toBeDeletedNodeNames)
 	if err != nil {
 		// Do not return error as cordoning is best-effort
 		klog.Warningf("NodeGroup.deleteMachines() of %q ran into error cordoning nodes: %v", ngImpl.Name, err)
@@ -446,10 +447,10 @@ func (ngImpl *NodeGroupImpl) deleteMachines(toDeleteMachineInfos []machineInfo)
 
 // AcquireScalingMutex acquires the scalingMutex associated with this NodeGroup and returns a function that releases the scalingMutex that is expected to be deferred by the caller.
 func (ngImpl *NodeGroupImpl) AcquireScalingMutex(operation string) (releaseFn func()) {
-	klog.V(3).Infof("%s is acquired scalingMutex for NodeGroup %q", operation, ngImpl.Name)
 	ngImpl.scalingMutex.Lock()
-	klog.V(3).Infof("%s has acquired scalingMutex for %q", operation, ngImpl.Name)
+	klog.V(3).Infof("%s has acquired scalingMutex of NodeGroup %q", operation, ngImpl.Name)
 	releaseFn = func() {
+		klog.V(3).Infof("%s is releasing scalingMutex of NodeGroup %q", operation, ngImpl.Name)
 		ngImpl.scalingMutex.Unlock()
 	}
 	return
@@ -572,6 +573,7 @@ func getMachineNamesTriggeredForDeletion(mcd *v1alpha1.MachineDeployment) []stri
 	return strings.Split(mcd.Annotations[machineutils.TriggerDeletionByMCM], ",")
 }
 
+// TODO: Move to using MCM annotations.CreateMachinesTriggeredForDeletionAnnotValue after MCM release
 func createMachinesTriggeredForDeletionAnnotValue(machineNames []string) string {
 	return strings.Join(machineNames, ",")
 }
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index cebf0eb2b4a3..dd14f06df023 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -494,13 +494,13 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 // scaleDownMachineDeployment scales down the MachineDeployment for given name by the length of toDeleteMachineNames
 // It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of toDeleteMachineNames
 // NOTE: Callers MUST take the NodeGroup scalingMutex before invoking this method.
-func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toDeleteMachineNames []string) (bool, error) {
+func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toBeDeletedMachineNames []string) (bool, error) {
 	md, err := m.GetMachineDeploymentObject(mdName)
 	if err != nil {
 		return true, err
 	}
 
-	scaleDownAmount := len(toDeleteMachineNames)
+	scaleDownAmount := len(toBeDeletedMachineNames)
 	expectedReplicas := md.Spec.Replicas - int32(scaleDownAmount)
 	if expectedReplicas == md.Spec.Replicas {
 		klog.Infof("MachineDeployment %q is already set to %d, skipping the update", md.Name, expectedReplicas)
@@ -516,9 +516,11 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 		mdCopy.Annotations = make(map[string]string)
 	}
 	alreadyMarkedMachineNames := getMachineNamesTriggeredForDeletion(md)
-	toDeleteMachineNames = mergeStringSlicesUnique(alreadyMarkedMachineNames, toDeleteMachineNames)
-	markedAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toDeleteMachineNames)
-	mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = markedAnnotValue
+	toBeDeletedMachineNames = mergeStringSlicesUnique(alreadyMarkedMachineNames, toBeDeletedMachineNames)
+	triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeDeletedMachineNames)
+	if mdCopy.Annotations[machineutils.TriggerDeletionByMCM] != triggerDeletionAnnotValue {
+		mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
+	}
 	_, err = m.machineClient.MachineDeployments(mdCopy.Namespace).Update(ctx, mdCopy, metav1.UpdateOptions{})
 	if err != nil {
 		return true, err

From 9d1d313b466a90ad7525a4fc14c41373add46b5e Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Thu, 30 Jan 2025 23:05:32 +0530
Subject: [PATCH 19/27] ensured IncreaseSize/DecreaseTargetSize logged  delta
 in mutex acquire/release log

---
 cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 702af0a73c28..6bfc03a3492b 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -284,7 +284,7 @@ func (ngImpl *NodeGroupImpl) IncreaseSize(delta int) error {
 	if delta <= 0 {
 		return fmt.Errorf("size increase must be positive")
 	}
-	release := ngImpl.AcquireScalingMutex("IncreaseSize")
+	release := ngImpl.AcquireScalingMutex(fmt.Sprintf("IncreaseSize by #%d", delta))
 	defer release()
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
@@ -309,7 +309,7 @@ func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
 	if delta >= 0 {
 		return fmt.Errorf("size decrease size must be negative")
 	}
-	release := ngImpl.AcquireScalingMutex("DecreaseTargetSize")
+	release := ngImpl.AcquireScalingMutex(fmt.Sprintf("DecreaseTargetSize by #%d", delta))
 	defer release()
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {

From b3f916733629b93ba28f1aa1ff896d8713adafa0 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Mon, 3 Feb 2025 16:20:24 +0530
Subject: [PATCH 20/27] review comments addressed: fixed unit test, adjusted
 log, removed redundant fn

---
 .../cloudprovider/mcm/mcm_cloud_provider.go    | 18 +++---------------
 .../mcm/mcm_cloud_provider_test.go             |  1 -
 .../cloudprovider/mcm/mcm_manager.go           | 16 +++++++---------
 .../cloudprovider/mcm/test_utils.go            |  1 +
 4 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 6bfc03a3492b..8265015cd6d9 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -135,7 +135,7 @@ func (mcm *mcmCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
 		return nil, nil
 	}
 
-	mInfo, err := mcm.mcmManager.GetMachineInfo(node)
+	mInfo, err := mcm.mcmManager.getMachineInfo(node)
 	if err != nil {
 		return nil, err
 	}
@@ -360,7 +360,7 @@ func (ngImpl *NodeGroupImpl) Refresh() error {
 
 // Belongs checks if the given node belongs to this NodeGroup and also returns its MachineInfo for its corresponding Machine
 func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (belongs bool, mInfo *machineInfo, err error) {
-	mInfo, err = ngImpl.mcmManager.GetMachineInfo(node)
+	mInfo, err = ngImpl.mcmManager.getMachineInfo(node)
 	if err != nil || mInfo == nil {
 		return
 	}
@@ -575,18 +575,6 @@ func getMachineNamesTriggeredForDeletion(mcd *v1alpha1.MachineDeployment) []stri
 
 // TODO: Move to using MCM annotations.CreateMachinesTriggeredForDeletionAnnotValue after MCM release
 func createMachinesTriggeredForDeletionAnnotValue(machineNames []string) string {
+	slices.Sort(machineNames)
 	return strings.Join(machineNames, ",")
 }
-
-func mergeStringSlicesUnique(slice1, slice2 []string) []string {
-	seen := make(map[string]struct{}, len(slice1)+len(slice2))
-	for _, s := range slices.Concat(slice1, slice2) {
-		seen[s] = struct{}{}
-	}
-	concatenated := make([]string, 0, len(seen)) // TODO: Change to slices.Collect(maps.Keys(seen)) from Go 1.23
-	for s := range seen {
-		concatenated = append(concatenated, s)
-	}
-	slices.Sort(concatenated)
-	return concatenated
-}
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 325347196209..0e13ec775775 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -171,7 +171,6 @@ func TestDeleteNodes(t *testing.T) {
 				},
 			},
 			action{node: newNodes(1, "fakeID")[0]},
-			//return fmt.Errorf("for NodeGroup %q, cannot scale down due to: %v", ngImpl.Name, toDeleteMachineNames, err)
 			expect{
 				mdName:     "machinedeployment-1",
 				mdReplicas: 2,
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index dd14f06df023..b1766463a1d9 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -510,14 +510,15 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 		return false, fmt.Errorf("cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
 	}
 
+	alreadyMarkedMachineNames := getMachineNamesTriggeredForDeletion(md)
+	toBeMarkedMachineNamesSet := sets.NewString(toBeDeletedMachineNames...).Insert(alreadyMarkedMachineNames...)
+	triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeMarkedMachineNamesSet.List())
+
 	mdCopy := md.DeepCopy()
 	mdCopy.Spec.Replicas = expectedReplicas
 	if mdCopy.Annotations == nil {
 		mdCopy.Annotations = make(map[string]string)
 	}
-	alreadyMarkedMachineNames := getMachineNamesTriggeredForDeletion(md)
-	toBeDeletedMachineNames = mergeStringSlicesUnique(alreadyMarkedMachineNames, toBeDeletedMachineNames)
-	triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeDeletedMachineNames)
 	if mdCopy.Annotations[machineutils.TriggerDeletionByMCM] != triggerDeletionAnnotValue {
 		mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
 	}
@@ -525,7 +526,7 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 	if err != nil {
 		return true, err
 	}
-	klog.V(2).Infof("MachineDeployment %s size decreased to %d ", mdCopy.Name, mdCopy.Spec.Replicas)
+	klog.V(2).Infof("MachineDeployment %s size decreased to %d, triggerDeletionAnnotValue: %q", mdCopy.Name, mdCopy.Spec.Replicas, triggerDeletionAnnotValue)
 	return false, nil
 }
 
@@ -947,9 +948,6 @@ func (m *McmManager) cordonNodes(nodeNames []string) error {
 	if len(nodeNames) == 0 {
 		return nil
 	}
-	if m.nodeInterface == nil {
-		return nil
-	}
 	ctx, cancelFn := context.WithDeadline(context.Background(), time.Now().Add(m.maxRetryTimeout))
 	defer cancelFn()
 	var errs []error
@@ -977,8 +975,8 @@ func (m *McmManager) cordonNodes(nodeNames []string) error {
 	return nil
 }
 
-// GetMachineInfo extracts the machine Key from the given node's providerID if found and checks whether it is failed or terminating and returns the MachineInfo or an error
-func (m *McmManager) GetMachineInfo(node *apiv1.Node) (*machineInfo, error) {
+// getMachineInfo extracts the machine Key from the given node's providerID if found and checks whether it is failed or terminating and returns the MachineInfo or an error
+func (m *McmManager) getMachineInfo(node *apiv1.Node) (*machineInfo, error) {
 	machines, err := m.machineLister.Machines(m.namespace).List(labels.Everything())
 	if err != nil {
 		return nil, fmt.Errorf("cannot list machines in namespace %q due to: %s", m.namespace, err)
diff --git a/cluster-autoscaler/cloudprovider/mcm/test_utils.go b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
index 188eb3db70f5..88d11066acdb 100644
--- a/cluster-autoscaler/cloudprovider/mcm/test_utils.go
+++ b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
@@ -290,6 +290,7 @@ func createMcmManager(
 		machineLister:           machines.Lister(),
 		machineClassLister:      machineClasses.Lister(),
 		nodeLister:              nodes.Lister(),
+		nodeInterface:           fakeTargetCoreClient.CoreV1().Nodes(),
 		maxRetryTimeout:         5 * time.Second,
 		retryInterval:           1 * time.Second,
 	}

From 7d1d0d7765feb354e752d48fae7aac4220d8cdc5 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Mon, 3 Feb 2025 16:44:59 +0530
Subject: [PATCH 21/27] all code review comments addressed

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 60 +++++++++----------
 .../mcm/mcm_cloud_provider_test.go            |  8 +--
 .../cloudprovider/mcm/mcm_manager.go          | 20 +++----
 .../cloudprovider/mcm/test_utils.go           |  4 +-
 4 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 8265015cd6d9..61cf708708ad 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -106,7 +106,7 @@ func buildStaticallyDiscoveringProvider(mcmManager *McmManager, resourceLimiter
 	return mcm, nil
 }
 
-// Cleanup stops the go routine that is handling the current view of the NodeGroupImpl in the form of a cache
+// Cleanup stops the go routine that is handling the current view of the nodeGroup in the form of a cache
 func (mcm *mcmCloudProvider) Cleanup() error {
 	mcm.mcmManager.Cleanup()
 	return nil
@@ -119,11 +119,11 @@ func (mcm *mcmCloudProvider) Name() string {
 // NodeGroups returns all node groups configured for this cloud provider.
 func (mcm *mcmCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
 	result := make([]cloudprovider.NodeGroup, 0, len(mcm.mcmManager.nodeGroups))
-	for _, nodeGroup := range mcm.mcmManager.nodeGroups {
-		if nodeGroup.maxSize == 0 {
+	for _, ng := range mcm.mcmManager.nodeGroups {
+		if ng.maxSize == 0 {
 			continue
 		}
-		result = append(result, nodeGroup)
+		result = append(result, ng)
 	}
 	return result
 }
@@ -145,19 +145,19 @@ func (mcm *mcmCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
 		return nil, nil
 	}
 
-	md, err := mcm.mcmManager.GetNodeGroupImpl(mInfo.Key)
+	ng, err := mcm.mcmManager.getNodeGroup(mInfo.Key)
 	if err != nil {
 		return nil, err
 	}
 
-	key := types.NamespacedName{Namespace: md.Namespace, Name: md.Name}
+	key := types.NamespacedName{Namespace: ng.Namespace, Name: ng.Name}
 	_, isManaged := mcm.mcmManager.nodeGroups[key]
 	if !isManaged {
 		klog.V(4).Infof("Skipped node %v, it's not managed by this controller", node.Spec.ProviderID)
 		return nil, nil
 	}
 
-	return md, nil
+	return ng, nil
 }
 
 // HasInstance returns whether a given node has a corresponding instance in this cloud provider
@@ -227,8 +227,8 @@ func (mcm *mcmCloudProvider) GetNodeGpuConfig(*apiv1.Node) *cloudprovider.GpuCon
 	return nil
 }
 
-// NodeGroupImpl implements NodeGroup interface.
-type NodeGroupImpl struct {
+// nodeGroup implements NodeGroup interface.
+type nodeGroup struct {
 	types.NamespacedName
 
 	mcmManager *McmManager
@@ -239,18 +239,18 @@ type NodeGroupImpl struct {
 }
 
 // MaxSize returns maximum size of the node group.
-func (ngImpl *NodeGroupImpl) MaxSize() int {
+func (ngImpl *nodeGroup) MaxSize() int {
 	return ngImpl.maxSize
 }
 
 // MinSize returns minimum size of the node group.
-func (ngImpl *NodeGroupImpl) MinSize() int {
+func (ngImpl *nodeGroup) MinSize() int {
 	return ngImpl.minSize
 }
 
 // TargetSize returns the current TARGET size of the node group. It is possible that the
 // number is different from the number of nodes registered in Kubernetes.
-func (ngImpl *NodeGroupImpl) TargetSize() (int, error) {
+func (ngImpl *nodeGroup) TargetSize() (int, error) {
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	return int(size), err
 }
@@ -258,28 +258,28 @@ func (ngImpl *NodeGroupImpl) TargetSize() (int, error) {
 // Exist checks if the node group really exists on the cloud provider side. Allows to tell the
 // theoretical node group from the real one.
 // TODO: Implement this to check if machine-deployment really exists.
-func (ngImpl *NodeGroupImpl) Exist() bool {
+func (ngImpl *nodeGroup) Exist() bool {
 	return true
 }
 
 // Create creates the node group on the cloud provider side.
-func (ngImpl *NodeGroupImpl) Create() (cloudprovider.NodeGroup, error) {
+func (ngImpl *nodeGroup) Create() (cloudprovider.NodeGroup, error) {
 	return nil, cloudprovider.ErrAlreadyExist
 }
 
 // Autoprovisioned returns true if the node group is autoprovisioned.
-func (ngImpl *NodeGroupImpl) Autoprovisioned() bool {
+func (ngImpl *nodeGroup) Autoprovisioned() bool {
 	return false
 }
 
 // Delete deletes the node group on the cloud provider side.
 // This will be executed only for autoprovisioned node groups, once their size drops to 0.
-func (ngImpl *NodeGroupImpl) Delete() error {
+func (ngImpl *nodeGroup) Delete() error {
 	return cloudprovider.ErrNotImplemented
 }
 
 // IncreaseSize of the Machinedeployment.
-func (ngImpl *NodeGroupImpl) IncreaseSize(delta int) error {
+func (ngImpl *nodeGroup) IncreaseSize(delta int) error {
 	klog.V(0).Infof("Received request to increase size of machine deployment %s by %d", ngImpl.Name, delta)
 	if delta <= 0 {
 		return fmt.Errorf("size increase must be positive")
@@ -304,7 +304,7 @@ func (ngImpl *NodeGroupImpl) IncreaseSize(delta int) error {
 // request for new nodes that have not been yet fulfilled. Delta should be negative.
 // It is assumed that cloud provider will not delete the existing nodes if the size
 // when there is an option to just decrease the target.
-func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
+func (ngImpl *nodeGroup) DecreaseTargetSize(delta int) error {
 	klog.V(0).Infof("Received request to decrease target size of machine deployment %s by %d", ngImpl.Name, delta)
 	if delta >= 0 {
 		return fmt.Errorf("size decrease size must be negative")
@@ -326,7 +326,7 @@ func (ngImpl *NodeGroupImpl) DecreaseTargetSize(delta int) error {
 }
 
 // Refresh cordons the Nodes corresponding to the machines that have been marked for deletion in the TriggerDeletionByMCM annotation on the MachineDeployment
-func (ngImpl *NodeGroupImpl) Refresh() error {
+func (ngImpl *nodeGroup) Refresh() error {
 	mcd, err := ngImpl.mcmManager.GetMachineDeploymentObject(ngImpl.Name)
 	if err != nil {
 		return err
@@ -359,12 +359,12 @@ func (ngImpl *NodeGroupImpl) Refresh() error {
 }
 
 // Belongs checks if the given node belongs to this NodeGroup and also returns its MachineInfo for its corresponding Machine
-func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (belongs bool, mInfo *machineInfo, err error) {
+func (ngImpl *nodeGroup) Belongs(node *apiv1.Node) (belongs bool, mInfo *machineInfo, err error) {
 	mInfo, err = ngImpl.mcmManager.getMachineInfo(node)
 	if err != nil || mInfo == nil {
 		return
 	}
-	targetMd, err := ngImpl.mcmManager.GetNodeGroupImpl(mInfo.Key)
+	targetMd, err := ngImpl.mcmManager.getNodeGroup(mInfo.Key)
 	if err != nil {
 		return
 	}
@@ -380,7 +380,7 @@ func (ngImpl *NodeGroupImpl) Belongs(node *apiv1.Node) (belongs bool, mInfo *mac
 
 // DeleteNodes deletes the nodes from the group. It is expected that this method will not be called
 // for nodes which are not part of ANY machine deployment.
-func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
+func (ngImpl *nodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
 	klog.V(0).Infof("for NodeGroup %q, Received request to delete nodes:- %v", ngImpl.Name, getNodeNames(nodes))
 	size, err := ngImpl.mcmManager.GetMachineDeploymentSize(ngImpl.Name)
 	if err != nil {
@@ -407,7 +407,7 @@ func (ngImpl *NodeGroupImpl) DeleteNodes(nodes []*apiv1.Node) error {
 }
 
 // deleteMachines annotates the corresponding MachineDeployment with machine names of toBeDeletedMachineInfos, reduces the desired replicas of the corresponding MachineDeployment and cordons corresponding nodes belonging to toBeDeletedMachineInfos
-func (ngImpl *NodeGroupImpl) deleteMachines(toBeDeletedMachineInfos []machineInfo) error {
+func (ngImpl *nodeGroup) deleteMachines(toBeDeletedMachineInfos []machineInfo) error {
 	if len(toBeDeletedMachineInfos) == 0 {
 		return nil
 	}
@@ -446,7 +446,7 @@ func (ngImpl *NodeGroupImpl) deleteMachines(toBeDeletedMachineInfos []machineInf
 }
 
 // AcquireScalingMutex acquires the scalingMutex associated with this NodeGroup and returns a function that releases the scalingMutex that is expected to be deferred by the caller.
-func (ngImpl *NodeGroupImpl) AcquireScalingMutex(operation string) (releaseFn func()) {
+func (ngImpl *nodeGroup) AcquireScalingMutex(operation string) (releaseFn func()) {
 	ngImpl.scalingMutex.Lock()
 	klog.V(3).Infof("%s has acquired scalingMutex of NodeGroup %q", operation, ngImpl.Name)
 	releaseFn = func() {
@@ -476,17 +476,17 @@ func getNodeNamesFromMachines(machines []*v1alpha1.Machine) []string {
 }
 
 // Id returns MachineDeployment id.
-func (ngImpl *NodeGroupImpl) Id() string {
+func (ngImpl *nodeGroup) Id() string {
 	return ngImpl.Name
 }
 
 // Debug returns a debug string for the Asg.
-func (ngImpl *NodeGroupImpl) Debug() string {
+func (ngImpl *nodeGroup) Debug() string {
 	return fmt.Sprintf("%s (%d:%d)", ngImpl.Id(), ngImpl.MinSize(), ngImpl.MaxSize())
 }
 
 // Nodes returns a list of all nodes that belong to this node group.
-func (ngImpl *NodeGroupImpl) Nodes() ([]cloudprovider.Instance, error) {
+func (ngImpl *nodeGroup) Nodes() ([]cloudprovider.Instance, error) {
 	instances, err := ngImpl.mcmManager.GetInstancesForMachineDeployment(ngImpl.Name)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the MachineDeployment %q, error: %v", ngImpl.Name, err)
@@ -506,7 +506,7 @@ func (ngImpl *NodeGroupImpl) Nodes() ([]cloudprovider.Instance, error) {
 // GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
 // NodeGroup. Returning a nil will result in using default options.
 // Implementation optional.
-func (ngImpl *NodeGroupImpl) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
+func (ngImpl *nodeGroup) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
 	options := defaults
 	mcdAnnotations, err := ngImpl.mcmManager.GetMachineDeploymentAnnotations(ngImpl.Name)
 	if err != nil {
@@ -542,7 +542,7 @@ func (ngImpl *NodeGroupImpl) GetOptions(defaults config.NodeGroupAutoscalingOpti
 }
 
 // TemplateNodeInfo returns a node template for this node group.
-func (ngImpl *NodeGroupImpl) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
+func (ngImpl *nodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
 
 	nodeTemplate, err := ngImpl.mcmManager.GetMachineDeploymentNodeTemplate(ngImpl.Name)
 	if err != nil {
@@ -560,7 +560,7 @@ func (ngImpl *NodeGroupImpl) TemplateNodeInfo() (*schedulerframework.NodeInfo, e
 }
 
 // AtomicIncreaseSize is not implemented.
-func (ngImpl *NodeGroupImpl) AtomicIncreaseSize(delta int) error {
+func (ngImpl *nodeGroup) AtomicIncreaseSize(delta int) error {
 	return cloudprovider.ErrNotImplemented
 }
 
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 0e13ec775775..079c65a04eae 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -287,7 +287,7 @@ func TestDeleteNodes(t *testing.T) {
 				trackers.ControlMachine.SetFailAtFakeResourceActions(entry.setup.controlMachineFakeResourceActions)
 			}
 
-			md, err := buildNodeGroupImplFromSpec(entry.setup.nodeGroups[0], m)
+			md, err := buildNodeGroupFromSpec(entry.setup.nodeGroups[0], m)
 			g.Expect(err).To(BeNil())
 
 			err = md.DeleteNodes([]*corev1.Node{entry.action.node})
@@ -322,7 +322,7 @@ func TestIdempotencyOfDeleteNodes(t *testing.T) {
 	m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, setupObj.nodeGroups, controlMachineObjects, targetCoreObjects, nil)
 	defer trackers.Stop()
 	waitForCacheSync(t, stop, hasSyncedCacheFns)
-	md, err := buildNodeGroupImplFromSpec(setupObj.nodeGroups[0], m)
+	md, err := buildNodeGroupFromSpec(setupObj.nodeGroups[0], m)
 	g.Expect(err).To(BeNil())
 
 	err = md.DeleteNodes(newNodes(1, "fakeID"))
@@ -517,7 +517,7 @@ func TestNodes(t *testing.T) {
 				trackers.ControlMachine.SetFailAtFakeResourceActions(entry.setup.controlMachineFakeResourceActions)
 			}
 
-			md, err := buildNodeGroupImplFromSpec(entry.setup.nodeGroups[0], m)
+			md, err := buildNodeGroupFromSpec(entry.setup.nodeGroups[0], m)
 			g.Expect(err).To(BeNil())
 
 			returnedInstances, err := md.Nodes()
@@ -669,7 +669,7 @@ func TestGetOptions(t *testing.T) {
 			defer trackers.Stop()
 			waitForCacheSync(t, stop, hasSyncedCacheFns)
 
-			md, err := buildNodeGroupImplFromSpec(entry.setup.nodeGroups[0], m)
+			md, err := buildNodeGroupFromSpec(entry.setup.nodeGroups[0], m)
 			g.Expect(err).To(BeNil())
 
 			options, err := md.GetOptions(ngAutoScalingOpDefaults)
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index b1766463a1d9..3ffd655d884f 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -131,7 +131,7 @@ type McmManager struct {
 	namespace               string
 	interrupt               chan struct{}
 	discoveryOpts           cloudprovider.NodeGroupDiscoveryOptions
-	nodeGroups              map[types.NamespacedName]*NodeGroupImpl
+	nodeGroups              map[types.NamespacedName]*nodeGroup
 	deploymentLister        v1appslister.DeploymentLister
 	machineClient           machineapi.MachineV1alpha1Interface
 	machineDeploymentLister machinelisters.MachineDeploymentLister
@@ -265,7 +265,7 @@ func createMCMManagerInternal(discoveryOpts cloudprovider.NodeGroupDiscoveryOpti
 		m := &McmManager{
 			namespace:               namespace,
 			interrupt:               make(chan struct{}),
-			nodeGroups:              make(map[types.NamespacedName]*NodeGroupImpl),
+			nodeGroups:              make(map[types.NamespacedName]*nodeGroup),
 			deploymentLister:        deploymentLister,
 			machineClient:           controlMachineClient,
 			machineClassLister:      machineClassLister,
@@ -316,7 +316,7 @@ func (m *McmManager) generateMachineDeploymentMap() error {
 // addNodeGroup adds node group defined in string spec. Format:
 // minNodes:maxNodes:namespace.machineDeploymentName
 func (m *McmManager) addNodeGroup(spec string) error {
-	nodeGroup, err := buildNodeGroupImplFromSpec(spec, m)
+	nodeGroup, err := buildNodeGroupFromSpec(spec, m)
 	if err != nil {
 		return err
 	}
@@ -384,8 +384,8 @@ func CreateMcmManager(discoveryOpts cloudprovider.NodeGroupDiscoveryOptions) (*M
 	return createMCMManagerInternal(discoveryOpts, defaultRetryInterval, defaultMaxRetryTimeout)
 }
 
-// GetNodeGroupImpl returns the NodeGroupImpl for the given fully-qualified machine name.
-func (m *McmManager) GetNodeGroupImpl(machineKey types.NamespacedName) (*NodeGroupImpl, error) {
+// getNodeGroup returns the NodeGroup for the given fully-qualified machine name.
+func (m *McmManager) getNodeGroup(machineKey types.NamespacedName) (*nodeGroup, error) {
 	if machineKey.Name == "" {
 		// Considering the possibility when Machine has been deleted but due to cached Node object it appears here.
 		return nil, fmt.Errorf("node does not Exists")
@@ -452,7 +452,7 @@ func (m *McmManager) GetMachineDeploymentSize(nodeGroupName string) (int64, erro
 }
 
 // SetMachineDeploymentSize sets the desired size for the backing MachineDeployment of the given nodeGroup.
-func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, nodeGroup *NodeGroupImpl, size int64) (bool, error) {
+func (m *McmManager) SetMachineDeploymentSize(ctx context.Context, nodeGroup *nodeGroup, size int64) (bool, error) {
 	md, err := m.GetMachineDeploymentObject(nodeGroup.Name)
 	if err != nil {
 		return true, err
@@ -1039,19 +1039,19 @@ func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]stri
 	return result
 }
 
-func buildNodeGroupImplFromSpec(value string, mcmManager *McmManager) (*NodeGroupImpl, error) {
+func buildNodeGroupFromSpec(value string, mcmManager *McmManager) (*nodeGroup, error) {
 	spec, err := dynamic.SpecFromString(value, true)
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse node group spec: %v", err)
 	}
 	s := strings.Split(spec.Name, ".")
 	Namespace, Name := s[0], s[1]
-	nodeGroup := buildNodeGroupImpl(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
+	nodeGroup := buildNodeGroup(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
 	return nodeGroup, nil
 }
 
-func buildNodeGroupImpl(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *NodeGroupImpl {
-	return &NodeGroupImpl{
+func buildNodeGroup(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *nodeGroup {
+	return &nodeGroup{
 		mcmManager:   mcmManager,
 		minSize:      minSize,
 		maxSize:      maxSize,
diff --git a/cluster-autoscaler/cloudprovider/mcm/test_utils.go b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
index 88d11066acdb..58c884625990 100644
--- a/cluster-autoscaler/cloudprovider/mcm/test_utils.go
+++ b/cluster-autoscaler/cloudprovider/mcm/test_utils.go
@@ -46,7 +46,7 @@ func newMachineDeployments(
 		machineDeployment := &v1alpha1.MachineDeployment{
 			TypeMeta: metav1.TypeMeta{
 				APIVersion: "machine.sapcloud.io",
-				Kind:       "NodeGroupImpl",
+				Kind:       "MachineDeployment",
 			},
 			ObjectMeta: metav1.ObjectMeta{
 				Name:      fmt.Sprintf("machinedeployment-%d", i+1),
@@ -282,7 +282,7 @@ func createMcmManager(
 		discoveryOpts: cloudprovider.NodeGroupDiscoveryOptions{
 			NodeGroupSpecs: nodeGroups,
 		},
-		nodeGroups:              make(map[types.NamespacedName]*NodeGroupImpl),
+		nodeGroups:              make(map[types.NamespacedName]*nodeGroup),
 		deploymentLister:        appsControlSharedInformers.Deployments().Lister(),
 		machineClient:           fakeTypedMachineClient,
 		machineDeploymentLister: machineDeployments.Lister(),

From 194b65a43a154a26a708890e285ebaec65a5ef22 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Mon, 3 Feb 2025 16:57:10 +0530
Subject: [PATCH 22/27] review feedback: unexport belongs, enforce interface

---
 .../cloudprovider/mcm/mcm_cloud_provider.go      | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 61cf708708ad..7c6a1246caf4 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -74,6 +74,8 @@ type mcmCloudProvider struct {
 	resourceLimiter *cloudprovider.ResourceLimiter
 }
 
+var _ cloudprovider.CloudProvider = (*mcmCloudProvider)(nil)
+
 // BuildMcmCloudProvider builds CloudProvider implementation for machine-controller-manager.
 func BuildMcmCloudProvider(mcmManager *McmManager, resourceLimiter *cloudprovider.ResourceLimiter) (cloudprovider.CloudProvider, error) {
 	if mcmManager.discoveryOpts.StaticDiscoverySpecified() {
@@ -238,6 +240,8 @@ type nodeGroup struct {
 	maxSize      int
 }
 
+var _ cloudprovider.NodeGroup = (*nodeGroup)(nil)
+
 // MaxSize returns maximum size of the node group.
 func (ngImpl *nodeGroup) MaxSize() int {
 	return ngImpl.maxSize
@@ -358,21 +362,21 @@ func (ngImpl *nodeGroup) Refresh() error {
 	return nil
 }
 
-// Belongs checks if the given node belongs to this NodeGroup and also returns its MachineInfo for its corresponding Machine
-func (ngImpl *nodeGroup) Belongs(node *apiv1.Node) (belongs bool, mInfo *machineInfo, err error) {
+// belongs checks if the given node belongs to this NodeGroup and also returns its MachineInfo for its corresponding Machine
+func (ngImpl *nodeGroup) belongs(node *apiv1.Node) (belongs bool, mInfo *machineInfo, err error) {
 	mInfo, err = ngImpl.mcmManager.getMachineInfo(node)
 	if err != nil || mInfo == nil {
 		return
 	}
-	targetMd, err := ngImpl.mcmManager.getNodeGroup(mInfo.Key)
+	targetNg, err := ngImpl.mcmManager.getNodeGroup(mInfo.Key)
 	if err != nil {
 		return
 	}
-	if targetMd == nil {
+	if targetNg == nil {
 		err = fmt.Errorf("%s doesn't belong to a known MachinDeployment", node.Name)
 		return
 	}
-	if targetMd.Id() == ngImpl.Id() {
+	if targetNg.Id() == ngImpl.Id() {
 		belongs = true
 	}
 	return
@@ -391,7 +395,7 @@ func (ngImpl *nodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
 	}
 	var toBeDeletedMachineInfos []machineInfo
 	for _, node := range nodes {
-		belongs, mInfo, err := ngImpl.Belongs(node)
+		belongs, mInfo, err := ngImpl.belongs(node)
 		if err != nil {
 			return err
 		} else if !belongs {

From 136ea2780ad8f2f145b151f67391e47cea4fc2d8 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Mon, 3 Feb 2025 20:29:25 +0530
Subject: [PATCH 23/27] addreseed review comment, refactored added test for
 computeScaleDownData

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   |  6 +-
 .../cloudprovider/mcm/mcm_manager.go          | 71 +++++++++++-----
 .../cloudprovider/mcm/mcm_manager_test.go     | 83 +++++++++++++++++++
 3 files changed, 135 insertions(+), 25 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 7c6a1246caf4..c9a64b5c41dd 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -341,8 +341,8 @@ func (ngImpl *nodeGroup) Refresh() error {
 	}
 	machinesOfNodeGroup, err := ngImpl.mcmManager.getMachinesForMachineDeployment(ngImpl.Name)
 	if err != nil {
-		klog.Errorf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
-		return fmt.Errorf("failed refresh of NodeGroup %q due to: %v", ngImpl.Name, err)
+		klog.Warningf("NodeGroup.Refresh() of %q failed to get machines for MachineDeployment due to: %v", ngImpl.Name, err)
+		return nil
 	}
 	toBeDeletedMachines := filterMachinesMatchingNames(machinesOfNodeGroup, sets.New(toBeDeletedMachineNames...))
 	if len(toBeDeletedMachines) == 0 {
@@ -479,7 +479,7 @@ func getNodeNamesFromMachines(machines []*v1alpha1.Machine) []string {
 	return nodeNames
 }
 
-// Id returns MachineDeployment id.
+// Id returns NodeGroup name
 func (ngImpl *nodeGroup) Id() string {
 	return ngImpl.Name
 }
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 3ffd655d884f..b45ff92244ca 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -169,6 +169,12 @@ type machineInfo struct {
 	FailedOrTerminating bool
 }
 
+type scaleDownData struct {
+	RevisedToBeDeletedNames  sets.Set[string]
+	RevisedScaledownAmount   int
+	RevisedMachineDeployment *v1alpha1.MachineDeployment
+}
+
 func init() {
 	controlBurst = flag.Int("control-apiserver-burst", rest.DefaultBurst, "Throttling burst configuration for the client to control cluster's apiserver.")
 	controlQPS = flag.Float64("control-apiserver-qps", float64(rest.DefaultQPS), "Throttling QPS configuration for the client to control cluster's apiserver.")
@@ -491,8 +497,9 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 	return true, err
 }
 
-// scaleDownMachineDeployment scales down the MachineDeployment for given name by the length of toDeleteMachineNames
-// It also updates the machines-marked-by-ca-for-deletion annotation on the machine deployment with the list of toDeleteMachineNames
+// scaleDownMachineDeployment scales down the MachineDeployment for given name by the length of toDeleteMachineNames after removing machine names that
+// are already marked for deletion in the machineutils.TriggerDeletionByMCM of the MachineDeployment.
+// It then updates the machineutils.TriggerDeletionByMCM annotation with revised toBeDeletedMachineNames along with the replica count as a atomic operation.
 // NOTE: Callers MUST take the NodeGroup scalingMutex before invoking this method.
 func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toBeDeletedMachineNames []string) (bool, error) {
 	md, err := m.GetMachineDeploymentObject(mdName)
@@ -500,33 +507,21 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 		return true, err
 	}
 
-	scaleDownAmount := len(toBeDeletedMachineNames)
-	expectedReplicas := md.Spec.Replicas - int32(scaleDownAmount)
-	if expectedReplicas == md.Spec.Replicas {
-		klog.Infof("MachineDeployment %q is already set to %d, skipping the update", md.Name, expectedReplicas)
+	data := computeScaledownData(md, toBeDeletedMachineNames)
+	if data.RevisedScaledownAmount == 0 {
+		klog.V(3).Infof("Skipping scaledown since MachineDeployment %q has already marked %v for deletion by MCM, skipping the scale-down", md.Name, toBeDeletedMachineNames)
 		return false, nil
-	} else if expectedReplicas < 0 {
-		klog.Errorf("Cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
-		return false, fmt.Errorf("cannot delete machines in MachineDeployment %s, expected decrease in replicas %d is more than current replicas %d", mdName, scaleDownAmount, md.Spec.Replicas)
 	}
 
-	alreadyMarkedMachineNames := getMachineNamesTriggeredForDeletion(md)
-	toBeMarkedMachineNamesSet := sets.NewString(toBeDeletedMachineNames...).Insert(alreadyMarkedMachineNames...)
-	triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeMarkedMachineNamesSet.List())
-
-	mdCopy := md.DeepCopy()
-	mdCopy.Spec.Replicas = expectedReplicas
-	if mdCopy.Annotations == nil {
-		mdCopy.Annotations = make(map[string]string)
-	}
-	if mdCopy.Annotations[machineutils.TriggerDeletionByMCM] != triggerDeletionAnnotValue {
-		mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
+	if data.RevisedMachineDeployment == nil {
+		klog.V(3).Infof("Skipping scaledown for MachineDeployment %q for toBeDeletedMachineNames: %v", md.Name, toBeDeletedMachineNames)
+		return false, nil
 	}
-	_, err = m.machineClient.MachineDeployments(mdCopy.Namespace).Update(ctx, mdCopy, metav1.UpdateOptions{})
+	updatedMd, err := m.machineClient.MachineDeployments(data.RevisedMachineDeployment.Namespace).Update(ctx, data.RevisedMachineDeployment, metav1.UpdateOptions{})
 	if err != nil {
 		return true, err
 	}
-	klog.V(2).Infof("MachineDeployment %s size decreased to %d, triggerDeletionAnnotValue: %q", mdCopy.Name, mdCopy.Spec.Replicas, triggerDeletionAnnotValue)
+	klog.V(2).Infof("MachineDeployment %s size decreased from %d to %d, TriggerDeletionByMCM Annotation Value: %q", md.Name, md.Spec.Replicas, updatedMd.Spec.Replicas, updatedMd.Annotations[machineutils.TriggerDeletionByMCM])
 	return false, nil
 }
 
@@ -1080,3 +1075,35 @@ func filterExtendedResources(allResources v1.ResourceList) (extendedResources v1
 	})
 	return
 }
+
+// computeScaledownData computes fresh scaleDownData for the given MachineDeployment given the machineNamesForDeletion
+func computeScaledownData(md *v1alpha1.MachineDeployment, machineNamesForDeletion []string) (data scaleDownData) {
+	forDeletionSet := sets.New(machineNamesForDeletion...)
+	alreadyMarkedSet := sets.New(getMachineNamesTriggeredForDeletion(md)...)
+
+	uniqueForDeletionSet := forDeletionSet.Difference(alreadyMarkedSet)
+	toBeMarkedSet := alreadyMarkedSet.Union(forDeletionSet)
+
+	data.RevisedToBeDeletedNames = uniqueForDeletionSet
+	data.RevisedScaledownAmount = uniqueForDeletionSet.Len()
+	data.RevisedMachineDeployment = nil
+
+	expectedReplicas := md.Spec.Replicas - int32(data.RevisedScaledownAmount)
+	if expectedReplicas == md.Spec.Replicas {
+		klog.Infof("MachineDeployment %q is already set to %d, no need to scale-down", md.Name, expectedReplicas)
+	} else if expectedReplicas < 0 {
+		klog.Errorf("Cannot delete machines in MachineDeployment %q, expected decrease in replicas: %d is more than current replicas: %d", md.Name, data.RevisedScaledownAmount, md.Spec.Replicas)
+	} else {
+		mdCopy := md.DeepCopy()
+		if mdCopy.Annotations == nil {
+			mdCopy.Annotations = make(map[string]string)
+		}
+		triggerDeletionAnnotValue := createMachinesTriggeredForDeletionAnnotValue(toBeMarkedSet.UnsortedList())
+		if mdCopy.Annotations[machineutils.TriggerDeletionByMCM] != triggerDeletionAnnotValue {
+			mdCopy.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
+		}
+		mdCopy.Spec.Replicas = expectedReplicas
+		data.RevisedMachineDeployment = mdCopy
+	}
+	return
+}
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
index eff9df3d7ce7..08190f112a2b 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
@@ -19,6 +19,8 @@ package mcm
 import (
 	"errors"
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
+	"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
 	"k8s.io/utils/ptr"
 	"maps"
@@ -222,6 +224,87 @@ func TestFilterExtendedResources(t *testing.T) {
 	assert.Equal(t, customResources, extendedResources)
 }
 
+func TestComputeScaledownData(t *testing.T) {
+	t.Run("simple", func(t *testing.T) {
+		initialReplicas := int32(2)
+		md := newMachineDeployments(1, initialReplicas, nil, nil, nil)[0]
+		md.Annotations = map[string]string{}
+
+		machineNamesForDeletion := []string{"n1"}
+		data := computeScaledownData(md, machineNamesForDeletion)
+		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
+		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
+		assert.Equal(t, int32(2-len(machineNamesForDeletion)), data.RevisedMachineDeployment.Spec.Replicas)
+	})
+
+	t.Run("single-duplicate", func(t *testing.T) {
+		initialReplicas := 2
+		md := newMachineDeployments(1, int32(initialReplicas), nil, nil, nil)[0]
+		md.Annotations = map[string]string{}
+
+		machineNamesForDeletion := []string{"n1"}
+		data := computeScaledownData(md, machineNamesForDeletion)
+		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
+		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
+
+		expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
+		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
+
+		md = data.RevisedMachineDeployment
+		// repeating computeScaledownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
+		data = computeScaledownData(md, machineNamesForDeletion)
+		assert.Equal(t, 0, data.RevisedScaledownAmount)
+		assert.Empty(t, data.RevisedToBeDeletedNames)
+		assert.Nil(t, data.RevisedMachineDeployment)
+
+	})
+
+	t.Run("multi-duplicates", func(t *testing.T) {
+		initialReplicas := 3
+		md := newMachineDeployments(1, int32(initialReplicas), nil, nil, nil)[0]
+		md.Annotations = map[string]string{}
+
+		machineNamesForDeletion := []string{"n1", "n2"}
+		data := computeScaledownData(md, machineNamesForDeletion)
+		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
+		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
+		expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
+		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
+
+		md = data.RevisedMachineDeployment
+		// repeating computeScaledownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
+		data = computeScaledownData(md, machineNamesForDeletion)
+		assert.Equal(t, 0, data.RevisedScaledownAmount)
+		assert.Empty(t, data.RevisedToBeDeletedNames)
+		assert.Nil(t, data.RevisedMachineDeployment)
+
+	})
+
+	t.Run("overlapping", func(t *testing.T) {
+		initialReplicas := 5
+		md := newMachineDeployments(1, int32(initialReplicas), nil, nil, nil)[0]
+		md.Annotations = map[string]string{}
+
+		machineNamesForDeletion := sets.New("n1", "n2")
+		data := computeScaledownData(md, machineNamesForDeletion.UnsortedList())
+		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion.UnsortedList()), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
+		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
+		expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
+		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
+
+		newMachineNamesForDeletion := sets.New("n2", "n3", "n4")
+		md = data.RevisedMachineDeployment
+		data = computeScaledownData(md, newMachineNamesForDeletion.UnsortedList())
+		assert.NotNil(t, data.RevisedMachineDeployment)
+		uniqueMachinesNamesForDeletion := newMachineNamesForDeletion.Difference(machineNamesForDeletion)
+		assert.Equal(t, uniqueMachinesNamesForDeletion.Len(), data.RevisedScaledownAmount)
+		assert.Equal(t, uniqueMachinesNamesForDeletion, data.RevisedToBeDeletedNames)
+		expectedReplicas = int32(initialReplicas - machineNamesForDeletion.Union(newMachineNamesForDeletion).Len())
+		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
+
+	})
+}
+
 func createSampleInstanceType(instanceTypeName string, customResourceName apiv1.ResourceName, customResourceQuantity resource.Quantity) *instanceType {
 	awsM5Large := AWSInstanceTypes[instanceTypeName]
 	extendedResources := make(apiv1.ResourceList)

From f31884dbec2cdc6540e32edcdd696614c869c555 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Mon, 3 Feb 2025 20:39:17 +0530
Subject: [PATCH 24/27] review comment: preset capacity for slices

---
 .../cloudprovider/mcm/mcm_cloud_provider.go            | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index c9a64b5c41dd..386eb31baaf1 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -393,7 +393,7 @@ func (ngImpl *nodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
 	if int(size) <= ngImpl.MinSize() {
 		return fmt.Errorf("min size reached, nodes will not be deleted")
 	}
-	var toBeDeletedMachineInfos []machineInfo
+	var toBeDeletedMachineInfos = make([]machineInfo, 0, len(nodes))
 	for _, node := range nodes {
 		belongs, mInfo, err := ngImpl.belongs(node)
 		if err != nil {
@@ -412,10 +412,12 @@ func (ngImpl *nodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
 
 // deleteMachines annotates the corresponding MachineDeployment with machine names of toBeDeletedMachineInfos, reduces the desired replicas of the corresponding MachineDeployment and cordons corresponding nodes belonging to toBeDeletedMachineInfos
 func (ngImpl *nodeGroup) deleteMachines(toBeDeletedMachineInfos []machineInfo) error {
-	if len(toBeDeletedMachineInfos) == 0 {
+	numDeletionCandidates := len(toBeDeletedMachineInfos)
+	if numDeletionCandidates == 0 {
 		return nil
 	}
-	var toBeDeletedMachineNames, toBeDeletedNodeNames []string
+	toBeDeletedMachineNames := make([]string, 0, numDeletionCandidates)
+	toBeDeletedNodeNames := make([]string, 0, numDeletionCandidates)
 	for _, mInfo := range toBeDeletedMachineInfos {
 		toBeDeletedMachineNames = append(toBeDeletedMachineNames, mInfo.Key.Name)
 		toBeDeletedNodeNames = append(toBeDeletedNodeNames, mInfo.NodeName)
@@ -438,7 +440,7 @@ func (ngImpl *nodeGroup) deleteMachines(toBeDeletedMachineInfos []machineInfo) e
 		return ngImpl.mcmManager.scaleDownMachineDeployment(ctx, ngImpl.Name, toBeDeletedMachineNames)
 	}, "MachineDeployment", "update", ngImpl.Name)
 	if err != nil {
-		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, len(toBeDeletedMachineNames), toBeDeletedMachineNames, err)
+		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, numDeletionCandidates, toBeDeletedMachineNames, err)
 		return fmt.Errorf("for NodeGroup %q, cannot scale down due to: %w", ngImpl.Name, err)
 	}
 	err = ngImpl.mcmManager.cordonNodes(toBeDeletedNodeNames)

From 36b59f1083728ad5ca5b403980956ae6edef2bd3 Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Tue, 4 Feb 2025 10:27:42 +0530
Subject: [PATCH 25/27] review feedback: ->computeScaleDownData and revised
 godoc

---
 .../cloudprovider/mcm/mcm_manager.go           |  7 ++++---
 .../cloudprovider/mcm/mcm_manager_test.go      | 18 +++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index b45ff92244ca..0865596d894e 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -507,7 +507,7 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 		return true, err
 	}
 
-	data := computeScaledownData(md, toBeDeletedMachineNames)
+	data := computeScaleDownData(md, toBeDeletedMachineNames)
 	if data.RevisedScaledownAmount == 0 {
 		klog.V(3).Infof("Skipping scaledown since MachineDeployment %q has already marked %v for deletion by MCM, skipping the scale-down", md.Name, toBeDeletedMachineNames)
 		return false, nil
@@ -1076,8 +1076,9 @@ func filterExtendedResources(allResources v1.ResourceList) (extendedResources v1
 	return
 }
 
-// computeScaledownData computes fresh scaleDownData for the given MachineDeployment given the machineNamesForDeletion
-func computeScaledownData(md *v1alpha1.MachineDeployment, machineNamesForDeletion []string) (data scaleDownData) {
+// computeScaleDownData computes fresh scaleDownData for the given input MachineDeployment and the machineNamesForDeletion.
+// The output scaleDownData encapsulates the scale-down amount and an updated, non-nil MachineDeployment.
+func computeScaleDownData(md *v1alpha1.MachineDeployment, machineNamesForDeletion []string) (data scaleDownData) {
 	forDeletionSet := sets.New(machineNamesForDeletion...)
 	alreadyMarkedSet := sets.New(getMachineNamesTriggeredForDeletion(md)...)
 
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
index 08190f112a2b..a9c839993d55 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
@@ -231,7 +231,7 @@ func TestComputeScaledownData(t *testing.T) {
 		md.Annotations = map[string]string{}
 
 		machineNamesForDeletion := []string{"n1"}
-		data := computeScaledownData(md, machineNamesForDeletion)
+		data := computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
 		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
 		assert.Equal(t, int32(2-len(machineNamesForDeletion)), data.RevisedMachineDeployment.Spec.Replicas)
@@ -243,7 +243,7 @@ func TestComputeScaledownData(t *testing.T) {
 		md.Annotations = map[string]string{}
 
 		machineNamesForDeletion := []string{"n1"}
-		data := computeScaledownData(md, machineNamesForDeletion)
+		data := computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
 		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
 
@@ -251,8 +251,8 @@ func TestComputeScaledownData(t *testing.T) {
 		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
 
 		md = data.RevisedMachineDeployment
-		// repeating computeScaledownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
-		data = computeScaledownData(md, machineNamesForDeletion)
+		// repeating computeScaleDownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
+		data = computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, 0, data.RevisedScaledownAmount)
 		assert.Empty(t, data.RevisedToBeDeletedNames)
 		assert.Nil(t, data.RevisedMachineDeployment)
@@ -265,15 +265,15 @@ func TestComputeScaledownData(t *testing.T) {
 		md.Annotations = map[string]string{}
 
 		machineNamesForDeletion := []string{"n1", "n2"}
-		data := computeScaledownData(md, machineNamesForDeletion)
+		data := computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
 		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
 		expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
 		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
 
 		md = data.RevisedMachineDeployment
-		// repeating computeScaledownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
-		data = computeScaledownData(md, machineNamesForDeletion)
+		// repeating computeScaleDownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
+		data = computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, 0, data.RevisedScaledownAmount)
 		assert.Empty(t, data.RevisedToBeDeletedNames)
 		assert.Nil(t, data.RevisedMachineDeployment)
@@ -286,7 +286,7 @@ func TestComputeScaledownData(t *testing.T) {
 		md.Annotations = map[string]string{}
 
 		machineNamesForDeletion := sets.New("n1", "n2")
-		data := computeScaledownData(md, machineNamesForDeletion.UnsortedList())
+		data := computeScaleDownData(md, machineNamesForDeletion.UnsortedList())
 		assert.Equal(t, createMachinesTriggeredForDeletionAnnotValue(machineNamesForDeletion.UnsortedList()), data.RevisedMachineDeployment.Annotations[machineutils.TriggerDeletionByMCM])
 		assert.Equal(t, len(machineNamesForDeletion), data.RevisedScaledownAmount)
 		expectedReplicas := int32(initialReplicas - len(machineNamesForDeletion))
@@ -294,7 +294,7 @@ func TestComputeScaledownData(t *testing.T) {
 
 		newMachineNamesForDeletion := sets.New("n2", "n3", "n4")
 		md = data.RevisedMachineDeployment
-		data = computeScaledownData(md, newMachineNamesForDeletion.UnsortedList())
+		data = computeScaleDownData(md, newMachineNamesForDeletion.UnsortedList())
 		assert.NotNil(t, data.RevisedMachineDeployment)
 		uniqueMachinesNamesForDeletion := newMachineNamesForDeletion.Difference(machineNamesForDeletion)
 		assert.Equal(t, uniqueMachinesNamesForDeletion.Len(), data.RevisedScaledownAmount)

From 0e21faa36f3ff37ffc6127890d2edc8a0b5c669e Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Wed, 5 Feb 2025 01:04:14 +0530
Subject: [PATCH 26/27] cordon nodes fix for when node is disabled for
 scaledown

---
 .../cloudprovider/mcm/mcm_cloud_provider.go   | 30 ++++----
 .../cloudprovider/mcm/mcm_manager.go          | 69 +++++++++++++------
 .../cloudprovider/mcm/mcm_manager_test.go     | 10 +--
 3 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 386eb31baaf1..e5e8e1b91794 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -26,6 +26,7 @@ import (
 	"fmt"
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
 	"k8s.io/apimachinery/pkg/util/sets"
+	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
 	"slices"
 	"strconv"
 	"strings"
@@ -206,10 +207,10 @@ func (mcm *mcmCloudProvider) checkMCMAvailableReplicas() error {
 // Refresh is called before every main loop and can be used to dynamically update cloud provider state.
 // In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
 func (mcm *mcmCloudProvider) Refresh() error {
-	err := mcm.checkMCMAvailableReplicas()
-	if err != nil {
-		return err
-	}
+	//err := mcm.checkMCMAvailableReplicas()
+	//if err != nil {
+	//	return err
+	//}
 	return mcm.mcmManager.Refresh()
 }
 
@@ -405,6 +406,10 @@ func (ngImpl *nodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
 			klog.V(3).Infof("for NodeGroup %q, Machine %q is already marked as terminating - skipping deletion", ngImpl.Name, mInfo.Key.Name)
 			continue
 		}
+		if eligibility.HasNoScaleDownAnnotation(node) {
+			klog.V(4).Infof("for NodeGroup %q, Node %q corresponding to Machine %q is marked with ScaleDownDisabledAnnotation %q - skipping deletion", ngImpl.Name, node.Name, mInfo.Key.Name, eligibility.ScaleDownDisabledKey)
+			continue
+		}
 		toBeDeletedMachineInfos = append(toBeDeletedMachineInfos, *mInfo)
 	}
 	return ngImpl.deleteMachines(toBeDeletedMachineInfos)
@@ -416,14 +421,8 @@ func (ngImpl *nodeGroup) deleteMachines(toBeDeletedMachineInfos []machineInfo) e
 	if numDeletionCandidates == 0 {
 		return nil
 	}
-	toBeDeletedMachineNames := make([]string, 0, numDeletionCandidates)
-	toBeDeletedNodeNames := make([]string, 0, numDeletionCandidates)
-	for _, mInfo := range toBeDeletedMachineInfos {
-		toBeDeletedMachineNames = append(toBeDeletedMachineNames, mInfo.Key.Name)
-		toBeDeletedNodeNames = append(toBeDeletedNodeNames, mInfo.NodeName)
-	}
 
-	release := ngImpl.AcquireScalingMutex(fmt.Sprintf("deleteMachines for %q", toBeDeletedMachineNames))
+	release := ngImpl.AcquireScalingMutex(fmt.Sprintf("deleteMachines for %s", toBeDeletedMachineInfos))
 	defer release()
 
 	// get the machine deployment and return if rolling update is not finished
@@ -437,17 +436,12 @@ func (ngImpl *nodeGroup) deleteMachines(toBeDeletedMachineInfos []machineInfo) e
 
 	// Trying to update the machineDeployment till the deadline
 	err = ngImpl.mcmManager.retry(func(ctx context.Context) (bool, error) {
-		return ngImpl.mcmManager.scaleDownMachineDeployment(ctx, ngImpl.Name, toBeDeletedMachineNames)
+		return ngImpl.mcmManager.scaleDownMachineDeployment(ctx, ngImpl.Name, toBeDeletedMachineInfos)
 	}, "MachineDeployment", "update", ngImpl.Name)
 	if err != nil {
-		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, numDeletionCandidates, toBeDeletedMachineNames, err)
+		klog.Errorf("Unable to scale down MachineDeployment %s by %d and delete machines %q due to: %v", ngImpl.Name, numDeletionCandidates, toBeDeletedMachineInfos, err)
 		return fmt.Errorf("for NodeGroup %q, cannot scale down due to: %w", ngImpl.Name, err)
 	}
-	err = ngImpl.mcmManager.cordonNodes(toBeDeletedNodeNames)
-	if err != nil {
-		// Do not return error as cordoning is best-effort
-		klog.Warningf("NodeGroup.deleteMachines() of %q ran into error cordoning nodes: %v", ngImpl.Name, err)
-	}
 	return nil
 }
 
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 0865596d894e..00b021352e12 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -32,6 +32,7 @@ import (
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
+	"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/eligibility"
 	v1appslister "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/utils/pointer"
 	"maps"
@@ -169,10 +170,14 @@ type machineInfo struct {
 	FailedOrTerminating bool
 }
 
+func (m machineInfo) String() string {
+	return fmt.Sprintf("(%s|%s)", m.Key, m.NodeName)
+}
+
 type scaleDownData struct {
-	RevisedToBeDeletedNames  sets.Set[string]
-	RevisedScaledownAmount   int
-	RevisedMachineDeployment *v1alpha1.MachineDeployment
+	RevisedToBeDeletedMachineNames sets.Set[string]
+	RevisedScaledownAmount         int
+	RevisedMachineDeployment       *v1alpha1.MachineDeployment
 }
 
 func init() {
@@ -322,12 +327,12 @@ func (m *McmManager) generateMachineDeploymentMap() error {
 // addNodeGroup adds node group defined in string spec. Format:
 // minNodes:maxNodes:namespace.machineDeploymentName
 func (m *McmManager) addNodeGroup(spec string) error {
-	nodeGroup, err := buildNodeGroupFromSpec(spec, m)
+	ng, err := buildNodeGroupFromSpec(spec, m)
 	if err != nil {
 		return err
 	}
-	key := types.NamespacedName{Namespace: nodeGroup.Namespace, Name: nodeGroup.Name}
-	m.nodeGroups[key] = nodeGroup
+	key := types.NamespacedName{Namespace: ng.Namespace, Name: ng.Name}
+	m.nodeGroups[key] = ng
 	return nil
 }
 
@@ -425,19 +430,19 @@ func (m *McmManager) getNodeGroup(machineKey types.NamespacedName) (*nodeGroup,
 		return nil, fmt.Errorf("unable to find parent MachineDeployment of given MachineSet name %q due to: %w", machineSetName, err)
 	}
 
-	machineDeployment, ok := m.nodeGroups[types.NamespacedName{Namespace: m.namespace, Name: machineDeploymentName}]
+	lookupKey := types.NamespacedName{Namespace: m.namespace, Name: machineDeploymentName}
+	ng, ok := m.nodeGroups[lookupKey]
 	if !ok {
-		return nil, fmt.Errorf("could not find MachineDeployment %q in the managed nodeGroups", machineDeploymentName)
+		return nil, fmt.Errorf("could not find NodeGroup for MachineDeployment %q in the managed nodeGroups", machineDeploymentName)
 	}
-	return machineDeployment, nil
+	return ng, nil
 }
 
-// Refresh method, for each machine deployment, will reset the priority of the machines if the number of annotated machines is more than desired.
-// It will select the machines to reset the priority based on the descending order of creation timestamp.
+// Refresh method for the McmManager that will invoke NodeGroup.Refresh for each node gorup and return collected errors.
 func (m *McmManager) Refresh() error {
 	var collectiveError []error
-	for _, nodeGroup := range m.nodeGroups {
-		collectiveError = append(collectiveError, nodeGroup.Refresh())
+	for _, ng := range m.nodeGroups {
+		collectiveError = append(collectiveError, ng.Refresh())
 	}
 	return errors.Join(collectiveError...)
 }
@@ -501,12 +506,19 @@ func (m *McmManager) updateAnnotationOnMachine(ctx context.Context, mcName strin
 // are already marked for deletion in the machineutils.TriggerDeletionByMCM of the MachineDeployment.
 // It then updates the machineutils.TriggerDeletionByMCM annotation with revised toBeDeletedMachineNames along with the replica count as a atomic operation.
 // NOTE: Callers MUST take the NodeGroup scalingMutex before invoking this method.
-func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toBeDeletedMachineNames []string) (bool, error) {
+func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName string, toBeDeletedMachineInfos []machineInfo) (bool, error) {
 	md, err := m.GetMachineDeploymentObject(mdName)
 	if err != nil {
 		return true, err
 	}
 
+	numDeletionCandidates := len(toBeDeletedMachineInfos)
+	toBeDeletedMachineNames := make([]string, 0, numDeletionCandidates)
+
+	for _, mInfo := range toBeDeletedMachineInfos {
+		toBeDeletedMachineNames = append(toBeDeletedMachineNames, mInfo.Key.Name)
+	}
+
 	data := computeScaleDownData(md, toBeDeletedMachineNames)
 	if data.RevisedScaledownAmount == 0 {
 		klog.V(3).Infof("Skipping scaledown since MachineDeployment %q has already marked %v for deletion by MCM, skipping the scale-down", md.Name, toBeDeletedMachineNames)
@@ -521,7 +533,20 @@ func (m *McmManager) scaleDownMachineDeployment(ctx context.Context, mdName stri
 	if err != nil {
 		return true, err
 	}
-	klog.V(2).Infof("MachineDeployment %s size decreased from %d to %d, TriggerDeletionByMCM Annotation Value: %q", md.Name, md.Spec.Replicas, updatedMd.Spec.Replicas, updatedMd.Annotations[machineutils.TriggerDeletionByMCM])
+	klog.V(2).Infof("MachineDeployment %q size decreased from %d to %d, TriggerDeletionByMCM Annotation Value: %q", md.Name, md.Spec.Replicas, updatedMd.Spec.Replicas, updatedMd.Annotations[machineutils.TriggerDeletionByMCM])
+
+	toBeCordonedNodeNames := make([]string, 0, len(data.RevisedToBeDeletedMachineNames))
+	for _, mInfo := range toBeDeletedMachineInfos {
+		if data.RevisedToBeDeletedMachineNames.Has(mInfo.Key.Name) {
+			toBeCordonedNodeNames = append(toBeCordonedNodeNames, mInfo.NodeName)
+			klog.V(2).Infof("For MachineDeployment %q, will cordon node: %q corresponding to machine %q", md.Name, mInfo.NodeName, mInfo.Key.Name)
+		}
+	}
+	err = m.cordonNodes(toBeCordonedNodeNames)
+	if err != nil {
+		// Do not return error as cordoning is best-effort
+		klog.Warningf("NodeGroup.deleteMachines() of %q ran into error cordoning nodes: %v", md.Name, err)
+	}
 	return false, nil
 }
 
@@ -809,8 +834,8 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(nodeGroupName string) (*no
 func (m *McmManager) GetMachineDeploymentObject(mdName string) (*v1alpha1.MachineDeployment, error) {
 	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(mdName)
 	if err != nil {
-		klog.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
-		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", mdName, err)
+		klog.Errorf("unable to fetch MachineDeployment object %q, Error: %v", mdName, err)
+		return nil, fmt.Errorf("unable to fetch MachineDeployment object %q, Error: %v", mdName, err)
 	}
 	return md, nil
 }
@@ -956,6 +981,10 @@ func (m *McmManager) cordonNodes(nodeNames []string) error {
 			klog.V(4).Infof("Node %q is already cordoned", nodeName)
 			continue
 		}
+		if eligibility.HasNoScaleDownAnnotation(node) {
+			klog.V(4).Infof("Node %q is marked with ScaleDownDisabledAnnotation %q", nodeName, eligibility.ScaleDownDisabledKey)
+			continue
+		}
 		adjustNode := node.DeepCopy()
 		adjustNode.Spec.Unschedulable = true
 		_, err = m.nodeInterface.Update(ctx, adjustNode, metav1.UpdateOptions{})
@@ -1041,8 +1070,8 @@ func buildNodeGroupFromSpec(value string, mcmManager *McmManager) (*nodeGroup, e
 	}
 	s := strings.Split(spec.Name, ".")
 	Namespace, Name := s[0], s[1]
-	nodeGroup := buildNodeGroup(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
-	return nodeGroup, nil
+	ng := buildNodeGroup(mcmManager, spec.MinSize, spec.MaxSize, Namespace, Name)
+	return ng, nil
 }
 
 func buildNodeGroup(mcmManager *McmManager, minSize int, maxSize int, namespace string, name string) *nodeGroup {
@@ -1085,7 +1114,7 @@ func computeScaleDownData(md *v1alpha1.MachineDeployment, machineNamesForDeletio
 	uniqueForDeletionSet := forDeletionSet.Difference(alreadyMarkedSet)
 	toBeMarkedSet := alreadyMarkedSet.Union(forDeletionSet)
 
-	data.RevisedToBeDeletedNames = uniqueForDeletionSet
+	data.RevisedToBeDeletedMachineNames = uniqueForDeletionSet
 	data.RevisedScaledownAmount = uniqueForDeletionSet.Len()
 	data.RevisedMachineDeployment = nil
 
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
index a9c839993d55..3e436443da8d 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager_test.go
@@ -251,10 +251,10 @@ func TestComputeScaledownData(t *testing.T) {
 		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
 
 		md = data.RevisedMachineDeployment
-		// repeating computeScaleDownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
+		// repeating computeScaleDownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedMachineNames, and nil RevisedMachineDeployment
 		data = computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, 0, data.RevisedScaledownAmount)
-		assert.Empty(t, data.RevisedToBeDeletedNames)
+		assert.Empty(t, data.RevisedToBeDeletedMachineNames)
 		assert.Nil(t, data.RevisedMachineDeployment)
 
 	})
@@ -272,10 +272,10 @@ func TestComputeScaledownData(t *testing.T) {
 		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
 
 		md = data.RevisedMachineDeployment
-		// repeating computeScaleDownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedNames, and nil RevisedMachineDeployment
+		// repeating computeScaleDownData for same machineNamesForDeletion should have 0 RevisedScaledownAmount, empty RevisedToBeDeletedMachineNames, and nil RevisedMachineDeployment
 		data = computeScaleDownData(md, machineNamesForDeletion)
 		assert.Equal(t, 0, data.RevisedScaledownAmount)
-		assert.Empty(t, data.RevisedToBeDeletedNames)
+		assert.Empty(t, data.RevisedToBeDeletedMachineNames)
 		assert.Nil(t, data.RevisedMachineDeployment)
 
 	})
@@ -298,7 +298,7 @@ func TestComputeScaledownData(t *testing.T) {
 		assert.NotNil(t, data.RevisedMachineDeployment)
 		uniqueMachinesNamesForDeletion := newMachineNamesForDeletion.Difference(machineNamesForDeletion)
 		assert.Equal(t, uniqueMachinesNamesForDeletion.Len(), data.RevisedScaledownAmount)
-		assert.Equal(t, uniqueMachinesNamesForDeletion, data.RevisedToBeDeletedNames)
+		assert.Equal(t, uniqueMachinesNamesForDeletion, data.RevisedToBeDeletedMachineNames)
 		expectedReplicas = int32(initialReplicas - machineNamesForDeletion.Union(newMachineNamesForDeletion).Len())
 		assert.Equal(t, expectedReplicas, data.RevisedMachineDeployment.Spec.Replicas)
 

From 12e747f9fbcdeb7c8b7d8477717c563f731e570f Mon Sep 17 00:00:00 2001
From: elankath <tarun.ramakrishna.elankath@sap.com>
Date: Wed, 5 Feb 2025 01:20:09 +0530
Subject: [PATCH 27/27] fixed unit test string quoting issue

---
 .../cloudprovider/mcm/mcm_cloud_provider.go               | 8 ++++----
 .../cloudprovider/mcm/mcm_cloud_provider_test.go          | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index e5e8e1b91794..b9b5d6c32607 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -207,10 +207,10 @@ func (mcm *mcmCloudProvider) checkMCMAvailableReplicas() error {
 // Refresh is called before every main loop and can be used to dynamically update cloud provider state.
 // In particular the list of node groups returned by NodeGroups can change as a result of CloudProvider.Refresh().
 func (mcm *mcmCloudProvider) Refresh() error {
-	//err := mcm.checkMCMAvailableReplicas()
-	//if err != nil {
-	//	return err
-	//}
+	err := mcm.checkMCMAvailableReplicas()
+	if err != nil {
+		return err
+	}
 	return mcm.mcmManager.Refresh()
 }
 
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 079c65a04eae..e19c20a748fd 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -581,7 +581,7 @@ func TestGetOptions(t *testing.T) {
 				nodeGroups: []string{nodeGroup1},
 			},
 			expect{
-				err: fmt.Errorf("unable to fetch MachineDeployment object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
+				err: fmt.Errorf("unable to fetch MachineDeployment object \"machinedeployment-1\", Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
 			},
 		},
 		{