Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Implement DRA support in Cluster Autoscaler #7350

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
4992123
DRA: extract interacting with the scheduler framework out of Predicat…
towca Sep 26, 2024
2d55ff2
DRA: introduce internal NodeInfo/PodInfo with DRA objects attached
towca Sep 27, 2024
7c1f8d5
DRA: migrate all of CA to use the new internal NodeInfo/PodInfo
towca Sep 27, 2024
dfd0234
DRA: remove AddNodeWithPods from ClusterSnapshot, replace uses with A…
towca Sep 30, 2024
fafb78a
DRA: add Initialize to ClusterSnapshot, remove AddNodes
towca Sep 30, 2024
c249f46
DRA: remove redundant IsPVCUsedByPods from ClusterSnapshot
towca Sep 30, 2024
f876a51
DRA: remove AddNode from ClusterSnapshot
towca Sep 30, 2024
bb87555
DRA: refactor utils related to NodeInfos
towca Sep 30, 2024
fad6868
DRA: propagate schedulerframework handle and DRA feature flag to Clus…
towca Sep 30, 2024
26e4787
DRA: Implement a Snapshot of DRA objects, its Provider, and utils
towca Sep 26, 2024
9e32e07
DRA: grab a snapshot of DRA objects and plumb to ClusterSnapshot.Init…
towca Sep 30, 2024
006685c
DRA: propagate DRA objects through NodeInfos in node_info utils
towca Sep 30, 2024
c5edd3b
DRA: rename ClusterSnapshot methods to better reflect their purpose
towca Oct 1, 2024
bdef0a7
DRA: extend ClusterSnapshot.SchedulePod, propagate scheduling state f…
towca Oct 1, 2024
0e055c4
DRA: plumb the DRA snapshot into scheduler framework through ClusterS…
towca Oct 1, 2024
0a11e9c
DRA: implement calculating utilization for DRA resources
towca Oct 1, 2024
ef9d420
DRA: integrate BasicClusterSnapshot with the DRA snapshot
towca Oct 1, 2024
38fb034
DRA: add integration tests
towca Sep 26, 2024
7e70b41
DRA: handle expendable pods using DRA
towca Oct 3, 2024
3544bb4
DRA: handle duplicating unschedulable pods using DRA
towca Oct 4, 2024
2e7eeea
DRA TMP: vendor in the required scheduler framework channges
towca Oct 7, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

// Asg implements NodeGroup interface.
Expand Down Expand Up @@ -179,7 +179,7 @@ func (asg *Asg) Nodes() ([]cloudprovider.Instance, error) {
}

// TemplateNodeInfo returns a node template for this node group.
func (asg *Asg) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (asg *Asg) TemplateNodeInfo() (*framework.NodeInfo, error) {
template, err := asg.manager.getAsgTemplate(asg.id)
if err != nil {
return nil, err
Expand All @@ -191,8 +191,7 @@ func (asg *Asg) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
return nil, err
}

nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(asg.id))
nodeInfo.SetNode(node)
nodeInfo := framework.NewNodeInfo(node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(asg.id)})
return nodeInfo, nil
}

Expand Down
7 changes: 3 additions & 4 deletions cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -392,7 +392,7 @@ func (ng *AwsNodeGroup) Nodes() ([]cloudprovider.Instance, error) {
}

// TemplateNodeInfo returns a node template for this node group.
func (ng *AwsNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (ng *AwsNodeGroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
template, err := ng.awsManager.getAsgTemplate(ng.asg)
if err != nil {
return nil, err
Expand All @@ -403,8 +403,7 @@ func (ng *AwsNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error)
return nil, err
}

nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(ng.asg.Name))
nodeInfo.SetNode(node)
nodeInfo := framework.NewNodeInfo(node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.asg.Name)})
return nodeInfo, nil
}

Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -477,7 +477,7 @@ func (as *AgentPool) Debug() string {
}

// TemplateNodeInfo returns a node template for this agent pool.
func (as *AgentPool) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (as *AgentPool) TemplateNodeInfo() (*framework.NodeInfo, error) {
return nil, cloudprovider.ErrNotImplemented
}

Expand Down
7 changes: 3 additions & 4 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"sigs.k8s.io/cloud-provider-azure/pkg/retry"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2022-08-01/compute"
Expand Down Expand Up @@ -627,7 +627,7 @@ func (scaleSet *ScaleSet) Debug() string {
}

// TemplateNodeInfo returns a node template for this scale set.
func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (scaleSet *ScaleSet) TemplateNodeInfo() (*framework.NodeInfo, error) {
template, err := scaleSet.getVMSSFromCache()
if err != nil {
return nil, err
Expand All @@ -641,8 +641,7 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro
return nil, err
}

nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(scaleSet.Name))
nodeInfo.SetNode(node)
nodeInfo := framework.NewNodeInfo(node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(scaleSet.Name)})
return nodeInfo, nil
}

Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_vms_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
)

// VMsPool is single instance VM pool
Expand Down Expand Up @@ -169,7 +169,7 @@ func (agentPool *VMsPool) Nodes() ([]cloudprovider.Instance, error) {
}

// TemplateNodeInfo is not implemented.
func (agentPool *VMsPool) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (agentPool *VMsPool) TemplateNodeInfo() (*framework.NodeInfo, error) {
return nil, cloudprovider.ErrNotImplemented
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/config/dynamic"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -365,13 +365,13 @@ func (asg *Asg) Nodes() ([]cloudprovider.Instance, error) {
return instances, nil
}

// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
// TemplateNodeInfo returns a framework.NodeInfo structure of an empty
// (as if just started) node. This will be used in scale-up simulations to
// predict what would a new node look like if a node group was expanded. The returned
// NodeInfo is expected to have a fully populated Node object, with all of the labels,
// capacity and allocatable information as well as all pods that are started on
// the node by default, using manifest (most likely only kube-proxy). Implementation optional.
func (asg *Asg) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (asg *Asg) TemplateNodeInfo() (*framework.NodeInfo, error) {
template, err := asg.baiducloudManager.getAsgTemplate(asg.Name)
if err != nil {
return nil, err
Expand All @@ -380,8 +380,7 @@ func (asg *Asg) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
if err != nil {
return nil, err
}
nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(asg.Name))
nodeInfo.SetNode(node)
nodeInfo := framework.NewNodeInfo(node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(asg.Name)})
return nodeInfo, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
)

const (
Expand Down Expand Up @@ -183,14 +183,14 @@ func (n *NodeGroup) Nodes() ([]cloudprovider.Instance, error) {
return toInstances(n.nodePool.Nodes), nil
}

// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
// TemplateNodeInfo returns a framework.NodeInfo structure of an empty
// (as if just started) node. This will be used in scale-up simulations to
// predict what would a new node look like if a node group was expanded. The
// returned NodeInfo is expected to have a fully populated Node object, with
// all of the labels, capacity and allocatable information as well as all pods
// that are started on the node by default, using manifest (most likely only
// kube-proxy). Implementation optional.
func (n *NodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (n *NodeGroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
return nil, cloudprovider.ErrNotImplemented
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/brightbox/gobrightbox/status"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/brightbox/k8ssdk"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
klog "k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
Expand Down Expand Up @@ -239,13 +240,13 @@ func (ng *brightboxNodeGroup) Exist() bool {
return err == nil
}

// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
// TemplateNodeInfo returns a framework.NodeInfo structure of an empty
// (as if just started) node. This will be used in scale-up simulations to
// predict what would a new node look like if a node group was expanded. The returned
// NodeInfo is expected to have a fully populated Node object, with all of the labels,
// capacity and allocatable information as well as all pods that are started on
// the node by default, using manifest (most likely only kube-proxy). Implementation optional.
func (ng *brightboxNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (ng *brightboxNodeGroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
klog.V(4).Info("TemplateNodeInfo")
klog.V(4).Infof("Looking for server type %q", ng.serverOptions.ServerType)
serverType, err := ng.findServerType()
Expand All @@ -268,8 +269,7 @@ func (ng *brightboxNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo,
Conditions: cloudprovider.BuildReadyConditions(),
},
}
nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(ng.Id()))
nodeInfo.SetNode(&node)
nodeInfo := framework.NewNodeInfo(&node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(ng.Id())})
return nodeInfo, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (

"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
)

const (
Expand All @@ -45,7 +45,7 @@ type cherryManager interface {
getNodes(nodegroup string) ([]string, error)
getNodeNames(nodegroup string) ([]string, error)
deleteNodes(nodegroup string, nodes []NodeRef, updatedNodeCount int) error
templateNodeInfo(nodegroup string) (*schedulerframework.NodeInfo, error)
templateNodeInfo(nodegroup string) (*framework.NodeInfo, error)
NodeGroupForNode(labels map[string]string, nodeId string) (string, error)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ import (
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/autoscaler/cluster-autoscaler/version"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -618,7 +618,7 @@ func BuildGenericLabels(nodegroup string, plan *Plan) map[string]string {

// templateNodeInfo returns a NodeInfo with a node template based on the Cherry Servers plan
// that is used to create nodes in a given node group.
func (mgr *cherryManagerRest) templateNodeInfo(nodegroup string) (*schedulerframework.NodeInfo, error) {
func (mgr *cherryManagerRest) templateNodeInfo(nodegroup string) (*framework.NodeInfo, error) {
node := apiv1.Node{}
nodeName := fmt.Sprintf("%s-asg-%d", nodegroup, rand.Int63())
node.ObjectMeta = metav1.ObjectMeta{
Expand Down Expand Up @@ -664,8 +664,7 @@ func (mgr *cherryManagerRest) templateNodeInfo(nodegroup string) (*schedulerfram
// GenericLabels
node.Labels = cloudprovider.JoinStringMaps(node.Labels, BuildGenericLabels(nodegroup, cherryPlan))

nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(nodegroup))
nodeInfo.SetNode(&node)
nodeInfo := framework.NewNodeInfo(&node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(nodegroup)})
return nodeInfo, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -269,7 +269,7 @@ func (ng *cherryNodeGroup) Nodes() ([]cloudprovider.Instance, error) {
}

// TemplateNodeInfo returns a node template for this node group.
func (ng *cherryNodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (ng *cherryNodeGroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
return ng.cherryManager.templateNodeInfo(ng.id)
}

Expand Down
8 changes: 3 additions & 5 deletions cluster-autoscaler/cloudprovider/civo/civo_node_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
autoscaler "k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/gpu"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

// NodeGroup implements cloudprovider.NodeGroup interface. NodeGroup contains
Expand Down Expand Up @@ -208,15 +208,13 @@ func (n *NodeGroup) Nodes() ([]cloudprovider.Instance, error) {
// all of the labels, capacity and allocatable information as well as all pods
// that are started on the node by default, using manifest (most likely only
// kube-proxy). Implementation optional.
func (n *NodeGroup) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (n *NodeGroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
node, err := n.buildNodeFromTemplate(n.Id(), n.nodeTemplate)
if err != nil {
return nil, fmt.Errorf("failed to build node from template")
}

nodeInfo := schedulerframework.NewNodeInfo(cloudprovider.BuildKubeProxy(n.Id()))
nodeInfo.SetNode(node)

nodeInfo := framework.NewNodeInfo(node, nil, &framework.PodInfo{Pod: cloudprovider.BuildKubeProxy(n.Id())})
return nodeInfo, nil
}

Expand Down
6 changes: 3 additions & 3 deletions cluster-autoscaler/cloudprovider/cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ import (
apiv1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/autoscaler/cluster-autoscaler/config"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
Expand Down Expand Up @@ -214,13 +214,13 @@ type NodeGroup interface {
// This list should include also instances that might have not become a kubernetes node yet.
Nodes() ([]Instance, error)

// TemplateNodeInfo returns a schedulerframework.NodeInfo structure of an empty
// TemplateNodeInfo returns a framework.NodeInfo structure of an empty
// (as if just started) node. This will be used in scale-up simulations to
// predict what would a new node look like if a node group was expanded. The returned
// NodeInfo is expected to have a fully populated Node object, with all of the labels,
// capacity and allocatable information as well as all pods that are started on
// the node by default, using manifest (most likely only kube-proxy). Implementation optional.
TemplateNodeInfo() (*schedulerframework.NodeInfo, error)
TemplateNodeInfo() (*framework.NodeInfo, error)

// Exist checks if the node group really exists on the cloud provider side. Allows to tell the
// theoretical node group from the real one. Implementation required.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"

apiv1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/simulator/framework"
klog "k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

// asg implements NodeGroup interface.
Expand Down Expand Up @@ -168,7 +168,7 @@ func (asg *asg) Delete() error {
}

// TemplateNodeInfo returns a node template for this node group.
func (asg *asg) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
func (asg *asg) TemplateNodeInfo() (*framework.NodeInfo, error) {
return nil, cloudprovider.ErrNotImplemented
}

Expand Down
Loading
Loading