Skip to content

Commit

Permalink
logic for getvmss call for spot, uniform vmss
Browse files Browse the repository at this point in the history
  • Loading branch information
gandhipr committed Jan 24, 2024
1 parent e427845 commit 9bd43a0
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 16 deletions.
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ const (
rateLimitWriteBucketsEnvVar = "RATE_LIMIT_WRITE_BUCKETS"

// refresh period in seconds
vmssSizeRefreshPeriodDefault = 30
VmssSizeRefreshPeriodDefault = 30

// auth methods
authMethodPrincipal = "principal"
Expand Down Expand Up @@ -257,7 +257,7 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
return nil, fmt.Errorf("failed to parse AZURE_GET_VMSS_SIZE_REFRESH_PERIOD %q: %v", getVmssSizeRefreshPeriod, err)
}
} else {
cfg.GetVmssSizeRefreshPeriod = vmssSizeRefreshPeriodDefault
cfg.GetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault
}

if enableVmssFlex := os.Getenv("AZURE_ENABLE_VMSS_FLEX"); enableVmssFlex != "" {
Expand Down
13 changes: 7 additions & 6 deletions cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -685,12 +685,13 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
azureRef: azureRef{
Name: vmssName,
},
minSize: minVal,
maxSize: maxVal,
manager: manager,
curSize: 3,
sizeRefreshPeriod: manager.azureCache.refreshInterval,
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
minSize: minVal,
maxSize: maxVal,
manager: manager,
curSize: 3,
sizeRefreshPeriod: manager.azureCache.refreshInterval,
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
getVmssSizeRefreshPeriod: time.Duration(VmssSizeRefreshPeriodDefault) * time.Second,
}}
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
}
Expand Down
56 changes: 49 additions & 7 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
defaultVmssInstancesRefreshPeriod = 5 * time.Minute
vmssContextTimeout = 3 * time.Minute
vmssSizeMutex sync.Mutex
defaultGetVmssSizeRefreshPeriod = VmssSizeRefreshPeriodDefault * time.Second
)

const (
Expand All @@ -63,8 +64,9 @@ type ScaleSet struct {

enableDynamicInstanceList bool

lastSizeRefresh time.Time
sizeRefreshPeriod time.Duration
lastSizeRefresh time.Time
sizeRefreshPeriod time.Duration
getVmssSizeRefreshPeriod time.Duration

instancesRefreshPeriod time.Duration
instancesRefreshJitter int
Expand Down Expand Up @@ -95,6 +97,12 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
scaleSet.instancesRefreshPeriod = defaultVmssInstancesRefreshPeriod
}

if az.config.GetVmssSizeRefreshPeriod != 0 {
scaleSet.getVmssSizeRefreshPeriod = time.Duration(az.config.GetVmssSizeRefreshPeriod) * time.Second
} else {
scaleSet.getVmssSizeRefreshPeriod = time.Duration(VmssSizeRefreshPeriodDefault) * time.Second
}

return scaleSet, nil
}

Expand Down Expand Up @@ -154,17 +162,44 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()

if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) {
klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize)
return scaleSet.curSize, nil
}

set, err := scaleSet.getVMSSFromCache()
if err != nil {
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
return -1, err
}

effectiveSizeRefreshPeriod := scaleSet.sizeRefreshPeriod

// If the scale set is Spot, we want to have a more fresh view of the Sku.Capacity field.
// This is because evictions can happen at any given point in time,
// even before VMs are materialized as nodes. We should be able to
// react to those and have the autoscaler readjust the goal again to force restoration.
// Taking into account only if orchestrationMode == Uniform because flex mode can have
// combination of spot and regular vms
if isSpotAndUniform(&set) {
effectiveSizeRefreshPeriod = scaleSet.getVmssSizeRefreshPeriod
}

if scaleSet.lastSizeRefresh.Add(effectiveSizeRefreshPeriod).After(time.Now()) {
klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize)
return scaleSet.curSize, nil
}

// If the toggle to utilize the GET VMSS is enabled or the scale set is on Spot,
// make a GET VMSS call to fetch more updated fresh info
if isSpotAndUniform(&set) {
ctx, cancel := getContextWithCancel()
defer cancel()

var rerr *retry.Error
set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup,
scaleSet.Name)
if rerr != nil {
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr)
return -1, err
}
}

vmssSizeMutex.Lock()
curSize := *set.Sku.Capacity
vmssSizeMutex.Unlock()
Expand All @@ -181,6 +216,13 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
return scaleSet.curSize, nil
}

func isSpotAndUniform(vmss *compute.VirtualMachineScaleSet) bool {
return vmss != nil && vmss.VirtualMachineScaleSetProperties != nil &&
vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile != nil &&
vmss.VirtualMachineScaleSetProperties.VirtualMachineProfile.Priority == compute.Spot &&
vmss.OrchestrationMode == compute.Uniform
}

// GetScaleSetSize gets Scale Set size.
func (scaleSet *ScaleSet) GetScaleSetSize() (int64, error) {
return scaleSet.getCurSize()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ func TestTargetSize(t *testing.T) {
provider.azureManager.azClient.virtualMachineScaleSetsClient = mockVMSSClient

if orchMode == compute.Uniform {

mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
mockVMSSVMClient.EXPECT().List(gomock.Any(), provider.azureManager.config.ResourceGroup, "test-asg", gomock.Any()).Return(expectedVMSSVMs, nil).AnyTimes()
provider.azureManager.azClient.virtualMachineScaleSetVMsClient = mockVMSSVMClient
Expand Down

0 comments on commit 9bd43a0

Please sign in to comment.