Skip to content

Commit

Permalink
Breakout Scale into UpScale/DownScale actions
Browse files Browse the repository at this point in the history
* also move logic into instance_reporters
  • Loading branch information
sethboyles committed Feb 12, 2025
1 parent c778add commit 8de2953
Show file tree
Hide file tree
Showing 12 changed files with 539 additions and 133 deletions.
2 changes: 1 addition & 1 deletion lib/cloud_controller/backends/instances_reporters.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def stats_for_app(app)
raise CloudController::Errors::ApiError.new_from_details('StatsUnavailable', 'Stats server temporarily unavailable.')
end

delegate :number_of_starting_and_running_instances_for_processes, to: :diego_reporter
delegate :number_of_starting_and_running_instances_for_processes, :instance_count_summary, to: :diego_reporter

private

Expand Down
48 changes: 48 additions & 0 deletions lib/cloud_controller/deployment_updater/actions/down_scaler.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
require 'cloud_controller/deployment_updater/actions/scale_down_old_process'
module VCAP::CloudController
module DeploymentUpdater
module Actions
class DownScaler
attr_reader :deployment, :logger, :app, :target_total_instance_count

def initialize(deployment, logger, target_total_instance_count, routable_instance_count)
@deployment = deployment
@app = deployment.app
@logger = logger
@target_total_instance_count = target_total_instance_count
@routable_instance_count = routable_instance_count
end

def scale_down
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances

return if instances_to_reduce <= 0

non_deploying_web_processes.each do |process|
if instances_to_reduce < process.instances
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
break
end

instances_to_reduce -= process.instances
ScaleDownOldProcess.new(deployment, process, 0).call
end
end

def can_downscale?
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
end

def desired_non_deploying_instances
[target_total_instance_count - @routable_instance_count, 0].max
end

private

def non_deploying_web_processes
app.web_processes.reject { |process| process.guid == deployment.deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
end
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def call

update_non_web_processes
restart_non_web_processes

deployment.update(
state: DeploymentModel::DEPLOYED_STATE,
status_value: DeploymentModel::FINALIZED_STATUS_VALUE,
Expand Down
88 changes: 15 additions & 73 deletions lib/cloud_controller/deployment_updater/actions/scale.rb
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
require 'cloud_controller/deployment_updater/actions/scale_down_canceled_processes'
require 'cloud_controller/deployment_updater/actions/scale_down_old_process'
require 'cloud_controller/deployment_updater/actions/finalize'
require 'cloud_controller/deployment_updater/actions/down_scaler'
require 'cloud_controller/deployment_updater/actions/up_scaler'
require 'cloud_controller/diego/constants'

module VCAP::CloudController
module DeploymentUpdater
module Actions
class Scale
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
attr_reader :deployment, :logger, :app, :target_total_instance_count, :interim_desired_instance_count

def initialize(deployment, logger, target_total_instance_count, interim_desired_instance_count = nil)
def initialize(deployment, logger, target_total_instance_count, interim_desired_instance_count=nil)
@deployment = deployment
@logger = logger
@app = deployment.app
Expand All @@ -19,110 +19,52 @@ def initialize(deployment, logger, target_total_instance_count, interim_desired_
end

def call
down_scaler = DownScaler.new(deployment, logger, target_total_instance_count, instance_count_summary.routable_instances_count)
up_scaler = UpScaler.new(deployment, logger, interim_desired_instance_count, instance_count_summary)

deployment.db.transaction do
return unless deployment.lock!.state == DeploymentModel::DEPLOYING_STATE

return unless can_scale? || can_downscale?
return unless up_scaler.can_scale? || down_scaler.can_downscale?

app.lock!

oldest_web_process_with_instances.lock!
deploying_web_process.lock!

deployment.update(
state: DeploymentModel::DEPLOYING_STATE,
status_value: DeploymentModel::ACTIVE_STATUS_VALUE,
status_reason: DeploymentModel::DEPLOYING_STATUS_REASON
)

ScaleDownCanceledProcesses.new(deployment).call

scale_down_old_processes if can_downscale?

return true if deploying_web_process.instances >= interim_desired_instance_count

if can_scale?
deploying_web_process.update(instances: desired_new_instances)
deployment.update(last_healthy_at: Time.now)
end
end
false
end

private

def scale_down_old_processes
instances_to_reduce = non_deploying_web_processes.map(&:instances).sum - desired_non_deploying_instances
ScaleDownCanceledProcesses.new(deployment).call

return if instances_to_reduce <= 0
down_scaler.scale_down if down_scaler.can_downscale?

non_deploying_web_processes.each do |process|
if instances_to_reduce < process.instances
ScaleDownOldProcess.new(deployment, process, process.instances - instances_to_reduce).call
break
end
return true if up_scaler.finished_scaling?

instances_to_reduce -= process.instances
ScaleDownOldProcess.new(deployment, process, 0).call
up_scaler.scale_up if up_scaler.can_scale?
end
end

def can_scale?
starting_instances.count < deployment.max_in_flight &&
unhealthy_instances.count == 0 &&
routable_instances.count >= deploying_web_process.instances - deployment.max_in_flight
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
logger.info("skipping-deployment-update-for-#{deployment.guid}")
false
end

def can_downscale?
non_deploying_web_processes.map(&:instances).sum > desired_non_deploying_instances
rescue CloudController::Errors::ApiError # the instances_reporter re-raises InstancesUnavailable as ApiError
logger.info("skipping-deployment-update-for-#{deployment.guid}")
false
end

def desired_non_deploying_instances
[target_total_instance_count - routable_instances.count, 0].max
end

def desired_new_instances
[routable_instances.count + deployment.max_in_flight, interim_desired_instance_count].min
end
private

def oldest_web_process_with_instances
# TODO: lock all web processes? We might alter all of them, depending on max-in-flight size
@oldest_web_process_with_instances ||= app.web_processes.select { |process| process.instances > 0 }.min_by { |p| [p.created_at, p.id] }
end

def non_deploying_web_processes
app.web_processes.reject { |process| process.guid == deploying_web_process.guid }.sort_by { |p| [p.created_at, p.id] }
def instance_count_summary
@instance_count_summary ||= instance_reporters.instance_count_summary(deploying_web_process)
end

def deploying_web_process
@deploying_web_process ||= deployment.deploying_web_process
end

def starting_instances
healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
end

def routable_instances
reported_instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
end

def healthy_instances
reported_instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
end

def unhealthy_instances
reported_instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
end

def reported_instances
@reported_instances = instance_reporters.all_instances_for_app(deploying_web_process)
end

def instance_reporters
CloudController::DependencyLocator.instance.instances_reporters
end
Expand Down
48 changes: 48 additions & 0 deletions lib/cloud_controller/deployment_updater/actions/up_scaler.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
module VCAP::CloudController
module DeploymentUpdater
module Actions
class UpScaler
attr_reader :deployment, :logger, :app, :interim_desired_instance_count

def initialize(deployment, logger, interim_desired_instance_count, instance_count_summary)
@deployment = deployment
@app = deployment.app
@logger = logger
@interim_desired_instance_count = interim_desired_instance_count
@starting_instances_count = instance_count_summary.starting_instances_count
@unhealthy_instances_count = instance_count_summary.unhealthy_instances_count
@routable_instances_count = instance_count_summary.routable_instances_count
end

def scale_up
return unless can_scale?

deploying_web_process.update(instances: desired_new_instances)
deployment.update(last_healthy_at: Time.now)
end

def can_scale?
@starting_instances_count < deployment.max_in_flight &&
@unhealthy_instances_count == 0 &&
# if routable instances is < deploying_web_process.instances - deployment.max_in_flight
# then that indicates that Diego isnt in sync with CAPI yet
@routable_instances_count >= deploying_web_process.instances - deployment.max_in_flight
end

def finished_scaling?
deploying_web_process.instances >= interim_desired_instance_count
end

private

def desired_new_instances
[@routable_instances_count + deployment.max_in_flight, interim_desired_instance_count].min
end

def deploying_web_process
@deploying_web_process ||= deployment.deploying_web_process
end
end
end
end
end
14 changes: 13 additions & 1 deletion lib/cloud_controller/diego/reporters/instances_reporter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ module VCAP::CloudController
module Diego
class InstancesReporter
include ReporterMixins

InstanceCountSummary = Struct.new(:starting_instances_count, :routable_instances_count, :healthy_instances_count, :unhealthy_instances_count)
HEALTHY_STATES = [VCAP::CloudController::Diego::LRP_RUNNING, VCAP::CloudController::Diego::LRP_STARTING].freeze
UNKNOWN_INSTANCE_COUNT = -1

def initialize(bbs_instances_client)
Expand Down Expand Up @@ -105,6 +106,17 @@ def crashed_instances_for_app(process)
raise CloudController::Errors::InstancesUnavailable.new(e)
end

def instance_count_summary(process)
instances = all_instances_for_app(process)

healthy_instances = instances.select { |_, val| HEALTHY_STATES.include?(val[:state]) }
unhealthy_instances = instances.reject { |_, val| HEALTHY_STATES.include?(val[:state]) }
starting_instances = healthy_instances.reject { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }
routable_instances = instances.select { |_, val| val[:state] == VCAP::CloudController::Diego::LRP_RUNNING && val[:routable] }

InstanceCountSummary.new(starting_instances.count, routable_instances.count, healthy_instances.count, unhealthy_instances.count)
end

private

attr_reader :bbs_instances_client
Expand Down
1 change: 1 addition & 0 deletions lib/cloud_controller/metrics/prometheus_updater.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require 'prometheus/client'
require 'prometheus/client/data_stores/direct_file_store'

module VCAP::CloudController::Metrics
class PrometheusUpdater
Expand Down
Loading

0 comments on commit 8de2953

Please sign in to comment.