Skip to content

Commit

Permalink
Complete refactor
Browse files Browse the repository at this point in the history
This is a complete refactor while maintaining the current behaviour.

We move all the logic into a number of classes:

`Deployment::Deployment`

This is responsible for checking the status of a state machine
execution. If it has failed for some reason then we find the fail event
and then pass the event onto the correct class for further parsing.
If no class exists then we return a generic error message.

`Deployment::Events::FailStateEntered`

This handles the case where we hit the Fail state.
It then extracts the error attempts to determine if it was sidekiq that
failed or not returning the appropriate message.

`Deployment::Events::LambdaFunctionFailed`

This handles the case where a lambda error causes the state machine
execution to terminate but doesn't actually reach the fail state.
Currently the only time this happens is during the Pre Flight Checks.

This happens to be by accident since looking at the definition then we
are supposed to go to the fail state but we are catching the error
incorrectly.
  • Loading branch information
cornet committed Mar 1, 2024
1 parent 105127e commit 02ce214
Show file tree
Hide file tree
Showing 13 changed files with 656 additions and 50 deletions.
2 changes: 2 additions & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Metrics/BlockLength:
Enabled: false
8 changes: 7 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
source 'https://rubygems.org' do
gem 'aws-sdk-states'
end

group :test do
gem 'nokogiri'
gem 'rspec'
gem 'rspec-mocks'
end
end
21 changes: 21 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,34 @@ GEM
aws-sigv4 (~> 1.1)
aws-sigv4 (1.5.2)
aws-eventstream (~> 1, >= 1.0.2)
diff-lcs (1.5.1)
jmespath (1.6.2)
nokogiri (1.16.2-arm64-darwin)
racc (~> 1.4)
racc (1.7.3)
rspec (3.13.0)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
rspec-mocks (~> 3.13.0)
rspec-core (3.13.0)
rspec-support (~> 3.13.0)
rspec-expectations (3.13.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-mocks (3.13.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.13.0)
rspec-support (3.13.1)

PLATFORMS
arm64-darwin-21
arm64-darwin-23

DEPENDENCIES
aws-sdk-states!
nokogiri!
rspec!
rspec-mocks!

BUNDLED WITH
2.3.7
65 changes: 16 additions & 49 deletions app.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
# frozen_string_literal: true

$LOAD_PATH.unshift File.expand_path('.', 'lib')

require 'aws-sdk-states'
require 'json'

require 'deployment'

STDOUT.sync = true

execution_arn = ENV['EXECUTION_ARN']
Expand All @@ -12,60 +18,21 @@

aws_client = Aws::States::Client.new(region: 'eu-west-1')

def deployment_status(aws_client, execution_arn)
aws_client.describe_execution({ execution_arn: execution_arn }).status
end

def failure_reason(aws_client, execution_arn)
resp = aws_client.get_execution_history({
execution_arn: execution_arn,
max_results: 2,
reverse_order: true
})

event_type = resp.events[1].type
if event_type == "LambdaFunctionFailed"
error_message = JSON.parse(resp.events[1].lambda_function_failed_event_details.cause)['errorMessage']
if error_message.include? "Pre flight checks failed"
preflight_checks_output = error_message.lines[1]
forward_deploy_check_result = preflight_checks_output.match(/ForwardDeployCheck=>"(.*?)"/i).captures[0]
if forward_deploy_check_result == "FAILED"
deploy_fail_reason = "Forward deploy check FAILED. No need to panic! "\
"This likely means your commit has already been deployed as part of a previous deploy. "\
"To confirm you can check whether your SHA is a parent commit to the currently deployed SHA. "\
"You can figure out the currently deployed SHA by following this guide https://www.notion.so/freeagent/Deployment-Runbooks-29796221387e40b7abbb217d7d33c4ac?pvs=4#3bfa2ab5d3ab4c33b7a46522027f94bb"
return deploy_fail_reason
end
end
deploy_fail_reason = error_message
elsif event_type == "FailStateEntered"
error_message = JSON.parse(JSON.parse(resp.events[1].state_entered_event_details.input)['Error']['Cause'])['errorMessage']
if error_message.include? "ECS" and error_message.include? "IN_PROGRESS"
deploy_fail_reason = "Sidekiq workers failed to start or failed to stabilise."
else
deploy_fail_reason = "Failure message: #{error_message}. Please investigate further here if required https://eu-west-1.console.aws.amazon.com/states/home?region=eu-west-1#/executions/details/#{execution_arn}"
end
else
deploy_fail_reason = "Uncaught failure. Please investigate here https://eu-west-1.console.aws.amazon.com/states/home?region=eu-west-1#/executions/details/#{execution_arn}"
end

return deploy_fail_reason
end
deploy = Deployment::Deployment.new(aws_client, execution_arn)

# One of: "RUNNING", "SUCCEEDED", "FAILED", "TIMED_OUT", "ABORTED"
if deployment_status(aws_client, execution_arn) == 'RUNNING'
if deploy.running?
puts 'Deployment in progress...🔄'
puts "Monitor at https://eu-west-1.console.aws.amazon.com/states/home?region=eu-west-1#/executions/details/#{execution_arn}"
sleep 15 until deployment_status(aws_client, execution_arn) != 'RUNNING'
end

deploy_status = deployment_status(aws_client, execution_arn)
if %w[FAILED TIMED_OUT ABORTED].include?(deploy_status)
puts "Deployment Failure Status: #{deploy_status} ❌"
sleep 10 while deploy.running?

if deploy.succeeded?
puts 'Deployment Successful 🎉'
else
puts "Deployment Failure Status: #{deploy.status} ❌"
File.open(ENV['GITHUB_OUTPUT'], 'a') do |f|
f.puts "deployment_failed=true"
f.puts "deployment_failure_reason=#{failure_reason(aws_client, execution_arn)}"
f.puts 'deployment_failed=true'
f.puts "deployment_failure_reason=#{deploy.failure_reason}"
end
elsif deploy_status == 'SUCCEEDED'
puts 'Deployment Successful 🎉'
end
3 changes: 3 additions & 0 deletions lib/deployment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
require 'deployment/deployment'
require 'deployment/events/fail_state_entered'
require 'deployment/events/lambda_function_failed'
61 changes: 61 additions & 0 deletions lib/deployment/deployment.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
module Deployment
class Deployment
def initialize(aws_client, execution_arn)
@aws_client = aws_client
@execution_arn = execution_arn
end

def status
@aws_client.describe_execution({ execution_arn: @execution_arn }).status
end

def running?
status == 'RUNNING'
end

def succeeded?
status == 'SUCCEEDED'
end

def aborted?
status == 'ABORTED'
end

def timed_out?
status == 'TIMED_OUT'
end

def failed?
status == 'FAILED'
end

def failure_reason
raise 'Deploy still runnning' if running?
raise 'Deploy succeeded' if succeeded?

event_type = fail_event.type

case event_type
when 'LambdaFunctionFailed'
Events::LambdaFunctionFailed.new(fail_event).error
when 'FailStateEntered'
Events::FailStateEntered.new(fail_event, @execution_arn).error
else
"Uncaught failure. Please investigate here https://eu-west-1.console.aws.amazon.com/states/home?region=eu-west-1#/executions/details/#{@execution_arn}"
end
end

private

def fail_event
# The penultimate event holds the failure reason
@aws_client.get_execution_history(
{
execution_arn: @execution_arn,
max_results: 2,
reverse_order: true
}
).events.last
end
end
end
36 changes: 36 additions & 0 deletions lib/deployment/events/fail_state_entered.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
module Deployment
module Events
class FailStateEntered
def initialize(event, execution_arn)
@event = event
@execution_arn = execution_arn
end

def error
if sidekiq_error?
'Sidekiq workers failed to start or failed to stabilise.'
else
"Failure message: #{error_message}. Please investigate further here if required https://eu-west-1.console.aws.amazon.com/states/home?region=eu-west-1#/executions/details/#{@execution_arn}"
end
end

private

def error_message
@error_message ||= JSON.parse(JSON.parse(@event.state_entered_event_details.input)['Error']['Cause'])['errorMessage']
end

def in_progress?
error_message.include? 'IN_PROGRESS'
end

def ecs_error?
error_message.include? 'ECS'
end

def sidekiq_error?
ecs_error? && in_progress?
end
end
end
end
42 changes: 42 additions & 0 deletions lib/deployment/events/lambda_function_failed.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
module Deployment
module Events
class LambdaFunctionFailed
def initialize(event)
@event = event
end

def error
if preflight_checks_failed? && forward_deploy_check_failed?
'Forward deploy check FAILED. No need to panic! '\
'This likely means your commit has already been deployed as part of a previous deploy. '\
'To confirm you can check whether your SHA is a parent commit to the currently deployed SHA. '\
'You can figure out the currently deployed SHA by following this guide https://www.notion.so/freeagent/Deployment-Runbooks-29796221387e40b7abbb217d7d33c4ac?pvs=4#3bfa2ab5d3ab4c33b7a46522027f94bb'
else
error_message
end
end

private

def error_message
@error_message ||= JSON.parse(@event.lambda_function_failed_event_details.cause)['errorMessage']
end

def preflight_checks_output
error_message.lines[1]
end

def preflight_checks_failed?
error_message.include? 'Pre flight checks failed'
end

def forward_deploy_check_result
preflight_checks_output.match(/ForwardDeployCheck=>"(.*?)"/i).captures[0]
end

def forward_deploy_check_failed?
forward_deploy_check_result == 'FAILED'
end
end
end
end
117 changes: 117 additions & 0 deletions spec/app_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
require 'rspec'
require 'stringio'

require 'aws-sdk-states'

RSpec.describe 'app' do
let(:mock_client) { double('Aws::States::Client') }
let(:execution_arn) { 'arn:aws:states:eu-west-1:123456789012:execution:my-execution-flow' }
let(:test_github_log) { Tempfile.new('github_output') }

before(:each) do
ENV['EXECUTION_ARN'] = execution_arn
ENV['GITHUB_OUTPUT'] = test_github_log.path
allow(Aws::States::Client).to receive(:new).and_return(mock_client)
allow(mock_client).to receive(:describe_execution).with(any_args).and_return(desc_exec_resp)
allow(mock_client).to receive(:get_execution_history).with(any_args).and_return(get_exec_history)
end

context 'when deployment was successful' do
let(:desc_exec_resp) { Aws::States::Types::DescribeExecutionOutput.new(status: 'SUCCEEDED') }
let(:get_exec_history) {}

it 'outputs "Deployment Successful 🎉"' do
$stdout = StringIO.new

# Execute the script
load "#{__dir__}/../app.rb"

# Assert the output
expect($stdout.string).to eq("Deployment Successful 🎉\n")
end
end

context 'when forward deploy check failed' do
let(:desc_exec_resp) { Aws::States::Types::DescribeExecutionOutput.new(status: 'FAILED') }
let(:get_exec_history) do
Aws::States::Types::GetExecutionHistoryOutput.new(
events: [
'',
Aws::States::Types::HistoryEvent.new(
type: 'LambdaFunctionFailed',
lambda_function_failed_event_details: Aws::States::Types::LambdaFunctionFailedEventDetails.new(
cause: '
{
"errorMessage": "Pre flight checks failed:\n{\"Checks\"=>{:RequiredParameters=>\"PASSED\", :CommitCheck=>\"PASSED\", :ScheduleCheck=>\"PASSED\", :ForwardDeployCheck=>\"FAILED\"}, \"Status\"=>\"FAILED\"}",
"errorType": "Function<StandardError>",
"stackTrace": [
"/var/task/pre_flight_checks.rb:145:in `handler"
]
}'
)
)
]
)
end

it 'outputs that it failed' do
$stdout = StringIO.new

# Execute the script
load "#{__dir__}/../app.rb"

# Assert the output
expect($stdout.string).to eq("Deployment Failure Status: FAILED ❌\n")
expect(File.readlines(test_github_log.path)).to eq(
[
"deployment_failed=true\n",
'deployment_failure_reason=' \
'Forward deploy check FAILED. No need to panic! '\
'This likely means your commit has already been deployed as part of a previous deploy. '\
'To confirm you can check whether your SHA is a parent commit to the currently deployed SHA. '\
"You can figure out the currently deployed SHA by following this guide https://www.notion.so/freeagent/Deployment-Runbooks-29796221387e40b7abbb217d7d33c4ac?pvs=4#3bfa2ab5d3ab4c33b7a46522027f94bb\n"
]
)
end
end

context 'when sidekiq failed to start' do
let(:desc_exec_resp) { Aws::States::Types::DescribeExecutionOutput.new(status: 'FAILED') }
let(:get_exec_history) do
Aws::States::Types::GetExecutionHistoryOutput.new(
events: [
'',
Aws::States::Types::HistoryEvent.new(
type: 'FailStateEntered',
state_entered_event_details: Aws::States::Types::StateEnteredEventDetails.new(
input: '
{
"Error": {
"Cause":"{\"errorMessage\":\"ECS deployment status: IN_PROGRESS\",\"errorType\":\"Function<DeployInProgress>\",\"stackTrace\":[\"/var/task/ecs_deployment_handler.rb:49:in `handler\"]}",
"error":"Function<DeployInProgress>","resource":"invoke","resourceType":"lambda}"
}
}
'
)
)
]
)
end

it 'outputs that it failed' do
$stdout = StringIO.new

# Execute the script
load "#{__dir__}/../app.rb"

# Assert the output
expect($stdout.string).to eq("Deployment Failure Status: FAILED ❌\n")
expect(File.readlines(test_github_log.path)).to eq(
[
"deployment_failed=true\n",
"deployment_failure_reason=Sidekiq workers failed to start or failed to stabilise.\n"
]
)
end
end
end
Loading

0 comments on commit 02ce214

Please sign in to comment.