Skip to content

Commit

Permalink
feat: Add basic metrics (#224)
Browse files Browse the repository at this point in the history
Mostly we expose the built-in metrics of the step function, but the big thing we add is the jobs completed metric. It looks into the runner logs and extracts whether the job itself was successful. This is done using metric filters. The status of the job is exported as a dimension. The provider labels are also exported as a dimension so that data can be easily sliced.

To make the step function based metrics more meaningful, we no longer start it for non self-hosted runs. We check the self-hosted label in the webhook handler now instead of as the first step of the step function.

Fixes #59 

BREAKING CHANGE: IRunnerProvider has a new logGroup field
  • Loading branch information
kichik authored Jan 17, 2023
1 parent 7aaf692 commit 746fc41
Show file tree
Hide file tree
Showing 16 changed files with 1,476 additions and 95 deletions.
147 changes: 147 additions & 0 deletions API.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions src/lambdas/webhook-handler/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ exports.handler = async function (event: AWSLambda.APIGatewayProxyEventV2): Prom
};
}

if (!payload.workflow_job.labels.includes('self-hosted')) {
console.log(`Ignoring labels "${payload.workflow_job.labels}", expecting "self-hosted"`);
return {
statusCode: 200,
body: 'OK. No runner started.',
};
}

// it's easier to deal with maps in step functions
let labels: any = {};
payload.workflow_job.labels.forEach((l: string) => labels[l] = true);
Expand Down
12 changes: 11 additions & 1 deletion src/providers/codebuild.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,16 @@ export class CodeBuildRunner extends BaseProvider implements IRunnerProvider {
*/
readonly image: RunnerImage;

/**
* Log group where provided runners will save their logs.
*
* Note that this is not the job log, but the runner itself. It will not contain output from the GitHub Action but only metadata on its execution.
*/
readonly logGroup: logs.ILogGroup;

private readonly vpc?: ec2.IVpc;
private readonly securityGroups?: ec2.ISecurityGroup[];
private readonly dind: boolean;
private readonly logGroup: logs.LogGroup;

constructor(scope: Construct, id: string, props?: CodeBuildRunnerProps) {
super(scope, id, props);
Expand Down Expand Up @@ -212,6 +218,8 @@ export class CodeBuildRunner extends BaseProvider implements IRunnerProvider {
build: {
commands: [
'sudo --preserve-env=AWS_CONTAINER_CREDENTIALS_RELATIVE_URI,AWS_DEFAULT_REGION,AWS_REGION -Hu runner /home/runner/run.sh',
'STATUS=$(grep -Phors "finish job request for job [0-9a-f\\-]+ with result: \\K.*" /home/runner/_diag/ | tail -n1)',
'[ -n "$STATUS" ] && echo CDKGHA JOB DONE "$RUNNER_LABEL" "$STATUS"',
],
},
},
Expand All @@ -231,6 +239,8 @@ export class CodeBuildRunner extends BaseProvider implements IRunnerProvider {
buildSpec.phases.build.commands = [
'cd \\actions',
'./run.cmd',
'$STATUS = Select-String -Path \'./_diag/*.log\' -Pattern \'finish job request for job [0-9a-f\\-]+ with result: (.*)\' | %{$_.Matches.Groups[1].Value} | Select-Object -Last 1',
'if ($STATUS) { echo "CDKGHA JOB DONE $\{Env:RUNNER_LABEL\} $STATUS" }',
];
}

Expand Down
7 changes: 7 additions & 0 deletions src/providers/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,13 @@ export interface IRunnerProvider extends ec2.IConnectable, iam.IGrantable, ICons
*/
readonly labels: string[];

/**
* Log group where provided runners will save their logs.
*
* Note that this is not the job log, but the runner itself. It will not contain output from the GitHub Action but only metadata on its execution.
*/
readonly logGroup: logs.ILogGroup;

/**
* Generate step function tasks that execute the runner.
*
Expand Down
3 changes: 3 additions & 0 deletions src/providers/docker-images/lambda/linux-arm64/runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ if [ "${RUNNER_VERSION}" = "latest" ]; then RUNNER_FLAGS=""; else RUNNER_FLAGS="
echo Config done
./run.sh
echo Run done

STATUS=$(grep -Phors "finish job request for job [0-9a-f\-]+ with result: \K.*" _diag/ | tail -n1)
[ -n "$STATUS" ] && echo CDKGHA JOB DONE "$RUNNER_LABEL" "$STATUS"
3 changes: 3 additions & 0 deletions src/providers/docker-images/lambda/linux-x64/runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@ if [ "${RUNNER_VERSION}" = "latest" ]; then RUNNER_FLAGS=""; else RUNNER_FLAGS="
echo Config done
./run.sh
echo Run done

STATUS=$(grep -Phors "finish job request for job [0-9a-f\-]+ with result: \K.*" _diag/ | tail -n1)
[ -n "$STATUS" ] && echo CDKGHA JOB DONE "$RUNNER_LABEL" "$STATUS"
15 changes: 14 additions & 1 deletion src/providers/ec2.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,16 @@ EOF
action () {
sudo -Hu runner /home/runner/config.sh --unattended --url "https://{}/{}/{}" --token "{}" --ephemeral --work _work --labels "{}" {} --name "{}" || exit 1
sudo --preserve-env=AWS_REGION -Hu runner /home/runner/run.sh || exit 2
STATUS=$(grep -Phors "finish job request for job [0-9a-f\\\\-]+ with result: \\\\K.*" /home/runner/_diag/ | tail -n1)
[ -n "$STATUS" ] && echo CDKGHA JOB DONE "{}" "$STATUS"
}
heartbeat &
if setup_logs && action | tee /var/log/runner.log 2>&1; then
aws stepfunctions send-task-success --task-token "$TASK_TOKEN" --task-output '{"ok": true}'
else
aws stepfunctions send-task-failure --task-token "$TASK_TOKEN"
fi
sleep 10 # give cloudwatch agent its default 5 seconds buffer duration to upload logs
poweroff
`.replace(/{/g, '\\{').replace(/}/g, '\\}').replace(/\\{\\}/g, '{}');

Expand Down Expand Up @@ -106,6 +109,8 @@ function action () {
if ($LASTEXITCODE -ne 0) { return 1 }
./run.cmd 2>&1 | Out-File -Encoding ASCII -Append /actions/runner.log
if ($LASTEXITCODE -ne 0) { return 2 }
$STATUS = Select-String -Path './_diag/*.log' -Pattern 'finish job request for job [0-9a-f\\\\-]+ with result: (.*)' | %{$_.Matches.Groups[1].Value} | Select-Object -Last 1
if ($STATUS) { echo "CDKGHA JOB DONE {} $STATUS" | Out-File -Encoding ASCII -Append /actions/runner.log }
return 0
}
setup_logs
Expand All @@ -115,6 +120,7 @@ if ($r -eq 0) {
} else {
aws stepfunctions send-task-failure --task-token "$TASK_TOKEN"
}
Start-Sleep -Seconds 10 # give cloudwatch agent its default 5 seconds buffer duration to upload logs
Stop-Computer -ComputerName localhost -Force
</powershell>
`.replace(/{/g, '\\{').replace(/}/g, '\\}').replace(/\\{\\}/g, '{}');
Expand Down Expand Up @@ -226,8 +232,14 @@ export class Ec2Runner extends BaseProvider implements IRunnerProvider {
*/
readonly grantPrincipal: iam.IPrincipal;

/**
* Log group where provided runners will save their logs.
*
* Note that this is not the job log, but the runner itself. It will not contain output from the GitHub Action but only metadata on its execution.
*/
readonly logGroup: logs.ILogGroup;

private readonly ami: RunnerAmi;
private readonly logGroup: logs.LogGroup;
private readonly role: iam.Role;
private readonly instanceType: ec2.InstanceType;
private readonly storageSize: cdk.Size;
Expand Down Expand Up @@ -303,6 +315,7 @@ export class Ec2Runner extends BaseProvider implements IRunnerProvider {
this.labels.join(','),
this.ami.runnerVersion.is(RunnerVersion.latest()) ? '' : '--disableupdate',
parameters.runnerNamePath,
this.labels.join(','),
];

const passUserData = new stepfunctions.Pass(this, `${this.labels.join(', ')} data`, {
Expand Down
Loading

0 comments on commit 746fc41

Please sign in to comment.