Skip to content

Commit 51806ab

Browse files
authored
Add CI/CD based on GitHub actions (#102)
This is copying the CI/CD setup from insrc with minimal changes (adapting some paths, using a different cloud project for RBE). I had to copy the `deployments/` folders for robco-integration-test and robco-navtest as they are needed by the CI. However, I removed the Oauth secrets from config.sh, which don't seem needed by the test. My plan for now is to keep the duplicate setup, which I expect to change very rarely.
1 parent 9c1a523 commit 51806ab

17 files changed

+761
-5
lines changed

.github/ci/.bazelrc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Bazel config for CI/CD builds.
2+
# This expects robco_integration_test_credentials.json to be available locally for AuthZ.
3+
4+
# Use rbe remote execution and caching on robco-integration-test.
5+
build --config=remote
6+
build --remote_instance_name=projects/robco-integration-test/instances/default_instance
7+
build --google_credentials=robco_integration_test_credentials.json
8+
# Slightly higher than the numer of available remote workers (10 in default_instance).
9+
# This has not been tuned a lot.
10+
build --jobs=12
11+
# No neeed to download every intermediate output to the local runner.
12+
build --remote_download_toplevel
13+
14+
# Use Result Store to store Build and Test logs .
15+
build --bes_backend=buildeventservice.googleapis.com
16+
build --bes_results_url=https://source.cloud.google.com/results/invocations
17+
build --bes_timeout=600s
18+
build --bes_instance_name=robco-integration-test
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Image used for integration_test.sh on Cloud Build.
2+
# Allows access to GKE and to run Bazel commands.
3+
FROM gcr.io/cloud-builders/kubectl
4+
5+
# https://bazel.build/install/ubuntu#install-on-ubuntu
6+
RUN \
7+
apt-get update && \
8+
apt-get install apt-transport-https curl gnupg -y && \
9+
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg && \
10+
mv bazel-archive-keyring.gpg /usr/share/keyrings && \
11+
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
12+
13+
apt-get update && \
14+
apt-get install -y \
15+
bazel-5.4.0 \
16+
git \
17+
jq && \
18+
apt-get clean && \
19+
rm -rf /var/lib/apt/lists/* && \
20+
21+
ln -s /usr/bin/bazel-5.4.0 /usr/bin/bazel && \
22+
# Unpack Bazel for future use.
23+
bazel version
24+
25+
# rules_python is not happy if bazel runs as root so create a new user
26+
# https://github.com/bazelbuild/rules_python/pull/713
27+
# https://github.com/GoogleCloudPlatform/cloud-builders/issues/641
28+
RUN mkdir -p /builder /output /workspace && chmod -R 777 /output
29+
RUN adduser builder
30+
USER builder

.github/ci/common.sh

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#!/bin/bash
2+
3+
# Format for the xtrace lines
4+
export 'PS4=+$(date --rfc-3339=seconds):${BASH_SOURCE}:${LINENO}: '
5+
set -o errexit # exit immediately, if a pipeline command fails
6+
set -o pipefail # returns the last command to exit with a non-zero status
7+
set -o xtrace # print command traces before executing command
8+
9+
# Wraps the common Bazel flags for CI for brevity.
10+
function bazel_ci {
11+
bazel --bazelrc="${DIR}/.bazelrc" "$@"
12+
}
13+
14+
function generate_build_id() {
15+
# Considerations for a build identifier: It must be unique, it shouldn't break
16+
# if we try multiple dailies in a day, and it would be nice if a textual sort
17+
# would put newest releases last.
18+
git_hash=$(echo "$GITHUB_SHA" | cut -c1-6)
19+
date "+daily-%Y-%m-%d-${git_hash}"
20+
}
21+
22+
function run_on_robot_sim() {
23+
local SIM_HOST="$1"
24+
shift
25+
# We don't know if this was executed with errexit on or off. Make sure that we
26+
# print the status and return the correct code either way.
27+
rc=0
28+
ssh -o "StrictHostKeyChecking=no" -i ~/.ssh/google_compute_engine builder@${SIM_HOST} "$@" || rc=$?
29+
echo "Done executing remote command: $* : ${rc}"
30+
return "${rc}"
31+
}
32+
33+
function init_robot_sim() {
34+
local SIM_HOST="$1"
35+
local DEPLOY_FILES="$2"
36+
37+
run_on_robot_sim ${SIM_HOST} 'rm -fr ~/robco/'
38+
39+
echo "Uploading setup files"
40+
run_on_robot_sim ${SIM_HOST} "mkdir -p ~/robco"
41+
scp -o "StrictHostKeyChecking=no" -i ~/.ssh/google_compute_engine ${DEPLOY_FILES} ${SIM_HOST}:~/robco/
42+
43+
# Terraform creates the robot-sim VM, but doesn't install the local cluster.
44+
# Since this script is idempotent, we run it on every test.
45+
# shellcheck disable=2088
46+
run_on_robot_sim ${SIM_HOST} "~/robco/install_k8s_on_robot.sh"
47+
}
48+
49+
function cleanup_old_vm_instances() {
50+
# Aborted CI runs might leak VM instances, so we delete old tagged instances.
51+
local instances
52+
instances="$(gcloud compute instances list \
53+
--filter "tags.items=delete-after-one-day AND creationTimestamp<-P1D" \
54+
--project=${GCP_PROJECT_ID} --format='value(name)')"
55+
56+
if [[ -n "$instances" ]] ; then
57+
gcloud compute instances delete $instances \
58+
--quiet --project=${GCP_PROJECT_ID} --zone=${GCP_ZONE}
59+
fi
60+
}
61+
62+
function cleanup_old_ssh_keys() {
63+
# Work around overflowing the VM metadata store (b/113859328) - delete all past builder keys.
64+
local keys
65+
keys="$(mktemp /tmp/keys.XXXXXX)"
66+
67+
gcloud compute project-info describe --format=json --project=${GCP_PROJECT_ID} | jq -r '.commonInstanceMetadata.items[] | select (.key == "ssh-keys") | .value' | egrep -v "^builder:" >${keys}
68+
gcloud compute project-info add-metadata --no-user-output-enabled --metadata-from-file ssh-keys=${keys} --project=${GCP_PROJECT_ID}
69+
rm -f ${keys}
70+
}
71+
72+
# Pushes images and releases a binary to a specified bucket.
73+
# bucket: target GCS bucket to release to
74+
# name: name of the release tar ball
75+
# labels: optional list of filename aliases for the release, these are one-line
76+
# text files with the release name as a bucket local path
77+
function release_binary {
78+
local bucket="$1"
79+
local name="$2"
80+
81+
# This function is called from test and release pipelines. We (re)build the binary and push the
82+
# app images here to ensure the app images which are referenced in the binary exist in the
83+
# registry.
84+
bazel_ci build \
85+
//src/bootstrap/cloud:crc-binary \
86+
//src/app_charts:push \
87+
//src/go/cmd/setup-robot:setup-robot.push
88+
89+
# The tag variable must be called 'TAG', see cloud-robotics/bazel/container_push.bzl
90+
for t in latest ${DOCKER_TAG}; do
91+
bazel-bin/src/go/cmd/setup-robot/setup-robot.push \
92+
--dst="${CLOUD_ROBOTICS_CONTAINER_REGISTRY}/setup-robot:${t}"
93+
TAG="$t" bazel-bin/src/app_charts/push "${CLOUD_ROBOTICS_CONTAINER_REGISTRY}"
94+
done
95+
96+
gsutil cp -a public-read \
97+
bazel-bin/src/bootstrap/cloud/crc-binary.tar.gz \
98+
"gs://${bucket}/${name}.tar.gz"
99+
100+
# Overwrite cache control as we want changes to run-install.sh and version files to be visible
101+
# right away.
102+
gsutil -h "Cache-Control:private, max-age=0, no-transform" \
103+
cp -a public-read \
104+
src/bootstrap/cloud/run-install.sh \
105+
"gs://${bucket}/"
106+
107+
# The remaining arguments are version labels. gsutil does not support symlinks, so we use version
108+
# files instead.
109+
local vfile
110+
vfile=$(mktemp)
111+
echo "${name}.tar.gz" >${vfile}
112+
shift 2
113+
# Loop over remianing args in $* and creat alias files.
114+
for label; do
115+
gsutil -h "Cache-Control:private, max-age=0, no-transform" \
116+
cp -a public-read \
117+
${vfile} "gs://${bucket}/${label}"
118+
done
119+
}
120+
121+

.github/ci/deploy_navtest.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4+
source "${DIR}/common.sh"
5+
6+
PROJECT_DIR="${DIR}/deployments/robco-navtest"
7+
source "${PROJECT_DIR}/config.sh"
8+
9+
gcloud auth activate-service-account --key-file robco_navtest_credentials.json
10+
gcloud auth configure-docker --quiet
11+
export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/robco_navtest_credentials.json
12+
13+
# TODO(skopecki) These variables should be declared in the run-install.sh and removed from this script.
14+
export BUCKET_URI="https://storage.googleapis.com/robco-ci-binary-builds"
15+
export SOURCE_CONTAINER_REGISTRY="gcr.io/robco-team"
16+
17+
# Deploy the binary release that was pushed by the last successful integration test.
18+
curl --silent --show-error --fail "${BUCKET_URI}/run-install.sh" \
19+
| bash -x -s -- ${GCP_PROJECT_ID}
20+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env bash
2+
3+
# Enable cloud robotics layer 2
4+
APP_MANAGEMENT=true
5+
6+
GCP_PROJECT_ID=robco-integration-test
7+
GCP_REGION=europe-west1
8+
GCP_ZONE=europe-west1-c
9+
10+
CLOUD_ROBOTICS_DEPLOY_ENVIRONMENT=GCP-testing
11+
TERRAFORM_GCS_BUCKET="robco-team-terraform-state"
12+
TERRAFORM_GCS_PREFIX="state/${GCP_PROJECT_ID}"
13+
CLOUD_ROBOTICS_CONTAINER_REGISTRY=gcr.io/robco-team
14+
PRIVATE_DOCKER_PROJECTS=robco-team
15+
CR_SYNCER_RBAC=true
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: apps.cloudrobotics.com/v1alpha1
2+
kind: AppRollout
3+
metadata:
4+
name: k8s-relay
5+
labels:
6+
app: k8s-relay
7+
spec:
8+
appName: k8s-relay-dev
9+
cloud: {}
10+
robots:
11+
- selector:
12+
any: true
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env bash
2+
3+
# Enable google cloud robotics layer 2
4+
APP_MANAGEMENT=true
5+
6+
GCP_PROJECT_ID=robco-navtest
7+
GCP_REGION=europe-west1
8+
GCP_ZONE=europe-west1-c
9+
10+
TERRAFORM_GCS_BUCKET="robco-team-terraform-state"
11+
TERRAFORM_GCS_PREFIX="state/${GCP_PROJECT_ID}"
12+
CLOUD_ROBOTICS_CONTAINER_REGISTRY=gcr.io/robco-team
13+
PRIVATE_DOCKER_PROJECTS=robco-team
14+
CR_SYNCER_RBAC=true

.github/ci/integration_test.sh

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/bin/bash
2+
3+
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
4+
source "${DIR}/common.sh"
5+
6+
# Because the format from common.sh is not recognized by Cloud Build.
7+
export 'PS4='
8+
9+
LOCK_OBJECT=gs://robco-integration-test-lock/lock
10+
LOCK_BACKOFF_SECONDS=60
11+
12+
lock() {
13+
# Take the lock by creating the lock object. x-goog-if-generation-match:0 is a
14+
# GCS precondition that causes `cp` to fail if the lock object already exists.
15+
while ! echo "lock" | gsutil -q -h "x-goog-if-generation-match:0" cp - $LOCK_OBJECT
16+
do
17+
: "lock: failed to obtain lock, retrying in $LOCK_BACKOFF_SECONDS seconds"
18+
: "Note to build cop: if you think there is a stale lock, run:"
19+
: " gsutil rm $LOCK_OBJECT"
20+
: "This can occur when a previous job timed out or was canceled while"
21+
: "holding the lock."
22+
sleep $LOCK_BACKOFF_SECONDS
23+
done
24+
# TODO(rodrigoq): if the build is cancelled by GitHub, the lock is not
25+
# released. The GCS lifecycle will delete the lock after a day, if the build
26+
# cop doesn't delete it sooner. We could add a check here to delete the lock
27+
# if it's too old, but I don't know how to do that safely - maybe a second
28+
# lock would prevent races between deletion checks, but maybe it would just
29+
# introduce other failure modes.
30+
}
31+
32+
finalize_and_unlock() {
33+
# Clean up CR of test robot.
34+
kubectl delete robots.registry.cloudrobotics.com "${NEW_ROBOT_NAME}" &> /dev/null || true
35+
36+
cleanup_old_ssh_keys || true
37+
cleanup_old_vm_instances || true
38+
39+
local sleep_time=1
40+
while ! gsutil -q rm $LOCK_OBJECT
41+
do
42+
echo "unlock: failed to relinquish lock, retrying in $sleep_time seconds"
43+
sleep $sleep_time
44+
sleep_time=$(expr $sleep_time '*' 2)
45+
done
46+
}
47+
48+
# Need to source the project config from here
49+
PROJECT_DIR="${DIR}/deployments/robco-integration-test"
50+
source "${PROJECT_DIR}/config.sh"
51+
gcloud config set project ${GCP_PROJECT_ID}
52+
gcloud container clusters get-credentials cloud-robotics --zone=${GCP_ZONE}
53+
54+
BUILD_IDENTIFIER=$(generate_build_id)
55+
echo "INFO: Build identifier is $BUILD_IDENTIFIER"
56+
57+
bazel_ci build //...
58+
59+
# Get the lock before deploying to the project. This ensures that other runs
60+
# will not change our deployment until we finish testing.
61+
lock
62+
63+
# `set +x` avoids log spam and makes error messages more obvious.
64+
trap 'set +x; finalize_and_unlock' EXIT
65+
66+
export BAZEL_FLAGS="--bazelrc=${DIR}/.bazelrc"
67+
bash -x .//deploy.sh update robco-integration-test
68+
69+
DOMAIN=${CLOUD_ROBOTICS_DOMAIN:-"www.endpoints.${GCP_PROJECT_ID}.cloud.goog"}
70+
CLOUD_CONTEXT="gke_${GCP_PROJECT_ID}_${GCP_ZONE}_cloud-robotics"
71+
SETUP_DEV_BINARY=./bazel-bin/src/go/cmd/setup-dev/setup-dev_/setup-dev
72+
73+
# This generates a .ssh/config for the sim-host
74+
gcloud compute config-ssh
75+
76+
# The `name` here should match the instance name in
77+
# ci/terraform/robco-integration-test.sh.
78+
# The `|| true` and `if [[ -z ...` bits work around a gcloud issue (b/147795223).
79+
SIM_HOST="$(gcloud compute instances list --project ${GCP_PROJECT_ID} --filter='name=("robot-sim")' --format='value(networkInterfaces.networkIP)' || true)"
80+
if [[ -z "$SIM_HOST" ]] ; then
81+
echo "Failed to get IP of robot-sim VM instance." >&2
82+
exit 1
83+
fi
84+
85+
DEPLOY_FILES="src/bootstrap/robot/setup_robot.sh \
86+
src/bootstrap/robot/install_k8s_on_robot.sh \
87+
./bazel-out/../../../external/kubernetes_helm/helm"
88+
init_robot_sim ${SIM_HOST} "${DEPLOY_FILES}"
89+
90+
# Setup new robot
91+
NEW_ROBOT_NAME="test-robot"
92+
NEW_ROBOT_TYPE="test-robot-type"
93+
94+
# Pre-create metadata-server firewall rule to avoid race (b/121175402).
95+
METADATA_SERVER_RULE="-p tcp -d 169.254.169.254 --dport 80 -j DNAT --to-destination 127.0.0.1:8965 -m comment --comment 'from ci/integration_test.sh'"
96+
run_on_robot_sim ${SIM_HOST} \
97+
"sudo iptables --table nat --wait --verbose --check PREROUTING ${METADATA_SERVER_RULE} \
98+
|| sudo iptables --table nat --wait --verbose --append PREROUTING ${METADATA_SERVER_RULE}"
99+
100+
gcloud auth application-default print-access-token --project ${GCP_PROJECT_ID} | \
101+
run_on_robot_sim ${SIM_HOST} "cat > ~/access_token"
102+
run_on_robot_sim ${SIM_HOST} "ACCESS_TOKEN_FILE=~/access_token ~/robco/setup_robot.sh ${NEW_ROBOT_NAME} --project ${GCP_PROJECT_ID} --robot-type ${NEW_ROBOT_TYPE}" || {
103+
: "setup_robot failed."
104+
: "If you see 'certificate has expired or is not yet valid' above (b/178455122), try:"
105+
: " gcloud compute config-ssh --project=robco-integration-test"
106+
: " ssh robot-sim.europe-west1-c.robco-integration-test"
107+
: " sudo kubeadm reset --force"
108+
exit 1
109+
}
110+
run_on_robot_sim ${SIM_HOST} "rm ~/access_token"
111+
112+
# TODO(b/121119919): remove this workaround
113+
run_on_robot_sim ${SIM_HOST} "kubectl delete pod -l name=metadata-server"
114+
# TODO(b/153142491): remove this workaround
115+
run_on_robot_sim ${SIM_HOST} "kubectl delete pod -l app=gcr-credential-refresher"
116+
117+
"${SETUP_DEV_BINARY}" --project="${GCP_PROJECT_ID}" --robot-name="${NEW_ROBOT_NAME}"
118+
119+
# Deploy the k8s relay rollout.
120+
kubectl apply -f "${DIR}/deployments/robco-integration-test/kubernetes/"
121+
122+
# Output state of cloud and robot k8s context to inspect the health of pods.
123+
kubectl config get-contexts || true
124+
kubectl --context ${CLOUD_CONTEXT} get pods || true
125+
kubectl --context ${GCP_PROJECT_ID}-robot get pods || true
126+
127+
# For some reason //src/go/tests:go_default_test is expecting
128+
# the kubeconfig in /home/builder/.kube/config, i.e. it does not use $HOME
129+
# (which is /builder/home). alexanderfaxa@ could not figure out why so just
130+
# copy the config there.
131+
mkdir -p /home/builder/.kube
132+
cp /builder/home/.kube/config /home/builder/.kube/config
133+
134+
bazel_ci test \
135+
--test_env GCP_PROJECT_ID=${GCP_PROJECT_ID} \
136+
--test_env GCP_REGION=${GCP_REGION} \
137+
--test_env GCP_ZONE=${GCP_ZONE} \
138+
--test_env PATH=$PATH \
139+
--jvmopt="-DCLOUD_ROBOTICS_DOMAIN=${DOMAIN}" \
140+
--test_output=streamed \
141+
--test_tag_filters="external" \
142+
--strategy=TestRunner=standalone \
143+
//...
144+
145+
# If this is running on main (ie, not a manual run) then update the `latest`
146+
# binary.
147+
if [[ "$MANUAL_RUN" == "false" ]] ; then
148+
release_binary "robco-ci-binary-builds" "crc-${BUILD_IDENTIFIER}" "latest"
149+
fi

0 commit comments

Comments
 (0)