Skip to content

Commit

Permalink
Add CI/CD based on GitHub actions (#102)
Browse files Browse the repository at this point in the history
This is copying the CI/CD setup from insrc with minimal changes
(adapting some paths, using a different cloud project for RBE).

I had to copy the `deployments/` folders for robco-integration-test and
robco-navtest as they are needed by the CI. However, I removed the Oauth
secrets from config.sh, which don't seem needed by the test. My plan for
now is to keep the duplicate setup, which I expect to change very
rarely.
  • Loading branch information
faximan authored Feb 24, 2023
1 parent 9c1a523 commit 51806ab
Show file tree
Hide file tree
Showing 17 changed files with 761 additions and 5 deletions.
18 changes: 18 additions & 0 deletions .github/ci/.bazelrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Bazel config for CI/CD builds.
# This expects robco_integration_test_credentials.json to be available locally for AuthZ.

# Use rbe remote execution and caching on robco-integration-test.
build --config=remote
build --remote_instance_name=projects/robco-integration-test/instances/default_instance
build --google_credentials=robco_integration_test_credentials.json
# Slightly higher than the numer of available remote workers (10 in default_instance).
# This has not been tuned a lot.
build --jobs=12
# No neeed to download every intermediate output to the local runner.
build --remote_download_toplevel

# Use Result Store to store Build and Test logs .
build --bes_backend=buildeventservice.googleapis.com
build --bes_results_url=https://source.cloud.google.com/results/invocations
build --bes_timeout=600s
build --bes_instance_name=robco-integration-test
30 changes: 30 additions & 0 deletions .github/ci/Dockerfile.integration-test-image
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Image used for integration_test.sh on Cloud Build.
# Allows access to GKE and to run Bazel commands.
FROM gcr.io/cloud-builders/kubectl

# https://bazel.build/install/ubuntu#install-on-ubuntu
RUN \
apt-get update && \
apt-get install apt-transport-https curl gnupg -y && \
curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor >bazel-archive-keyring.gpg && \
mv bazel-archive-keyring.gpg /usr/share/keyrings && \
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/bazel-archive-keyring.gpg] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \

apt-get update && \
apt-get install -y \
bazel-5.4.0 \
git \
jq && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \

ln -s /usr/bin/bazel-5.4.0 /usr/bin/bazel && \
# Unpack Bazel for future use.
bazel version

# rules_python is not happy if bazel runs as root so create a new user
# https://github.com/bazelbuild/rules_python/pull/713
# https://github.com/GoogleCloudPlatform/cloud-builders/issues/641
RUN mkdir -p /builder /output /workspace && chmod -R 777 /output
RUN adduser builder
USER builder
121 changes: 121 additions & 0 deletions .github/ci/common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/bin/bash

# Format for the xtrace lines
export 'PS4=+$(date --rfc-3339=seconds):${BASH_SOURCE}:${LINENO}: '
set -o errexit # exit immediately, if a pipeline command fails
set -o pipefail # returns the last command to exit with a non-zero status
set -o xtrace # print command traces before executing command

# Wraps the common Bazel flags for CI for brevity.
function bazel_ci {
bazel --bazelrc="${DIR}/.bazelrc" "$@"
}

function generate_build_id() {
# Considerations for a build identifier: It must be unique, it shouldn't break
# if we try multiple dailies in a day, and it would be nice if a textual sort
# would put newest releases last.
git_hash=$(echo "$GITHUB_SHA" | cut -c1-6)
date "+daily-%Y-%m-%d-${git_hash}"
}

function run_on_robot_sim() {
local SIM_HOST="$1"
shift
# We don't know if this was executed with errexit on or off. Make sure that we
# print the status and return the correct code either way.
rc=0
ssh -o "StrictHostKeyChecking=no" -i ~/.ssh/google_compute_engine builder@${SIM_HOST} "$@" || rc=$?
echo "Done executing remote command: $* : ${rc}"
return "${rc}"
}

function init_robot_sim() {
local SIM_HOST="$1"
local DEPLOY_FILES="$2"

run_on_robot_sim ${SIM_HOST} 'rm -fr ~/robco/'

echo "Uploading setup files"
run_on_robot_sim ${SIM_HOST} "mkdir -p ~/robco"
scp -o "StrictHostKeyChecking=no" -i ~/.ssh/google_compute_engine ${DEPLOY_FILES} ${SIM_HOST}:~/robco/

# Terraform creates the robot-sim VM, but doesn't install the local cluster.
# Since this script is idempotent, we run it on every test.
# shellcheck disable=2088
run_on_robot_sim ${SIM_HOST} "~/robco/install_k8s_on_robot.sh"
}

function cleanup_old_vm_instances() {
# Aborted CI runs might leak VM instances, so we delete old tagged instances.
local instances
instances="$(gcloud compute instances list \
--filter "tags.items=delete-after-one-day AND creationTimestamp<-P1D" \
--project=${GCP_PROJECT_ID} --format='value(name)')"

if [[ -n "$instances" ]] ; then
gcloud compute instances delete $instances \
--quiet --project=${GCP_PROJECT_ID} --zone=${GCP_ZONE}
fi
}

function cleanup_old_ssh_keys() {
# Work around overflowing the VM metadata store (b/113859328) - delete all past builder keys.
local keys
keys="$(mktemp /tmp/keys.XXXXXX)"

gcloud compute project-info describe --format=json --project=${GCP_PROJECT_ID} | jq -r '.commonInstanceMetadata.items[] | select (.key == "ssh-keys") | .value' | egrep -v "^builder:" >${keys}
gcloud compute project-info add-metadata --no-user-output-enabled --metadata-from-file ssh-keys=${keys} --project=${GCP_PROJECT_ID}
rm -f ${keys}
}

# Pushes images and releases a binary to a specified bucket.
# bucket: target GCS bucket to release to
# name: name of the release tar ball
# labels: optional list of filename aliases for the release, these are one-line
# text files with the release name as a bucket local path
function release_binary {
local bucket="$1"
local name="$2"

# This function is called from test and release pipelines. We (re)build the binary and push the
# app images here to ensure the app images which are referenced in the binary exist in the
# registry.
bazel_ci build \
//src/bootstrap/cloud:crc-binary \
//src/app_charts:push \
//src/go/cmd/setup-robot:setup-robot.push

# The tag variable must be called 'TAG', see cloud-robotics/bazel/container_push.bzl
for t in latest ${DOCKER_TAG}; do
bazel-bin/src/go/cmd/setup-robot/setup-robot.push \
--dst="${CLOUD_ROBOTICS_CONTAINER_REGISTRY}/setup-robot:${t}"
TAG="$t" bazel-bin/src/app_charts/push "${CLOUD_ROBOTICS_CONTAINER_REGISTRY}"
done

gsutil cp -a public-read \
bazel-bin/src/bootstrap/cloud/crc-binary.tar.gz \
"gs://${bucket}/${name}.tar.gz"

# Overwrite cache control as we want changes to run-install.sh and version files to be visible
# right away.
gsutil -h "Cache-Control:private, max-age=0, no-transform" \
cp -a public-read \
src/bootstrap/cloud/run-install.sh \
"gs://${bucket}/"

# The remaining arguments are version labels. gsutil does not support symlinks, so we use version
# files instead.
local vfile
vfile=$(mktemp)
echo "${name}.tar.gz" >${vfile}
shift 2
# Loop over remianing args in $* and creat alias files.
for label; do
gsutil -h "Cache-Control:private, max-age=0, no-transform" \
cp -a public-read \
${vfile} "gs://${bucket}/${label}"
done
}


20 changes: 20 additions & 0 deletions .github/ci/deploy_navtest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${DIR}/common.sh"

PROJECT_DIR="${DIR}/deployments/robco-navtest"
source "${PROJECT_DIR}/config.sh"

gcloud auth activate-service-account --key-file robco_navtest_credentials.json
gcloud auth configure-docker --quiet
export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/robco_navtest_credentials.json

# TODO(skopecki) These variables should be declared in the run-install.sh and removed from this script.
export BUCKET_URI="https://storage.googleapis.com/robco-ci-binary-builds"
export SOURCE_CONTAINER_REGISTRY="gcr.io/robco-team"

# Deploy the binary release that was pushed by the last successful integration test.
curl --silent --show-error --fail "${BUCKET_URI}/run-install.sh" \
| bash -x -s -- ${GCP_PROJECT_ID}

15 changes: 15 additions & 0 deletions .github/ci/deployments/robco-integration-test/config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env bash

# Enable cloud robotics layer 2
APP_MANAGEMENT=true

GCP_PROJECT_ID=robco-integration-test
GCP_REGION=europe-west1
GCP_ZONE=europe-west1-c
[email protected]
CLOUD_ROBOTICS_DEPLOY_ENVIRONMENT=GCP-testing
TERRAFORM_GCS_BUCKET="robco-team-terraform-state"
TERRAFORM_GCS_PREFIX="state/${GCP_PROJECT_ID}"
CLOUD_ROBOTICS_CONTAINER_REGISTRY=gcr.io/robco-team
PRIVATE_DOCKER_PROJECTS=robco-team
CR_SYNCER_RBAC=true
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: apps.cloudrobotics.com/v1alpha1
kind: AppRollout
metadata:
name: k8s-relay
labels:
app: k8s-relay
spec:
appName: k8s-relay-dev
cloud: {}
robots:
- selector:
any: true
14 changes: 14 additions & 0 deletions .github/ci/deployments/robco-navtest/config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

# Enable google cloud robotics layer 2
APP_MANAGEMENT=true

GCP_PROJECT_ID=robco-navtest
GCP_REGION=europe-west1
GCP_ZONE=europe-west1-c
[email protected]
TERRAFORM_GCS_BUCKET="robco-team-terraform-state"
TERRAFORM_GCS_PREFIX="state/${GCP_PROJECT_ID}"
CLOUD_ROBOTICS_CONTAINER_REGISTRY=gcr.io/robco-team
PRIVATE_DOCKER_PROJECTS=robco-team
CR_SYNCER_RBAC=true
149 changes: 149 additions & 0 deletions .github/ci/integration_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#!/bin/bash

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${DIR}/common.sh"

# Because the format from common.sh is not recognized by Cloud Build.
export 'PS4='

LOCK_OBJECT=gs://robco-integration-test-lock/lock
LOCK_BACKOFF_SECONDS=60

lock() {
# Take the lock by creating the lock object. x-goog-if-generation-match:0 is a
# GCS precondition that causes `cp` to fail if the lock object already exists.
while ! echo "lock" | gsutil -q -h "x-goog-if-generation-match:0" cp - $LOCK_OBJECT
do
: "lock: failed to obtain lock, retrying in $LOCK_BACKOFF_SECONDS seconds"
: "Note to build cop: if you think there is a stale lock, run:"
: " gsutil rm $LOCK_OBJECT"
: "This can occur when a previous job timed out or was canceled while"
: "holding the lock."
sleep $LOCK_BACKOFF_SECONDS
done
# TODO(rodrigoq): if the build is cancelled by GitHub, the lock is not
# released. The GCS lifecycle will delete the lock after a day, if the build
# cop doesn't delete it sooner. We could add a check here to delete the lock
# if it's too old, but I don't know how to do that safely - maybe a second
# lock would prevent races between deletion checks, but maybe it would just
# introduce other failure modes.
}

finalize_and_unlock() {
# Clean up CR of test robot.
kubectl delete robots.registry.cloudrobotics.com "${NEW_ROBOT_NAME}" &> /dev/null || true

cleanup_old_ssh_keys || true
cleanup_old_vm_instances || true

local sleep_time=1
while ! gsutil -q rm $LOCK_OBJECT
do
echo "unlock: failed to relinquish lock, retrying in $sleep_time seconds"
sleep $sleep_time
sleep_time=$(expr $sleep_time '*' 2)
done
}

# Need to source the project config from here
PROJECT_DIR="${DIR}/deployments/robco-integration-test"
source "${PROJECT_DIR}/config.sh"
gcloud config set project ${GCP_PROJECT_ID}
gcloud container clusters get-credentials cloud-robotics --zone=${GCP_ZONE}

BUILD_IDENTIFIER=$(generate_build_id)
echo "INFO: Build identifier is $BUILD_IDENTIFIER"

bazel_ci build //...

# Get the lock before deploying to the project. This ensures that other runs
# will not change our deployment until we finish testing.
lock

# `set +x` avoids log spam and makes error messages more obvious.
trap 'set +x; finalize_and_unlock' EXIT

export BAZEL_FLAGS="--bazelrc=${DIR}/.bazelrc"
bash -x .//deploy.sh update robco-integration-test

DOMAIN=${CLOUD_ROBOTICS_DOMAIN:-"www.endpoints.${GCP_PROJECT_ID}.cloud.goog"}
CLOUD_CONTEXT="gke_${GCP_PROJECT_ID}_${GCP_ZONE}_cloud-robotics"
SETUP_DEV_BINARY=./bazel-bin/src/go/cmd/setup-dev/setup-dev_/setup-dev

# This generates a .ssh/config for the sim-host
gcloud compute config-ssh

# The `name` here should match the instance name in
# ci/terraform/robco-integration-test.sh.
# The `|| true` and `if [[ -z ...` bits work around a gcloud issue (b/147795223).
SIM_HOST="$(gcloud compute instances list --project ${GCP_PROJECT_ID} --filter='name=("robot-sim")' --format='value(networkInterfaces.networkIP)' || true)"
if [[ -z "$SIM_HOST" ]] ; then
echo "Failed to get IP of robot-sim VM instance." >&2
exit 1
fi

DEPLOY_FILES="src/bootstrap/robot/setup_robot.sh \
src/bootstrap/robot/install_k8s_on_robot.sh \
./bazel-out/../../../external/kubernetes_helm/helm"
init_robot_sim ${SIM_HOST} "${DEPLOY_FILES}"

# Setup new robot
NEW_ROBOT_NAME="test-robot"
NEW_ROBOT_TYPE="test-robot-type"

# Pre-create metadata-server firewall rule to avoid race (b/121175402).
METADATA_SERVER_RULE="-p tcp -d 169.254.169.254 --dport 80 -j DNAT --to-destination 127.0.0.1:8965 -m comment --comment 'from ci/integration_test.sh'"
run_on_robot_sim ${SIM_HOST} \
"sudo iptables --table nat --wait --verbose --check PREROUTING ${METADATA_SERVER_RULE} \
|| sudo iptables --table nat --wait --verbose --append PREROUTING ${METADATA_SERVER_RULE}"

gcloud auth application-default print-access-token --project ${GCP_PROJECT_ID} | \
run_on_robot_sim ${SIM_HOST} "cat > ~/access_token"
run_on_robot_sim ${SIM_HOST} "ACCESS_TOKEN_FILE=~/access_token ~/robco/setup_robot.sh ${NEW_ROBOT_NAME} --project ${GCP_PROJECT_ID} --robot-type ${NEW_ROBOT_TYPE}" || {
: "setup_robot failed."
: "If you see 'certificate has expired or is not yet valid' above (b/178455122), try:"
: " gcloud compute config-ssh --project=robco-integration-test"
: " ssh robot-sim.europe-west1-c.robco-integration-test"
: " sudo kubeadm reset --force"
exit 1
}
run_on_robot_sim ${SIM_HOST} "rm ~/access_token"

# TODO(b/121119919): remove this workaround
run_on_robot_sim ${SIM_HOST} "kubectl delete pod -l name=metadata-server"
# TODO(b/153142491): remove this workaround
run_on_robot_sim ${SIM_HOST} "kubectl delete pod -l app=gcr-credential-refresher"

"${SETUP_DEV_BINARY}" --project="${GCP_PROJECT_ID}" --robot-name="${NEW_ROBOT_NAME}"

# Deploy the k8s relay rollout.
kubectl apply -f "${DIR}/deployments/robco-integration-test/kubernetes/"

# Output state of cloud and robot k8s context to inspect the health of pods.
kubectl config get-contexts || true
kubectl --context ${CLOUD_CONTEXT} get pods || true
kubectl --context ${GCP_PROJECT_ID}-robot get pods || true

# For some reason //src/go/tests:go_default_test is expecting
# the kubeconfig in /home/builder/.kube/config, i.e. it does not use $HOME
# (which is /builder/home). alexanderfaxa@ could not figure out why so just
# copy the config there.
mkdir -p /home/builder/.kube
cp /builder/home/.kube/config /home/builder/.kube/config

bazel_ci test \
--test_env GCP_PROJECT_ID=${GCP_PROJECT_ID} \
--test_env GCP_REGION=${GCP_REGION} \
--test_env GCP_ZONE=${GCP_ZONE} \
--test_env PATH=$PATH \
--jvmopt="-DCLOUD_ROBOTICS_DOMAIN=${DOMAIN}" \
--test_output=streamed \
--test_tag_filters="external" \
--strategy=TestRunner=standalone \
//...

# If this is running on main (ie, not a manual run) then update the `latest`
# binary.
if [[ "$MANUAL_RUN" == "false" ]] ; then
release_binary "robco-ci-binary-builds" "crc-${BUILD_IDENTIFIER}" "latest"
fi
Loading

0 comments on commit 51806ab

Please sign in to comment.