|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" |
| 4 | +source "${DIR}/common.sh" |
| 5 | + |
| 6 | +# Because the format from common.sh is not recognized by Cloud Build. |
| 7 | +export 'PS4=' |
| 8 | + |
| 9 | +LOCK_OBJECT=gs://robco-integration-test-lock/lock |
| 10 | +LOCK_BACKOFF_SECONDS=60 |
| 11 | + |
| 12 | +lock() { |
| 13 | + # Take the lock by creating the lock object. x-goog-if-generation-match:0 is a |
| 14 | + # GCS precondition that causes `cp` to fail if the lock object already exists. |
| 15 | + while ! echo "lock" | gsutil -q -h "x-goog-if-generation-match:0" cp - $LOCK_OBJECT |
| 16 | + do |
| 17 | + : "lock: failed to obtain lock, retrying in $LOCK_BACKOFF_SECONDS seconds" |
| 18 | + : "Note to build cop: if you think there is a stale lock, run:" |
| 19 | + : " gsutil rm $LOCK_OBJECT" |
| 20 | + : "This can occur when a previous job timed out or was canceled while" |
| 21 | + : "holding the lock." |
| 22 | + sleep $LOCK_BACKOFF_SECONDS |
| 23 | + done |
| 24 | + # TODO(rodrigoq): if the build is cancelled by GitHub, the lock is not |
| 25 | + # released. The GCS lifecycle will delete the lock after a day, if the build |
| 26 | + # cop doesn't delete it sooner. We could add a check here to delete the lock |
| 27 | + # if it's too old, but I don't know how to do that safely - maybe a second |
| 28 | + # lock would prevent races between deletion checks, but maybe it would just |
| 29 | + # introduce other failure modes. |
| 30 | +} |
| 31 | + |
| 32 | +finalize_and_unlock() { |
| 33 | + # Clean up CR of test robot. |
| 34 | + kubectl delete robots.registry.cloudrobotics.com "${NEW_ROBOT_NAME}" &> /dev/null || true |
| 35 | + |
| 36 | + cleanup_old_ssh_keys || true |
| 37 | + cleanup_old_vm_instances || true |
| 38 | + |
| 39 | + local sleep_time=1 |
| 40 | + while ! gsutil -q rm $LOCK_OBJECT |
| 41 | + do |
| 42 | + echo "unlock: failed to relinquish lock, retrying in $sleep_time seconds" |
| 43 | + sleep $sleep_time |
| 44 | + sleep_time=$(expr $sleep_time '*' 2) |
| 45 | + done |
| 46 | +} |
| 47 | + |
| 48 | +# Need to source the project config from here |
| 49 | +PROJECT_DIR="${DIR}/deployments/robco-integration-test" |
| 50 | +source "${PROJECT_DIR}/config.sh" |
| 51 | +gcloud config set project ${GCP_PROJECT_ID} |
| 52 | +gcloud container clusters get-credentials cloud-robotics --zone=${GCP_ZONE} |
| 53 | + |
| 54 | +BUILD_IDENTIFIER=$(generate_build_id) |
| 55 | +echo "INFO: Build identifier is $BUILD_IDENTIFIER" |
| 56 | + |
| 57 | +bazel_ci build //... |
| 58 | + |
| 59 | +# Get the lock before deploying to the project. This ensures that other runs |
| 60 | +# will not change our deployment until we finish testing. |
| 61 | +lock |
| 62 | + |
| 63 | +# `set +x` avoids log spam and makes error messages more obvious. |
| 64 | +trap 'set +x; finalize_and_unlock' EXIT |
| 65 | + |
| 66 | +export BAZEL_FLAGS="--bazelrc=${DIR}/.bazelrc" |
| 67 | +bash -x .//deploy.sh update robco-integration-test |
| 68 | + |
| 69 | +DOMAIN=${CLOUD_ROBOTICS_DOMAIN:-"www.endpoints.${GCP_PROJECT_ID}.cloud.goog"} |
| 70 | +CLOUD_CONTEXT="gke_${GCP_PROJECT_ID}_${GCP_ZONE}_cloud-robotics" |
| 71 | +SETUP_DEV_BINARY=./bazel-bin/src/go/cmd/setup-dev/setup-dev_/setup-dev |
| 72 | + |
| 73 | +# This generates a .ssh/config for the sim-host |
| 74 | +gcloud compute config-ssh |
| 75 | + |
| 76 | +# The `name` here should match the instance name in |
| 77 | +# ci/terraform/robco-integration-test.sh. |
| 78 | +# The `|| true` and `if [[ -z ...` bits work around a gcloud issue (b/147795223). |
| 79 | +SIM_HOST="$(gcloud compute instances list --project ${GCP_PROJECT_ID} --filter='name=("robot-sim")' --format='value(networkInterfaces.networkIP)' || true)" |
| 80 | +if [[ -z "$SIM_HOST" ]] ; then |
| 81 | + echo "Failed to get IP of robot-sim VM instance." >&2 |
| 82 | + exit 1 |
| 83 | +fi |
| 84 | + |
| 85 | +DEPLOY_FILES="src/bootstrap/robot/setup_robot.sh \ |
| 86 | + src/bootstrap/robot/install_k8s_on_robot.sh \ |
| 87 | + ./bazel-out/../../../external/kubernetes_helm/helm" |
| 88 | +init_robot_sim ${SIM_HOST} "${DEPLOY_FILES}" |
| 89 | + |
| 90 | +# Setup new robot |
| 91 | +NEW_ROBOT_NAME="test-robot" |
| 92 | +NEW_ROBOT_TYPE="test-robot-type" |
| 93 | + |
| 94 | +# Pre-create metadata-server firewall rule to avoid race (b/121175402). |
| 95 | +METADATA_SERVER_RULE="-p tcp -d 169.254.169.254 --dport 80 -j DNAT --to-destination 127.0.0.1:8965 -m comment --comment 'from ci/integration_test.sh'" |
| 96 | +run_on_robot_sim ${SIM_HOST} \ |
| 97 | + "sudo iptables --table nat --wait --verbose --check PREROUTING ${METADATA_SERVER_RULE} \ |
| 98 | + || sudo iptables --table nat --wait --verbose --append PREROUTING ${METADATA_SERVER_RULE}" |
| 99 | + |
| 100 | +gcloud auth application-default print-access-token --project ${GCP_PROJECT_ID} | \ |
| 101 | + run_on_robot_sim ${SIM_HOST} "cat > ~/access_token" |
| 102 | +run_on_robot_sim ${SIM_HOST} "ACCESS_TOKEN_FILE=~/access_token ~/robco/setup_robot.sh ${NEW_ROBOT_NAME} --project ${GCP_PROJECT_ID} --robot-type ${NEW_ROBOT_TYPE}" || { |
| 103 | + : "setup_robot failed." |
| 104 | + : "If you see 'certificate has expired or is not yet valid' above (b/178455122), try:" |
| 105 | + : " gcloud compute config-ssh --project=robco-integration-test" |
| 106 | + : " ssh robot-sim.europe-west1-c.robco-integration-test" |
| 107 | + : " sudo kubeadm reset --force" |
| 108 | + exit 1 |
| 109 | +} |
| 110 | +run_on_robot_sim ${SIM_HOST} "rm ~/access_token" |
| 111 | + |
| 112 | +# TODO(b/121119919): remove this workaround |
| 113 | +run_on_robot_sim ${SIM_HOST} "kubectl delete pod -l name=metadata-server" |
| 114 | +# TODO(b/153142491): remove this workaround |
| 115 | +run_on_robot_sim ${SIM_HOST} "kubectl delete pod -l app=gcr-credential-refresher" |
| 116 | + |
| 117 | +"${SETUP_DEV_BINARY}" --project="${GCP_PROJECT_ID}" --robot-name="${NEW_ROBOT_NAME}" |
| 118 | + |
| 119 | +# Deploy the k8s relay rollout. |
| 120 | +kubectl apply -f "${DIR}/deployments/robco-integration-test/kubernetes/" |
| 121 | + |
| 122 | +# Output state of cloud and robot k8s context to inspect the health of pods. |
| 123 | +kubectl config get-contexts || true |
| 124 | +kubectl --context ${CLOUD_CONTEXT} get pods || true |
| 125 | +kubectl --context ${GCP_PROJECT_ID}-robot get pods || true |
| 126 | + |
| 127 | +# For some reason //src/go/tests:go_default_test is expecting |
| 128 | +# the kubeconfig in /home/builder/.kube/config, i.e. it does not use $HOME |
| 129 | +# (which is /builder/home). alexanderfaxa@ could not figure out why so just |
| 130 | +# copy the config there. |
| 131 | +mkdir -p /home/builder/.kube |
| 132 | +cp /builder/home/.kube/config /home/builder/.kube/config |
| 133 | + |
| 134 | +bazel_ci test \ |
| 135 | + --test_env GCP_PROJECT_ID=${GCP_PROJECT_ID} \ |
| 136 | + --test_env GCP_REGION=${GCP_REGION} \ |
| 137 | + --test_env GCP_ZONE=${GCP_ZONE} \ |
| 138 | + --test_env PATH=$PATH \ |
| 139 | + --jvmopt="-DCLOUD_ROBOTICS_DOMAIN=${DOMAIN}" \ |
| 140 | + --test_output=streamed \ |
| 141 | + --test_tag_filters="external" \ |
| 142 | + --strategy=TestRunner=standalone \ |
| 143 | + //... |
| 144 | + |
| 145 | +# If this is running on main (ie, not a manual run) then update the `latest` |
| 146 | +# binary. |
| 147 | +if [[ "$MANUAL_RUN" == "false" ]] ; then |
| 148 | + release_binary "robco-ci-binary-builds" "crc-${BUILD_IDENTIFIER}" "latest" |
| 149 | +fi |
0 commit comments