Skip to content

Commit

Permalink
[CI] Add docker retries (elastic#191824)
Browse files Browse the repository at this point in the history
## Summary
We've seen several issues stemming from docker failures, like:
```
Error response from daemon: Get "https://docker.elastic.co/v2/": ...
```

This is happening because of rolling docker updates, and should affect
each node for a short time, while connections are draining. The
suggestion was to implement retry logic for docker operations. This PR
tries to cover much of it.
  • Loading branch information
delanni authored Sep 2, 2024
1 parent 0017463 commit 8b3b314
Show file tree
Hide file tree
Showing 12 changed files with 68 additions and 24 deletions.
11 changes: 8 additions & 3 deletions .buildkite/scripts/common/setup_job_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ if [[ "$(type -t vault_get)" != "function" ]]; then
source .buildkite/scripts/common/vault_fns.sh
fi

source .buildkite/scripts/common/util.sh

# Set up general-purpose tokens and credentials
{
BUILDKITE_TOKEN="$(vault_get buildkite-ci buildkite_token_all_jobs)"
Expand All @@ -18,9 +20,12 @@ fi

KIBANA_DOCKER_USERNAME="$(vault_get container-registry username)"
KIBANA_DOCKER_PASSWORD="$(vault_get container-registry password)"
if (command -v docker && docker version) &> /dev/null; then
echo "$KIBANA_DOCKER_PASSWORD" | docker login -u "$KIBANA_DOCKER_USERNAME" --password-stdin docker.elastic.co
fi
function docker_login() {
if (command -v docker && docker version) &> /dev/null; then
echo "$KIBANA_DOCKER_PASSWORD" | docker login -u "$KIBANA_DOCKER_USERNAME" --password-stdin docker.elastic.co
fi
}
retry 5 15 docker_login
}

# Set up a custom ES Snapshot Manifest if one has been specified for this build
Expand Down
29 changes: 29 additions & 0 deletions .buildkite/scripts/common/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,32 @@ print_if_dry_run() {
echo "DRY_RUN is enabled."
fi
}

docker_with_retry () {
cmd=$1
shift
args=("$@")
attempt=0
max_retries=5
sleep_time=15

while true
do
attempt=$((attempt+1))

if [ $attempt -gt $max_retries ]
then
echo "Docker $cmd retries exceeded, aborting."
exit 1
fi

if docker "$cmd" "${args[@]}"
then
echo "Docker $cmd successful."
break
else
echo "Docker $cmd unsuccessful, attempt '$attempt'... Retrying in $sleep_time"
sleep $sleep_time
fi
done
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ fi
.buildkite/scripts/bootstrap.sh

source .buildkite/scripts/steps/artifacts/env.sh
source .buildkite/scripts/common/util.sh

GIT_ABBREV_COMMIT=${BUILDKITE_COMMIT:0:12}
KIBANA_IMAGE_TAG="sec-sol-qg-$GIT_ABBREV_COMMIT"
Expand All @@ -23,7 +24,7 @@ if docker manifest inspect $KIBANA_IMAGE &> /dev/null; then
exit 0
fi

docker pull $KIBANA_BASE_IMAGE:latest
docker_with_retry pull $KIBANA_BASE_IMAGE:latest

echo "--- Build images"
node scripts/build \
Expand All @@ -50,8 +51,8 @@ docker load < "target/kibana-serverless-$BASE_VERSION-docker-image-aarch64.tar.g
docker tag "$KIBANA_IMAGE" "$KIBANA_IMAGE-arm64"

echo "--- Push images"
docker image push "$KIBANA_IMAGE-arm64"
docker image push "$KIBANA_IMAGE-amd64"
docker_with_retry push "$KIBANA_IMAGE-arm64"
docker_with_retry push "$KIBANA_IMAGE-amd64"

echo "--- Create and push manifests"
docker manifest create \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

source .buildkite/scripts/common/util.sh

if [ "$KIBANA_MKI_QUALITY_GATE" == "1" ]; then
echo "Triggered by quality gate!"
triggered_by="Serverless Quality Gate."
Expand All @@ -17,11 +19,11 @@ else
KBN_IMAGE=${KIBANA_LATEST}
fi

docker pull ${KBN_IMAGE}
docker_with_retry pull ${KBN_IMAGE}
build_date=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.build-date"')
vcs_ref=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.vcs-ref"')
vcs_url=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.vcs-url"')
version=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.version"')
version=$(docker inspect ${KBN_IMAGE} | jq -r '.[0].Config.Labels."org.label-schema.version"')

markdown_text="""
#### $triggered_by
Expand All @@ -32,9 +34,9 @@ markdown_text="""
---
#### Kibana Container Metadata
- Build Date : $build_date
- Github Commit Hash : $vcs_ref
- Github Repo : $vcs_url
- Build Date : $build_date
- Github Commit Hash : $vcs_ref
- Github Repo : $vcs_url
- Version : $version
"""
echo "${markdown_text//[*\\_]/\\&}" | buildkite-agent annotate --style "info"
echo "${markdown_text//[*\\_]/\\&}" | buildkite-agent annotate --style "info"
2 changes: 1 addition & 1 deletion .buildkite/scripts/steps/artifacts/cloud.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ docker tag "$KIBANA_BASE_IMAGE" "$KIBANA_TEST_IMAGE"
if docker manifest inspect $KIBANA_TEST_IMAGE &> /dev/null; then
echo "Cloud image already exists, skipping docker push"
else
docker image push "$KIBANA_TEST_IMAGE"
docker_with_retry push "$KIBANA_TEST_IMAGE"
fi

echo "--- Create deployment"
Expand Down
6 changes: 4 additions & 2 deletions .buildkite/scripts/steps/artifacts/docker_image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ set -euo pipefail

source .buildkite/scripts/steps/artifacts/env.sh

source .buildkite/scripts/common/util.sh

GIT_ABBREV_COMMIT=${BUILDKITE_COMMIT:0:12}
if [[ "${BUILDKITE_PULL_REQUEST:-false}" == "false" ]]; then
KIBANA_IMAGE_TAG="git-$GIT_ABBREV_COMMIT"
Expand Down Expand Up @@ -50,8 +52,8 @@ if [[ "$SKIP_BUILD" == "false" ]]; then
docker tag "$KIBANA_IMAGE" "$KIBANA_IMAGE-arm64"

echo "--- Push images"
docker image push "$KIBANA_IMAGE-arm64"
docker image push "$KIBANA_IMAGE-amd64"
docker_with_retry push "$KIBANA_IMAGE-arm64"
docker_with_retry push "$KIBANA_IMAGE-amd64"

echo "--- Create and push manifests"
docker manifest create \
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/scripts/steps/artifacts/publish.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ chmod -R a+r target/*
chmod -R a+w target

echo "--- Pull latest Release Manager CLI"
docker pull docker.elastic.co/infra/release-manager:latest
docker_with_retry pull docker.elastic.co/infra/release-manager:latest

echo "--- Publish artifacts"
if [[ "$BUILDKITE_BRANCH" == "$KIBANA_BASE_BRANCH" ]] || [[ "${DRY_RUN:-}" =~ ^(1|true)$ ]]; then
Expand Down
3 changes: 2 additions & 1 deletion .buildkite/scripts/steps/demo_env/es_and_init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
set -euo pipefail

source "$(dirname "${0}")/config.sh"
source "$(dirname "${0}")/../../common/util.sh"

"$(dirname "${0}")/auth.sh"

Expand All @@ -16,7 +17,7 @@ DOCKER_EXPORT_URL=$(curl https://storage.googleapis.com/kibana-ci-es-snapshots-d
curl "$DOCKER_EXPORT_URL" > target/elasticsearch-docker.tar.gz
docker load < target/elasticsearch-docker.tar.gz
docker tag "docker.elastic.co/elasticsearch/elasticsearch:$DEPLOYMENT_VERSION-SNAPSHOT" "$ES_IMAGE"
docker push "$ES_IMAGE"
docker_with_retry push "$ES_IMAGE"

echo '--- Prepare yaml'

Expand Down
4 changes: 3 additions & 1 deletion .buildkite/scripts/steps/demo_env/kibana.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ set -euo pipefail

.buildkite/scripts/bootstrap.sh

source .buildkite/scripts/common/util.sh

source "$(dirname "${0}")/config.sh"

export KIBANA_IMAGE="gcr.io/elastic-kibana-184716/demo/kibana:$DEPLOYMENT_NAME-$(git rev-parse HEAD)"
Expand All @@ -15,7 +17,7 @@ echo '--- Build Docker image with example plugins'
cd target/example_plugins
BUILT_IMAGE="docker.elastic.co/kibana/kibana:$DEPLOYMENT_VERSION-SNAPSHOT"
docker build --build-arg BASE_IMAGE="$BUILT_IMAGE" -t "$KIBANA_IMAGE" -f "$KIBANA_DIR/.buildkite/scripts/steps/demo_env/Dockerfile" .
docker push "$KIBANA_IMAGE"
docker_with_retry push "$KIBANA_IMAGE"
cd -

"$(dirname "${0}")/auth.sh"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

set -euo pipefail

source .buildkite/scripts/common/util.sh

KIBANA_GITHUB_URL="https://github.com/elastic/kibana"
ES_SERVERLESS_GITHUB_URL="https://github.com/elastic/elasticsearch-serverless"

Expand All @@ -15,7 +17,7 @@ fi

# Pull the target image
if [[ $ES_SERVERLESS_IMAGE != *":git-"* ]]; then
docker pull "$ES_SERVERLESS_IMAGE"
docker_with_retry pull "$ES_SERVERLESS_IMAGE"
ES_SERVERLESS_VERSION=$(docker inspect --format='{{json .Config.Labels}}' "$ES_SERVERLESS_IMAGE" | jq -r '.["org.opencontainers.image.revision"]' | cut -c1-12)

IMAGE_WITHOUT_TAG=$(echo "$ES_SERVERLESS_IMAGE" | cut -d: -f1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,18 @@ ARM_64_DIGEST=$(jq -r '.manifests[] | select(.platform.architecture == "arm64")
AMD_64_DIGEST=$(jq -r '.manifests[] | select(.platform.architecture == "amd64") | .digest' manifests.json)

echo docker pull --platform linux/arm64 "$SOURCE_IMAGE@$ARM_64_DIGEST"
docker pull --platform linux/arm64 "$SOURCE_IMAGE@$ARM_64_DIGEST"
docker_with_retry pull --platform linux/arm64 "$SOURCE_IMAGE@$ARM_64_DIGEST"
echo linux/arm64 image pulled, with digest: $ARM_64_DIGEST

echo docker pull --platform linux/amd64 "$SOURCE_IMAGE@$AMD_64_DIGEST"
docker pull --platform linux/amd64 "$SOURCE_IMAGE@$AMD_64_DIGEST"
docker_with_retry pull --platform linux/amd64 "$SOURCE_IMAGE@$AMD_64_DIGEST"
echo linux/amd64 image pulled, with digest: $AMD_64_DIGEST

docker tag "$SOURCE_IMAGE@$ARM_64_DIGEST" "$TARGET_IMAGE-arm64"
docker tag "$SOURCE_IMAGE@$AMD_64_DIGEST" "$TARGET_IMAGE-amd64"

docker push "$TARGET_IMAGE-arm64"
docker push "$TARGET_IMAGE-amd64"
docker_with_retry push "$TARGET_IMAGE-arm64"
docker_with_retry push "$TARGET_IMAGE-amd64"

docker manifest rm "$TARGET_IMAGE" || echo "Nothing to delete"

Expand Down
2 changes: 1 addition & 1 deletion .buildkite/scripts/steps/es_snapshots/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ set +e
echo $ES_CLOUD_ID $ES_CLOUD_VERSION $KIBANA_ES_CLOUD_VERSION $KIBANA_ES_CLOUD_IMAGE
docker tag "$ES_CLOUD_ID" "$KIBANA_ES_CLOUD_IMAGE"

docker image push "$KIBANA_ES_CLOUD_IMAGE"
docker_with_retry push "$KIBANA_ES_CLOUD_IMAGE"

export ELASTICSEARCH_CLOUD_IMAGE="$KIBANA_ES_CLOUD_IMAGE"
export ELASTICSEARCH_CLOUD_IMAGE_CHECKSUM="$(docker images "$KIBANA_ES_CLOUD_IMAGE" --format "{{.Digest}}")"
Expand Down

0 comments on commit 8b3b314

Please sign in to comment.