diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index d6199372a0a..3b21d9a78d6 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -269,6 +269,7 @@ gcs GENERALIZEDTIME getwindowid ghp +gitjob gitmodules gitrepo gke diff --git a/bats/Makefile b/bats/Makefile index d1f6801b8fd..6eaafc78141 100644 --- a/bats/Makefile +++ b/bats/Makefile @@ -16,12 +16,12 @@ SC_EXCLUDES ?= SC1091,SC2034,SC2154 lint: find tests -name '*.bash' | xargs ./scripts/bats-lint.pl find tests -name '*.bats' | xargs ./scripts/bats-lint.pl - find tests -name '*.bash' | xargs shellcheck -s bash -e $(SC_EXCLUDES) - find tests -name '*.bats' | xargs shellcheck -s bash -e $(SC_EXCLUDES) - find scripts -name '*.sh' | xargs shellcheck -s bash -e $(SC_EXCLUDES) - find tests -name '*.bash' | xargs shfmt -s -d - find tests -name '*.bats' | xargs shfmt -s -d - find scripts -name '*.sh' | xargs shfmt -s -d + find tests -name '*.bash' | xargs shellcheck --shell=bash --exclude=$(SC_EXCLUDES) + find tests -name '*.bats' | xargs shellcheck --shell=bash --exclude=$(SC_EXCLUDES) + find scripts -name '*.sh' | xargs shellcheck --shell=bash --exclude=$(SC_EXCLUDES) + find tests -name '*.bash' | xargs shfmt --simplify --diff --language-dialect bats --indent 4 + find tests -name '*.bats' | xargs shfmt --simplify --diff --language-dialect bats --indent 4 + find scripts -name '*.sh' | xargs shfmt --simplify --diff DEPS = bin/darwin/jq bin/linux/jq diff --git a/bats/tests/helpers/utils.bash b/bats/tests/helpers/utils.bash index 7ea70fbbbb6..78f6c358049 100644 --- a/bats/tests/helpers/utils.bash +++ b/bats/tests/helpers/utils.bash @@ -388,7 +388,9 @@ capture_logs() { cp -LR "${PATH_LOGS}/" "$logdir" echo "${BATS_TEST_DESCRIPTION:-teardown}" >"${logdir}/test_description" # Capture settings.json - cp "$PATH_CONFIG_FILE" "$logdir" + if [[ -f $PATH_CONFIG_FILE ]]; then + cp "$PATH_CONFIG_FILE" "$logdir" + fi foreach_profile export_profile "$logdir" fi } diff --git a/bats/tests/k8s/helm-install-rancher.bats b/bats/tests/k8s/helm-install-rancher.bats index 8d983699674..e1478252101 100644 --- a/bats/tests/k8s/helm-install-rancher.bats +++ b/bats/tests/k8s/helm-install-rancher.bats @@ -1,7 +1,10 @@ # Test case 11 & 12 load '../helpers/load' -RD_FILE_RAMDISK_SIZE=12 # We need more disk to run the Rancher image. + +local_setup_file() { + RD_USE_RAMDISK=false +} local_setup() { needs_port 443 @@ -84,6 +87,65 @@ determine_chart_version() { fail || return } +assert_not_empty_list() { + run "$@" + assert_success || return + run jq_output length + assert_success || return + refute_output 0 || return +} + +assert_true() { + run --separate-stderr "$@" + assert_success || return + assert_output --regexp '^([Tt]rue|1)$' || return +} + +# Given namespace and app name, assert that a log line contains the given string. +assert_pod_log_line() { + local namespace="$1" + local selector="app=$2" + shift 2 + local expect="$*" + run kubectl get pod --namespace "$namespace" --selector "$selector" --output=jsonpath='{.items[0].metadata.name}' + assert_success + assert_output || return + local name="$output" + + run kubectl logs --namespace "$namespace" "$name" + assert_success || return + assert_output --partial "$expect" || return +} + +# Pull down the image manually first so we are less likely to time out when +# deploying rancher +pull_rancher_image() { + local rancher_chart_version + if ! load_var rancher_chart_version; then + fail "Could not restore Rancher chart version" + fi + local CONTAINERD_NAMESPACE=k8s.io + try ctrctl pull --quiet "rancher/rancher:v$rancher_chart_version" +} + +wait_for_rancher_pod() { + try assert_pod_log_line cattle-system rancher Listening on :443 + try assert_pod_log_line cattle-system rancher Starting catalog controller + try --max 60 --delay 10 assert_pod_log_line cattle-system rancher Watching metadata for rke-machine-config.cattle.io/v1 + try --max 60 --delay 10 assert_pod_log_line cattle-system rancher 'Creating clusterRole for roleTemplate Cluster Owner (cluster-owner).' + try assert_pod_log_line cattle-system rancher Rancher startup complete + try assert_pod_log_line cattle-system rancher Created machine for node +} + +wait_for_webhook_pod() { + try assert_pod_log_line cattle-system rancher-webhook Rancher-webhook version + try assert_pod_log_line cattle-system rancher-webhook Listening on :9443 + # Depending on version, this is either "cattle-webhook-tls" or "cattle-system/cattle-webhook-tls" + try assert_pod_log_line cattle-system rancher-webhook Creating new TLS secret for cattle- + try assert_pod_log_line cattle-system rancher-webhook Active TLS secret cattle- + try assert_pod_log_line cattle-system rancher-webhook 'Sleeping for 15 seconds then applying webhook config' +} + deploy_rancher() { # TODO remove `skip_unless_host_ip` once `traefik_hostname` no longer needs it if is_windows; then @@ -98,22 +160,70 @@ deploy_rancher() { helm upgrade \ --install cert-manager jetstack/cert-manager \ --namespace cert-manager \ - --set installCRDs=true \ + --set crds.enabled=true \ + --set crds.keep=true \ + --set prometheus.enabled=false \ --set "extraArgs[0]=--enable-certificate-owner-ref=true" \ --create-namespace + try assert_not_empty_list helm list --namespace cert-manager --deployed --output json --selector name=cert-manager + wait_for_kube_deployment_available --namespace cert-manager cert-manager local host host=$(traefik_hostname) || return comment "Installing rancher $rancher_chart_version" + # The helm install can take a long time, especially on CI. Therefore we + # avoid using --wait / --timeout, and instead check for forward progress + # at each step. helm upgrade \ --install rancher rancher-latest/rancher \ --version "$rancher_chart_version" \ --namespace cattle-system \ --set hostname="$host" \ - --wait \ - --timeout=10m \ + --set replicas=1 \ --create-namespace + + try assert_not_empty_list helm list --all --output json --namespace cattle-system --selector name=rancher + try assert_not_empty_list helm list --deployed --output json --namespace cattle-system --selector name=rancher + try kubectl get ingress --namespace cattle-system rancher + try assert_not_empty_list kubectl get ingress --namespace cattle-system rancher --output jsonpath='{.status.loadBalancer.ingress}' + + try --max 60 --delay 10 kubectl get namespace fleet-local + try --max 60 --delay 10 kubectl get namespace local + try --max 60 --delay 10 kubectl get namespace cattle-global-data + try --max 60 --delay 10 kubectl get namespace fleet-default + + try assert_not_empty_list kubectl get pods --namespace cattle-system --selector app=rancher --output jsonpath='{.items}' + + # Unfortunately, the Rancher pod could get restarted; this may lead to the + # wait steps to fail and we need to start again from the top. + try wait_for_rancher_pod + + try assert_true kubectl get APIServices v3.project.cattle.io --output=jsonpath='{.status.conditions[?(@.type=="Available")].status}' + + try kubectl get namespace cattle-fleet-system + try kubectl get namespace cattle-system + + try --max 48 kubectl get deployment --namespace cattle-fleet-system fleet-controller + try assert_kube_deployment_available --namespace cattle-fleet-system gitjob + try assert_kube_deployment_available --namespace cattle-fleet-system fleet-controller + + try --max 60 --delay 10 assert_not_empty_list kubectl get pods --namespace cattle-system --selector app=rancher-webhook --output jsonpath='{.items}' + + # Unfortunately, the webhook pod might restart too :( + try wait_for_webhook_pod + + try --max 120 assert_kube_deployment_available --namespace cattle-system rancher + try --max 120 assert_kube_deployment_available --namespace cattle-fleet-local-system fleet-agent + try --max 60 assert_kube_deployment_available --namespace cattle-system rancher-webhook + + # The rancher pod sometimes falls over on its own; retry in a loop to + # detect flapping. + local i + for i in {1..10}; do + sleep 1 + try --max 60 --delay 10 assert_kube_deployment_available --namespace cattle-system rancher + done } verify_rancher() { @@ -122,24 +232,32 @@ verify_rancher() { skip_unless_host_ip fi + # Get k3s logs if possible before things fail + kubectl get deployments --all-namespaces || : + kubectl get pods --all-namespaces || : + + local name + name="$(kubectl get pod -n cattle-system --selector app=rancher --output=jsonpath='{.items[].metadata.name}' || echo '')" + if [[ -n $name ]]; then + kubectl logs -n cattle-system "$name" || : + fi + + name="$(kubectl get pod -n cattle-system --selector app=rancher-webhook --output=jsonpath='{.items[].metadata.name}' || echo '')" + if [[ -n $name ]]; then + kubectl logs -n cattle-system "$name" || : + fi + local host host=$(traefik_hostname) || return - run try --max 9 --delay 10 curl --insecure --silent --show-error "https://${host}/dashboard/auth/login" + run try --max 9 --delay 10 curl --insecure --show-error "https://${host}/dashboard/auth/login" assert_success assert_output --partial 'href="/dashboard/' - run kubectl get secret --namespace cattle-system bootstrap-secret -o json + run try kubectl get secret --namespace cattle-system bootstrap-secret -o json assert_success assert_output --partial "bootstrapPassword" } -uninstall_rancher() { - run helm uninstall rancher --namespace cattle-system --wait - assert_nothing - run helm uninstall cert-manager --namespace cert-manager --wait - assert_nothing -} - @test 'add helm repo' { helm repo add jetstack https://charts.jetstack.io helm repo add rancher-latest https://releases.rancher.com/server-charts/latest @@ -152,6 +270,6 @@ foreach_k3s_version \ start_kubernetes \ wait_for_kubelet \ wait_for_traefik \ + pull_rancher_image \ deploy_rancher \ - verify_rancher \ - uninstall_rancher + verify_rancher