Skip to content

100 Nodes Scale Test (scale-100) #252

100 Nodes Scale Test (scale-100)

100 Nodes Scale Test (scale-100) #252

name: 100 Nodes Scale Test (scale-100)
on:
schedule:
- cron: '39 0 * * 1-5'
workflow_dispatch:
inputs:
PR-number:
description: "Pull request number."
required: true
context-ref:
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)."
required: true
SHA:
description: "SHA under test (head of the PR branch)."
required: true
extra-args:
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow."
required: false
default: '{}'
# For testing uncomment following lines:
# push:
# branches:
# - your_branch_name
permissions:
# To be able to access the repository with actions/checkout
contents: read
# To be able to request the JWT from GitHub's OIDC provider
id-token: write
# To allow retrieving information from the PR API
pull-requests: read
# To be able to set commit status
statuses: write
concurrency:
# Structure:
# - Workflow name
# - Event type
# - A unique identifier depending on event type:
# - schedule: SHA
# - workflow_dispatch: PR number
#
# This structure ensures a unique concurrency group name is generated for each
# type of testing, such that re-runs will cancel the previous run.
group: |
${{ github.workflow }}
${{ github.event_name }}
${{
(github.event_name == 'schedule' && github.sha) ||
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number)
}}
cancel-in-progress: true
env:
# renovate: datasource=golang-version depName=go
go_version: 1.23.3
# Adding k8s.local to the end makes kops happy-
# has stricter DNS naming requirements.
test_name: scale-100
cluster_name: ${{ github.run_id }}-${{ github.run_attempt }}
# renovate: datasource=docker depName=google/cloud-sdk
gcloud_version: 499.0.0
jobs:
echo-inputs:
if: ${{ github.event_name == 'workflow_dispatch' }}
name: Echo Workflow Dispatch Inputs
runs-on: ubuntu-24.04
steps:
- name: Echo Workflow Dispatch Inputs
run: |
echo '${{ tojson(inputs) }}'
commit-status-start:
name: Commit Status Start
runs-on: ubuntu-latest
steps:
- name: Set initial commit status
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
with:
sha: ${{ inputs.SHA || github.sha }}
install-and-scaletest:
runs-on: ubuntu-latest
name: Install and Scale Test
timeout-minutes: 150
steps:
- name: Checkout context ref (trusted)
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
ref: ${{ inputs.context-ref || github.sha }}
persist-credentials: false
- name: Set Environment Variables
uses: ./.github/actions/set-env-variables
- name: Set up job variables
id: vars
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] ; then
SHA="${{ inputs.SHA }}"
else
SHA="${{ github.sha }}"
fi
# Adding k8s.local to the end makes kops happy
# has stricter DNS naming requirements.
CLUSTER_NAME="${{ env.test_name }}-${{ env.cluster_name }}.k8s.local"
CILIUM_INSTALL_DEFAULTS="--chart-directory=install/kubernetes/cilium \
--set pprof.enabled=true \
--helm-set=prometheus.enabled=true \
--helm-set=cluster.name=${{ env.cluster_name }} \
--helm-set=k8sServiceHost=api.internal.${CLUSTER_NAME} \
--helm-set=k8sServicePort=443 \
--helm-set=kubeProxyReplacement=true \
--helm-set=operator.replicas=1 \
--wait=false"
# only add SHA to the image tags if it was set
if [ -n "${SHA}" ]; then
echo sha=${SHA} >> $GITHUB_OUTPUT
CILIUM_INSTALL_DEFAULTS+=" --helm-set=image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci \
--helm-set=image.useDigest=false \
--helm-set=image.tag=${SHA} \
--helm-set=operator.image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator \
--helm-set=operator.image.suffix=-ci \
--helm-set=operator.image.tag=${SHA} \
--helm-set=operator.image.useDigest=false \
--helm-set=clustermesh.apiserver.image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci \
--helm-set=clustermesh.apiserver.image.tag=${SHA} \
--helm-set=clustermesh.apiserver.image.useDigest=false \
--helm-set=hubble.relay.image.repository=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/hubble-relay-ci \
--helm-set=hubble.relay.image.tag=${SHA} \
--helm-set=hubble.relay.image.useDigest=false"
fi
echo SHA=${SHA} >> $GITHUB_OUTPUT
echo cilium_install_defaults=${CILIUM_INSTALL_DEFAULTS} >> $GITHUB_OUTPUT
echo CLUSTER_NAME=${CLUSTER_NAME} >> $GITHUB_OUTPUT
- name: Wait for images to be available
timeout-minutes: 30
shell: bash
run: |
for image in cilium-ci operator-generic-ci hubble-relay-ci ; do
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.SHA }} &> /dev/null; do sleep 45s; done
done
- name: Install Go
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
with:
go-version: ${{ env.go_version }}
- name: Install Kops
uses: cilium/scale-tests-action/install-kops@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
- name: Setup gcloud credentials
uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f # v2.1.7
with:
workload_identity_provider: ${{ secrets.GCP_PERF_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_PERF_SA }}
create_credentials_file: true
export_environment_variables: true
- name: Setup gcloud CLI
uses: google-github-actions/setup-gcloud@6189d56e4096ee891640bb02ac264be376592d6a # v2.1.2
with:
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }}
version: ${{ env.gcloud_version }}
- name: Clone ClusterLoader2
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: kubernetes/perf-tests
# Avoid using renovate to update this dependency because: (1)
# perf-tests does not tag or release, so renovate will pull
# all updates to the default branch and (2) continually
# updating CL2 may impact the stability of the scale test
# results.
ref: 6eb52ac89d5de15a0ad13cfeb2b2026e57ce4f64
persist-credentials: false
sparse-checkout: clusterloader2
path: perf-tests
- name: Setup CL2
run: |
# CL2 needs ssh access to control plane nodes
gcloud compute config-ssh
# Copy the custom configs to the folder where CL2 expects them.
cp -r .github/actions/cl2-modules ./perf-tests/clusterloader2/testing/custom
cd ./perf-tests/clusterloader2
# CL2 hardcodes module paths to live in ./testing/load, even
# if the path given is relative.
cp ../../.github/actions/cl2-modules/cilium-agent-pprofs.yaml ./testing/load/
cp ../../.github/actions/cl2-modules/cilium-metrics.yaml ./testing/load/
echo \
'{"CL2_ADDITIONAL_MEASUREMENT_MODULES": ["./cilium-agent-pprofs.yaml", "./cilium-metrics.yaml"]}' \
> modules.yaml
go build ./cmd/clusterloader.go
- name: Deploy cluster
id: deploy-cluster
uses: cilium/scale-tests-action/create-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
timeout-minutes: 30
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
control_plane_size: n1-standard-8
control_plane_count: 1
node_size: e2-standard-8
node_count: 1
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }}
kube_proxy_enabled: false
- name: Setup firewall rules
uses: cilium/scale-tests-action/setup-firewall@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
- name: Install Cilium CLI
uses: cilium/cilium-cli@3286926bbf80fdd0103a372256459e577224f9f6 # v0.16.20
with:
skip-build: ${{ env.CILIUM_CLI_SKIP_BUILD }}
image-repo: ${{ env.CILIUM_CLI_IMAGE_REPO }}
image-tag: ${{ inputs.SHA || github.sha }}
- name: Display version info of installed tools
run: |
echo "--- go ---"
go version
echo "--- cilium-cli ---"
cilium version --client
echo "--- kops ---"
./kops version
echo "--- gcloud ---"
gcloud version
- name: Install Cilium
run: |
cilium install --dry-run-helm-values ${{ steps.vars.outputs.cilium_install_defaults }}
cilium install ${{ steps.vars.outputs.cilium_install_defaults }}
- name: Wait for cluster to be ready
uses: cilium/scale-tests-action/validate-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
timeout-minutes: 20
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
interval: 10s
- name: Run CL2 to setup prometheus
shell: bash
working-directory: ./perf-tests/clusterloader2
env:
CL2_PROMETHEUS_PVC_ENABLED: "false"
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true"
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true"
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 2.0
timeout-minutes: 10
run: |
# Don't run any tasks at this point, just setup the monitoring stack
./clusterloader \
-v=2 \
--testconfig=./testing/custom/common/setup.yaml \
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \
--provider=gce \
--enable-exec-service=false \
--enable-prometheus-server \
--tear-down-prometheus-server=false \
--kubeconfig=$HOME/.kube/config \
2>&1 | tee cl2-setup.txt
- name: Create Instance Group for workload deployments
uses: cilium/scale-tests-action/create-instance-group@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
timeout-minutes: 30
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
node_size: e2-medium
node_count: 100
ig_name: workloads
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Wait for cluster to be ready
uses: cilium/scale-tests-action/validate-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
timeout-minutes: 20
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
interval: 10s
- name: Setup firewall rules
uses: cilium/scale-tests-action/setup-firewall@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
create_native_routing_firewall: 'false'
- name: Wait for Cilium status to be ready
run: |
cilium status --wait
- name: Run CL2
id: run-cl2
working-directory: ./perf-tests/clusterloader2
shell: bash
timeout-minutes: 40
env:
CL2_ENABLE_PVS: "false"
CL2_ENABLE_NETWORKPOLICIES: "true"
CL2_ALLOWED_SLOW_API_CALLS: 1
CL2_SCHEDULER_THROUGHPUT_THRESHOLD: 0
CL2_PROMETHEUS_PVC_ENABLED: "false"
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true"
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true"
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 2.0
run: |
./clusterloader \
-v=2 \
--testconfig=./testing/load/config.yaml \
--provider=gce \
--enable-prometheus-server \
--tear-down-prometheus-server=false \
--nodes=100 \
--report-dir=./report \
--experimental-prometheus-snapshot-to-report-dir=true \
--kubeconfig=$HOME/.kube/config \
--testoverrides=./testing/overrides/load_throughput.yaml \
--testoverrides=./testing/experiments/use_simple_latency_query.yaml \
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \
--testoverrides=./modules.yaml \
2>&1 | tee cl2-output.txt
- name: Get sysdump
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }}
run: |
cilium status
cilium sysdump --output-filename cilium-sysdump-final
sudo chmod +r cilium-sysdump-final.zip
- name: Cleanup cluster
if: ${{ always() && steps.deploy-cluster.outcome != 'skipped' }}
uses: cilium/scale-tests-action/cleanup-cluster@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
with:
cluster_name: ${{ steps.vars.outputs.cluster_name }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Export results and sysdump to GS bucket
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }}
uses: cilium/scale-tests-action/export-results@d3ecfd83003f3e9c98ba125ca14933401d44918f # main
with:
test_name: ${{ env.test_name }}
results_bucket: ${{ env.GCP_PERF_RESULTS_BUCKET }}
artifacts: ./perf-tests/clusterloader2/report/*
other_files: cilium-sysdump-final.zip ./perf-tests/clusterloader2/cl2-output.txt
commit-status-final:
if: ${{ always() }}
name: Commit Status Final
needs: install-and-scaletest
runs-on: ubuntu-latest
steps:
- name: Set final commit status
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
with:
sha: ${{ inputs.SHA || github.sha }}
status: ${{ needs.install-and-scaletest.result }}