Cluster Mesh Scale Test (scale-clustermesh) #33
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Cluster Mesh Scale Test (scale-clustermesh) | |
on: | |
schedule: | |
- cron: '39 12 * * 1-5' | |
workflow_dispatch: | |
inputs: | |
PR-number: | |
description: "Pull request number." | |
required: true | |
context-ref: | |
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)." | |
required: true | |
SHA: | |
description: "SHA under test (head of the PR branch)." | |
required: true | |
extra-args: | |
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow." | |
required: false | |
default: '{}' | |
# For testing uncomment following lines: | |
# push: | |
# branches: | |
# - your_branch_name | |
permissions: | |
# To be able to access the repository with actions/checkout | |
contents: read | |
# To be able to request the JWT from GitHub's OIDC provider | |
id-token: write | |
# To allow retrieving information from the PR API | |
pull-requests: read | |
# To be able to set commit status | |
statuses: write | |
concurrency: | |
# Structure: | |
# - Workflow name | |
# - Event type | |
# - A unique identifier depending on event type: | |
# - schedule: SHA | |
# - workflow_dispatch: PR number | |
# | |
# This structure ensures a unique concurrency group name is generated for each | |
# type of testing, such that re-runs will cancel the previous run. | |
group: | | |
${{ github.workflow }} | |
${{ github.event_name }} | |
${{ | |
(github.event_name == 'schedule' && github.sha) || | |
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number) | |
}} | |
cancel-in-progress: true | |
env: | |
# renovate: datasource=golang-version depName=go | |
go_version: 1.22.5 | |
# renovate: datasource=docker depName=google/cloud-sdk | |
gcloud_version: 483.0.0 | |
# renovate: datasource=git-refs depName=https://github.com/cilium/scaffolding branch=main | |
cmapisrv_mock_ref: 40f01dfc4b5781d232bc2027ae50017cfb291a01 | |
test_name: scale-clustermesh | |
cluster_name: ${{ github.run_id }}-${{ github.run_attempt }} | |
mock_clusters: 250 | |
jobs: | |
echo-inputs: | |
if: ${{ github.event_name == 'workflow_dispatch' }} | |
name: Echo Workflow Dispatch Inputs | |
runs-on: ubuntu-latest | |
steps: | |
- name: Echo Workflow Dispatch Inputs | |
run: | | |
echo '${{ tojson(inputs) }}' | |
commit-status-start: | |
name: Commit Status Start | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set initial commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
install-and-test: | |
runs-on: ubuntu-latest | |
name: Install and Cluster Mesh Scale Test | |
timeout-minutes: 60 | |
steps: | |
- name: Checkout context ref (trusted) | |
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 | |
with: | |
ref: ${{ inputs.context-ref || github.sha }} | |
persist-credentials: false | |
- name: Set Environment Variables | |
uses: ./.github/actions/set-env-variables | |
- name: Set up job variables | |
id: vars | |
run: | | |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] ; then | |
SHA="${{ inputs.SHA }}" | |
else | |
SHA="${{ github.sha }}" | |
fi | |
# Adding k8s.local to the end makes kops happy | |
# has stricter DNS naming requirements. | |
CLUSTER_NAME="${{ env.test_name }}-${{ env.cluster_name }}.k8s.local" | |
echo SHA=${SHA} >> $GITHUB_OUTPUT | |
echo CLUSTER_NAME=${CLUSTER_NAME} >> $GITHUB_OUTPUT | |
- name: Wait for images to be available | |
timeout-minutes: 30 | |
shell: bash | |
run: | | |
for image in cilium-ci operator-generic-ci clustermesh-apiserver-ci ; do | |
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.SHA }} &> /dev/null; do sleep 45s; done | |
done | |
- name: Install Go | |
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2 | |
with: | |
go-version: ${{ env.go_version }} | |
- name: Install Kops | |
uses: cilium/scale-tests-action/install-kops@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main | |
- name: Setup gcloud credentials | |
uses: google-github-actions/auth@71fee32a0bb7e97b4d33d548e7d957010649d8fa # v2.1.3 | |
with: | |
workload_identity_provider: ${{ secrets.GCP_PERF_WORKLOAD_IDENTITY_PROVIDER }} | |
service_account: ${{ secrets.GCP_PERF_SA }} | |
create_credentials_file: true | |
export_environment_variables: true | |
- name: Setup gcloud CLI | |
uses: google-github-actions/setup-gcloud@98ddc00a17442e89a24bbf282954a3b65ce6d200 # v2.1.0 | |
with: | |
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }} | |
version: ${{ env.gcloud_version }} | |
- name: Clone ClusterLoader2 | |
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 | |
with: | |
repository: kubernetes/perf-tests | |
# Avoid using renovate to update this dependency because: (1) | |
# perf-tests does not tag or release, so renovate will pull | |
# all updates to the default branch and (2) continually | |
# updating CL2 may impact the stability of the scale test | |
# results. | |
ref: 6eb52ac89d5de15a0ad13cfeb2b2026e57ce4f64 | |
persist-credentials: false | |
sparse-checkout: clusterloader2 | |
path: perf-tests | |
- name: Clone the Cluster Mesh API Server Mock | |
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 | |
with: | |
repository: cilium/scaffolding | |
ref: ${{ env.cmapisrv_mock_ref }} | |
persist-credentials: false | |
sparse-checkout: cmapisrv-mock | |
path: scaffolding | |
- name: Deploy cluster | |
id: deploy-cluster | |
uses: cilium/scale-tests-action/create-cluster@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main | |
timeout-minutes: 30 | |
with: | |
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }} | |
control_plane_size: n2-standard-8 | |
control_plane_count: 1 | |
node_size: n2-standard-8 | |
node_count: 1 | |
node_cidr: 100.0.0.0/16 | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }} | |
kube_proxy_enabled: false | |
- name: Install Cilium CLI | |
uses: cilium/cilium-cli@e386af2b9f500e4c40436ac660cd6602da104fc7 # v0.16.14 | |
with: | |
skip-build: ${{ env.CILIUM_CLI_SKIP_BUILD }} | |
image-repo: ${{ env.CILIUM_CLI_IMAGE_REPO }} | |
image-tag: ${{ env.CILIUM_CLI_VERSION }} | |
- name: Display version info of installed tools | |
run: | | |
echo "--- go ---" | |
go version | |
echo "--- cilium-cli ---" | |
cilium version --client | |
echo "--- kops ---" | |
./kops version | |
echo "--- gcloud ---" | |
gcloud version | |
- name: Setup firewall rules | |
uses: cilium/scale-tests-action/setup-firewall@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main | |
with: | |
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }} | |
# Warning: since this is a privileged workflow, subsequent workflow job | |
# steps must take care not to execute untrusted code. | |
- name: Checkout pull request branch (NOT TRUSTED) | |
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 | |
with: | |
ref: ${{ steps.vars.outputs.SHA }} | |
persist-credentials: false | |
path: untrusted | |
sparse-checkout: | | |
install/kubernetes/cilium | |
- name: Install Cilium | |
run: | | |
# * Increase the node BPF map size to account for the total number of nodes. | |
# * Disable health checking, as mocked nodes are unreachable. | |
cilium install \ | |
--chart-directory=untrusted/install/kubernetes/cilium \ | |
--set image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${{ steps.vars.outputs.SHA }} \ | |
--set operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${{ steps.vars.outputs.SHA }} \ | |
--set clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${{ steps.vars.outputs.SHA }} \ | |
--set ipam.mode=kubernetes \ | |
--set kubeProxyReplacement=true \ | |
--set k8sServiceHost=api.internal.${{ steps.vars.outputs.CLUSTER_NAME }} \ | |
--set k8sServicePort=443 \ | |
--set pprof.enabled=true \ | |
--set prometheus.enabled=true \ | |
--set cluster.name=${{ env.test_name }}-${{ env.cluster_name }} \ | |
--set cluster.id=255 \ | |
--set operator.replicas=1 \ | |
--set operator.nodeSelector.node-role\\.kubernetes\\.io/control-plane= \ | |
--set bpf.nodeMapMax=65536 \ | |
--set healthChecking=false \ | |
--set endpointHealthChecking.enabled=false | |
# This step must be run after installing Cilium, as it requires | |
# system pods (e.g., coredns) to be running. | |
- name: Wait for cluster to be ready | |
uses: cilium/scale-tests-action/validate-cluster@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main | |
timeout-minutes: 20 | |
with: | |
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }} | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
- name: Wait for Cilium status to be ready | |
run: | | |
cilium status --wait | |
- name: Setup CL2 | |
run: | | |
# CL2 needs ssh access to control plane nodes | |
gcloud compute config-ssh | |
# Copy the custom configs to the folder where CL2 expects them. | |
cp -r .github/actions/cl2-modules ./perf-tests/clusterloader2/testing/custom | |
- name: Run CL2 to setup prometheus | |
working-directory: ./perf-tests/clusterloader2 | |
env: | |
CL2_PROMETHEUS_PVC_ENABLED: "false" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true" | |
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4.0 | |
timeout-minutes: 10 | |
run: | | |
# Don't run any tasks at this point, just setup the monitoring stack | |
go run ./cmd/clusterloader.go \ | |
-v=2 \ | |
--testconfig=./testing/custom/clustermesh/setup.yaml \ | |
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \ | |
--provider=gce \ | |
--enable-prometheus-server \ | |
--tear-down-prometheus-server=false \ | |
--kubeconfig=$HOME/.kube/config \ | |
2>&1 | tee cl2-setup.txt | |
- name: Deploy the Cluster Mesh API Server Mock | |
run: | | |
helm install cmapisrv-mock \ | |
./scaffolding/cmapisrv-mock/deploy/cmapisrv-mock \ | |
--namespace kube-system \ | |
--set image.repository=quay.io/cilium/cmapisrv-mock \ | |
--set image.tag=${{ env.cmapisrv_mock_ref }} \ | |
--set nodeSelector.node-role\\.kubernetes\\.io/control-plane= \ | |
--set tolerations[0].key=node-role.kubernetes.io/control-plane \ | |
--set tolerations[0].operator=Exists \ | |
--set config.ipv6=false \ | |
--set config.clusters=${{ env.mock_clusters }} \ | |
--set config.nodes=100 \ | |
--set config.nodesQPS=0.1 \ | |
--set config.identities=100 \ | |
--set config.identitiesQPS=0.2 \ | |
--set config.endpoints=1000 \ | |
--set config.endpointsQPS=1 \ | |
--set config.services=0 \ | |
--set config.servicesQPS=0 \ | |
--set serviceMonitor=true | |
kubectl -n kube-system wait --for=condition=Ready pod \ | |
-l app.kubernetes.io/name=cmapisrv-mock --timeout=300s | |
- name: Enable KVStoreMesh and configure Cilium to connect to the Cluster Mesh API Server Mock | |
run: | | |
cat<<EOF > values-clustermesh-config.yaml | |
clustermesh: | |
config: | |
enabled: true | |
clusters: | |
EOF | |
for i in $(seq 1 ${{ env.mock_clusters }}); do | |
printf " - name: cluster-%03d\n" ${i} | |
printf " address: cmapisrv-mock.kube-system.svc\n" | |
printf " port: 2379\n" | |
done >> values-clustermesh-config.yaml | |
# * We enable KVStoreMesh only at this point to leverage the bootstrap QPS | |
# and speed-up the overall bootstrap process. | |
# * Increase the KVStoreMesh QPS to match the ones of the cmapisrv-mock, | |
# as not a problem considering the limited number of watchers. | |
# * Store etcd data directly in memory, for improved performance. | |
cilium upgrade --reuse-values \ | |
--chart-directory=untrusted/install/kubernetes/cilium \ | |
--set clustermesh.useAPIServer=true \ | |
--set clustermesh.apiserver.etcd.storageMedium=Memory \ | |
--set clustermesh.apiserver.kvstoremesh.enabled=true \ | |
--set clustermesh.apiserver.kvstoremesh.extraArgs[0]=--kvstore-opt=etcd.qps=1000 \ | |
--set clustermesh.apiserver.nodeSelector.node-role\\.kubernetes\\.io/control-plane= \ | |
--set clustermesh.apiserver.tolerations[0].key=node-role.kubernetes.io/control-plane \ | |
--set clustermesh.apiserver.tolerations[0].operator=Exists \ | |
--set clustermesh.apiserver.metrics.serviceMonitor.enabled=true \ | |
--values values-clustermesh-config.yaml | |
cilium status --wait | |
cilium clustermesh status --wait --wait-duration=5m | |
- name: Run CL2 | |
id: run-cl2 | |
working-directory: ./perf-tests/clusterloader2 | |
env: | |
CL2_PROMETHEUS_PVC_ENABLED: "false" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true" | |
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true" | |
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4.0 | |
timeout-minutes: 30 | |
run: | | |
go run ./cmd/clusterloader.go \ | |
-v=2 \ | |
--testconfig=./testing/custom/clustermesh/config.yaml \ | |
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \ | |
--provider=gce \ | |
--nodes=1 \ | |
--enable-prometheus-server \ | |
--tear-down-prometheus-server=false \ | |
--report-dir=./report \ | |
--experimental-prometheus-snapshot-to-report-dir=true \ | |
--kubeconfig=$HOME/.kube/config \ | |
2>&1 | tee cl2-output.txt | |
- name: Get sysdump | |
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }} | |
run: | | |
cilium status | |
cilium sysdump --output-filename cilium-sysdump-final | |
- name: Cleanup cluster | |
if: ${{ always() && steps.deploy-cluster.outcome != 'skipped' }} | |
uses: cilium/scale-tests-action/cleanup-cluster@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main | |
with: | |
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }} | |
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }} | |
- name: Export results and sysdump to GS bucket | |
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }} | |
uses: cilium/scale-tests-action/export-results@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main | |
with: | |
test_name: ${{ env.test_name }} | |
results_bucket: ${{ env.GCP_PERF_RESULTS_BUCKET }} | |
artifacts: ./perf-tests/clusterloader2/report/* | |
other_files: cilium-sysdump-final.zip ./perf-tests/clusterloader2/cl2-output.txt | |
commit-status-final: | |
if: ${{ always() }} | |
name: Commit Status Final | |
needs: install-and-test | |
runs-on: ubuntu-latest | |
steps: | |
- name: Set final commit status | |
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1 | |
with: | |
sha: ${{ inputs.SHA || github.sha }} | |
status: ${{ needs.install-and-test.result }} |