Skip to content

Cluster Mesh Scale Test (scale-clustermesh) #33

Cluster Mesh Scale Test (scale-clustermesh)

Cluster Mesh Scale Test (scale-clustermesh) #33

name: Cluster Mesh Scale Test (scale-clustermesh)
on:
schedule:
- cron: '39 12 * * 1-5'
workflow_dispatch:
inputs:
PR-number:
description: "Pull request number."
required: true
context-ref:
description: "Context in which the workflow runs. If PR is from a fork, will be the PR target branch (general case). If PR is NOT from a fork, will be the PR branch itself (this allows committers to test changes to workflows directly from PRs)."
required: true
SHA:
description: "SHA under test (head of the PR branch)."
required: true
extra-args:
description: "[JSON object] Arbitrary arguments passed from the trigger comment via regex capture group. Parse with 'fromJson(inputs.extra-args).argName' in workflow."
required: false
default: '{}'
# For testing uncomment following lines:
# push:
# branches:
# - your_branch_name
permissions:
# To be able to access the repository with actions/checkout
contents: read
# To be able to request the JWT from GitHub's OIDC provider
id-token: write
# To allow retrieving information from the PR API
pull-requests: read
# To be able to set commit status
statuses: write
concurrency:
# Structure:
# - Workflow name
# - Event type
# - A unique identifier depending on event type:
# - schedule: SHA
# - workflow_dispatch: PR number
#
# This structure ensures a unique concurrency group name is generated for each
# type of testing, such that re-runs will cancel the previous run.
group: |
${{ github.workflow }}
${{ github.event_name }}
${{
(github.event_name == 'schedule' && github.sha) ||
(github.event_name == 'workflow_dispatch' && github.event.inputs.PR-number)
}}
cancel-in-progress: true
env:
# renovate: datasource=golang-version depName=go
go_version: 1.22.5
# renovate: datasource=docker depName=google/cloud-sdk
gcloud_version: 483.0.0
# renovate: datasource=git-refs depName=https://github.com/cilium/scaffolding branch=main
cmapisrv_mock_ref: 40f01dfc4b5781d232bc2027ae50017cfb291a01
test_name: scale-clustermesh
cluster_name: ${{ github.run_id }}-${{ github.run_attempt }}
mock_clusters: 250
jobs:
echo-inputs:
if: ${{ github.event_name == 'workflow_dispatch' }}
name: Echo Workflow Dispatch Inputs
runs-on: ubuntu-latest
steps:
- name: Echo Workflow Dispatch Inputs
run: |
echo '${{ tojson(inputs) }}'
commit-status-start:
name: Commit Status Start
runs-on: ubuntu-latest
steps:
- name: Set initial commit status
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
with:
sha: ${{ inputs.SHA || github.sha }}
install-and-test:
runs-on: ubuntu-latest
name: Install and Cluster Mesh Scale Test
timeout-minutes: 60
steps:
- name: Checkout context ref (trusted)
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
ref: ${{ inputs.context-ref || github.sha }}
persist-credentials: false
- name: Set Environment Variables
uses: ./.github/actions/set-env-variables
- name: Set up job variables
id: vars
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ] ; then
SHA="${{ inputs.SHA }}"
else
SHA="${{ github.sha }}"
fi
# Adding k8s.local to the end makes kops happy
# has stricter DNS naming requirements.
CLUSTER_NAME="${{ env.test_name }}-${{ env.cluster_name }}.k8s.local"
echo SHA=${SHA} >> $GITHUB_OUTPUT
echo CLUSTER_NAME=${CLUSTER_NAME} >> $GITHUB_OUTPUT
- name: Wait for images to be available
timeout-minutes: 30
shell: bash
run: |
for image in cilium-ci operator-generic-ci clustermesh-apiserver-ci ; do
until docker manifest inspect quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/$image:${{ steps.vars.outputs.SHA }} &> /dev/null; do sleep 45s; done
done
- name: Install Go
uses: actions/setup-go@0a12ed9d6a96ab950c8f026ed9f722fe0da7ef32 # v5.0.2
with:
go-version: ${{ env.go_version }}
- name: Install Kops
uses: cilium/scale-tests-action/install-kops@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main
- name: Setup gcloud credentials
uses: google-github-actions/auth@71fee32a0bb7e97b4d33d548e7d957010649d8fa # v2.1.3
with:
workload_identity_provider: ${{ secrets.GCP_PERF_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_PERF_SA }}
create_credentials_file: true
export_environment_variables: true
- name: Setup gcloud CLI
uses: google-github-actions/setup-gcloud@98ddc00a17442e89a24bbf282954a3b65ce6d200 # v2.1.0
with:
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }}
version: ${{ env.gcloud_version }}
- name: Clone ClusterLoader2
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
repository: kubernetes/perf-tests
# Avoid using renovate to update this dependency because: (1)
# perf-tests does not tag or release, so renovate will pull
# all updates to the default branch and (2) continually
# updating CL2 may impact the stability of the scale test
# results.
ref: 6eb52ac89d5de15a0ad13cfeb2b2026e57ce4f64
persist-credentials: false
sparse-checkout: clusterloader2
path: perf-tests
- name: Clone the Cluster Mesh API Server Mock
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
repository: cilium/scaffolding
ref: ${{ env.cmapisrv_mock_ref }}
persist-credentials: false
sparse-checkout: cmapisrv-mock
path: scaffolding
- name: Deploy cluster
id: deploy-cluster
uses: cilium/scale-tests-action/create-cluster@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main
timeout-minutes: 30
with:
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }}
control_plane_size: n2-standard-8
control_plane_count: 1
node_size: n2-standard-8
node_count: 1
node_cidr: 100.0.0.0/16
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
project_id: ${{ secrets.GCP_PERF_PROJECT_ID }}
kube_proxy_enabled: false
- name: Install Cilium CLI
uses: cilium/cilium-cli@e386af2b9f500e4c40436ac660cd6602da104fc7 # v0.16.14
with:
skip-build: ${{ env.CILIUM_CLI_SKIP_BUILD }}
image-repo: ${{ env.CILIUM_CLI_IMAGE_REPO }}
image-tag: ${{ env.CILIUM_CLI_VERSION }}
- name: Display version info of installed tools
run: |
echo "--- go ---"
go version
echo "--- cilium-cli ---"
cilium version --client
echo "--- kops ---"
./kops version
echo "--- gcloud ---"
gcloud version
- name: Setup firewall rules
uses: cilium/scale-tests-action/setup-firewall@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main
with:
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }}
# Warning: since this is a privileged workflow, subsequent workflow job
# steps must take care not to execute untrusted code.
- name: Checkout pull request branch (NOT TRUSTED)
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
with:
ref: ${{ steps.vars.outputs.SHA }}
persist-credentials: false
path: untrusted
sparse-checkout: |
install/kubernetes/cilium
- name: Install Cilium
run: |
# * Increase the node BPF map size to account for the total number of nodes.
# * Disable health checking, as mocked nodes are unreachable.
cilium install \
--chart-directory=untrusted/install/kubernetes/cilium \
--set image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/cilium-ci:${{ steps.vars.outputs.SHA }} \
--set operator.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/operator-generic-ci:${{ steps.vars.outputs.SHA }} \
--set clustermesh.apiserver.image.override=quay.io/${{ env.QUAY_ORGANIZATION_DEV }}/clustermesh-apiserver-ci:${{ steps.vars.outputs.SHA }} \
--set ipam.mode=kubernetes \
--set kubeProxyReplacement=true \
--set k8sServiceHost=api.internal.${{ steps.vars.outputs.CLUSTER_NAME }} \
--set k8sServicePort=443 \
--set pprof.enabled=true \
--set prometheus.enabled=true \
--set cluster.name=${{ env.test_name }}-${{ env.cluster_name }} \
--set cluster.id=255 \
--set operator.replicas=1 \
--set operator.nodeSelector.node-role\\.kubernetes\\.io/control-plane= \
--set bpf.nodeMapMax=65536 \
--set healthChecking=false \
--set endpointHealthChecking.enabled=false
# This step must be run after installing Cilium, as it requires
# system pods (e.g., coredns) to be running.
- name: Wait for cluster to be ready
uses: cilium/scale-tests-action/validate-cluster@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main
timeout-minutes: 20
with:
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Wait for Cilium status to be ready
run: |
cilium status --wait
- name: Setup CL2
run: |
# CL2 needs ssh access to control plane nodes
gcloud compute config-ssh
# Copy the custom configs to the folder where CL2 expects them.
cp -r .github/actions/cl2-modules ./perf-tests/clusterloader2/testing/custom
- name: Run CL2 to setup prometheus
working-directory: ./perf-tests/clusterloader2
env:
CL2_PROMETHEUS_PVC_ENABLED: "false"
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true"
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true"
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4.0
timeout-minutes: 10
run: |
# Don't run any tasks at this point, just setup the monitoring stack
go run ./cmd/clusterloader.go \
-v=2 \
--testconfig=./testing/custom/clustermesh/setup.yaml \
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \
--provider=gce \
--enable-prometheus-server \
--tear-down-prometheus-server=false \
--kubeconfig=$HOME/.kube/config \
2>&1 | tee cl2-setup.txt
- name: Deploy the Cluster Mesh API Server Mock
run: |
helm install cmapisrv-mock \
./scaffolding/cmapisrv-mock/deploy/cmapisrv-mock \
--namespace kube-system \
--set image.repository=quay.io/cilium/cmapisrv-mock \
--set image.tag=${{ env.cmapisrv_mock_ref }} \
--set nodeSelector.node-role\\.kubernetes\\.io/control-plane= \
--set tolerations[0].key=node-role.kubernetes.io/control-plane \
--set tolerations[0].operator=Exists \
--set config.ipv6=false \
--set config.clusters=${{ env.mock_clusters }} \
--set config.nodes=100 \
--set config.nodesQPS=0.1 \
--set config.identities=100 \
--set config.identitiesQPS=0.2 \
--set config.endpoints=1000 \
--set config.endpointsQPS=1 \
--set config.services=0 \
--set config.servicesQPS=0 \
--set serviceMonitor=true
kubectl -n kube-system wait --for=condition=Ready pod \
-l app.kubernetes.io/name=cmapisrv-mock --timeout=300s
- name: Enable KVStoreMesh and configure Cilium to connect to the Cluster Mesh API Server Mock
run: |
cat<<EOF > values-clustermesh-config.yaml
clustermesh:
config:
enabled: true
clusters:
EOF
for i in $(seq 1 ${{ env.mock_clusters }}); do
printf " - name: cluster-%03d\n" ${i}
printf " address: cmapisrv-mock.kube-system.svc\n"
printf " port: 2379\n"
done >> values-clustermesh-config.yaml
# * We enable KVStoreMesh only at this point to leverage the bootstrap QPS
# and speed-up the overall bootstrap process.
# * Increase the KVStoreMesh QPS to match the ones of the cmapisrv-mock,
# as not a problem considering the limited number of watchers.
# * Store etcd data directly in memory, for improved performance.
cilium upgrade --reuse-values \
--chart-directory=untrusted/install/kubernetes/cilium \
--set clustermesh.useAPIServer=true \
--set clustermesh.apiserver.etcd.storageMedium=Memory \
--set clustermesh.apiserver.kvstoremesh.enabled=true \
--set clustermesh.apiserver.kvstoremesh.extraArgs[0]=--kvstore-opt=etcd.qps=1000 \
--set clustermesh.apiserver.nodeSelector.node-role\\.kubernetes\\.io/control-plane= \
--set clustermesh.apiserver.tolerations[0].key=node-role.kubernetes.io/control-plane \
--set clustermesh.apiserver.tolerations[0].operator=Exists \
--set clustermesh.apiserver.metrics.serviceMonitor.enabled=true \
--values values-clustermesh-config.yaml
cilium status --wait
cilium clustermesh status --wait --wait-duration=5m
- name: Run CL2
id: run-cl2
working-directory: ./perf-tests/clusterloader2
env:
CL2_PROMETHEUS_PVC_ENABLED: "false"
CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: "true"
CL2_PROMETHEUS_SCRAPE_CILIUM_AGENT: "true"
CL2_PROMETHEUS_MEMORY_SCALE_FACTOR: 4.0
timeout-minutes: 30
run: |
go run ./cmd/clusterloader.go \
-v=2 \
--testconfig=./testing/custom/clustermesh/config.yaml \
--testoverrides=./testing/prometheus/not-scrape-kube-proxy.yaml \
--provider=gce \
--nodes=1 \
--enable-prometheus-server \
--tear-down-prometheus-server=false \
--report-dir=./report \
--experimental-prometheus-snapshot-to-report-dir=true \
--kubeconfig=$HOME/.kube/config \
2>&1 | tee cl2-output.txt
- name: Get sysdump
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }}
run: |
cilium status
cilium sysdump --output-filename cilium-sysdump-final
- name: Cleanup cluster
if: ${{ always() && steps.deploy-cluster.outcome != 'skipped' }}
uses: cilium/scale-tests-action/cleanup-cluster@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main
with:
cluster_name: ${{ steps.vars.outputs.CLUSTER_NAME }}
kops_state: ${{ secrets.GCP_PERF_KOPS_STATE_STORE }}
- name: Export results and sysdump to GS bucket
if: ${{ always() && steps.run-cl2.outcome != 'skipped' && steps.run-cl2.outcome != 'cancelled' }}
uses: cilium/scale-tests-action/export-results@c3a2f10946bf94430b40fd9e3ad40834add6d555 # main
with:
test_name: ${{ env.test_name }}
results_bucket: ${{ env.GCP_PERF_RESULTS_BUCKET }}
artifacts: ./perf-tests/clusterloader2/report/*
other_files: cilium-sysdump-final.zip ./perf-tests/clusterloader2/cl2-output.txt
commit-status-final:
if: ${{ always() }}
name: Commit Status Final
needs: install-and-test
runs-on: ubuntu-latest
steps:
- name: Set final commit status
uses: myrotvorets/set-commit-status-action@3730c0a348a2ace3c110851bed53331bc6406e9f # v2.0.1
with:
sha: ${{ inputs.SHA || github.sha }}
status: ${{ needs.install-and-test.result }}