Skip to content

Commit

Permalink
test: performance test CI work (#8761)
Browse files Browse the repository at this point in the history
  • Loading branch information
NicholasBlaskey authored Feb 1, 2024
1 parent 36a2e29 commit 40a70cf
Show file tree
Hide file tree
Showing 8 changed files with 571 additions and 211 deletions.
31 changes: 31 additions & 0 deletions .circleci/devcluster/perftest.devcluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
stages:
- master:
pre:
- sh: make -C tools prep-root
config_file:
db:
host: localhost # Host is localhost since we connect through ssh forwarding.
port: 5432
user: $PERF_DB_USER
password: $PERF_DB_PASS
name: postgres
ssl_mode: require
checkpoint_storage:
type: shared_fs
host_path: /tmp
storage_path: determined-cp
log:
level: debug
root: tools/build
cache:
cache_dir: /tmp/determined-cache
launch_error: false
security:
authz:
rbac_ui_enabled: true
resource_manager:
type: agent
default_aux_resource_pool: default
default_compute_resource_pool: default
resource_pools:
- pool_name: default
201 changes: 201 additions & 0 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2132,6 +2132,172 @@ jobs:
path: /tmp/priority_scheduler
destination: devcluster-priority_scheduler-logs

test-perf:
parameters:
snapshot-after-migrations:
type: boolean
default: false
deploy-db:
type: boolean
default: false
machine:
image: <<pipeline.parameters.machine-image>>
resource_class: xlarge
steps:
- queue/until_front_of_line:
only-on-branch: main
time: "120" # Wait two hours at most. Adjust this over time.
- checkout
- attach_workspace:
at: .
- setup-python-venv:
executor: <<pipeline.parameters.machine-image>>
- install-devcluster
- run:
name: Install upload deps
command: tools/scripts/retry.sh pip install requests determined psycopg2-binary

- when:
condition: <<parameters.deploy-db>>
steps:
- run:
name: Select snapshot to use
command: |
echo 'export PERF_SNAPSHOT_TO_USE="perf-test-base-snapshot"' >> "$BASH_ENV"
SNAPSHOT_COMMITS=$(aws rds describe-db-snapshots \
--region="us-west-2" \
--query "DBSnapshots[?TagList[?Key=='ci-snapshot']].DBSnapshotIdentifier" \
--output json | jq -r '.[] | split("-")[3]')
echo "Snapshot commits (${SNAPSHOT_COMMITS})"
for ((n=0; n<=1000; n++)); do
COMMIT=$(git log --format="%H" -n 1 --skip=$n)
if [[ " $SNAPSHOT_COMMITS " =~ .*"$COMMIT".* ]]; then
echo "export PERF_SNAPSHOT_TO_USE=\"ci-snapshot-commit-${COMMIT}\"" >> "$BASH_ENV"
break
fi
done
source $BASH_ENV
echo "Deciding to use $PERF_SNAPSHOT_TO_USE"
- run:
name: Wait for snapshot to be available
command: |
aws rds wait db-snapshot-available \
--region "us-west-2" \
--db-snapshot-identifier "${PERF_SNAPSHOT_TO_USE}"
- run:
name: Deploy database
command: |
aws rds restore-db-instance-from-db-snapshot \
--region="us-west-2" \
--db-snapshot-identifier="${PERF_SNAPSHOT_TO_USE}" \
--db-instance-identifier="ci-perf-db-${CIRCLE_BUILD_NUM}" \
--no-multi-az \
--no-publicly-accessible \
--no-auto-minor-version-upgrade \
--db-parameter-group-name="logquerieslong" \
--tags "Key=ci-snapshot" \
--vpc-security-group-ids="${PERF_DB_SECURITY_GROUP_ID}" \
no_output_timeout: 30m
- run:
name: Get db instance host
command: |
echo "export RDS_HOST=$(aws rds describe-db-instances \
--region us-west-2 \
--db-instance-identifier "ci-perf-db-${CIRCLE_BUILD_NUM}" \
--query "DBInstances[0].Endpoint.Address" \
--output text)" >> "$BASH_ENV"
source $BASH_ENV
echo "perf db host ${PERF_DB_HOST}"
- run:
name: Wait for database to be ready
command: |
aws rds wait db-instance-available \
--region="us-west-2" \
--db-instance-identifier="ci-perf-db-${CIRCLE_BUILD_NUM}"
- run:
name: Add SSH key
command: echo "${PERF_DB_BASTION_SSH_KEY}" | base64 --decode | ssh-add -
- run:
name: Port forward to bastion instance
command: ssh -L 5432:${PERF_DB_HOST}:5432 -N -f ubuntu@$PERF_DB_BASTION_HOST
- start-devcluster:
target-stage: master
devcluster-config: perftest.devcluster.yaml
- run:
name: Wait and record any migrations ran
command: python .circleci/scripts/wait_for_perf_migration_upload_results.py

- when:
condition: <<parameters.snapshot-after-migrations>>
steps:
- run:
name: Take and wait for RDS snapshot, only on main and when migrations were applied
command: |
if [ -f /tmp/no-migrations-needed ]; then
echo "/tmp/no-migrations-needed exists, no need to take a snapshot"
exit 0
fi
COMMIT=$(git log -1 --pretty=format:%H)
echo "Taking snapshot"
aws rds create-db-snapshot \
--region="us-west-2" \
--db-instance-identifier="${PERF_DB_AWS_NAME}" \
--db-snapshot-identifier="ci-snapshot-commit-${COMMIT}" \
--tags "Key=ci-snapshot"
echo "Snapshot taken now waiting for it to become completed"
aws rds wait db-snapshot-completed \
--region="us-west-2" \
--db-snapshot-identifier="ci-snapshot-commit-${COMMIT}"
echo "Snapshot completed"
- run:
name: Build performance test Docker image
command: make -C performance build
- run:
name: Run performance test
command: |
export PERF_DOCKER_FLAGS="--network=host"
export PERF_K6_FLAGS='-e DET_ADMIN_USERNAME="admin" \
-e DET_ADMIN_PASSWORD="" \
-e model_name="tnjpuojqzbluqiyyqilftulsw" \
-e model_version_number="1" \
-e trial_id="8282" \
-e experiment_id="100" \
-e task_id="backported.8282" \
-e metric_name="85c9" \
-e metric_type="METRIC_TYPE_TRAINING" \
-e batches="1800" \
-e batches_margin="99" \
-e resource_pool="default"'
make -C performance run
- run:
name: Upload result of performance test to Postgres result db
command: python .circleci/scripts/upload_perf_results.py ./performance/reports/latest.results.json

- when:
condition: <<parameters.deploy-db>>
when: always
steps:
- run:
name: Delete RDS instance
command: |
aws rds delete-db-instance \
--region="us-west-2" \
--db-instance-identifier="ci-perf-db-${CIRCLE_BUILD_NUM}" \
--skip-final-snapshot
- slack/status:
fail_only: false
only_for_branches: main
failure_message: ':thisisfine: A \`${CIRCLE_JOB}\` job on branch \`${CIRCLE_BRANCH}\` has failed!'
mentions: "U03CP4ZKY2D" # Ping Nick Blaskey for now. Eventually switch this to perf team.

deploy:
parameters:
compute-agent-instance-type:
Expand Down Expand Up @@ -2752,6 +2918,20 @@ workflows:
target-stage: agent
wait-for-master: false

- test-perf:
name: test-perf
snapshot-after-migrations: true
deploy-db: false
requires:
- build-go
context:
- perf-tests
- aws
filters:
branches:
only:
- main

- deploy:
name: deploy-latest-master-cluster
enable-cors: true
Expand Down Expand Up @@ -3125,6 +3305,27 @@ workflows:
aux-agent-instance-type: ["m5.large"]
max-dynamic-agents: [2]

# Perf tests.
- request-perf-tests:
type: approval
filters: *upstream-feature-branch

- build-go:
requires:
- request-perf-tests

- test-perf:
name: test-perf-feature-branch
snapshot-after-migrations: false
deploy-db: true
requires:
- build-go
- request-perf-tests
context:
- perf-tests
- aws
filters: *upstream-feature-branch

# Nightly tests
- request-gpu-nightly:
type: approval
Expand Down
Loading

0 comments on commit 40a70cf

Please sign in to comment.