.github/workflows/workflow-run-forge.yaml

name: "*run Forge reusable workflow"

on:
  # this is called from within the build-images.yaml workflow since we depend on the images having been built before
  workflow_call:
    inputs:
      GIT_SHA:
        required: false
        type: string
        description: The git SHA1 to test. If not specified, Forge will check the latest commits on the current branch
      IMAGE_TAG:
        required: false
        type: string
        description: The docker image tag to test. If not specified, falls back on GIT_SHA, and then to the latest commits on the current branch
      FORGE_IMAGE_TAG:
        required: false
        type: string
        description: The docker image tag to use for forge runner. If not specified, falls back on GIT_SHA, and then to the latest commits on the current branch
      FORGE_NAMESPACE:
        required: false
        type: string
        description: The Forge k8s namespace to be used for test. This value should manage Forge test concurrency. It may be truncated.
      FORGE_CLUSTER_NAME:
        required: false
        type: string
        description: The Forge k8s cluster to be used for test
      FORGE_RUNNER_DURATION_SECS:
        required: false
        type: number
        default: 480
        description: Duration of the forge test run
      FORGE_TEST_SUITE:
        required: false
        type: string
        default: land_blocking
        description: Test suite to run
      POST_TO_SLACK:
        required: false
        type: boolean
        default: false
        description: Whether to post the test results comment to Slack
      TIMEOUT_MINUTES:
        required: false
        type: number
        default: 360
        description: Github job timeout in minutes
      FORGE_ENABLE_FAILPOINTS:
        required: false
        type: string
        description: Whether to use failpoints images
      FORGE_ENABLE_HAPROXY:
        required: false
        type: string
        description: Whether to use HAPRoxy
      FORGE_ENABLE_PERFORMANCE:
        required: false
        type: string
        description: Whether to use performance images
      FORGE_RETAIN_DEBUG_LOGS:
        required: false
        type: boolean
        description: Retain debug logs for all nodes
      COMMENT_HEADER:
        required: false
        type: string
        default: forge
        description: A unique ID for Forge sticky comment on your PR. See
          https://github.com/marocchino/sticky-pull-request-comment#keep-more-than-one-comment
      SKIP_JOB:
        required: false
        default: false
        type: boolean
        description: Set to true to skip this job. Useful for PRs that don't require this workflow.
      FORGE_NUM_VALIDATORS:
        required: false
        type: string
        description: Number of validators to use for the forge test
      FORGE_NUM_VALIDATOR_FULLNODES:
        required: false
        type: string
        description: Number of validator fullnodes to use for the forge test
      FORGE_ENABLE_INDEXER:
        required: false
        type: boolean
        description: Whether to use indexer
      FORGE_DEPLOYER_PROFILE:
        required: false
        type: string
        description: The deployer profile used to spin up and configure forge infrastructure
      SEND_RESULTS_TO_TRUNK:
        required: false
        type: boolean
        description: Send forge results to trunk.io

env:
  AWS_ACCOUNT_NUM: ${{ secrets.ENV_ECR_AWS_ACCOUNT_NUM }}
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
  GCP_WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
  GCP_SERVICE_ACCOUNT_EMAIL: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}
  AWS_REGION: us-west-2
  IMAGE_TAG: ${{ inputs.IMAGE_TAG }}
  FORGE_IMAGE_TAG: ${{ inputs.FORGE_IMAGE_TAG }}
  FORGE_BLOCKING: ${{ secrets.FORGE_BLOCKING }}
  FORGE_CLUSTER_NAME: ${{ inputs.FORGE_CLUSTER_NAME }}
  FORGE_OUTPUT: forge_output.txt
  FORGE_REPORT: forge_report.json
  FORGE_COMMENT: forge_comment.txt
  FORGE_PRE_COMMENT: forge_pre_comment.txt
  FORGE_RUNNER_MODE: k8s
  FORGE_RUNNER_DURATION_SECS: ${{ inputs.FORGE_RUNNER_DURATION_SECS }}
  FORGE_NAMESPACE: ${{ inputs.FORGE_NAMESPACE }}
  FORGE_ENABLE_HAPROXY: ${{ inputs.FORGE_ENABLE_HAPROXY }}
  FORGE_ENABLE_INDEXER: ${{ inputs.FORGE_ENABLE_INDEXER }}
  FORGE_DEPLOYER_PROFILE: ${{ inputs.FORGE_DEPLOYER_PROFILE }}
  FORGE_TEST_SUITE: ${{ inputs.FORGE_TEST_SUITE }}
  POST_TO_SLACK: ${{ inputs.POST_TO_SLACK }}
  FORGE_ENABLE_FAILPOINTS: ${{ inputs.FORGE_ENABLE_FAILPOINTS }}
  FORGE_ENABLE_PERFORMANCE: ${{ inputs.FORGE_ENABLE_PERFORMANCE }}
  FORGE_RETAIN_DEBUG_LOGS: ${{ inputs.FORGE_RETAIN_DEBUG_LOGS }}
  COMMENT_HEADER: ${{ inputs.COMMENT_HEADER }}
  VERBOSE: true
  FORGE_NUM_VALIDATORS: ${{ inputs.FORGE_NUM_VALIDATORS }}
  FORGE_NUM_VALIDATOR_FULLNODES: ${{ inputs.FORGE_NUM_VALIDATOR_FULLNODES }}
  # FORGE_JUNIT_XML_PATH: ${{ inputs.SEND_RESULTS_TO_TRUNK && '/tmp/test.xml' || '' }}

# TODO: should we migrate this to a composite action, so that we can skip it
# at the call site, and don't need to wrap each step in an if statement?
jobs:
  forge:
    runs-on: runs-on,cpu=4,ram=16,family=m7a+m7i-flex,image=aptos-ubuntu-x64,run-id=${{ github.run_id }},spot=co
    timeout-minutes: ${{ inputs.TIMEOUT_MINUTES }}
    steps:
      - uses: actions/checkout@v4
        if: ${{ !inputs.SKIP_JOB }}
        with:
          ref: ${{ inputs.GIT_SHA }}
          # get the last 10 commits if GIT_SHA is not specified
          fetch-depth: inputs.GIT_SHA != null && 0 || 10

      - uses: actions/setup-python@v4
        if: ${{ !inputs.SKIP_JOB }}

      - name: Install python deps
        run: pip3 install click==8.1.3 psutil==5.9.1
        if: ${{ !inputs.SKIP_JOB }}

      # Calculate the auth duration based on the test duration
      # If the test duration is less than the default 90 minutes, use the default
      # otherwise add 30 minutes to the length of the Forge test run
      - name: Calculate Forge Auth Duration
        if: ${{ !inputs.SKIP_JOB }}
        id: calculate-auth-duration
        run: |
          auth_duration=$(( $FORGE_RUNNER_DURATION_SECS > 5400 ? $FORGE_RUNNER_DURATION_SECS + 30 * 60 : 5400 ))
          echo "auth_duration=${auth_duration}" >> $GITHUB_OUTPUT

      - uses: aptos-labs/aptos-core/.github/actions/docker-setup@main
        if: ${{ !inputs.SKIP_JOB }}
        id: docker-setup
        with:
          GCP_WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
          GCP_SERVICE_ACCOUNT_EMAIL: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}
          # action/docker-setup logs in to GCP under project "aptos-ci", but this workflow
          # runs kubectl under project "aptos-forge-gcp-0", to which the service account
          # of "aptos-ci" has delegated access. The exported environment variables will
          # still refer to "aptos-ci", which confuses the gcloud CLI, so we need to keep
          # them out of the environment. That's ok, because gcloud will take configuration
          # from the file-system anyway.
          EXPORT_GCP_PROJECT_VARIABLES: "false"
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_DOCKER_ARTIFACT_REPO: ${{ secrets.AWS_DOCKER_ARTIFACT_REPO }}
          GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
          GCP_AUTH_DURATION: ${{ steps.calculate-auth-duration.outputs.auth_duration }}

      - name: "Export GCloud auth token"
        if: ${{ !inputs.SKIP_JOB }}
        id: gcloud-auth
        run: echo "CLOUDSDK_AUTH_ACCESS_TOKEN=${{ steps.docker-setup.outputs.CLOUDSDK_AUTH_ACCESS_TOKEN }}" >> $GITHUB_ENV
        shell: bash

      - name: "Setup GCloud project"
        if: ${{ !inputs.SKIP_JOB }}
        shell: bash
        run: gcloud config set project aptos-forge-gcp-0

      - name: Check for Forge images
        if: ${{ !inputs.SKIP_JOB }}
        id: check-forge-images
        run: |
          check_image_exists() {
            local tag=$1
            local image_name=$2
            local output_prefix=$3
            if [[ -n "$tag" ]]; then
              if docker manifest inspect us-docker.pkg.dev/aptos-registry/docker/${image_name}:${tag} &> /dev/null; then
                echo "${output_prefix}_EXISTS=true" >> $GITHUB_OUTPUT
                echo "${output_prefix}_TAG=$tag" >> $GITHUB_OUTPUT
                echo "✅ $image_name image found for tag: $tag" >> $GITHUB_STEP_SUMMARY
              else
                echo "${output_prefix}_EXISTS=false" >> $GITHUB_OUTPUT
                echo "⚠️ No $image_name image found for tag: $tag. The latest available image will be used instead." >> $GITHUB_STEP_SUMMARY
              fi
            fi
          }

          check_image_exists "${{ inputs.GIT_SHA }}" "validator" "VALIDATOR_GIT_SHA"
          check_image_exists "${{ inputs.IMAGE_TAG }}" "validator" "VALIDATOR_IMAGE_TAG"
          check_image_exists "${{ inputs.GIT_SHA }}" "forge" "FORGE_GIT_SHA"
          check_image_exists "${{ inputs.FORGE_IMAGE_TAG }}" "forge" "FORGE_IMAGE_TAG"

          echo "ℹ️ If testing Forge changes, ensure the 'CICD: build-images' label is added to your PR and the image build workflow has completed successfully." >> $GITHUB_STEP_SUMMARY

      - name: Run pre-Forge checks
        if: ${{ !inputs.SKIP_JOB }}
        shell: bash
        env:
          FORGE_RUNNER_MODE: pre-forge
        run: testsuite/run_forge.sh

      - name: Post pre-Forge comment
        if: ${{ !inputs.SKIP_JOB && github.event.number != null }}
        uses: marocchino/sticky-pull-request-comment@39c5b5dc7717447d0cba270cd115037d32d28443 # pin@39c5b5dc7717447d0cba270cd115037d32d2844
        with:
          header: ${{ env.COMMENT_HEADER }}
          hide_and_recreate: true # Hide the previous comment and add a comment at the end
          hide_classify: "OUTDATED"
          path: ${{ env.FORGE_PRE_COMMENT }}

      - name: Run Forge
        if: ${{ !inputs.SKIP_JOB }}
        shell: bash
        run: testsuite/run_forge.sh

      - name: Post forge result comment
        # Post a Github comment if the run has not been cancelled and if we're running on a PR
        if: ${{ !inputs.SKIP_JOB  && github.event.number != null && !cancelled() }}
        uses: marocchino/sticky-pull-request-comment@39c5b5dc7717447d0cba270cd115037d32d28443 # pin@39c5b5dc7717447d0cba270cd115037d32d2844
        with:
          header: ${{ env.COMMENT_HEADER }}
          hide_and_recreate: true
          hide_classify: "OUTDATED"
          path: ${{ env.FORGE_COMMENT }}

      - name: Post to a Slack channel on failure
        # Post a Slack comment if the run has not been cancelled and the envs are set
        if: ${{ !inputs.SKIP_JOB && env.POST_TO_SLACK == 'true' && failure() }}
        id: slack
        uses: slackapi/slack-github-action@936158bbe252e9a6062e793ea4609642c966e302 # pin@v1.21.0
        with:
          # For posting a rich message using Block Kit
          payload: |
            {
              "text": "${{ job.status == 'success' && ':white_check_mark:' || ':x:' }} ${{ github.job }}(suite: `${{ inputs.FORGE_TEST_SUITE }}`, namespace: `${{ inputs.FORGE_NAMESPACE }}`): <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|link>"
            }
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.FORGE_SLACK_WEBHOOK_URL }}

      # Print out whether the job was skipped.
      - run: echo "Skipping forge test!"
        if: ${{ inputs.SKIP_JOB }}

      # TEMP disable till fixed
      # - name: Upload results
      #   # Run this step even if the test step ahead fails
      #   if: ${{ !inputs.SKIP_JOB && inputs.SEND_RESULTS_TO_TRUNK && !cancelled() }}
      #   uses: trunk-io/analytics-uploader@main
      #   with:
      #     # Configured in the nextest.toml file
      #     junit-paths: ${{ env.FORGE_JUNIT_XML_PATH }}
      #     org-slug: aptoslabs
      #     token: ${{ secrets.TRUNK_API_TOKEN }}
      #   continue-on-error: true