.github/workflows/adhoc-matrix.yml

name: Execute ZKVM-Perf (Matrix)

on:
  workflow_dispatch:
    inputs:
      provers:
        description: 'Provers to use (comma-separated)'
        required: false
        type: string
        default: 'sp1'
      programs:
        description: 'Programs to benchmark (comma-separated)'
        required: false
        type: string
        default: 'loop,fibonacci,tendermint,reth1,reth2'
      filename:
        description: 'Filename for the benchmark'
        required: false
        type: string
        default: 'benchmark'
      trials:
        description: 'Number of trials to run'
        required: false
        type: string
        default: '1'
      sp1_ref:
        description: 'SP1 reference (commit hash or branch name)'
        required: false
        type: string
        default: 'dev'
      additional_params:
        description: 'Additional parameters as JSON'
        required: false
        type: string
        default: '{"hashfns":"poseidon","shard_sizes":"22"}'

jobs:
  run-benchmarks:
    strategy:
      matrix:
        include:
          - instance_type: g6.16xlarge
            enable_gpu: true
            ami_id: ami-079a6a210557ef0e4
          - instance_type: r7i.16xlarge
            enable_gpu: false
            ami_id: ami-079a6a210557ef0e4
      fail-fast: false  # This prevents the entire matrix from failing if one job fails
    
    name: Run on ${{ matrix.instance_type }}
    runs-on: ubuntu-latest
    continue-on-error: true  # This allows the workflow to continue even if this job fails
    
    steps:
      - name: Echo Workflow Inputs
        run: |
          echo "Provers: ${{ inputs.provers }}"
          echo "Programs: ${{ inputs.programs }}"
          echo "Filename: ${{ inputs.filename }}"
          echo "Trials: ${{ inputs.trials }}"
          echo "SP1 Reference: ${{ inputs.sp1_ref }}"
          echo "Additional Parameters: ${{ inputs.additional_params }}"
          
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v1
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ secrets.AWS_REGION }}

      - name: Start EC2 runner
        id: start-ec2-runner
        uses: xJonathanLEI/ec2-github-runner@main
        with:
          mode: start
          github-token: ${{ secrets.GH_PAT }}
          ec2-image-id: ${{ matrix.ami_id }}
          ec2-instance-type: ${{ matrix.instance_type }}
          subnet-id: ${{ secrets.AWS_SUBNET_ID }}
          security-group-id: ${{ secrets.AWS_SG_ID }}
          storage-size: 1024
          aws-resource-tags: >
            [
              {"Key": "Name", "Value": "ec2-github-runner"},
              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
            ]

      - name: Run benchmarks
        id: run-benchmarks
        uses: actions/github-script@v6
        with:
          github-token: ${{ secrets.GH_PAT }}
          script: |
            const runnerName = '${{ steps.start-ec2-runner.outputs.label }}';
            const maxAttempts = 30;
            const pollInterval = 10000; // 10 seconds
            let triggeredRunId = null;
            
            console.log('Triggering benchmark workflow');
            
            try {
              await github.rest.actions.createWorkflowDispatch({
                owner: context.repo.owner,
                repo: context.repo.repo,
                workflow_id: 'run-on-runner.yml',
                ref: context.ref,
                inputs: {
                  runner_name: runnerName,
                  instance_type: '${{ matrix.instance_type }}',
                  enable_gpu: '${{ matrix.enable_gpu }}',
                  provers: '${{ inputs.provers }}',
                  programs: '${{ inputs.programs }}',
                  filename: '${{ inputs.filename }}_${{ matrix.instance_type }}',
                  trials: '${{ inputs.trials }}',
                  sp1_ref: '${{ inputs.sp1_ref }}',
                  additional_params: '${{ inputs.additional_params }}'
                }
              });
              console.log('Benchmark workflow triggered successfully');
            } catch (error) {
              console.error(`Failed to trigger workflow: ${error.message}`);
              return;
            }
            
            console.log('Polling for the triggered run');
            for (let attempt = 1; attempt <= maxAttempts; attempt++) {
              await new Promise(resolve => setTimeout(resolve, pollInterval));
              
              try {
                const runs = await github.rest.actions.listWorkflowRuns({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  workflow_id: 'run-on-runner.yml',
                  status: 'in_progress'
                });
                
                console.log(`Found ${runs.data.workflow_runs.length} in-progress runs`);
                
                for (const run of runs.data.workflow_runs) {
                  if (new Date(run.created_at).getTime() > Date.now() - 300000) { // Within last 5 minutes
                    console.log(`Checking run ${run.id} created at ${run.created_at}`);
                    try {
                      const jobs = await github.rest.actions.listJobsForWorkflowRun({
                        owner: context.repo.owner,
                        repo: context.repo.repo,
                        run_id: run.id
                      });
                      
                      console.log(`Run ${run.id} has ${jobs.data.jobs.length} jobs`);
                      for (const job of jobs.data.jobs) {
                        console.log(`  Job: ${job.name}`);
                      }
                      
                      const matchingJob = jobs.data.jobs.find(job => 
                        job.name === `Run Benchmark on ${runnerName}`
                      );
                      
                      if (matchingJob) {
                        triggeredRunId = run.id;
                        console.log(`Found matching run. Triggered run ID: ${triggeredRunId}`);
                        break;
                      } else {
                        console.log(`No matching job found for run ${run.id}`);
                      }
                    } catch (error) {
                      console.log(`Error checking jobs for run ${run.id}: ${error.message}`);
                      continue;
                    }
                  } else {
                    console.log(`Skipping run ${run.id} as it's older than 5 minutes`);
                  }
                }
                
                if (triggeredRunId) break;
                
                console.log(`Attempt ${attempt}: Matching run not found yet. Continuing to poll...`);
              } catch (error) {
                console.log(`Error while polling: ${error.message}`);
              }
            }
            
            if (!triggeredRunId) {
              console.error('Failed to find the triggered workflow run with matching job after maximum attempts');
              return;
            }
            
            core.setOutput('triggered_run_id', triggeredRunId);
            core.exportVariable('TRIGGERED_RUN_ID', triggeredRunId);
            console.log(`Triggered run ID: ${triggeredRunId}`);

      - name: Wait for benchmark completion
        uses: actions/github-script@v6
        with:
          github-token: ${{ secrets.GH_PAT }}
          script: |
            const triggeredRunId = process.env.TRIGGERED_RUN_ID;
            if (!triggeredRunId) {
              console.error('No triggered run ID found');
              return;
            }
            
            // 10 hours in milliseconds
            const maxWaitTime = 36000000;
            // 3 minutes in milliseconds
            const checkInterval = 180000;
            const startTime = Date.now();
            
            const runUrl = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${triggeredRunId}`;
            console.log(`Waiting for benchmark job to complete. Job URL: ${runUrl}`);
            
            while (true) {
              const run = await github.rest.actions.getWorkflowRun({
                owner: context.repo.owner,
                repo: context.repo.repo,
                run_id: triggeredRunId
              });
              
              if (run.data.status === 'completed') {
                console.log(`Benchmark workflow completed with conclusion: ${run.data.conclusion}`);
                if (run.data.conclusion !== 'success') {
                  console.warn(`Benchmark workflow failed with conclusion: ${run.data.conclusion}. Job URL: ${runUrl}`);
                }
                break;
              }
              
              if (Date.now() - startTime > maxWaitTime) {
                console.error(`Benchmark workflow did not complete within the maximum wait time. Job URL: ${runUrl}`);
                break;
              }
              
              console.log(`Waiting for benchmark to complete... Current status: ${run.data.status}. Job URL: ${runUrl}`);
              await new Promise(resolve => setTimeout(resolve, checkInterval));
            }

      - name: Stop EC2 runner
        if: always()
        uses: xJonathanLEI/ec2-github-runner@main
        with:
          mode: stop
          github-token: ${{ secrets.GH_PAT }}
          label: ${{ steps.start-ec2-runner.outputs.label }}
          ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}

    outputs:
      triggered_run_id: ${{ steps.run-benchmarks.outputs.triggered_run_id }}