From eac768ce28a94a962894693289d7b7507f14e4fc Mon Sep 17 00:00:00 2001
From: Abdul Fatir <Abdulfatirs@gmail.com>
Date: Mon, 2 Dec 2024 10:05:57 +0100
Subject: [PATCH] Add workflow to run evaluation on a subset of datasets (#222)

*Issue #, if available:*

*Description of changes:* This PR adds a workflow that will run the
evaluation script on `chronos-bolt-small` for a subset of datasets
specified in `ci/evaluate/backtest_configs.yaml`. After evaluation, a
comment will be made on the PR. The workflow will only run if the
`run-eval` label is present on a PR. The end-to-end workflow has been
split into two workflows:

- `eval-model.yml`: only has read access (can be run from forks). This
will evaluate the model and upload the metrics CSV file as a Github
artifact.
- `eval-pr-comment.yml`: has read and write access (can only be run when
in the `main` branch). This will be triggered when the first job
finishes, will download the CSV from the eval job and make the comment.
According to [this
post](https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/),
splitting into two jobs as done here is the recommended and secure way
to do this.

**NOTE**: The first steps works as expected, but we can only test the
second step after the merging because this workflow needs to be part of
the `main` branch for this to work.

By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.

---------

Co-authored-by: Abdul Fatir Ansari <ansarnd@amazon.de>
---
 .github/workflows/eval-model.yml      | 35 +++++++++++++++++
 .github/workflows/eval-pr-comment.yml | 54 +++++++++++++++++++++++++++
 ci/evaluate/backtest_config.yaml      | 37 ++++++++++++++++++
 3 files changed, 126 insertions(+)
 create mode 100644 .github/workflows/eval-model.yml
 create mode 100644 .github/workflows/eval-pr-comment.yml
 create mode 100644 ci/evaluate/backtest_config.yaml

diff --git a/.github/workflows/eval-model.yml b/.github/workflows/eval-model.yml
new file mode 100644
index 0000000..0446870
--- /dev/null
+++ b/.github/workflows/eval-model.yml
@@ -0,0 +1,35 @@
+# Evaluates Chronos-Bolt (Small) model on selected datasets
+name: Evaluate
+
+on:
+  # Runs only with read privilages for the GITHUB_TOKEN
+  pull_request:
+    branches: ["main"] # Run on PRs to main branch
+
+jobs:
+  evaluate-and-post:
+    if: contains(github.event.pull_request.labels.*.name, 'run-eval')  # Only run if 'run-eval' label is added
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Dependencies
+        run: pip install ".[evaluation]" -f https://download.pytorch.org/whl/cpu/torch_stable.html
+
+      - name: Run Eval Script
+        run: python scripts/evaluation/evaluate.py ci/evaluate/backtest_config.yaml eval-ci-metrics.csv --chronos-model-id=amazon/chronos-bolt-small --device=cpu --torch-dtype=float32
+      
+      - name: Upload CSV
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-metrics
+          path: eval-ci-metrics.csv
+          retention-days: 1
+          overwrite: true
diff --git a/.github/workflows/eval-pr-comment.yml b/.github/workflows/eval-pr-comment.yml
new file mode 100644
index 0000000..1f0d18c
--- /dev/null
+++ b/.github/workflows/eval-pr-comment.yml
@@ -0,0 +1,54 @@
+# Post evaluation results from the "Evaluate" workflow as a PR comment
+name: Post Eval Metrics
+
+on:
+  # Runs with read & write privilages for the GITHUB_TOKEN
+  workflow_run:
+    workflows: ["Evaluate"]
+    types:
+      - completed
+
+jobs:
+  comment-eval-results:
+    if: >
+      github.event.workflow_run.event == 'pull_request' &&
+      github.event.workflow_run.conclusion == 'success'
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read # for downloading artifacts
+      pull-requests: write # for posting PR comment
+
+    steps:
+      - name: Download Eval Metrics
+        uses: actions/download-artifact@v4
+        with:
+          name: eval-metrics
+          path: eval-metrics-artifact/
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.event.workflow_run.id }}
+
+      - name: Display structure of downloaded files
+        run: ls -R
+
+      - name: Read CSV
+        id: csv
+        uses: juliangruber/read-file-action@v1
+        with:
+          path: eval-metrics-artifact/eval-ci-metrics.csv
+
+      - name: Create Markdown Table
+        uses: petems/csv-to-md-table-action@master
+        id: csv-table-output
+        with:
+          csvinput: ${{ steps.csv.outputs.content }}
+
+      - name: Post Table as a Comment
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          repository: ${{ github.repository }}
+          issue-number: ${{ github.event.pull_request.number }}
+          body: |
+            ### Evaluation Metrics
+            ${{steps.csv-table-output.outputs.markdown-table}}
+          reactions: rocket
\ No newline at end of file
diff --git a/ci/evaluate/backtest_config.yaml b/ci/evaluate/backtest_config.yaml
new file mode 100644
index 0000000..8843908
--- /dev/null
+++ b/ci/evaluate/backtest_config.yaml
@@ -0,0 +1,37 @@
+# From In-domain
+- name: taxi_30min # 30 min
+  hf_repo: autogluon/chronos_datasets
+  offset: -48
+  prediction_length: 48
+  num_rolls: 1
+# From Zero-shot
+- name: ETTh # Hourly
+  hf_repo: autogluon/chronos_datasets_extra
+  offset: -24
+  prediction_length: 24
+  num_rolls: 1
+- name: monash_covid_deaths # Daily
+  hf_repo: autogluon/chronos_datasets
+  offset: -30
+  prediction_length: 30
+  num_rolls: 1
+- name: monash_nn5_weekly # Weekly
+  hf_repo: autogluon/chronos_datasets
+  offset: -8
+  prediction_length: 8
+  num_rolls: 1
+- name: monash_fred_md # Monthly
+  hf_repo: autogluon/chronos_datasets
+  offset: -12
+  prediction_length: 12
+  num_rolls: 1
+- name: monash_m3_quarterly # Quarterly
+  hf_repo: autogluon/chronos_datasets
+  offset: -8
+  prediction_length: 8
+  num_rolls: 1
+- name: monash_tourism_yearly # Yearly
+  hf_repo: autogluon/chronos_datasets
+  offset: -4
+  prediction_length: 4
+  num_rolls: 1
\ No newline at end of file