ci: replace build-artifact-s3 with new workflow, add local tpch bench…

…es (#3864) This PR adds a few things: - `publish-dev-s3.yml` workflow to replace `build-artifact-s3.yml` with one that uses the unified build process in `build-wheel.yml` as well as serve a Python simplified repository API through Cloudfront - `benchmark-local-tpch.yml` which runs benchmarks for local TPC-H at 100SF. It will automatically build and publish if necessary, and is run both manually and as a nightly test - fix index URLs in `nightlies-tests.yml` and add local TPC-H benchmarks Eventually I plan on moving all of our current nightly benchmarks to our main repo, and get them publishing to Google sheets. Then we can deprecate our postgres/looker setup as well as our K8s ray cluster
Eventual-Inc · Mar 3, 2025 · 7bcec18 · 7bcec18
1 parent 7ee2b35
commit 7bcec18
Show file tree

Hide file tree

Showing 8 changed files with 303 additions and 221 deletions.
diff --git a/.github/ci-scripts/local_tpch.py b/.github/ci-scripts/local_tpch.py
@@ -0,0 +1,94 @@
+"""Run TPC-H benchmarks with native runner on local Parquet data and upload results to Google sheets.
+
+Expects tables as Parquet files in "/tmp/tpch-data/"
+"""
+
+import os
+import time
+from datetime import datetime, timezone
+
+import gspread
+
+import daft
+import daft.context
+from benchmarking.tpch import answers
+from daft.sql import SQLCatalog
+
+
+def get_df(name):
+    return daft.read_parquet(f"/tmp/tpch-data/{name}/*")
+
+
+def run_benchmark():
+    table_names = [
+        "part",
+        "supplier",
+        "partsupp",
+        "customer",
+        "orders",
+        "lineitem",
+        "nation",
+        "region",
+    ]
+
+    def lowercase_column_names(df):
+        return df.select(*[daft.col(name).alias(name.lower()) for name in df.column_names])
+
+    catalog = SQLCatalog({tbl: lowercase_column_names(get_df(tbl)) for tbl in table_names})
+
+    results = {}
+
+    for q in range(1, 23):
+        if q == 21:
+            # TODO: remove this once we support q21
+            daft_df = answers.q21(get_df)
+        else:
+            with open(f"benchmarking/tpch/queries/{q:02}.sql") as query_file:
+                query = query_file.read()
+            daft_df = daft.sql(query, catalog=catalog)
+
+        start = time.perf_counter()
+        daft_df.collect()
+        end = time.perf_counter()
+
+        results[q] = end - start
+
+    return results
+
+
+def get_run_metadata():
+    return {
+        "started at": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S.%f"),
+        "daft version": daft.__version__,
+        "github ref": os.getenv("GITHUB_REF"),
+        "github sha": os.getenv("GITHUB_SHA"),
+    }
+
+
+def upload_to_google_sheets(data):
+    gc = gspread.service_account()
+
+    sh = gc.open_by_url(
+        "https://docs.google.com/spreadsheets/d/1d6pXsIsBkjjM93GYtoiF83WXvJXR4vFgFQdmG05u8eE/edit?gid=0#gid=0"
+    )
+    ws = sh.worksheet("Local TPC-H")
+    ws.append_row(data)
+
+
+def main():
+    daft.context.set_runner_native()
+
+    metadata = get_run_metadata()
+
+    results = run_benchmark()
+
+    data_dict = {**metadata, **results}
+
+    print("Results:")
+    print(data_dict)
+
+    upload_to_google_sheets(list(data_dict.values()))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/benchmark-local-tpch.yml b/.github/workflows/benchmark-local-tpch.yml
@@ -0,0 +1,72 @@
+name: benchmark-local-tpch
+
+on:
+  workflow_dispatch:
+  workflow_call:
+    inputs:
+      daft_index_url:
+        description: The index URL of the Daft build to benchmark. If not provided, builds Daft off of the provided branch.
+        type: string
+        default: ''
+        required: false
+
+env:
+  DAFT_INDEX_URL: ${{ inputs.daft_index_url != '' && inputs.daft_index_url || format('https://d1p3klp2t5517h.cloudfront.net/builds/dev/{0}', github.sha) }}
+  AWS_REGION: us-west-2
+
+  DAFT_ANALYTICS_ENABLED: '0'
+  UV_SYSTEM_PYTHON: 1
+  PYTHON_VERSION: '3.9'
+
+jobs:
+  build:
+    name: Build and publish wheels if necessary
+    if: ${{ inputs.daft_index_url == '' }}
+    uses: ./.github/workflows/publish-dev-s3.yml
+
+  benchmark:
+    needs: build
+    if: ${{ !failure() && !cancelled() }}
+    runs-on: [self-hosted, linux, arm64, bench-tpch]
+    permissions:
+      id-token: write
+      contents: read
+
+    steps:
+    - name: Mount local SSD to /tmp
+      run: |
+        findmnt /tmp 1> /dev/null
+        code=$?
+        if [ $code -ne 0 ]; then
+            sudo mkfs.ext4 /dev/nvme0n1
+            sudo mount -t ext4 /dev/nvme0n1 /tmp
+            sudo chmod 777 /tmp
+        fi
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Assume GitHub Actions AWS Credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: ${{ env.AWS_REGION }}
+        role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
+        role-session-name: DaftLocalTpchGitHubWorkflow
+    - name: Download TPC-H data (100SF)
+      run: aws s3 cp s3://eventual-dev-benchmarking-fixtures/uncompressed/tpch-dbgen/100_0/32/parquet /tmp/tpch-data --recursive
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+    - name: Install Daft and dev dependencies
+      run: |
+        rm -rf daft
+        uv pip install getdaft --pre --extra-index-url ${{ env.DAFT_INDEX_URL }}
+        uv pip install gspread
+    - name: Write service account secret file
+      run: |
+        cat << EOF > ~/.config/gspread/service_account.json
+        ${{ secrets.GOOGLE_SHEETS_SERVICE_ACCOUNT }}
+        EOF
+    - name: Run benchmark and upload results to Google Sheets
+      run: python .github/ci-scripts/local_tpch.py
diff --git a/.github/workflows/build-artifact-s3.yml b/.github/workflows/build-artifact-s3.yml
diff --git a/.github/workflows/nightlies-tests.yml b/.github/workflows/nightlies-tests.yml
@@ -1,13 +1,8 @@
-name: Verify platforms nightly wheels
+name: Test and benchmark suite on nightly build
 
 on:
-  schedule:
-  - cron: 0 13 * * *
   workflow_dispatch:
-  workflow_run:
-    workflows: [daft-publish]
-    types:
-    - completed
+  workflow_call:
 
 env:
   DAFT_ANALYTICS_ENABLED: '0'
@@ -44,7 +39,7 @@ jobs:
 
     - name: Install Daft and dev dependencies
       run: |
-        uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://pypi.anaconda.org/daft-nightly/simple --force-reinstall
+        uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://d1p3klp2t5517h.cloudfront.net/builds/nightly --force-reinstall
         rm -rf daft
     - uses: actions/cache@v4
       env:
@@ -114,7 +109,7 @@ jobs:
         echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
     - name: Install Daft and dev dependencies
       run: |
-        uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://pypi.anaconda.org/daft-nightly/simple --force-reinstall
+        uv pip install -r requirements-dev.txt getdaft --pre --extra-index-url https://d1p3klp2t5517h.cloudfront.net/builds/nightly --force-reinstall
         rm -rf daft
     - name: Prepare tmpdirs for IO services
       run: |
@@ -155,3 +150,32 @@ jobs:
       env:
         SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
         SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
+
+  benchmark-local-tpch:
+    uses: ./.github/workflows/benchmark-local-tpch.yml
+    with:
+      daft_index_url: https://d1p3klp2t5517h.cloudfront.net/builds/nightly
+
+  on-local-tpch-failure:
+    name: Send Slack notification on failure
+    runs-on: ubuntu-latest
+    needs: benchmark-local-tpch
+    if: ${{ failure() }}
+
+    steps:
+    - uses: slackapi/[email protected]
+      with:
+        payload: |
+          {
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": ":rotating_light: [CI] Local TPC-H benchmarks on nightly wheel <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED* :rotating_light:"
+                }
+              }
+            ]
+          }
+        webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
+        webhook-type: incoming-webhook
diff --git a/.github/workflows/nightly-publish-s3.yml b/.github/workflows/nightly-publish-s3.yml
@@ -103,3 +103,8 @@ jobs:
           }
         webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
         webhook-type: incoming-webhook
+
+  tests:
+    name: Run tests on nightly build
+    needs: publish
+    uses: ./.github/workflows/nightlies-tests.yml