.github/workflows/push-dataset.yml

name: Synch uploaded jsonl files to HuggingFace

on:
  workflow_call:
    inputs:
      datasource:
        type: string
        required: true
      db_user:
        type: string
        required: true
      db_password:
        type: string
        required: true
      db_host:
        type: string
        required: true
  workflow_dispatch: # allow manual triggering
    inputs:
      datasource:
        description: 'The datasource to process'
        type: choice
        default: all
        options:
          - all
          - agentmodels
          - agisf
          - aisafety.info
          - alignmentforum
          - arbital
          - arxiv
          - blogs
          - distill
          - eaforum
          - lesswrong
          - special_docs
          - youtube

jobs:
  generate-dataset:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout repository
      uses: actions/checkout@v2

    - name: Setup Python environment
      uses: actions/setup-python@v2
      with:
        python-version: '3.11'

    - name: Install dependencies
      run: pip install -r requirements.txt

    - name: Generate dataset file
      env:
        ARD_DB_USER: ${{ secrets.ARD_DB_USER || inputs.db_user }}
        ARD_DB_PASSWORD: ${{ secrets.ARD_DB_PASSWORD || inputs.db_password }}
        ARD_DB_HOST: ${{ secrets.ARD_DB_HOST || inputs.db_host }}
        ARD_DB_NAME: alignment_research_dataset
      run: python main.py generate_jsonl_files ${{ inputs.datasource }}

    - name: Setup Huggingface client
      run: pip install huggingface_hub gdown jsonlines datasets

    - name: Upload files
      run: python upload_to_huggingface.py ${{ secrets.HUGGINGFACE_TOKEN }} ${{ inputs.datasource }}