Draft push action

openreview · Nov 13, 2024 · 0088d37 · 0088d37
1 parent 0304738
commit 0088d37
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 1 deletion.
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
@@ -0,0 +1,42 @@
+# This workflow builds and pushes the expertise image to the Artifact Registry
+
+name: dev-deployment
+
+# Controls when the workflow will run
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+  # Triggers the workflow on push events but only for the master branch
+  push:
+    branches: [ feature/containerize ]
+jobs:
+  deploy:
+    # Allow the job to fetch a GitHub ID token
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Add SSH key
+        run: |
+          mkdir -p /home/runner/.ssh
+          echo "${{ secrets.GCLOUD_SSH_KEY }}" > /home/runner/.ssh/google_compute_engine
+          echo "${{ secrets.GCLOUD_SSH_KEY_PUB }}" > /home/runner/.ssh/google_compute_engine.pub
+          chmod 600 /home/runner/.ssh/google_compute_engine
+          chmod 600 /home/runner/.ssh/google_compute_engine.pub
+      - name: Authenticate with Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v1
+        with:
+          workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }}
+          create_credentials_file: true
+          cleanup_credentials: true
+          export_environment_variables: true
+      - name: Setup gcloud
+        uses: google-github-actions/setup-gcloud@v1
+      - name: Run deploy script
+        run: |
+          gcloud config set compute/zone us-central1-c
+          gcloud compute ssh openreview@instance-matching-server --command '/bin/expertise_build_dev.sh -b ${{ github.event.inputs.branch }} -p ${{ github.event.inputs.py_branch }}' --quiet
diff --git a/Dockerfile b/Dockerfile
@@ -11,6 +11,10 @@ ARG PATH="/app/miniconda/bin:${PATH}"
 
 # Set the environment variable
 ENV FLASK_ENV=production
+ENV AIP_STORAGE_URI="gs://openreview-expertise/expertise-utils/"
+ENV SPECTER_DIR="/app/expertise-utils/specter/"
+ENV MFR_VOCAB_DIR="/app/expertise-utils/multifacet_recommender/feature_vocab_file"
+ENV MFR_CHECKPOINT_DIR="/app/expertise-utils/multifacet_recommender/mfr_model_checkpoint/"
 
 COPY . /app/openreview-expertise
 
@@ -21,6 +25,7 @@ RUN apt update \
     && apt install -y build-essential \
     && apt install -y git \
     && apt install -y sudo \
+    && apt install -y vim \
     && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y tzdata \
     && cd $HOME \
     && wget "https://repo.anaconda.com/miniconda/Miniconda3-py38_22.11.1-1-Linux-x86_64.sh" -O miniconda.sh \

diff --git a/expertise/execute_pipeline.py b/expertise/execute_pipeline.py
@@ -37,6 +37,8 @@
     baseurl_v1 = raw_request.pop('baseurl_v1')
     baseurl_v2 = raw_request.pop('baseurl_v2')
     destination_prefix = raw_request.pop('gcs_folder')
+    dump_embs = False if 'dump_embs' not in raw_request else raw_request.pop('dump_embs')
+    dump_archives = False if 'dump_archives' not in raw_request else raw_request.pop('dump_archives')
     specter_dir = os.getenv('SPECTER_DIR')
     mfr_vocab_dir = os.getenv('MFR_VOCAB_DIR')
     mfr_checkpoint_dir = os.getenv('MFR_CHECKPOINT_DIR')
@@ -88,4 +90,41 @@
                 })
         blob = bucket.blob(destination_blob)
         contents = '\n'.join([json.dumps(r) for r in result])
-        blob.upload_from_string(contents)
+        blob.upload_from_string(contents)
+
+    # Dump config
+    destination_blob = f"{blob_prefix}/job_config.json"
+    blob = bucket.blob(destination_blob)
+    blob.upload_from_string(json.dumps(config.to_json()))
+
+    # Dump archives
+    if dump_archives:
+        for jsonl_file in os.listdir(os.path.join(config.job_dir, 'archives')):
+            result = []
+            destination_blob = f"{blob_prefix}/archives/{jsonl_file}"
+            with open(os.path.join(config.job_dir, 'archives' ,jsonl_file), 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    result.append({
+                        'id': data['id'],
+                        'content': data['content']
+                    })
+            blob = bucket.blob(destination_blob)
+            contents = '\n'.join([json.dumps(r) for r in result])
+            blob.upload_from_string(contents)
+
+    # Dump embeddings
+    if dump_embs:
+        for emb_file in [d for d in os.listdir(config.job_dir) if '.jsonl' in d]:
+            result = []
+            destination_blob = f"{blob_prefix}/{emb_file}"
+            with open(os.path.join(config.job_dir, emb_file), 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    result.append({
+                        'paper_id': data['paper_id'],
+                        'embedding': data['embedding']
+                    })
+            blob = bucket.blob(destination_blob)
+            contents = '\n'.join([json.dumps(r) for r in result])
+            blob.upload_from_string(contents)