openreview · haroldrubio · Feb 17, 2024 · Feb 17, 2024 · Feb 17, 2024 · Feb 17, 2024
diff --git a/.github/workflows/push-image.yml b/.github/workflows/push-image.yml
@@ -0,0 +1,42 @@
+# This workflow builds and pushes the expertise image to the Artifact Registry
+
+name: dev-deployment
+
+# Controls when the workflow will run
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+  # Triggers the workflow on push events but only for the master branch
+  push:
+    branches: [ feature/containerize ]
+jobs:
+  deploy:
+    # Allow the job to fetch a GitHub ID token
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Add SSH key
+        run: |
+          mkdir -p /home/runner/.ssh
+          echo "${{ secrets.GCLOUD_SSH_KEY }}" > /home/runner/.ssh/google_compute_engine
+          echo "${{ secrets.GCLOUD_SSH_KEY_PUB }}" > /home/runner/.ssh/google_compute_engine.pub
+          chmod 600 /home/runner/.ssh/google_compute_engine
+          chmod 600 /home/runner/.ssh/google_compute_engine.pub
+      - name: Authenticate with Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v1
+        with:
+          workload_identity_provider: ${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }}
+          create_credentials_file: true
+          cleanup_credentials: true
+          export_environment_variables: true
+      - name: Setup gcloud
+        uses: google-github-actions/setup-gcloud@v1
+      - name: Run deploy script
+        run: |
+          gcloud config set compute/zone us-central1-c
+          gcloud compute ssh openreview@instance-matching-server --command '/bin/expertise_build_dev.sh -b ${{ github.event.inputs.branch }} -p ${{ github.event.inputs.py_branch }}' --quiet
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,55 @@
+FROM nvidia/cuda:11.6.1-base-ubuntu20.04
+
+WORKDIR /app
+
+ENV PYTHON_VERSION=3.8
+
+ENV HOME="/app"
+
+ENV PATH="/app/miniconda/bin:${PATH}"
+ARG PATH="/app/miniconda/bin:${PATH}"
+
+# Set the environment variable
+ENV FLASK_ENV=production
+ENV AIP_STORAGE_URI="gs://openreview-expertise/expertise-utils/"
+ENV SPECTER_DIR="/app/expertise-utils/specter/"
+ENV MFR_VOCAB_DIR="/app/expertise-utils/multifacet_recommender/feature_vocab_file"
+ENV MFR_CHECKPOINT_DIR="/app/expertise-utils/multifacet_recommender/mfr_model_checkpoint/"
+
+COPY . /app/openreview-expertise
+
+RUN apt update \
+    && apt install -y wget \
+    && apt install -y make gcc \
+    && apt install -y curl \
+    && apt install -y build-essential \
+    && apt install -y git \
+    && apt install -y sudo \
+    && apt install -y vim \
+    && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y tzdata \
+    && cd $HOME \
+    && wget "https://repo.anaconda.com/miniconda/Miniconda3-py38_22.11.1-1-Linux-x86_64.sh" -O miniconda.sh \
+    && printf '%s' "473e5ecc8e078e9ef89355fbca21f8eefa5f9081544befca99867c7beac3150d  miniconda.sh" | sha256sum -c \
+    && bash miniconda.sh -b -p $HOME/miniconda \
+    && conda update -y conda \
+    && conda create -n expertise python=$PYTHON_VERSION -c conda-forge
+
+RUN echo "source ${HOME}/miniconda/etc/profile.d/conda.sh" >> ${HOME}/.bashrc \
+    && echo "conda activate expertise" >> ${HOME}/.bashrc \
+    && /bin/bash -c "source ${HOME}/miniconda/etc/profile.d/conda.sh && conda activate expertise" \
+    && python --version \
+    && conda install -y pytorch pytorch-cuda=11 -c pytorch -c nvidia  \
+    && mkdir ${HOME}/expertise-utils \
+    && conda install -y filelock \
+    && cd ${HOME}/openreview-expertise \
+    && cp ${HOME}/openreview-expertise/expertise/service/config/default_container.cfg ${HOME}/openreview-expertise/expertise/service/config/production.cfg \
+    && pip install -e . \
+    && conda install -y intel-openmp \
+    && conda install -y faiss-cpu -c pytorch \
+    && pip install -I protobuf==3.20.1 \
+    && pip install numpy==1.24.4 --force-reinstall
+
+EXPOSE 8080
+
+# Define the entry point and pass arguments separately
+ENTRYPOINT ["python", "-m", "expertise.service", "--host", "0.0.0.0", "--port", "8080", "--container"]
diff --git a/expertise/execute_pipeline.py b/expertise/execute_pipeline.py
@@ -0,0 +1,130 @@
+import argparse
+import os
+import openreview
+import shortuuid
+import json
+import csv
+from expertise.execute_expertise import execute_create_dataset, execute_expertise
+from expertise.service import load_model_artifacts
+from expertise.service.utils import APIRequest, JobConfig
+from google.cloud import storage
+
+DEFAULT_CONFIG = {
+    "dataset": {},
+    "model": "specter+mfr",
+    "model_params": {
+        "use_title": True,
+        "sparse_value": 600,
+        "specter_batch_size": 16,
+        "mfr_batch_size": 50,
+        "use_abstract": True,
+        "average_score": False,
+        "max_score": True,
+        "skip_specter": False,
+        "use_cuda": True,
+        "use_redis": False
+    }
+}
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('api_request_str', help='a JSON file containing all other arguments')
+    args = parser.parse_args()
+    raw_request: dict = json.loads(args.api_request_str)
+
+    # Pop token, base URLs and other expected variables
+    token = raw_request.pop('token')
+    baseurl_v1 = raw_request.pop('baseurl_v1')
+    baseurl_v2 = raw_request.pop('baseurl_v2')
+    destination_prefix = raw_request.pop('gcs_folder')
+    dump_embs = False if 'dump_embs' not in raw_request else raw_request.pop('dump_embs')
+    dump_archives = False if 'dump_archives' not in raw_request else raw_request.pop('dump_archives')
+    specter_dir = os.getenv('SPECTER_DIR')
+    mfr_vocab_dir = os.getenv('MFR_VOCAB_DIR')
+    mfr_checkpoint_dir = os.getenv('MFR_CHECKPOINT_DIR')
+    server_config ={
+        'OPENREVIEW_BASEURL': baseurl_v1,
+        'OPENREVIEW_BASEURL_V2': baseurl_v2,
+        'SPECTER_DIR': specter_dir,
+        'MFR_VOCAB_DIR': mfr_vocab_dir,
+        'MFR_CHECKPOINT_DIR': mfr_checkpoint_dir,
+    }
+    load_model_artifacts()
+
+    client_v1 = openreview.Client(baseurl=baseurl_v1, token=token)
+    client_v2 = openreview.api.OpenReviewClient(baseurl_v2, token=token)
+
+    job_id = shortuuid.ShortUUID().random(length=5)
+    working_dir = f"/app/{job_id}"
+    os.makedirs(working_dir, exist_ok=True)  
+
+    validated_request = APIRequest(raw_request)
+    config = JobConfig.from_request(
+        api_request = validated_request,
+        starting_config = DEFAULT_CONFIG,
+        openreview_client= client_v1,
+        openreview_client_v2= client_v2,
+        server_config = server_config,
+        working_dir = working_dir
+    )
+
+    # Create Dataset and Execute Expertise
+    execute_create_dataset(client_v1, client_v2, config.to_json())
+    execute_expertise(config.to_json())
+
+    # Fetch and write to storage
+    bucket_name = destination_prefix.split('/')[2]
+    blob_prefix = '/'.join(destination_prefix.split('/')[3:])
+    gcs_client = storage.Client()
+    bucket = gcs_client.bucket(bucket_name)
+    for csv_file in [d for d in os.listdir(config.job_dir) if '.csv' in d]:
+        result = []
+        destination_blob = f"{blob_prefix}/{csv_file.replace('.csv', '.jsonl')}"
+        with open(os.path.join(config.job_dir, csv_file), 'r') as f:
+            reader = csv.reader(f)
+            for row in reader:
+                result.append({
+                    'submission': row[0],
+                    'user': row[1],
+                    'score': float(row[2])
+                })
+        blob = bucket.blob(destination_blob)
+        contents = '\n'.join([json.dumps(r) for r in result])
+        blob.upload_from_string(contents)
+
+    # Dump config
+    destination_blob = f"{blob_prefix}/job_config.json"
+    blob = bucket.blob(destination_blob)
+    blob.upload_from_string(json.dumps(config.to_json()))
+
+    # Dump archives
+    if dump_archives:
+        for jsonl_file in os.listdir(os.path.join(config.job_dir, 'archives')):
+            result = []
+            destination_blob = f"{blob_prefix}/archives/{jsonl_file}"
+            with open(os.path.join(config.job_dir, 'archives' ,jsonl_file), 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    result.append({
+                        'id': data['id'],
+                        'content': data['content']
+                    })
+            blob = bucket.blob(destination_blob)
+            contents = '\n'.join([json.dumps(r) for r in result])
+            blob.upload_from_string(contents)
+
+    # Dump embeddings
+    if dump_embs:
+        for emb_file in [d for d in os.listdir(config.job_dir) if '.jsonl' in d]:
+            result = []
+            destination_blob = f"{blob_prefix}/{emb_file}"
+            with open(os.path.join(config.job_dir, emb_file), 'r') as f:
+                for line in f:
+                    data = json.loads(line)
+                    result.append({
+                        'paper_id': data['paper_id'],
+                        'embedding': data['embedding']
+                    })
+            blob = bucket.blob(destination_blob)
+            contents = '\n'.join([json.dumps(r) for r in result])
+            blob.upload_from_string(contents)
diff --git a/expertise/service/__init__.py b/expertise/service/__init__.py
@@ -2,9 +2,14 @@
 import flask
 import logging, logging.handlers
 import redis
+from threading import Event
+from google.cloud import storage
 
 from celery import Celery
 
+artifact_loading_started = Event()
+model_ready = Event()
+
 def configure_logger(app):
     '''
     Configures the app's logger object.
@@ -93,4 +98,33 @@ def create_redis(app):
         port=app.config['REDIS_PORT'],
         db=app.config['REDIS_EMBEDDINGS_DB']
     )
-    return config_pool, embedding_pool
+    return config_pool, embedding_pool
+
+def load_model_artifacts():
+    """Copy all files from a GCS bucket directory to a local directory."""
+    artifact_loading_started.set()
+
+    # Extract the bucket name and path from the environment variable
+    aip_storage_uri = os.getenv('AIP_STORAGE_URI')
+    print(f"Loading from... {aip_storage_uri}")
+    if not aip_storage_uri:
+        raise ValueError("AIP_STORAGE_URI environment variable is not set")
+
+    # Assuming AIP_STORAGE_URI is in the format gs://bucket_name/path_to_directory
+    bucket_name = aip_storage_uri.split('/')[2]
+    print(f"Bucket={bucket_name}")
+
+    # The directory to copy the artifacts to, and the subdirectory name you want
+    destination_dir = "/app/expertise-utils" ## TODO: Parameterize this
+    source_blob_prefix = '/'.join(aip_storage_uri.split('/')[3:])
+
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=source_blob_prefix)
+    for blob in blobs:
+        destination_path = os.path.join(destination_dir, os.path.relpath(blob.name, start=source_blob_prefix))
+        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+        blob.download_to_filename(destination_path)
+        print(f"Copied {blob.name} to {destination_path}")
+
+    model_ready.set()
diff --git a/expertise/service/__main__.py b/expertise/service/__main__.py
@@ -1,9 +1,18 @@
 import argparse
 from expertise.service.server import app
+import os
+from expertise.service import load_model_artifacts
+import threading
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', default='localhost')
     parser.add_argument('--port', default=5000, type=int)
+    parser.add_argument('--container', action='store_true')
     args = parser.parse_args()
+
+    if args.container:
+        threading.Thread(target=load_model_artifacts).start()
+
+
     app.run(host=args.host, port=args.port)
diff --git a/expertise/service/config/default_container.cfg b/expertise/service/config/default_container.cfg
@@ -0,0 +1,28 @@
+LOG_FILE='default_container.log'
+CREATE_DATASET_RETRIES=9
+OPENREVIEW_BASEURL='https://api.openreview.net'
+OPENREVIEW_BASEURL_V2='https://api2.openreview.net'
+WORKING_DIR = './jobs'
+SPECTER_DIR = '/app/expertise-utils/specter/'
+MFR_VOCAB_DIR = '/app/expertise-utils/multifacet_recommender/feature_vocab_file'
+MFR_CHECKPOINT_DIR = '/app/expertise-utils/multifacet_recommender/mfr_model_checkpoint/'
+CELERY_CONFIG = 'expertise.service.config.celery_config'
+REDIS_ADDR = 'localhost'
+REDIS_PORT = 6379
+REDIS_CONFIG_DB = 10
+REDIS_EMBEDDINGS_DB = 11
+DEFAULT_CONFIG = {
+    "dataset": {},
+    "model": "specter+mfr",
+    "model_params": {
+        "use_title": True,
+        "specter_batch_size": 16,
+        "mfr_batch_size": 50,
+        "use_abstract": True,
+        "average_score": False,
+        "max_score": True,
+        "skip_specter": False,
+        "use_cuda": True,
+        "use_redis": False
+    }
+}