Add Dockerfile to run the scraper in a container (#201)

* Dockerfile: Add initial Dockerfile for running ARD scraper as job. * Dockerfile: Exclude data files from image, force CPU-only torch. Pulling CUDA into a small job without a GPU seems to be wasteful. * GCP Credentials: Move to subdirectory, so we can mount them as secret. * DB: Take full connection URI from env, rather than components. This allows us to use UNIX sockets to connect to the DB, which plays nice with some Google Cloud SQL Auth Proxy setups. * Dockerfile: Include required data files, pandoc dependency. * DB: Update README and GitHub workflows for taking full connection URI * Revert "DB: Update README and GitHub workflows for taking full connection URI" This reverts commit a168894. * DB: Allow specifying either full connection URI or components. * GCP Credentials: Ignore credentials file in repository root. We now look for this in a subdirectory, but should ignore at old path.
StampyAI · Apr 24, 2024 · 51206b4 · 51206b4
1 parent 2bc78e9
commit 51206b4
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -122,5 +122,7 @@ carado.moe/
 !requirements.txt
 *.epub
 
+secrets/
 credentials.json
+
 data/raw/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.11-slim-bookworm
+
+COPY align_data /source/align_data
+COPY main.py /source/main.py
+COPY requirements.txt /source/requirements.txt
+COPY data/raw/agentmodels.org /source/data/raw/agentmodels.org
+COPY data/raw/ai-alignment-papers.csv /source/data/raw/ai-alignment-papers.csv
+COPY data/raw/alignment_newsletter.xlsx /source/data/raw/alignment_newsletter.xlsx
+WORKDIR /source
+
+RUN apt-get update
+RUN apt-get -y install git pandoc
+
+RUN useradd --create-home --shell /bin/bash ard
+RUN chown ard:ard -R /source
+USER ard:ard
+
+RUN python -m pip install --upgrade pip
+RUN pip3 install torch --index-url https://download.pytorch.org/whl/cpu
+RUN pip install -r requirements.txt
+
+CMD ["python", "main.py", "fetch-all"]
diff --git a/align_data/settings.py b/align_data/settings.py
@@ -36,12 +36,14 @@
 AGISF_AIRTABLE_TABLE_ID = os.environ.get("AGISF_AIRTABLE_TABLE_ID")
 
 ### MYSQL ###
-user = os.environ.get("ARD_DB_USER", "user")
-password = os.environ.get("ARD_DB_PASSWORD", "we all live in a yellow submarine")
-host = os.environ.get("ARD_DB_HOST", "127.0.0.1")
-port = os.environ.get("ARD_DB_PORT", "3306")
-db_name = os.environ.get("ARD_DB_NAME", "alignment_research_dataset")
-DB_CONNECTION_URI = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db_name}"
+if not (DB_CONNECTION_URI := os.environ.get("ARD_DB_CONNECTION_URI")):
+    user = os.environ.get("ARD_DB_USER", "user")
+    password = os.environ.get("ARD_DB_PASSWORD", "we all live in a yellow submarine")
+    host = os.environ.get("ARD_DB_HOST", "127.0.0.1")
+    port = os.environ.get("ARD_DB_PORT", "3306")
+    db_name = os.environ.get("ARD_DB_NAME", "alignment_research_dataset")
+    DB_CONNECTION_URI = f"mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db_name}"
+
 ARTICLE_MAIN_KEYS = [
     "id",
     "source",

diff --git a/align_data/sources/articles/google_cloud.py b/align_data/sources/articles/google_cloud.py
@@ -33,7 +33,7 @@
 sheet_name = "Sheet1" # TODO: remove this
 
 
-def get_credentials(credentials_file: Union[Path, str] = "credentials.json") -> Credentials:
+def get_credentials(credentials_file: Union[Path, str] = "secrets/gcp_credentials/credentials.json") -> Credentials:
     return Credentials.from_service_account_file(credentials_file, scopes=SCOPES)