Merge pull request #293 from georgetown-cset/292-automate-data-prs

Add data pr workflow; download new data programmatically from gcs
georgetown-cset · Jan 8, 2024 · b8ac04b · b8ac04b
2 parents 6e94932 + 90130f2
commit b8ac04b
Show file tree

Hide file tree

Showing 10 changed files with 265 additions and 198 deletions.
diff --git a/.github/workflows/update-data.yml b/.github/workflows/update-data.yml
@@ -0,0 +1,41 @@
+name: Update Data
+
+on:
+  schedule:
+    - cron: "0 0 15 * *"
+
+jobs:
+  scheduled-job:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: 'write'
+      id-token: 'write'
+      pull-requests: 'write'
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: 'Authenticate to Google Cloud'
+        id: 'auth'
+        uses: 'google-github-actions/auth@v2'
+        with:
+          workload_identity_provider: 'projects/855475113448/locations/global/workloadIdentityPools/eto-github/providers/eto-github'
+          service_account: '[email protected]'
+          create_credentials_file: 'true'
+          token_format: 'access_token'
+      - name: Update data
+        run: |
+          python3 -m pip install --upgrade pip
+          pip install -r requirements.txt
+          PYTHONPATH='.' python3 scripts/preprocess_for_website.py
+      - name: Create Pull Request
+        id: cpr
+        uses: peter-evans/create-pull-request@v4
+        with:
+          commit-message: Update datasets
+          add-paths: |
+            github-metrics/src/data/*
+            github-metrics/static/*
+          token: ${{ secrets.GITHUB_TOKEN }}
+          title: Data update
+          reviewers: jmelot
diff --git a/github-metrics/src/data/config.json b/github-metrics/src/data/config.json
@@ -1 +1 @@
-{"start_year": 2017, "end_year": 2023, "last_updated": "December 05, 2023"}
+{"start_year": 2017, "end_year": 2023, "last_updated": "December 19, 2023"}
diff --git a/github-metrics/src/data/field_to_repos.json b/github-metrics/src/data/field_to_repos.json
diff --git a/github-metrics/src/data/fields.json b/github-metrics/src/data/fields.json
@@ -1 +1 @@
-["Knowledge management", "Bioinformatics", "World Wide Web", "Automotive engineering", "Social psychology", "riscv", "Social science", "Mathematical optimization", "Machine learning", "Parallel computing", "Organic chemistry", "Microbiology", "Transport engineering", "Molecular biology", "Atmospheric sciences", "Epistemology", "Oceanography", "Advertising", "Mathematical analysis", "Artificial intelligence", "Optics", "Computational chemistry", "Cardiology", "Software engineering", "Simulation", "Agronomy", "Cancer research", "Particle physics", "Theoretical computer science", "Hydrology", "Radiology", "Renewable Energy", "Botany", "Gender studies", "Control theory", "Anatomy", "Medical education", "Finance", "Astrophysics", "Cell biology", "Financial system", "Algebra", "Reliability engineering", "International economics", "Linguistics", "Econometrics", "Quantum mechanics", "Topology", "Zoology", "Neuroscience", "Geophysics", "Genetics", "Quantum electrodynamics", "Information retrieval", "Sustainable Development", "Cognitive science", "Food science", "Thermodynamics", "Condensed matter physics", "Energy Storage", "Data science", "Pattern recognition", "Embedded system", "Acoustics", "Nuclear physics", "Computational biology", "Astrobiology", "Geometry", "Water resource management", "Climate and Earth Science", "Surgery", "Emissions", "Pharmacology", "Computer vision", "Computer security", "Financial economics", "Visual arts", "Energy Systems", "Ecology", "Virology", "Natural language processing", "Pathology", "Consumption of Energy and Resources", "Remote sensing", "Evolutionary biology", "Natural Resources", "Media studies", "Speech recognition", "Meteorology", "Immunology", "Classics", "Industrial Ecology", "Computer graphics (images)", "ai_safety", "Nuclear magnetic resonance", "Computational physics", "Astronomy", "Distributed computing", "Multimedia", "Earth Systems", "weto", "Operating system"]
+["Computer graphics (images)", "Botany", "Financial system", "Radiology", "Condensed matter physics", "Computer security", "Artificial intelligence", "Cardiology", "Geophysics", "Surgery", "Acoustics", "Consumption of Energy and Resources", "Epistemology", "Natural Resources", "Econometrics", "Quantum electrodynamics", "Finance", "Nuclear magnetic resonance", "Pattern recognition", "Knowledge management", "Climate and Earth Science", "Simulation", "Control theory", "International economics", "Cognitive science", "Computational biology", "Immunology", "Reliability engineering", "ai_safety", "Anatomy", "Linguistics", "Meteorology", "Machine learning", "Quantum mechanics", "Financial economics", "Ecology", "Zoology", "Optics", "Sustainable Development", "Speech recognition", "Bioinformatics", "Mathematical analysis", "Computational chemistry", "Atmospheric sciences", "Organic chemistry", "Topology", "Pharmacology", "Operating system", "Natural language processing", "Hydrology", "Software engineering", "Embedded system", "Computational physics", "Nuclear physics", "Earth Systems", "Algebra", "Distributed computing", "Parallel computing", "Astrobiology", "Genetics", "Industrial Ecology", "weto", "Food science", "Oceanography", "Virology", "Astrophysics", "Cell biology", "World Wide Web", "Multimedia", "Gender studies", "Thermodynamics", "Water resource management", "Neuroscience", "Visual arts", "Computer vision", "Energy Storage", "Renewable Energy", "Information retrieval", "Cancer research", "Microbiology", "Mathematical optimization", "Molecular biology", "Energy Systems", "Social science", "Transport engineering", "Pathology", "riscv", "Evolutionary biology", "Automotive engineering", "Advertising", "Classics", "Media studies", "Geometry", "Social psychology", "Particle physics", "Data science", "Agronomy", "Remote sensing", "Astronomy", "Medical education", "Emissions", "Theoretical computer science"]
diff --git a/github-metrics/src/data/id_to_repo.json b/github-metrics/src/data/id_to_repo.json
diff --git a/github-metrics/static/orca_download.jsonl b/github-metrics/static/orca_download.jsonl
diff --git a/orca_data_pipeline.py b/orca_data_pipeline.py
@@ -17,6 +17,9 @@
 from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import (
     BigQueryToBigQueryOperator,
 )
+from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
+    BigQueryToGCSOperator,
+)
 from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
     GCSToBigQueryOperator,
 )
@@ -186,7 +189,12 @@
         curr >> last
         curr = last
 
-    wait_for_checks = DummyOperator(task_id="wait_for_checks")
+    export_website_stats = BigQueryToGCSOperator(
+        task_id="export_website_stats",
+        source_project_dataset_table=f"{staging_dataset}.website_stats",
+        destination_cloud_storage_uris=f"gs://{DATA_BUCKET}/{tmp_dir}/website_stats/data*",
+        export_format="NEWLINE_DELIMITED_JSON",
+    )
 
     checks = [
         BigQueryCheckOperator(
@@ -206,7 +214,7 @@
         ),
     ]
 
-    last >> checks >> wait_for_checks
+    last >> checks >> export_website_stats
 
     msg_success = get_post_success("ORCA data updated!", dag)
     curr_time = datetime.strftime(datetime.now(), "%Y_%m_%d")
@@ -232,7 +240,7 @@
                 },
                 python_callable=update_table_descriptions,
             )
-            if table != "website_stats"
+            if table == "repos_with_full_meta"
             else None
         )
 
@@ -243,8 +251,8 @@
             create_disposition="CREATE_IF_NEEDED",
             write_disposition="WRITE_TRUNCATE",
         )
-        wait_for_checks >> copy_to_prod
-        if table != "website_stats":
+        export_website_stats >> copy_to_prod
+        if table == "repos_with_full_meta":
             copy_to_prod >> pop_descriptions >> take_snapshot
         else:
             copy_to_prod >> take_snapshot

diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,5 @@ pycountry
 tqdm
 coverage
 google-cloud-secret-manager
+google-cloud-storage
 linkchecker
diff --git a/scripts/get_owners.py b/scripts/get_owners.py
@@ -3,8 +3,8 @@
 import time
 
 import requests
-
 from geolocation.geolocation_base import get_country_from_location
+
 from scripts.github_config import RATE_LIMIT_INTERVAL, mk_auth
 
 AUTH = mk_auth()

diff --git a/scripts/preprocess_for_website.py b/scripts/preprocess_for_website.py
@@ -2,10 +2,12 @@
 import json
 import math
 import os
+import shutil
 from datetime import datetime
 from typing import Tuple
 
 import pycountry
+from google.cloud import storage
 from tqdm import tqdm
 
 from scripts.constants import (
@@ -381,6 +383,7 @@ def write_download_data(ids_to_repos: dict) -> None:
     parent_dir = os.path.join("github-metrics", "static")
     os.makedirs(parent_dir, exist_ok=True)
     with open(os.path.join(parent_dir, "orca_download.jsonl"), mode="w") as f:
+        first_line = True
         for id, meta in ids_to_repos.items():
             meta["github_id"] = id
             meta.pop("top_articles")
@@ -410,7 +413,8 @@ def write_download_data(ids_to_repos: dict) -> None:
                 {"year": year, "country": country, "count": count}
                 for year, country, count in meta["downloads"]
             ]
-            f.write(json.dumps(meta) + "\n")
+            f.write(("" if first_line else "\n") + json.dumps(meta))
+            first_line = False
 
 
 def write_data(input_dir: str, output_dir: str) -> None:
@@ -504,9 +508,22 @@ def write_config(config_fi: str) -> None:
 if __name__ == "__main__":
     default_data_dir = os.path.join("github-metrics", "src", "data")
     parser = argparse.ArgumentParser()
-    parser.add_argument("--input_dir", default="gh_website_stats")
     parser.add_argument("--data_dir", default=default_data_dir)
     args = parser.parse_args()
 
-    write_data(args.input_dir, args.data_dir)
+    # we have to pull the data down from gcs like this rather than reading directly from BigQuery because
+    # the latter is prohibitively slow
+    input_dir = "website_stats"
+    if os.path.exists(input_dir):
+        shutil.rmtree(input_dir)
+    os.makedirs(input_dir)
+    client = storage.Client()
+    bucket = client.get_bucket("airflow-data-exchange")
+    for blob in client.list_blobs(
+        "airflow-data-exchange", prefix="orca/tmp/website_stats"
+    ):
+        blob.download_to_filename(
+            os.path.join(input_dir, blob.name.strip("/").split("/")[-1])
+        )
+    write_data(input_dir, args.data_dir)
     write_config(os.path.join(args.data_dir, "config.json"))
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"start_year": 2017, "end_year": 2023, "last_updated": "December 05, 2023"}
		{"start_year": 2017, "end_year": 2023, "last_updated": "December 19, 2023"}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		["Knowledge management", "Bioinformatics", "World Wide Web", "Automotive engineering", "Social psychology", "riscv", "Social science", "Mathematical optimization", "Machine learning", "Parallel computing", "Organic chemistry", "Microbiology", "Transport engineering", "Molecular biology", "Atmospheric sciences", "Epistemology", "Oceanography", "Advertising", "Mathematical analysis", "Artificial intelligence", "Optics", "Computational chemistry", "Cardiology", "Software engineering", "Simulation", "Agronomy", "Cancer research", "Particle physics", "Theoretical computer science", "Hydrology", "Radiology", "Renewable Energy", "Botany", "Gender studies", "Control theory", "Anatomy", "Medical education", "Finance", "Astrophysics", "Cell biology", "Financial system", "Algebra", "Reliability engineering", "International economics", "Linguistics", "Econometrics", "Quantum mechanics", "Topology", "Zoology", "Neuroscience", "Geophysics", "Genetics", "Quantum electrodynamics", "Information retrieval", "Sustainable Development", "Cognitive science", "Food science", "Thermodynamics", "Condensed matter physics", "Energy Storage", "Data science", "Pattern recognition", "Embedded system", "Acoustics", "Nuclear physics", "Computational biology", "Astrobiology", "Geometry", "Water resource management", "Climate and Earth Science", "Surgery", "Emissions", "Pharmacology", "Computer vision", "Computer security", "Financial economics", "Visual arts", "Energy Systems", "Ecology", "Virology", "Natural language processing", "Pathology", "Consumption of Energy and Resources", "Remote sensing", "Evolutionary biology", "Natural Resources", "Media studies", "Speech recognition", "Meteorology", "Immunology", "Classics", "Industrial Ecology", "Computer graphics (images)", "ai_safety", "Nuclear magnetic resonance", "Computational physics", "Astronomy", "Distributed computing", "Multimedia", "Earth Systems", "weto", "Operating system"]
		["Computer graphics (images)", "Botany", "Financial system", "Radiology", "Condensed matter physics", "Computer security", "Artificial intelligence", "Cardiology", "Geophysics", "Surgery", "Acoustics", "Consumption of Energy and Resources", "Epistemology", "Natural Resources", "Econometrics", "Quantum electrodynamics", "Finance", "Nuclear magnetic resonance", "Pattern recognition", "Knowledge management", "Climate and Earth Science", "Simulation", "Control theory", "International economics", "Cognitive science", "Computational biology", "Immunology", "Reliability engineering", "ai_safety", "Anatomy", "Linguistics", "Meteorology", "Machine learning", "Quantum mechanics", "Financial economics", "Ecology", "Zoology", "Optics", "Sustainable Development", "Speech recognition", "Bioinformatics", "Mathematical analysis", "Computational chemistry", "Atmospheric sciences", "Organic chemistry", "Topology", "Pharmacology", "Operating system", "Natural language processing", "Hydrology", "Software engineering", "Embedded system", "Computational physics", "Nuclear physics", "Earth Systems", "Algebra", "Distributed computing", "Parallel computing", "Astrobiology", "Genetics", "Industrial Ecology", "weto", "Food science", "Oceanography", "Virology", "Astrophysics", "Cell biology", "World Wide Web", "Multimedia", "Gender studies", "Thermodynamics", "Water resource management", "Neuroscience", "Visual arts", "Computer vision", "Energy Storage", "Renewable Energy", "Information retrieval", "Cancer research", "Microbiology", "Mathematical optimization", "Molecular biology", "Energy Systems", "Social science", "Transport engineering", "Pathology", "riscv", "Evolutionary biology", "Automotive engineering", "Advertising", "Classics", "Media studies", "Geometry", "Social psychology", "Particle physics", "Data science", "Agronomy", "Remote sensing", "Astronomy", "Medical education", "Emissions", "Theoretical computer science"]