Skip to content

Commit

Permalink
Merge pull request #293 from georgetown-cset/292-automate-data-prs
Browse files Browse the repository at this point in the history
Add data pr workflow; download new data programmatically from gcs
  • Loading branch information
brianlove authored Jan 8, 2024
2 parents 6e94932 + 90130f2 commit b8ac04b
Show file tree
Hide file tree
Showing 10 changed files with 265 additions and 198 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/update-data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: Update Data

on:
schedule:
- cron: "0 0 15 * *"

jobs:
scheduled-job:
runs-on: ubuntu-latest

permissions:
contents: 'write'
id-token: 'write'
pull-requests: 'write'

steps:
- uses: actions/checkout@v4
- name: 'Authenticate to Google Cloud'
id: 'auth'
uses: 'google-github-actions/auth@v2'
with:
workload_identity_provider: 'projects/855475113448/locations/global/workloadIdentityPools/eto-github/providers/eto-github'
service_account: '[email protected]'
create_credentials_file: 'true'
token_format: 'access_token'
- name: Update data
run: |
python3 -m pip install --upgrade pip
pip install -r requirements.txt
PYTHONPATH='.' python3 scripts/preprocess_for_website.py
- name: Create Pull Request
id: cpr
uses: peter-evans/create-pull-request@v4
with:
commit-message: Update datasets
add-paths: |
github-metrics/src/data/*
github-metrics/static/*
token: ${{ secrets.GITHUB_TOKEN }}
title: Data update
reviewers: jmelot
2 changes: 1 addition & 1 deletion github-metrics/src/data/config.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"start_year": 2017, "end_year": 2023, "last_updated": "December 05, 2023"}
{"start_year": 2017, "end_year": 2023, "last_updated": "December 19, 2023"}
2 changes: 1 addition & 1 deletion github-metrics/src/data/field_to_repos.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion github-metrics/src/data/fields.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
["Knowledge management", "Bioinformatics", "World Wide Web", "Automotive engineering", "Social psychology", "riscv", "Social science", "Mathematical optimization", "Machine learning", "Parallel computing", "Organic chemistry", "Microbiology", "Transport engineering", "Molecular biology", "Atmospheric sciences", "Epistemology", "Oceanography", "Advertising", "Mathematical analysis", "Artificial intelligence", "Optics", "Computational chemistry", "Cardiology", "Software engineering", "Simulation", "Agronomy", "Cancer research", "Particle physics", "Theoretical computer science", "Hydrology", "Radiology", "Renewable Energy", "Botany", "Gender studies", "Control theory", "Anatomy", "Medical education", "Finance", "Astrophysics", "Cell biology", "Financial system", "Algebra", "Reliability engineering", "International economics", "Linguistics", "Econometrics", "Quantum mechanics", "Topology", "Zoology", "Neuroscience", "Geophysics", "Genetics", "Quantum electrodynamics", "Information retrieval", "Sustainable Development", "Cognitive science", "Food science", "Thermodynamics", "Condensed matter physics", "Energy Storage", "Data science", "Pattern recognition", "Embedded system", "Acoustics", "Nuclear physics", "Computational biology", "Astrobiology", "Geometry", "Water resource management", "Climate and Earth Science", "Surgery", "Emissions", "Pharmacology", "Computer vision", "Computer security", "Financial economics", "Visual arts", "Energy Systems", "Ecology", "Virology", "Natural language processing", "Pathology", "Consumption of Energy and Resources", "Remote sensing", "Evolutionary biology", "Natural Resources", "Media studies", "Speech recognition", "Meteorology", "Immunology", "Classics", "Industrial Ecology", "Computer graphics (images)", "ai_safety", "Nuclear magnetic resonance", "Computational physics", "Astronomy", "Distributed computing", "Multimedia", "Earth Systems", "weto", "Operating system"]
["Computer graphics (images)", "Botany", "Financial system", "Radiology", "Condensed matter physics", "Computer security", "Artificial intelligence", "Cardiology", "Geophysics", "Surgery", "Acoustics", "Consumption of Energy and Resources", "Epistemology", "Natural Resources", "Econometrics", "Quantum electrodynamics", "Finance", "Nuclear magnetic resonance", "Pattern recognition", "Knowledge management", "Climate and Earth Science", "Simulation", "Control theory", "International economics", "Cognitive science", "Computational biology", "Immunology", "Reliability engineering", "ai_safety", "Anatomy", "Linguistics", "Meteorology", "Machine learning", "Quantum mechanics", "Financial economics", "Ecology", "Zoology", "Optics", "Sustainable Development", "Speech recognition", "Bioinformatics", "Mathematical analysis", "Computational chemistry", "Atmospheric sciences", "Organic chemistry", "Topology", "Pharmacology", "Operating system", "Natural language processing", "Hydrology", "Software engineering", "Embedded system", "Computational physics", "Nuclear physics", "Earth Systems", "Algebra", "Distributed computing", "Parallel computing", "Astrobiology", "Genetics", "Industrial Ecology", "weto", "Food science", "Oceanography", "Virology", "Astrophysics", "Cell biology", "World Wide Web", "Multimedia", "Gender studies", "Thermodynamics", "Water resource management", "Neuroscience", "Visual arts", "Computer vision", "Energy Storage", "Renewable Energy", "Information retrieval", "Cancer research", "Microbiology", "Mathematical optimization", "Molecular biology", "Energy Systems", "Social science", "Transport engineering", "Pathology", "riscv", "Evolutionary biology", "Automotive engineering", "Advertising", "Classics", "Media studies", "Geometry", "Social psychology", "Particle physics", "Data science", "Agronomy", "Remote sensing", "Astronomy", "Medical education", "Emissions", "Theoretical computer science"]
2 changes: 1 addition & 1 deletion github-metrics/src/data/id_to_repo.json

Large diffs are not rendered by default.

370 changes: 185 additions & 185 deletions github-metrics/static/orca_download.jsonl

Large diffs are not rendered by default.

18 changes: 13 additions & 5 deletions orca_data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import (
BigQueryToBigQueryOperator,
)
from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
BigQueryToGCSOperator,
)
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
GCSToBigQueryOperator,
)
Expand Down Expand Up @@ -186,7 +189,12 @@
curr >> last
curr = last

wait_for_checks = DummyOperator(task_id="wait_for_checks")
export_website_stats = BigQueryToGCSOperator(
task_id="export_website_stats",
source_project_dataset_table=f"{staging_dataset}.website_stats",
destination_cloud_storage_uris=f"gs://{DATA_BUCKET}/{tmp_dir}/website_stats/data*",
export_format="NEWLINE_DELIMITED_JSON",
)

checks = [
BigQueryCheckOperator(
Expand All @@ -206,7 +214,7 @@
),
]

last >> checks >> wait_for_checks
last >> checks >> export_website_stats

msg_success = get_post_success("ORCA data updated!", dag)
curr_time = datetime.strftime(datetime.now(), "%Y_%m_%d")
Expand All @@ -232,7 +240,7 @@
},
python_callable=update_table_descriptions,
)
if table != "website_stats"
if table == "repos_with_full_meta"
else None
)

Expand All @@ -243,8 +251,8 @@
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE",
)
wait_for_checks >> copy_to_prod
if table != "website_stats":
export_website_stats >> copy_to_prod
if table == "repos_with_full_meta":
copy_to_prod >> pop_descriptions >> take_snapshot
else:
copy_to_prod >> take_snapshot
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ pycountry
tqdm
coverage
google-cloud-secret-manager
google-cloud-storage
linkchecker
2 changes: 1 addition & 1 deletion scripts/get_owners.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import time

import requests

from geolocation.geolocation_base import get_country_from_location

from scripts.github_config import RATE_LIMIT_INTERVAL, mk_auth

AUTH = mk_auth()
Expand Down
23 changes: 20 additions & 3 deletions scripts/preprocess_for_website.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
import json
import math
import os
import shutil
from datetime import datetime
from typing import Tuple

import pycountry
from google.cloud import storage
from tqdm import tqdm

from scripts.constants import (
Expand Down Expand Up @@ -381,6 +383,7 @@ def write_download_data(ids_to_repos: dict) -> None:
parent_dir = os.path.join("github-metrics", "static")
os.makedirs(parent_dir, exist_ok=True)
with open(os.path.join(parent_dir, "orca_download.jsonl"), mode="w") as f:
first_line = True
for id, meta in ids_to_repos.items():
meta["github_id"] = id
meta.pop("top_articles")
Expand Down Expand Up @@ -410,7 +413,8 @@ def write_download_data(ids_to_repos: dict) -> None:
{"year": year, "country": country, "count": count}
for year, country, count in meta["downloads"]
]
f.write(json.dumps(meta) + "\n")
f.write(("" if first_line else "\n") + json.dumps(meta))
first_line = False


def write_data(input_dir: str, output_dir: str) -> None:
Expand Down Expand Up @@ -504,9 +508,22 @@ def write_config(config_fi: str) -> None:
if __name__ == "__main__":
default_data_dir = os.path.join("github-metrics", "src", "data")
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir", default="gh_website_stats")
parser.add_argument("--data_dir", default=default_data_dir)
args = parser.parse_args()

write_data(args.input_dir, args.data_dir)
# we have to pull the data down from gcs like this rather than reading directly from BigQuery because
# the latter is prohibitively slow
input_dir = "website_stats"
if os.path.exists(input_dir):
shutil.rmtree(input_dir)
os.makedirs(input_dir)
client = storage.Client()
bucket = client.get_bucket("airflow-data-exchange")
for blob in client.list_blobs(
"airflow-data-exchange", prefix="orca/tmp/website_stats"
):
blob.download_to_filename(
os.path.join(input_dir, blob.name.strip("/").split("/")[-1])
)
write_data(input_dir, args.data_dir)
write_config(os.path.join(args.data_dir, "config.json"))

0 comments on commit b8ac04b

Please sign in to comment.