-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #162 from catalyst-cooperative/kaggle-pipeline
Archive raw Kaggle and Github metrics daily
- Loading branch information
Showing
8 changed files
with
281 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
name: save-daily-metrics | ||
on: | ||
workflow_dispatch: | ||
schedule: | ||
# Run every day at 8:00 PM UTC | ||
# https://crontab.guru/#0_20_*_*_* | ||
- cron: "0 20 * * *" | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
permissions: | ||
contents: read | ||
id-token: write | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 2 | ||
|
||
- name: Set up conda environment for testing | ||
uses: mamba-org/setup-micromamba@v1 | ||
with: | ||
environment-file: environment.yml | ||
cache-environment: true | ||
ondarc: | | ||
channels: | ||
- conda-forge | ||
- defaults | ||
channel_priority: strict | ||
- name: Log conda environnment information | ||
run: | | ||
conda info | ||
conda list | ||
conda config --show-sources | ||
conda config --show | ||
printenv | sort | ||
- name: Authenticate gcloud | ||
id: gcloud-auth | ||
continue-on-error: true | ||
uses: "google-github-actions/auth@v2" | ||
with: | ||
workload_identity_provider: "projects/345950277072/locations/global/workloadIdentityPools/gh-actions-pool/providers/gh-actions-provider" | ||
service_account: "pudl-usage-metrics-etl@catalyst-cooperative-pudl.iam.gserviceaccount.com" | ||
create_credentials_file: true | ||
|
||
- name: "Set up Cloud SDK" | ||
uses: "google-github-actions/setup-gcloud@v2" | ||
with: | ||
version: ">= 363.0.0" | ||
|
||
- shell: bash -l {0} | ||
name: Save Github Metrics | ||
env: | ||
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} | ||
run: | | ||
python src/usage_metrics/scripts/save_github_metrics.py | ||
- shell: bash -l {0} | ||
name: Save Kaggle Metrics | ||
env: | ||
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} | ||
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} | ||
run: | | ||
python src/usage_metrics/scripts/save_kaggle_metrics.py | ||
- name: Inform the Codemonkeys | ||
uses: 8398a7/action-slack@v3 | ||
with: | ||
status: custom | ||
fields: workflow,job,commit,repo,ref,author,took | ||
custom_payload: | | ||
{ | ||
username: 'action-slack', | ||
icon_emoji: ':octocat:', | ||
attachments: [{ | ||
color: '${{ job.status }}' === 'success' ? 'good' : '${{ job.status }}' === 'failure' ? 'danger' : 'warning', | ||
text: `${process.env.AS_WORKFLOW}\n${process.env.AS_JOB} (${process.env.AS_COMMIT}) of ${process.env.AS_REPO}@${process.env.AS_REF} by ${process.env.AS_AUTHOR} ${{ job.status }} in ${process.env.AS_TOOK}`, | ||
}] | ||
} | ||
env: | ||
GITHUB_TOKEN: ${{ github.token }} # required | ||
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required | ||
MATRIX_CONTEXT: ${{ toJson(matrix) }} # required | ||
if: always() # Pick up events even if the job fails or is canceled. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""Module contains assets that extract raw data.""" | ||
|
||
from . import save_github_metrics, save_kaggle_metrics |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
"""This script pull github traffic metrics and saves them to a GC Bucket.""" | ||
|
||
import json | ||
import logging | ||
import os | ||
import sys | ||
import time | ||
from dataclasses import dataclass | ||
from datetime import date | ||
|
||
import requests | ||
from google.cloud import storage | ||
from requests.exceptions import HTTPError | ||
|
||
logger = logging.getLogger() | ||
logging.basicConfig(level="INFO") | ||
|
||
|
||
@dataclass | ||
class Metric: | ||
"""Format metrics into folder names.""" | ||
|
||
name: str | ||
folder: str | ||
|
||
|
||
def get_biweekly_metrics(owner: str, repo: str, token: str, metric: str) -> str: | ||
"""Get json data for a biweekly github metric. | ||
Args: | ||
metric (str): The github metric name. | ||
Returns: | ||
json (str): The metric data as json text. | ||
""" | ||
query_url = f"https://api.github.com/repos/{owner}/{repo}/traffic/{metric}" | ||
headers = { | ||
"Authorization": f"token {token}", | ||
"Accept": "application/vnd.github.v3+json", | ||
} | ||
|
||
response = make_github_request(query_url, headers) | ||
return json.dumps(response.json()) | ||
|
||
|
||
def get_persistent_metrics(owner: str, repo: str, token: str, metric: str) -> str: | ||
"""Get githubs persistent metrics: forks and stargazers. | ||
Args: | ||
metrics (str): the metric to retrieve (forks | stargazers) | ||
Returns: | ||
json (str): A json string of metrics. | ||
""" | ||
query_url = f"https://api.github.com/repos/{owner}/{repo}/{metric}" | ||
headers = { | ||
"Authorization": f"token {token}", | ||
"Accept": "application/vnd.github.v3.star+json", | ||
} | ||
|
||
metrics = [] | ||
page = 1 | ||
|
||
timeout = 600 # Set 10 minute timeout | ||
timeout_start = time.time() | ||
|
||
while time.time() < timeout_start + timeout: | ||
params = {"page": page} | ||
metrics_json = make_github_request(query_url, headers, params).json() | ||
|
||
if len(metrics_json) <= 0: | ||
break | ||
metrics += metrics_json | ||
page += 1 | ||
return json.dumps(metrics) | ||
|
||
|
||
def make_github_request(query: str, headers: str, params: str = None): | ||
"""Makes a request to the github api. | ||
Args: | ||
query (str): A github api request url. | ||
headers (str): Header to include in the request. | ||
params (str): Params of request. | ||
Returns: | ||
response (requests.models.Response): the request response. | ||
""" | ||
try: | ||
response = requests.get(query, headers=headers, params=params, timeout=100) | ||
|
||
response.raise_for_status() | ||
except HTTPError as http_err: | ||
raise HTTPError( | ||
f"HTTP error occurred: {http_err}\n\tResponse test: {response.text}" | ||
) | ||
return response | ||
|
||
|
||
def upload_to_bucket(data, metric): | ||
"""Upload a gcp object.""" | ||
bucket_name = "pudl-usage-metrics-archives.catalyst.coop" | ||
storage_client = storage.Client() | ||
bucket = storage_client.bucket(bucket_name) | ||
blob_name = f"github/{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" | ||
|
||
blob = bucket.blob(blob_name) | ||
blob.upload_from_string(data) | ||
|
||
logger.info(f"Uploaded {metric.name} data to {blob_name}.") | ||
|
||
|
||
def save_metrics(): | ||
"""Save github traffic metrics to google cloud bucket.""" | ||
token = os.getenv("API_TOKEN_GITHUB", "...") | ||
owner = "catalyst-cooperative" | ||
repo = "pudl" | ||
|
||
biweekly_metrics = [ | ||
Metric("clones", "clones"), | ||
Metric("popular/paths", "popular_paths"), | ||
Metric("popular/referrers", "popular_referrers"), | ||
Metric("views", "views"), | ||
] | ||
persistent_metrics = [Metric("stargazers", "stargazers"), Metric("forks", "forks")] | ||
|
||
for metric in biweekly_metrics: | ||
metric_data = get_biweekly_metrics(owner, repo, token, metric.name) | ||
upload_to_bucket(metric_data, metric) | ||
|
||
for metric in persistent_metrics: | ||
metric_data = get_persistent_metrics(owner, repo, token, metric.name) | ||
upload_to_bucket(metric_data, metric) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(save_metrics()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
"""This script pull Kaggle traffic metrics and saves them to a GC Bucket.""" | ||
|
||
import json | ||
import logging | ||
import sys | ||
from datetime import date | ||
|
||
from google.cloud import storage | ||
from kaggle.api.kaggle_api_extended import KaggleApi | ||
|
||
logger = logging.getLogger() | ||
logging.basicConfig(level="INFO") | ||
|
||
|
||
def get_kaggle_logs() -> str: | ||
"""Get PUDL project usage metadata from Kaggle site.""" | ||
api = KaggleApi() | ||
kaggle_owner = "catalystcooperative" | ||
kaggle_dataset = "pudl-project" | ||
|
||
metadata = api.metadata_get(kaggle_owner, kaggle_dataset) | ||
metadata.update({"metrics_date": date.today().strftime("%Y-%m-%d")}) | ||
return json.dumps(metadata) | ||
|
||
|
||
def upload_to_bucket(data): | ||
"""Upload a gcp object.""" | ||
bucket_name = "pudl-usage-metrics-archives.catalyst.coop" | ||
|
||
storage_client = storage.Client() | ||
bucket = storage_client.bucket(bucket_name) | ||
blob_name = f"kaggle/{date.today().strftime('%Y-%m-%d')}.json" | ||
|
||
blob = bucket.blob(blob_name) | ||
blob.upload_from_string(data) | ||
|
||
logger.info(f"Uploaded today's data to {blob_name}.") | ||
|
||
|
||
def save_metrics(): | ||
"""Save github traffic metrics to google cloud bucket.""" | ||
kaggle_metrics = get_kaggle_logs() | ||
upload_to_bucket(kaggle_metrics) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(save_metrics()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ passenv = | |
GOOGLE_* | ||
GCLOUD_* | ||
GCP_* | ||
KAGGLE_* | ||
HOME | ||
SQLALCHEMY_WARN_20 | ||
IPINFO_TOKEN | ||
|