diff --git a/.github/workflows/save_daily_metrics.yml b/.github/workflows/save_daily_metrics.yml new file mode 100644 index 0000000..1b3f044 --- /dev/null +++ b/.github/workflows/save_daily_metrics.yml @@ -0,0 +1,86 @@ +name: save-daily-metrics +on: + workflow_dispatch: + schedule: + # Run every day at 8:00 PM UTC + # https://crontab.guru/#0_20_*_*_* + - cron: "0 20 * * *" + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Set up conda environment for testing + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: environment.yml + cache-environment: true + ondarc: | + channels: + - conda-forge + - defaults + channel_priority: strict + + - name: Log conda environnment information + run: | + conda info + conda list + conda config --show-sources + conda config --show + printenv | sort + + - name: Authenticate gcloud + id: gcloud-auth + continue-on-error: true + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "projects/345950277072/locations/global/workloadIdentityPools/gh-actions-pool/providers/gh-actions-provider" + service_account: "pudl-usage-metrics-etl@catalyst-cooperative-pudl.iam.gserviceaccount.com" + create_credentials_file: true + + - name: "Set up Cloud SDK" + uses: "google-github-actions/setup-gcloud@v2" + with: + version: ">= 363.0.0" + + - shell: bash -l {0} + name: Save Github Metrics + env: + API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} + run: | + python src/usage_metrics/scripts/save_github_metrics.py + + - shell: bash -l {0} + name: Save Kaggle Metrics + env: + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} + run: | + python src/usage_metrics/scripts/save_kaggle_metrics.py + + - name: Inform the Codemonkeys + uses: 8398a7/action-slack@v3 + with: + status: custom + fields: workflow,job,commit,repo,ref,author,took + custom_payload: | + { + username: 'action-slack', + icon_emoji: ':octocat:', + attachments: [{ + color: '${{ job.status }}' === 'success' ? 'good' : '${{ job.status }}' === 'failure' ? 'danger' : 'warning', + text: `${process.env.AS_WORKFLOW}\n${process.env.AS_JOB} (${process.env.AS_COMMIT}) of ${process.env.AS_REPO}@${process.env.AS_REF} by ${process.env.AS_AUTHOR} ${{ job.status }} in ${process.env.AS_TOOK}`, + }] + } + env: + GITHUB_TOKEN: ${{ github.token }} # required + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required + MATRIX_CONTEXT: ${{ toJson(matrix) }} # required + if: always() # Pick up events even if the job fails or is canceled. diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index 539c7ed..d6bcf79 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -61,6 +61,8 @@ jobs: POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }} POSTGRES_DB: ${{ secrets.POSTGRES_DB }} POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} run: | tox diff --git a/README.md b/README.md index 13058a2..004a99d 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ If you want to take advantage of caching raw logs, rather than redownloading the Dagster stores run logs and caches in a directory stored in the `DAGSTER_HOME` environment variable. The `usage_metrics/dagster_home/dagster.yaml` file contains configuration for the dagster instance. **Note:** The `usage_metrics/dagster_home/storage` directory could grow to become a couple GBs because all op outputs for every run are stored there. You can read more about the dagster_home directory in the [dagster docs](https://docs.dagster.io/deployment/dagster-instance#default-local-behavior). +To use the Kaggle API, [sign up for a Kaggle account](https://www.kaggle.com). Then go to the ['Account' tab]((https://www.kaggle.com//account)) of your user profile and select 'Create API Token'. This will trigger the download of `kaggle.json`, a file containing your API credentials. Use this file to automatically set your Kaggle API credentials locally, or manually set `KAGGLE_USER` and `KAGGLE_KEY` environment variables. + To set these environment variables, run these commands: ``` @@ -41,6 +43,8 @@ conda activate pudl-usage-metrics conda env config vars set IPINFO_TOKEN="{your_api_key_here}" conda env config vars set DAGSTER_HOME="$(pwd)/dagster_home/" conda env config vars set DATA_DIR="$(pwd)/data/" +conda env config vars set KAGGLE_USER="{your_kaggle_username_here}" # If setting manually +conda env config vars set KAGGLE_KEY="{your_kaggle_api_key_here}" # If setting manually conda activate pudl-usage-metrics ``` diff --git a/pyproject.toml b/pyproject.toml index 10c1d3f..f2499e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "pg8000>=1.31.1", "cloud-sql-python-connector[pg8000]>=1.11.0", "google-cloud-storage>=2.17", + "kaggle>=1.6.3" ] classifiers = [ diff --git a/src/usage_metrics/scripts/__init__.py b/src/usage_metrics/scripts/__init__.py new file mode 100644 index 0000000..da1b7bb --- /dev/null +++ b/src/usage_metrics/scripts/__init__.py @@ -0,0 +1,3 @@ +"""Module contains assets that extract raw data.""" + +from . import save_github_metrics, save_kaggle_metrics diff --git a/src/usage_metrics/scripts/save_github_metrics.py b/src/usage_metrics/scripts/save_github_metrics.py new file mode 100644 index 0000000..2902641 --- /dev/null +++ b/src/usage_metrics/scripts/save_github_metrics.py @@ -0,0 +1,137 @@ +"""This script pull github traffic metrics and saves them to a GC Bucket.""" + +import json +import logging +import os +import sys +import time +from dataclasses import dataclass +from datetime import date + +import requests +from google.cloud import storage +from requests.exceptions import HTTPError + +logger = logging.getLogger() +logging.basicConfig(level="INFO") + + +@dataclass +class Metric: + """Format metrics into folder names.""" + + name: str + folder: str + + +def get_biweekly_metrics(owner: str, repo: str, token: str, metric: str) -> str: + """Get json data for a biweekly github metric. + + Args: + metric (str): The github metric name. + + Returns: + json (str): The metric data as json text. + """ + query_url = f"https://api.github.com/repos/{owner}/{repo}/traffic/{metric}" + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + } + + response = make_github_request(query_url, headers) + return json.dumps(response.json()) + + +def get_persistent_metrics(owner: str, repo: str, token: str, metric: str) -> str: + """Get githubs persistent metrics: forks and stargazers. + + Args: + metrics (str): the metric to retrieve (forks | stargazers) + + Returns: + json (str): A json string of metrics. + """ + query_url = f"https://api.github.com/repos/{owner}/{repo}/{metric}" + headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3.star+json", + } + + metrics = [] + page = 1 + + timeout = 600 # Set 10 minute timeout + timeout_start = time.time() + + while time.time() < timeout_start + timeout: + params = {"page": page} + metrics_json = make_github_request(query_url, headers, params).json() + + if len(metrics_json) <= 0: + break + metrics += metrics_json + page += 1 + return json.dumps(metrics) + + +def make_github_request(query: str, headers: str, params: str = None): + """Makes a request to the github api. + + Args: + query (str): A github api request url. + headers (str): Header to include in the request. + params (str): Params of request. + + Returns: + response (requests.models.Response): the request response. + """ + try: + response = requests.get(query, headers=headers, params=params, timeout=100) + + response.raise_for_status() + except HTTPError as http_err: + raise HTTPError( + f"HTTP error occurred: {http_err}\n\tResponse test: {response.text}" + ) + return response + + +def upload_to_bucket(data, metric): + """Upload a gcp object.""" + bucket_name = "pudl-usage-metrics-archives.catalyst.coop" + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob_name = f"github/{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" + + blob = bucket.blob(blob_name) + blob.upload_from_string(data) + + logger.info(f"Uploaded {metric.name} data to {blob_name}.") + + +def save_metrics(): + """Save github traffic metrics to google cloud bucket.""" + token = os.getenv("API_TOKEN_GITHUB", "...") + owner = "catalyst-cooperative" + repo = "pudl" + + biweekly_metrics = [ + Metric("clones", "clones"), + Metric("popular/paths", "popular_paths"), + Metric("popular/referrers", "popular_referrers"), + Metric("views", "views"), + ] + persistent_metrics = [Metric("stargazers", "stargazers"), Metric("forks", "forks")] + + for metric in biweekly_metrics: + metric_data = get_biweekly_metrics(owner, repo, token, metric.name) + upload_to_bucket(metric_data, metric) + + for metric in persistent_metrics: + metric_data = get_persistent_metrics(owner, repo, token, metric.name) + upload_to_bucket(metric_data, metric) + + +if __name__ == "__main__": + sys.exit(save_metrics()) diff --git a/src/usage_metrics/scripts/save_kaggle_metrics.py b/src/usage_metrics/scripts/save_kaggle_metrics.py new file mode 100644 index 0000000..04bfc44 --- /dev/null +++ b/src/usage_metrics/scripts/save_kaggle_metrics.py @@ -0,0 +1,47 @@ +"""This script pull Kaggle traffic metrics and saves them to a GC Bucket.""" + +import json +import logging +import sys +from datetime import date + +from google.cloud import storage +from kaggle.api.kaggle_api_extended import KaggleApi + +logger = logging.getLogger() +logging.basicConfig(level="INFO") + + +def get_kaggle_logs() -> str: + """Get PUDL project usage metadata from Kaggle site.""" + api = KaggleApi() + kaggle_owner = "catalystcooperative" + kaggle_dataset = "pudl-project" + + metadata = api.metadata_get(kaggle_owner, kaggle_dataset) + metadata.update({"metrics_date": date.today().strftime("%Y-%m-%d")}) + return json.dumps(metadata) + + +def upload_to_bucket(data): + """Upload a gcp object.""" + bucket_name = "pudl-usage-metrics-archives.catalyst.coop" + + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob_name = f"kaggle/{date.today().strftime('%Y-%m-%d')}.json" + + blob = bucket.blob(blob_name) + blob.upload_from_string(data) + + logger.info(f"Uploaded today's data to {blob_name}.") + + +def save_metrics(): + """Save github traffic metrics to google cloud bucket.""" + kaggle_metrics = get_kaggle_logs() + upload_to_bucket(kaggle_metrics) + + +if __name__ == "__main__": + sys.exit(save_metrics()) diff --git a/tox.ini b/tox.ini index 5f027de..f2276e0 100644 --- a/tox.ini +++ b/tox.ini @@ -19,6 +19,7 @@ passenv = GOOGLE_* GCLOUD_* GCP_* + KAGGLE_* HOME SQLALCHEMY_WARN_20 IPINFO_TOKEN