diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 967bcee..d5e69cf 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -11,6 +11,7 @@ jobs: runs-on: ubuntu-latest env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + DBGAP_STUDY_ID: phs002371.v5.p1 steps: - name: Checkout code @@ -19,7 +20,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.11 - name: Install dependencies run: | diff --git a/dbgapmonitor.py b/dbgapmonitor.py index e62f76c..6333dc5 100644 --- a/dbgapmonitor.py +++ b/dbgapmonitor.py @@ -1,50 +1,70 @@ -import pandas as pd -import polars as pl +#!/usr/bin/env python + +""" +File: dbgapmonitor.py +Author: Adam J. Taylor +Date: 2024-05-05 +Description: A Python script to monitor dbGaP Authorized Requestors and send a message to Slack. +""" + from datetime import datetime, timedelta import io -import requests import json import os -# Download the tab-separated text file -url = "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetAuthorizedRequestDownload.cgi?study_id=phs002371.v5.p1" -response = requests.get(url) +import pandas as pd +import polars as pl +import requests -# Read the CSV file without header and with arbitrary column names -df = ( - pl.read_csv( - # "dbGaPAuthorizedRequests.phs002371.v1.p1.tab-delimited.txt", - io.StringIO(response.text), - separator="\t", - truncate_ragged_lines=True, - try_parse_dates=True, - ) - .rename({"Cloud Service AdministratorData stewardRequestor": "Requestor"}) - .with_columns(pl.col("Date of approval").str.to_date("%b%d, %Y")) - .sort("Date of approval", descending=True) -) +def get_dbgap_requestors(phs_id): + """ + Retrieves the list of dbGaP Authorized Requestors for a given study ID. -# Strip extra whitespace from the columns -df = df.with_columns( - pl.col("Requestor").str.strip_chars(), - pl.col("Affiliation").str.strip_chars(), - pl.col("Project").str.strip_chars(), -) + Args: + phs_id (str): The study ID for which the requestors are to be retrieved. + Returns: + pandas.DataFrame: A DataFrame containing the dbGaP Authorized Requestors. + """ + # Download the tab-separated text file + url = f"https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/GetAuthorizedRequestDownload.cgi?study_id={phs_id}" + response = requests.get(url) -# Filter for those approved in the last month -# Get today's date -today = datetime.today() + # Read the CSV file without header and with arbitrary column names + # Polars is used here as it simplified loading a rather non-standard TSV file + df = ( + pl.read_csv( + io.StringIO(response.text), + separator="\t", + truncate_ragged_lines=True, + try_parse_dates=True, + ) + .rename({"Cloud Service AdministratorData stewardRequestor": "Requestor"}) + .with_columns(pl.col("Date of approval").str.to_date("%b%d, %Y")) + .sort("Date of approval", descending=True) + ) -# Calculate the date from 30 days ago -last_month = today - timedelta(days=7) -df_recent = df.filter(pl.col("Date of approval") > last_month) + # Strip extra whitespace from the columns + df = df.with_columns( + pl.col("Requestor").str.strip_chars(), + pl.col("Affiliation").str.strip_chars(), + pl.col("Project").str.strip_chars(), + ) -print(df_recent) + return df def dataframe_to_slack_block_with_md_links(df): + """ + Converts a pandas DataFrame to a Slack message block with markdown links. + + Args: + df (pandas.DataFrame): The DataFrame containing the data to be converted. + + Returns: + dict: A dictionary representing the Slack message block with markdown links. + """ blocks = [ { "type": "section", @@ -62,6 +82,19 @@ def dataframe_to_slack_block_with_md_links(df): def send_message_to_slack_blocks(webhook_url, blocks): + """ + Sends a message to Slack using the provided webhook URL and blocks. + + Args: + webhook_url (str): The URL of the Slack webhook. + blocks (list): The blocks to be sent as part of the message. + + Raises: + ValueError: If the request to Slack returns an error. + + Returns: + None + """ headers = {"Content-Type": "application/json"} data = json.dumps(blocks) response = requests.post(webhook_url, headers=headers, data=data) @@ -71,24 +104,48 @@ def send_message_to_slack_blocks(webhook_url, blocks): ) -if df_recent.to_pandas().empty: - # If no modified entities are found, prepare a simple message for Slack - slack_message_blocks = { - "blocks": [ - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "No new dbGaP Authorized Requestors added in the last 7 days", - }, - } - ] - } -else: - # If there are modified entities, format the message as before - slack_message_blocks = dataframe_to_slack_block_with_md_links(df_recent.to_pandas()) - -# Usage -# Get the webhook URL from a env variable called SLACK_WEBHOOK_URL -webhook_url = os.getenv("SLACK_WEBHOOK_URL") -send_message_to_slack_blocks(webhook_url, slack_message_blocks) +def main(): + + # Get the webhook URL from a env variable called SLACK_WEBHOOK_URL + webhook_url = os.getenv("SLACK_WEBHOOK_URL") + + # Get the study ID from an environment variable + phs_id = os.getenv("DBGAP_STUDY_ID") + + # Declare the number of days to look back + lookback_days = 7 + + # Get the dbGaP Authorized Requestors for the study ID + df = get_dbgap_requestors(phs_id) + + # Filter for those approved in the n days + today = datetime.today() + start_date = today - timedelta(days=lookback_days) + df_recent = df.filter(pl.col("Date of approval") > start_date) + + # Perpeare the slack message blocks + if df_recent.to_pandas().empty: + # If no modified entities are found, prepare a simple message for Slack + slack_message_blocks = { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"No new dbGaP Authorized Requestors added in the last {lookback_days} days", + }, + } + ] + } + else: + # If there are modified entities, format the message as before + slack_message_blocks = dataframe_to_slack_block_with_md_links( + df_recent.to_pandas() + ) + + # Send the message to Slack + send_message_to_slack_blocks(webhook_url, slack_message_blocks) + + +if __name__ == "__main__": + main()