From 5828f9c765bfbc9508d56b517e3deae5abd3e8a5 Mon Sep 17 00:00:00 2001
From: Jon Massey <jon.massey@thedatalab.org>
Date: Fri, 10 May 2024 17:03:37 +0100
Subject: [PATCH 1/4] Add GitHub API query, dataclass, and test

Define a Codespace dataclass containing required fields (see discussion
in https://github.com/opensafely-core/codespaces-initiative/issues/42).
Rather than use an instance of the existing Repo dataclass to store
repo data, we only need the name and we only receive a
minimal amount of repo data from the API so just store the name as a
string. This is hopefully less confusing than modifying the Repo class
or populating the extra fields this class requires with dummy data.

An additional PAT is required to query codespaces for the opensafely
GitHub organisation. Any future querying of codespaces for other
organisations will require similarly permissioned PATs.

The organisation codespaces endpoint is queried and returned data is
passed unmodified to the Codespace dataclass's from_dict() method, which
does the required data conversion. This follows the pattern established
for the other domain dataclasses.
---
 INSTALL.md                          |  7 +++++++
 dotenv-sample                       |  1 +
 metrics/github/client.py            |  8 ++++++++
 metrics/github/github.py            | 26 ++++++++++++++++++++++++++
 metrics/github/query.py             |  5 +++++
 tests/metrics/github/test_github.py | 17 +++++++++++++++++
 6 files changed, 64 insertions(+)

diff --git a/INSTALL.md b/INSTALL.md
index a80e61dd..761a8798 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -9,6 +9,7 @@ dokku$ dokku git:set metrics deploy-branch main
 ```bash
 dokku config:set metrics GITHUB_EBMDATALAB_TOKEN='xxx'
 dokku config:set metrics GITHUB_OS_CORE_TOKEN='xxx'
+dokku config:set metrics GITHUB_OS_TOKEN='xxx'
 dokku config:set metrics SLACK_SIGNING_SECRET='xxx'
 dokku config:set metrics SLACK_TECH_SUPPORT_CHANNEL_ID='xxx'
 dokku config:set metrics SLACK_TOKEN='xxx'
@@ -22,6 +23,12 @@ Each token is assigned to a single organisation and should have the following *r
 * *all repositories* owned by the organisation with the following permissions:
 Code scanning alerts, Dependabot alerts, Metadata, Pull requests and Repository security advisories
 
+The `GITHUB_OS_TOKEN` is a fine-grained GitHub personal access token that is used for authenticating with the GitHub REST API.
+It is assigned to a single organisation and should have the following *read-only* permissions:
+* organisation permissions: codespaces
+* *all repositories* owned by the organisation with the following permissions:
+Codespaces and Metadata
+
 ## Disable checks
 Dokku performs health checks on apps during deploy by sending requests to port 80.
 This tool isn't a web app so it can't accept requests on a port.
diff --git a/dotenv-sample b/dotenv-sample
index bba73f88..7032b7ba 100644
--- a/dotenv-sample
+++ b/dotenv-sample
@@ -4,6 +4,7 @@ TIMESCALEDB_URL=postgresql://user:pass@localhost:5433/metrics
 # API tokens for pulling data from Github
 GITHUB_EBMDATALAB_TOKEN=
 GITHUB_OS_CORE_TOKEN=
+GITHUB_OS_TOKEN=
 
 # Slack API access credentials.
 # The slack app used for this will need the following OAuth scopes:
diff --git a/metrics/github/client.py b/metrics/github/client.py
index ce3512c8..a0d18556 100644
--- a/metrics/github/client.py
+++ b/metrics/github/client.py
@@ -52,6 +52,14 @@ def rest_query(self, path, **variables):
             data = response.json()
             if isinstance(data, list):
                 yield from data
+
+            # Unlike the team repositories endpoint or the team members endpoint,
+            # which return arrays of the objects we're interested in,
+            # the codespaces endpoint returns an object.
+            # This object has a codespaces key,
+            # whose value is an array of the objects we're interested in.
+            elif "codespaces" in path and isinstance(data, dict):
+                yield from data["codespaces"]
             else:
                 raise RuntimeError("Unexpected response format:", data)
 
diff --git a/metrics/github/github.py b/metrics/github/github.py
index 50d4f5e6..18615a1b 100644
--- a/metrics/github/github.py
+++ b/metrics/github/github.py
@@ -108,6 +108,32 @@ def from_dict(cls, data, repo):
         )
 
 
+@dataclass(frozen=True)
+class Codespace:
+    org: str
+    repo_name: str
+    user: str
+    created_at: datetime.datetime
+    last_used_at: datetime.datetime
+
+    @classmethod
+    def from_dict(cls, data, org):
+        return cls(
+            org=org,
+            repo_name=data["repository"]["name"],
+            user=data["owner"]["login"],
+            created_at=data["created_at"],
+            last_used_at=data["last_used_at"],
+        )
+
+
+def codespaces(org):
+    return [
+        Codespace.from_dict(data=codespace, org=org)
+        for codespace in query.codespaces(org)
+    ]
+
+
 def tech_prs():
     tech_team_members = _tech_team_members()
     return [
diff --git a/metrics/github/query.py b/metrics/github/query.py
index e3a69f51..be441d21 100644
--- a/metrics/github/query.py
+++ b/metrics/github/query.py
@@ -141,11 +141,16 @@ def issues(org, repo):
     )
 
 
+def codespaces(org):
+    yield from _client().rest_query("/orgs/{org}/codespaces", org=org)
+
+
 def _client():
     return GitHubClient(
         tokens={
             "ebmdatalab": os.environ["GITHUB_EBMDATALAB_TOKEN"],
             "opensafely-core": os.environ["GITHUB_OS_CORE_TOKEN"],
+            "opensafely": os.environ["GITHUB_OS_TOKEN"],
         }
     )
 
diff --git a/tests/metrics/github/test_github.py b/tests/metrics/github/test_github.py
index 6fe76fe7..44ef9225 100644
--- a/tests/metrics/github/test_github.py
+++ b/tests/metrics/github/test_github.py
@@ -35,6 +35,23 @@ def fake(*keys):
     return patch
 
 
+def test_codespaces(patch):
+    patch(
+        "codespaces",
+        {
+            "opensafely": [
+                {
+                    "owner": {"login": "testuser"},
+                    "repository": {"name": "testrepo"},
+                    "created_at": datetime.datetime.now().isoformat(),
+                    "last_used_at": datetime.datetime.now().isoformat(),
+                },
+            ]
+        },
+    )
+    assert len(github.codespaces("opensafely")) == 1
+
+
 def test_includes_tech_owned_repos(patch):
     patch(
         "team_repos",

From ee51522932100108d82420abb9022f7096d8c1b5 Mon Sep 17 00:00:00 2001
From: Jon Massey <jon.massey@thedatalab.org>
Date: Wed, 15 May 2024 15:56:15 +0100
Subject: [PATCH 2/4] Define Codespaces database table

Use a broad composite key to ensure uniqueness and stability over time.
Candidate "id" fields from Codespaces API not used as minimally documented
by GitHub and thus untrustworthy. Large composite primary key does have
performance penalities, but for the expected data volume this is not
presumed to be significant.
---
 metrics/timescaledb/tables.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/metrics/timescaledb/tables.py b/metrics/timescaledb/tables.py
index f1fbb448..f7c59363 100644
--- a/metrics/timescaledb/tables.py
+++ b/metrics/timescaledb/tables.py
@@ -4,6 +4,16 @@
 metadata = MetaData()
 
 
+GitHubCodespaces = Table(
+    "github_codespaces",
+    metadata,
+    Column("created_at", TIMESTAMP(timezone=True), primary_key=True),
+    Column("organisation", Text, primary_key=True),
+    Column("repo", Text, primary_key=True),
+    Column("user", Text, primary_key=True),
+    Column("last_used_at", TIMESTAMP(timezone=True)),
+)
+
 GitHubRepos = Table(
     "github_repos",
     metadata,

From b7051a4ce9e6248ff5fceb32a3d930e8bb9b20c4 Mon Sep 17 00:00:00 2001
From: Jon Massey <jon.massey@thedatalab.org>
Date: Wed, 15 May 2024 17:03:04 +0100
Subject: [PATCH 3/4] Support upserts in database

This is required as codespaces' state will change over time, and their
data will disappear from the API once deleted so we are unable to
do a full refresh of the data each time.

Uses PostgreSQL "INSERT..ON CONFLICT.. UPDATE" style as newer
"MERGE" statement not yet supported in SQLAlchemy.
---
 metrics/timescaledb/db.py            | 44 ++++++++++++++++++++++++++--
 tests/metrics/timescaledb/test_db.py | 26 ++++++++++++++++
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/metrics/timescaledb/db.py b/metrics/timescaledb/db.py
index 489ccd82..33095508 100644
--- a/metrics/timescaledb/db.py
+++ b/metrics/timescaledb/db.py
@@ -19,8 +19,7 @@ def reset_table(table, batch_size=None):
 
 
 def write(table, rows):
-    max_params = 65535  # limit for postgresql
-    batch_size = max_params // len(table.columns)
+    batch_size = _batch_size(table)
 
     with _get_engine().begin() as connection:
         for values in batched(rows, batch_size):
@@ -28,6 +27,47 @@ def write(table, rows):
             log.info("Inserted %s rows", len(values), table=table.name)
 
 
+def upsert(table, rows):
+    _ensure_table(table)
+    batch_size = _batch_size(table)
+    non_pk_columns = set(table.columns) - set(table.primary_key.columns)
+
+    # use the primary key constraint to match rows to be updated,
+    # using default constraint name if not otherwise specified
+    constraint = table.primary_key.name or table.name + "_pkey"
+
+    with _get_engine().begin() as connection:
+        for values in batched(rows, batch_size):
+            # Construct a PostgreSQL "INSERT..ON CONFLICT" style upsert statement
+            # https://docs.sqlalchemy.org/en/20/dialects/postgresql.html#insert-on-conflict-upsert
+
+            # "Vanilla" statement to start, we need this to be able to derive
+            # the "excluded" columns in the values which we need to use to update
+            # the target table in case of conflict on the constraint
+            insert_stmt = insert(table).values(values)
+
+            # This dict dicates which columns in the target table are updated (the
+            # non-PK columns) and the corresponding values with which they are updated
+            update_set_clause = {
+                c: insert_stmt.excluded[c.name] for c in non_pk_columns
+            }
+
+            # Extend the insert statement to include checking for row conflicts using
+            # the primary key constraint and telling the database to update
+            # the conflicting rows according to the SET clause
+            insert_stmt = insert_stmt.on_conflict_do_update(
+                constraint=constraint,
+                set_=update_set_clause,
+            )
+            connection.execute(insert_stmt)
+            log.info("Inserted %s rows", len(values), table=table.name)
+
+
+def _batch_size(table):
+    max_params = 65535  # limit for postgresql
+    return max_params // len(table.columns)
+
+
 def _drop_table(table, batch_size):
     with _get_engine().begin() as connection:
         log.debug("Removing table: %s", table.name)
diff --git a/tests/metrics/timescaledb/test_db.py b/tests/metrics/timescaledb/test_db.py
index e3eec767..58f17e85 100644
--- a/tests/metrics/timescaledb/test_db.py
+++ b/tests/metrics/timescaledb/test_db.py
@@ -164,3 +164,29 @@ def test_write(engine, table):
     # check rows are in table
     rows = get_rows(engine, table)
     assert len(rows) == 3
+
+
+def test_upsert(engine, table):
+    # add a non-PK column to the table
+    table.append_column(Column("value2", Text))
+
+    # insert initial rows
+    rows = [{"value": i, "value2": "a"} for i in range(1, 4)]
+    db.upsert(table, rows)
+
+    # second batch of rows, some with preexisting value1, some new
+    # all with different value2
+    rows = [{"value": i, "value2": "b"} for i in range(3, 6)]
+    db.upsert(table, rows)
+
+    # check all rows are in table
+    rows = get_rows(engine, table)
+    assert len(rows) == 5
+
+    # check upsert leaves unmatched rows 1-2 intact
+    original_rows = [r for r in rows if int(r[0]) < 3]
+    assert original_rows == [("1", "a"), ("2", "a")]
+
+    # check upsert modifies matched row 3 and new rows 4-5
+    modified_rows = [r for r in rows if int(r[0]) >= 3]
+    assert modified_rows == [("3", "b"), ("4", "b"), ("5", "b")]

From 3b7b084419921803b7e0d59129dbeb2e457eedaf Mon Sep 17 00:00:00 2001
From: Jon Massey <jon.massey@thedatalab.org>
Date: Fri, 17 May 2024 11:54:53 +0100
Subject: [PATCH 4/4] Add task to fetch and store codespaces metrics

As the codespaces data will change over time and deleted codespaces
will disappear from the API, we do not drop and recreate the table
each time the task is run as per the other tasks. Instead, calling
the upsert() method ensures the table exists then merges the new
data with the existing.

Conversion method added to metrics.py is not a metric in the usual
sense of the word, but instead renames some fields to match the
database schema. This is neccesary due to different naming conventions
in the domain dataclasses and database tables.
---
 metrics/github/metrics.py   | 13 +++++++++++++
 metrics/tasks/codespaces.py | 24 ++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 metrics/tasks/codespaces.py

diff --git a/metrics/github/metrics.py b/metrics/github/metrics.py
index 569ab2e1..c8f5fd61 100644
--- a/metrics/github/metrics.py
+++ b/metrics/github/metrics.py
@@ -75,3 +75,16 @@ def convert_issue_counts_to_metrics(counts):
             }
         )
     return metrics
+
+
+def convert_codespaces_to_dicts(codespaces):
+    return [
+        {
+            "organisation": c.org,
+            "repo": c.repo_name,
+            "user": c.user,
+            "created_at": c.created_at,
+            "last_used_at": c.last_used_at,
+        }
+        for c in codespaces
+    ]
diff --git a/metrics/tasks/codespaces.py b/metrics/tasks/codespaces.py
new file mode 100644
index 00000000..38159fb7
--- /dev/null
+++ b/metrics/tasks/codespaces.py
@@ -0,0 +1,24 @@
+import sys
+
+import structlog
+
+import metrics.github.github as github
+from metrics.github.metrics import convert_codespaces_to_dicts
+from metrics.timescaledb import db, tables
+
+
+log = structlog.get_logger()
+
+
+def main():
+    log.info("Getting codespaces")
+    codespaces = github.codespaces(org="opensafely")
+    log.info(f"Got {len(codespaces)} codespaces")
+
+    log.info("Writing data")
+    db.upsert(tables.GitHubCodespaces, convert_codespaces_to_dicts(codespaces))
+    log.info("Written data")
+
+
+if __name__ == "__main__":
+    sys.exit(main())