ebmdatalab · Jongmassey · May 31, 2024 · May 10, 2024 · May 15, 2024 · May 15, 2024
diff --git a/INSTALL.md b/INSTALL.md
@@ -9,6 +9,7 @@ dokku$ dokku git:set metrics deploy-branch main
 ```bash
 dokku config:set metrics GITHUB_EBMDATALAB_TOKEN='xxx'
 dokku config:set metrics GITHUB_OS_CORE_TOKEN='xxx'
+dokku config:set metrics GITHUB_OS_TOKEN='xxx'
 dokku config:set metrics SLACK_SIGNING_SECRET='xxx'
 dokku config:set metrics SLACK_TECH_SUPPORT_CHANNEL_ID='xxx'
 dokku config:set metrics SLACK_TOKEN='xxx'
@@ -22,6 +23,12 @@ Each token is assigned to a single organisation and should have the following *r
 * *all repositories* owned by the organisation with the following permissions:
 Code scanning alerts, Dependabot alerts, Metadata, Pull requests and Repository security advisories
 
+The `GITHUB_OS_TOKEN` is a fine-grained GitHub personal access token that is used for authenticating with the GitHub REST API.
+It is assigned to a single organisation and should have the following *read-only* permissions:
+* organisation permissions: codespaces
+* *all repositories* owned by the organisation with the following permissions:
+Codespaces and Metadata
+
 ## Disable checks
 Dokku performs health checks on apps during deploy by sending requests to port 80.
 This tool isn't a web app so it can't accept requests on a port.

diff --git a/dotenv-sample b/dotenv-sample
@@ -4,6 +4,7 @@ TIMESCALEDB_URL=postgresql://user:pass@localhost:5433/metrics
 # API tokens for pulling data from Github
 GITHUB_EBMDATALAB_TOKEN=
 GITHUB_OS_CORE_TOKEN=
+GITHUB_OS_TOKEN=
 
 # Slack API access credentials.
 # The slack app used for this will need the following OAuth scopes:

diff --git a/metrics/github/client.py b/metrics/github/client.py
@@ -52,6 +52,14 @@ def rest_query(self, path, **variables):
             data = response.json()
             if isinstance(data, list):
                 yield from data
+
+            # Unlike the team repositories endpoint or the team members endpoint,
+            # which return arrays of the objects we're interested in,
+            # the codespaces endpoint returns an object.
+            # This object has a codespaces key,
+            # whose value is an array of the objects we're interested in.
+            elif "codespaces" in path and isinstance(data, dict):
+                yield from data["codespaces"]
             else:
                 raise RuntimeError("Unexpected response format:", data)
 

diff --git a/metrics/github/github.py b/metrics/github/github.py
@@ -108,6 +108,32 @@ def from_dict(cls, data, repo):
         )
 
 
+@dataclass(frozen=True)
+class Codespace:
+    org: str
+    repo_name: str
+    user: str
+    created_at: datetime.datetime
+    last_used_at: datetime.datetime
+
+    @classmethod
+    def from_dict(cls, data, org):
+        return cls(
+            org=org,
+            repo_name=data["repository"]["name"],
+            user=data["owner"]["login"],
+            created_at=data["created_at"],
+            last_used_at=data["last_used_at"],
+        )
+
+
+def codespaces(org):
+    return [
+        Codespace.from_dict(data=codespace, org=org)
+        for codespace in query.codespaces(org)
+    ]
+
+
 def tech_prs():
     tech_team_members = _tech_team_members()
     return [

diff --git a/metrics/github/metrics.py b/metrics/github/metrics.py
@@ -75,3 +75,16 @@ def convert_issue_counts_to_metrics(counts):
             }
         )
     return metrics
+
+
+def convert_codespaces_to_dicts(codespaces):
+    return [
+        {
+            "organisation": c.org,
+            "repo": c.repo_name,
+            "user": c.user,
+            "created_at": c.created_at,
+            "last_used_at": c.last_used_at,
+        }
+        for c in codespaces
+    ]
diff --git a/metrics/github/query.py b/metrics/github/query.py
@@ -141,11 +141,16 @@ def issues(org, repo):
     )
 
 
+def codespaces(org):
+    yield from _client().rest_query("/orgs/{org}/codespaces", org=org)
+
+
 def _client():
     return GitHubClient(
         tokens={
             "ebmdatalab": os.environ["GITHUB_EBMDATALAB_TOKEN"],
             "opensafely-core": os.environ["GITHUB_OS_CORE_TOKEN"],
+            "opensafely": os.environ["GITHUB_OS_TOKEN"],
         }
     )
 

diff --git a/metrics/tasks/codespaces.py b/metrics/tasks/codespaces.py
@@ -0,0 +1,24 @@
+import sys
+
+import structlog
+
+import metrics.github.github as github
+from metrics.github.metrics import convert_codespaces_to_dicts
+from metrics.timescaledb import db, tables
+
+
+log = structlog.get_logger()
+
+
+def main():
+    log.info("Getting codespaces")
+    codespaces = github.codespaces(org="opensafely")
+    log.info(f"Got {len(codespaces)} codespaces")
+
+    log.info("Writing data")
+    db.upsert(tables.GitHubCodespaces, convert_codespaces_to_dicts(codespaces))
+    log.info("Written data")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/metrics/timescaledb/db.py b/metrics/timescaledb/db.py
@@ -19,15 +19,55 @@ def reset_table(table, batch_size=None):
 
 
 def write(table, rows):
-    max_params = 65535  # limit for postgresql
-    batch_size = max_params // len(table.columns)
+    batch_size = _batch_size(table)
 
     with _get_engine().begin() as connection:
         for values in batched(rows, batch_size):
             connection.execute(insert(table).values(values))
             log.info("Inserted %s rows", len(values), table=table.name)
 
 
+def upsert(table, rows):
+    _ensure_table(table)
+    batch_size = _batch_size(table)
+    non_pk_columns = set(table.columns) - set(table.primary_key.columns)
+
+    # use the primary key constraint to match rows to be updated,
+    # using default constraint name if not otherwise specified
+    constraint = table.primary_key.name or table.name + "_pkey"
+
+    with _get_engine().begin() as connection:
+        for values in batched(rows, batch_size):
+            # Construct a PostgreSQL "INSERT..ON CONFLICT" style upsert statement
+            # https://docs.sqlalchemy.org/en/20/dialects/postgresql.html#insert-on-conflict-upsert
+
+            # "Vanilla" statement to start, we need this to be able to derive
+            # the "excluded" columns in the values which we need to use to update
+            # the target table in case of conflict on the constraint
+            insert_stmt = insert(table).values(values)
+
+            # This dict dicates which columns in the target table are updated (the
+            # non-PK columns) and the corresponding values with which they are updated
+            update_set_clause = {
+                c: insert_stmt.excluded[c.name] for c in non_pk_columns
+            }
+
+            # Extend the insert statement to include checking for row conflicts using
+            # the primary key constraint and telling the database to update
+            # the conflicting rows according to the SET clause
+            insert_stmt = insert_stmt.on_conflict_do_update(
+                constraint=constraint,
+                set_=update_set_clause,
+            )
+            connection.execute(insert_stmt)
+            log.info("Inserted %s rows", len(values), table=table.name)
+
+
+def _batch_size(table):
+    max_params = 65535  # limit for postgresql
+    return max_params // len(table.columns)
+
+
 def _drop_table(table, batch_size):
     with _get_engine().begin() as connection:
         log.debug("Removing table: %s", table.name)

diff --git a/metrics/timescaledb/tables.py b/metrics/timescaledb/tables.py
@@ -4,6 +4,16 @@
 metadata = MetaData()
 
 
+GitHubCodespaces = Table(
+    "github_codespaces",
+    metadata,
+    Column("created_at", TIMESTAMP(timezone=True), primary_key=True),
+    Column("organisation", Text, primary_key=True),
+    Column("repo", Text, primary_key=True),
+    Column("user", Text, primary_key=True),
+    Column("last_used_at", TIMESTAMP(timezone=True)),
+)
+
 GitHubRepos = Table(
     "github_repos",
     metadata,

diff --git a/tests/metrics/github/test_github.py b/tests/metrics/github/test_github.py
@@ -35,6 +35,23 @@ def fake(*keys):
     return patch
 
 
+def test_codespaces(patch):
+    patch(
+        "codespaces",
+        {
+            "opensafely": [
+                {
+                    "owner": {"login": "testuser"},
+                    "repository": {"name": "testrepo"},
+                    "created_at": datetime.datetime.now().isoformat(),
+                    "last_used_at": datetime.datetime.now().isoformat(),
+                },
+            ]
+        },
+    )
+    assert len(github.codespaces("opensafely")) == 1
+
+
 def test_includes_tech_owned_repos(patch):
     patch(
         "team_repos",

diff --git a/tests/metrics/timescaledb/test_db.py b/tests/metrics/timescaledb/test_db.py
@@ -164,3 +164,29 @@ def test_write(engine, table):
     # check rows are in table
     rows = get_rows(engine, table)
     assert len(rows) == 3
+
+
+def test_upsert(engine, table):
+    # add a non-PK column to the table
+    table.append_column(Column("value2", Text))
+
+    # insert initial rows
+    rows = [{"value": i, "value2": "a"} for i in range(1, 4)]
+    db.upsert(table, rows)
+
+    # second batch of rows, some with preexisting value1, some new
+    # all with different value2
+    rows = [{"value": i, "value2": "b"} for i in range(3, 6)]
+    db.upsert(table, rows)
+
+    # check all rows are in table
+    rows = get_rows(engine, table)
+    assert len(rows) == 5
+
+    # check upsert leaves unmatched rows 1-2 intact
+    original_rows = [r for r in rows if int(r[0]) < 3]
+    assert original_rows == [("1", "a"), ("2", "a")]
+
+    # check upsert modifies matched row 3 and new rows 4-5
+    modified_rows = [r for r in rows if int(r[0]) >= 3]
+    assert modified_rows == [("3", "b"), ("4", "b"), ("5", "b")]