diff --git a/INSTALL.md b/INSTALL.md index a80e61dd..761a8798 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -9,6 +9,7 @@ dokku$ dokku git:set metrics deploy-branch main ```bash dokku config:set metrics GITHUB_EBMDATALAB_TOKEN='xxx' dokku config:set metrics GITHUB_OS_CORE_TOKEN='xxx' +dokku config:set metrics GITHUB_OS_TOKEN='xxx' dokku config:set metrics SLACK_SIGNING_SECRET='xxx' dokku config:set metrics SLACK_TECH_SUPPORT_CHANNEL_ID='xxx' dokku config:set metrics SLACK_TOKEN='xxx' @@ -22,6 +23,12 @@ Each token is assigned to a single organisation and should have the following *r * *all repositories* owned by the organisation with the following permissions: Code scanning alerts, Dependabot alerts, Metadata, Pull requests and Repository security advisories +The `GITHUB_OS_TOKEN` is a fine-grained GitHub personal access token that is used for authenticating with the GitHub REST API. +It is assigned to a single organisation and should have the following *read-only* permissions: +* organisation permissions: codespaces +* *all repositories* owned by the organisation with the following permissions: +Codespaces and Metadata + ## Disable checks Dokku performs health checks on apps during deploy by sending requests to port 80. This tool isn't a web app so it can't accept requests on a port. diff --git a/dotenv-sample b/dotenv-sample index bba73f88..7032b7ba 100644 --- a/dotenv-sample +++ b/dotenv-sample @@ -4,6 +4,7 @@ TIMESCALEDB_URL=postgresql://user:pass@localhost:5433/metrics # API tokens for pulling data from Github GITHUB_EBMDATALAB_TOKEN= GITHUB_OS_CORE_TOKEN= +GITHUB_OS_TOKEN= # Slack API access credentials. # The slack app used for this will need the following OAuth scopes: diff --git a/metrics/github/client.py b/metrics/github/client.py index ce3512c8..a0d18556 100644 --- a/metrics/github/client.py +++ b/metrics/github/client.py @@ -52,6 +52,14 @@ def rest_query(self, path, **variables): data = response.json() if isinstance(data, list): yield from data + + # Unlike the team repositories endpoint or the team members endpoint, + # which return arrays of the objects we're interested in, + # the codespaces endpoint returns an object. + # This object has a codespaces key, + # whose value is an array of the objects we're interested in. + elif "codespaces" in path and isinstance(data, dict): + yield from data["codespaces"] else: raise RuntimeError("Unexpected response format:", data) diff --git a/metrics/github/github.py b/metrics/github/github.py index 50d4f5e6..18615a1b 100644 --- a/metrics/github/github.py +++ b/metrics/github/github.py @@ -108,6 +108,32 @@ def from_dict(cls, data, repo): ) +@dataclass(frozen=True) +class Codespace: + org: str + repo_name: str + user: str + created_at: datetime.datetime + last_used_at: datetime.datetime + + @classmethod + def from_dict(cls, data, org): + return cls( + org=org, + repo_name=data["repository"]["name"], + user=data["owner"]["login"], + created_at=data["created_at"], + last_used_at=data["last_used_at"], + ) + + +def codespaces(org): + return [ + Codespace.from_dict(data=codespace, org=org) + for codespace in query.codespaces(org) + ] + + def tech_prs(): tech_team_members = _tech_team_members() return [ diff --git a/metrics/github/metrics.py b/metrics/github/metrics.py index 569ab2e1..c8f5fd61 100644 --- a/metrics/github/metrics.py +++ b/metrics/github/metrics.py @@ -75,3 +75,16 @@ def convert_issue_counts_to_metrics(counts): } ) return metrics + + +def convert_codespaces_to_dicts(codespaces): + return [ + { + "organisation": c.org, + "repo": c.repo_name, + "user": c.user, + "created_at": c.created_at, + "last_used_at": c.last_used_at, + } + for c in codespaces + ] diff --git a/metrics/github/query.py b/metrics/github/query.py index e3a69f51..be441d21 100644 --- a/metrics/github/query.py +++ b/metrics/github/query.py @@ -141,11 +141,16 @@ def issues(org, repo): ) +def codespaces(org): + yield from _client().rest_query("/orgs/{org}/codespaces", org=org) + + def _client(): return GitHubClient( tokens={ "ebmdatalab": os.environ["GITHUB_EBMDATALAB_TOKEN"], "opensafely-core": os.environ["GITHUB_OS_CORE_TOKEN"], + "opensafely": os.environ["GITHUB_OS_TOKEN"], } ) diff --git a/metrics/tasks/codespaces.py b/metrics/tasks/codespaces.py new file mode 100644 index 00000000..38159fb7 --- /dev/null +++ b/metrics/tasks/codespaces.py @@ -0,0 +1,24 @@ +import sys + +import structlog + +import metrics.github.github as github +from metrics.github.metrics import convert_codespaces_to_dicts +from metrics.timescaledb import db, tables + + +log = structlog.get_logger() + + +def main(): + log.info("Getting codespaces") + codespaces = github.codespaces(org="opensafely") + log.info(f"Got {len(codespaces)} codespaces") + + log.info("Writing data") + db.upsert(tables.GitHubCodespaces, convert_codespaces_to_dicts(codespaces)) + log.info("Written data") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/metrics/timescaledb/db.py b/metrics/timescaledb/db.py index 489ccd82..33095508 100644 --- a/metrics/timescaledb/db.py +++ b/metrics/timescaledb/db.py @@ -19,8 +19,7 @@ def reset_table(table, batch_size=None): def write(table, rows): - max_params = 65535 # limit for postgresql - batch_size = max_params // len(table.columns) + batch_size = _batch_size(table) with _get_engine().begin() as connection: for values in batched(rows, batch_size): @@ -28,6 +27,47 @@ def write(table, rows): log.info("Inserted %s rows", len(values), table=table.name) +def upsert(table, rows): + _ensure_table(table) + batch_size = _batch_size(table) + non_pk_columns = set(table.columns) - set(table.primary_key.columns) + + # use the primary key constraint to match rows to be updated, + # using default constraint name if not otherwise specified + constraint = table.primary_key.name or table.name + "_pkey" + + with _get_engine().begin() as connection: + for values in batched(rows, batch_size): + # Construct a PostgreSQL "INSERT..ON CONFLICT" style upsert statement + # https://docs.sqlalchemy.org/en/20/dialects/postgresql.html#insert-on-conflict-upsert + + # "Vanilla" statement to start, we need this to be able to derive + # the "excluded" columns in the values which we need to use to update + # the target table in case of conflict on the constraint + insert_stmt = insert(table).values(values) + + # This dict dicates which columns in the target table are updated (the + # non-PK columns) and the corresponding values with which they are updated + update_set_clause = { + c: insert_stmt.excluded[c.name] for c in non_pk_columns + } + + # Extend the insert statement to include checking for row conflicts using + # the primary key constraint and telling the database to update + # the conflicting rows according to the SET clause + insert_stmt = insert_stmt.on_conflict_do_update( + constraint=constraint, + set_=update_set_clause, + ) + connection.execute(insert_stmt) + log.info("Inserted %s rows", len(values), table=table.name) + + +def _batch_size(table): + max_params = 65535 # limit for postgresql + return max_params // len(table.columns) + + def _drop_table(table, batch_size): with _get_engine().begin() as connection: log.debug("Removing table: %s", table.name) diff --git a/metrics/timescaledb/tables.py b/metrics/timescaledb/tables.py index f1fbb448..f7c59363 100644 --- a/metrics/timescaledb/tables.py +++ b/metrics/timescaledb/tables.py @@ -4,6 +4,16 @@ metadata = MetaData() +GitHubCodespaces = Table( + "github_codespaces", + metadata, + Column("created_at", TIMESTAMP(timezone=True), primary_key=True), + Column("organisation", Text, primary_key=True), + Column("repo", Text, primary_key=True), + Column("user", Text, primary_key=True), + Column("last_used_at", TIMESTAMP(timezone=True)), +) + GitHubRepos = Table( "github_repos", metadata, diff --git a/tests/metrics/github/test_github.py b/tests/metrics/github/test_github.py index 6fe76fe7..44ef9225 100644 --- a/tests/metrics/github/test_github.py +++ b/tests/metrics/github/test_github.py @@ -35,6 +35,23 @@ def fake(*keys): return patch +def test_codespaces(patch): + patch( + "codespaces", + { + "opensafely": [ + { + "owner": {"login": "testuser"}, + "repository": {"name": "testrepo"}, + "created_at": datetime.datetime.now().isoformat(), + "last_used_at": datetime.datetime.now().isoformat(), + }, + ] + }, + ) + assert len(github.codespaces("opensafely")) == 1 + + def test_includes_tech_owned_repos(patch): patch( "team_repos", diff --git a/tests/metrics/timescaledb/test_db.py b/tests/metrics/timescaledb/test_db.py index e3eec767..58f17e85 100644 --- a/tests/metrics/timescaledb/test_db.py +++ b/tests/metrics/timescaledb/test_db.py @@ -164,3 +164,29 @@ def test_write(engine, table): # check rows are in table rows = get_rows(engine, table) assert len(rows) == 3 + + +def test_upsert(engine, table): + # add a non-PK column to the table + table.append_column(Column("value2", Text)) + + # insert initial rows + rows = [{"value": i, "value2": "a"} for i in range(1, 4)] + db.upsert(table, rows) + + # second batch of rows, some with preexisting value1, some new + # all with different value2 + rows = [{"value": i, "value2": "b"} for i in range(3, 6)] + db.upsert(table, rows) + + # check all rows are in table + rows = get_rows(engine, table) + assert len(rows) == 5 + + # check upsert leaves unmatched rows 1-2 intact + original_rows = [r for r in rows if int(r[0]) < 3] + assert original_rows == [("1", "a"), ("2", "a")] + + # check upsert modifies matched row 3 and new rows 4-5 + modified_rows = [r for r in rows if int(r[0]) >= 3] + assert modified_rows == [("3", "b"), ("4", "b"), ("5", "b")]