Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Codespace usage metrics #173

Merged
merged 4 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ dokku$ dokku git:set metrics deploy-branch main
```bash
dokku config:set metrics GITHUB_EBMDATALAB_TOKEN='xxx'
dokku config:set metrics GITHUB_OS_CORE_TOKEN='xxx'
dokku config:set metrics GITHUB_OS_TOKEN='xxx'
dokku config:set metrics SLACK_SIGNING_SECRET='xxx'
dokku config:set metrics SLACK_TECH_SUPPORT_CHANNEL_ID='xxx'
dokku config:set metrics SLACK_TOKEN='xxx'
Expand All @@ -22,6 +23,12 @@ Each token is assigned to a single organisation and should have the following *r
* *all repositories* owned by the organisation with the following permissions:
Code scanning alerts, Dependabot alerts, Metadata, Pull requests and Repository security advisories

The `GITHUB_OS_TOKEN` is a fine-grained GitHub personal access token that is used for authenticating with the GitHub REST API.
It is assigned to a single organisation and should have the following *read-only* permissions:
* organisation permissions: codespaces
* *all repositories* owned by the organisation with the following permissions:
Codespaces and Metadata

## Disable checks
Dokku performs health checks on apps during deploy by sending requests to port 80.
This tool isn't a web app so it can't accept requests on a port.
Expand Down
1 change: 1 addition & 0 deletions dotenv-sample
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ TIMESCALEDB_URL=postgresql://user:pass@localhost:5433/metrics
# API tokens for pulling data from Github
GITHUB_EBMDATALAB_TOKEN=
GITHUB_OS_CORE_TOKEN=
GITHUB_OS_TOKEN=

# Slack API access credentials.
# The slack app used for this will need the following OAuth scopes:
Expand Down
8 changes: 8 additions & 0 deletions metrics/github/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ def rest_query(self, path, **variables):
data = response.json()
if isinstance(data, list):
yield from data

# Unlike the team repositories endpoint or the team members endpoint,
# which return arrays of the objects we're interested in,
# the codespaces endpoint returns an object.
# This object has a codespaces key,
# whose value is an array of the objects we're interested in.
elif "codespaces" in path and isinstance(data, dict):
yield from data["codespaces"]
else:
raise RuntimeError("Unexpected response format:", data)

Expand Down
26 changes: 26 additions & 0 deletions metrics/github/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,32 @@ def from_dict(cls, data, repo):
)


@dataclass(frozen=True)
class Codespace:
org: str
repo_name: str
user: str
created_at: datetime.datetime
last_used_at: datetime.datetime

@classmethod
def from_dict(cls, data, org):
return cls(
org=org,
repo_name=data["repository"]["name"],
user=data["owner"]["login"],
created_at=data["created_at"],
last_used_at=data["last_used_at"],
)


def codespaces(org):
return [
Codespace.from_dict(data=codespace, org=org)
for codespace in query.codespaces(org)
]


def tech_prs():
tech_team_members = _tech_team_members()
return [
Expand Down
13 changes: 13 additions & 0 deletions metrics/github/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,16 @@ def convert_issue_counts_to_metrics(counts):
}
)
return metrics


def convert_codespaces_to_dicts(codespaces):
return [
{
"organisation": c.org,
"repo": c.repo_name,
"user": c.user,
"created_at": c.created_at,
"last_used_at": c.last_used_at,
}
for c in codespaces
]
5 changes: 5 additions & 0 deletions metrics/github/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,16 @@ def issues(org, repo):
)


def codespaces(org):
yield from _client().rest_query("/orgs/{org}/codespaces", org=org)


def _client():
return GitHubClient(
tokens={
"ebmdatalab": os.environ["GITHUB_EBMDATALAB_TOKEN"],
"opensafely-core": os.environ["GITHUB_OS_CORE_TOKEN"],
"opensafely": os.environ["GITHUB_OS_TOKEN"],
}
)

Expand Down
24 changes: 24 additions & 0 deletions metrics/tasks/codespaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import sys

import structlog

import metrics.github.github as github
from metrics.github.metrics import convert_codespaces_to_dicts
from metrics.timescaledb import db, tables


log = structlog.get_logger()


def main():
log.info("Getting codespaces")
codespaces = github.codespaces(org="opensafely")
log.info(f"Got {len(codespaces)} codespaces")

log.info("Writing data")
db.upsert(tables.GitHubCodespaces, convert_codespaces_to_dicts(codespaces))
log.info("Written data")


if __name__ == "__main__":
sys.exit(main())
44 changes: 42 additions & 2 deletions metrics/timescaledb/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,55 @@ def reset_table(table, batch_size=None):


def write(table, rows):
max_params = 65535 # limit for postgresql
batch_size = max_params // len(table.columns)
batch_size = _batch_size(table)

with _get_engine().begin() as connection:
for values in batched(rows, batch_size):
connection.execute(insert(table).values(values))
log.info("Inserted %s rows", len(values), table=table.name)


def upsert(table, rows):
Jongmassey marked this conversation as resolved.
Show resolved Hide resolved
_ensure_table(table)
batch_size = _batch_size(table)
non_pk_columns = set(table.columns) - set(table.primary_key.columns)

# use the primary key constraint to match rows to be updated,
# using default constraint name if not otherwise specified
constraint = table.primary_key.name or table.name + "_pkey"

with _get_engine().begin() as connection:
for values in batched(rows, batch_size):
# Construct a PostgreSQL "INSERT..ON CONFLICT" style upsert statement
# https://docs.sqlalchemy.org/en/20/dialects/postgresql.html#insert-on-conflict-upsert

# "Vanilla" statement to start, we need this to be able to derive
# the "excluded" columns in the values which we need to use to update
# the target table in case of conflict on the constraint
insert_stmt = insert(table).values(values)

# This dict dicates which columns in the target table are updated (the
# non-PK columns) and the corresponding values with which they are updated
update_set_clause = {
c: insert_stmt.excluded[c.name] for c in non_pk_columns
}

# Extend the insert statement to include checking for row conflicts using
# the primary key constraint and telling the database to update
# the conflicting rows according to the SET clause
insert_stmt = insert_stmt.on_conflict_do_update(
constraint=constraint,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we can't pass table.primary_key instead of constraint here, because table.primary_key.name == None?

If that's the case, then consider naming the primary key when you define the table and removing the logic that determines the value of constraint from upsert. Something like:

GitHubCodespaces = Table(
    "github_codespaces",
    metadata,
    Column("created_at", TIMESTAMP(timezone=True)),
    Column("organisation", Text),
    Column("repo", Text),
    Column("user", Text),
    Column("last_used_at", TIMESTAMP(timezone=True)),
    PrimaryKeyConstraint(
        "created_at", "organisation", "repo", "user", name="github_codespaces_pkey"
    ),
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By just passing table.primary_key it actually passes a set of columns, from which it infers the correct constraint based on the columns. With compound primary keys, this is known to be a bit broken! The way I've written it, it handles either explicitly or defaultly-named PK constraints, either single or multiple columns. I feel this is a pretty robust strategy!

I did actually initially write this with an explicit PrimaryKeyConstraint in the table definition, but felt that defaulting to the default constraint name was a bit cleaner.

set_=update_set_clause,
)
connection.execute(insert_stmt)
log.info("Inserted %s rows", len(values), table=table.name)


def _batch_size(table):
max_params = 65535 # limit for postgresql
return max_params // len(table.columns)


def _drop_table(table, batch_size):
with _get_engine().begin() as connection:
log.debug("Removing table: %s", table.name)
Expand Down
10 changes: 10 additions & 0 deletions metrics/timescaledb/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@
metadata = MetaData()


GitHubCodespaces = Table(
"github_codespaces",
metadata,
Column("created_at", TIMESTAMP(timezone=True), primary_key=True),
Jongmassey marked this conversation as resolved.
Show resolved Hide resolved
Column("organisation", Text, primary_key=True),
Column("repo", Text, primary_key=True),
Column("user", Text, primary_key=True),
Column("last_used_at", TIMESTAMP(timezone=True)),
)

GitHubRepos = Table(
"github_repos",
metadata,
Expand Down
17 changes: 17 additions & 0 deletions tests/metrics/github/test_github.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,23 @@ def fake(*keys):
return patch


def test_codespaces(patch):
Jongmassey marked this conversation as resolved.
Show resolved Hide resolved
patch(
"codespaces",
{
"opensafely": [
{
"owner": {"login": "testuser"},
"repository": {"name": "testrepo"},
"created_at": datetime.datetime.now().isoformat(),
"last_used_at": datetime.datetime.now().isoformat(),
},
]
},
)
assert len(github.codespaces("opensafely")) == 1


def test_includes_tech_owned_repos(patch):
patch(
"team_repos",
Expand Down
26 changes: 26 additions & 0 deletions tests/metrics/timescaledb/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,29 @@ def test_write(engine, table):
# check rows are in table
rows = get_rows(engine, table)
assert len(rows) == 3


def test_upsert(engine, table):
Jongmassey marked this conversation as resolved.
Show resolved Hide resolved
# add a non-PK column to the table
table.append_column(Column("value2", Text))

# insert initial rows
rows = [{"value": i, "value2": "a"} for i in range(1, 4)]
db.upsert(table, rows)

# second batch of rows, some with preexisting value1, some new
# all with different value2
rows = [{"value": i, "value2": "b"} for i in range(3, 6)]
db.upsert(table, rows)

# check all rows are in table
rows = get_rows(engine, table)
assert len(rows) == 5

# check upsert leaves unmatched rows 1-2 intact
original_rows = [r for r in rows if int(r[0]) < 3]
assert original_rows == [("1", "a"), ("2", "a")]

# check upsert modifies matched row 3 and new rows 4-5
modified_rows = [r for r in rows if int(r[0]) >= 3]
assert modified_rows == [("3", "b"), ("4", "b"), ("5", "b")]