diff --git a/metrics/github/github.py b/metrics/github/github.py index f4db5e04..50d4f5e6 100644 --- a/metrics/github/github.py +++ b/metrics/github/github.py @@ -8,6 +8,22 @@ # Slugs (not names!) of the GitHub entities we're interested in _TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"] _ORGS = ["ebmdatalab", "opensafely-core"] +# Some repos (for example the websites) frequently have PRs created by people +# outside the tech teams. We don't want our delivery metrics to be skewed by these +# and we don't necessarily want to hold people in other teams to the same hygiene +# standards as we hold ourselves. So we treat them differently. (We don't want to +# exclude such PRs from other repos because it's definitely interesting if there are +# old PRs (or lots of PRs) created by non-tech-team members in those repos.) +_CONTENT_REPOS = { + "ebmdatalab": [ + "opensafely.org", + "team-manual", + "bennett.ox.ac.uk", + "openprescribing", + ] +} +_MANAGERS = {"sebbacon", "benbc"} +_EX_DEVELOPERS = {"ghickman", "milanwiedemann", "CarolineMorton"} @dataclass(frozen=True) @@ -23,6 +39,10 @@ class Repo: def is_tech_owned(self): return self.team in _TECH_TEAMS + @property + def is_content_repo(self): + return self.org in _CONTENT_REPOS and self.name in _CONTENT_REPOS[self.org] + @classmethod def from_dict(cls, data, org, team): return cls( @@ -42,6 +62,7 @@ class PR: created_on: datetime.date merged_on: datetime.date closed_on: datetime.date + is_content: bool def was_old_on(self, date): opened = self.created_on @@ -56,13 +77,17 @@ def was_merged_on(self, date): return self.merged_on and date == self.merged_on @classmethod - def from_dict(cls, data, repo): + def from_dict(cls, data, repo, tech_team_members): + author = data["author"]["login"] + is_content = repo.is_content_repo and author not in tech_team_members + return cls( repo, - data["author"]["login"], + author, date_from_iso(data["createdAt"]), date_from_iso(data["mergedAt"]), date_from_iso(data["closedAt"]), + is_content, ) @@ -84,8 +109,9 @@ def from_dict(cls, data, repo): def tech_prs(): + tech_team_members = _tech_team_members() return [ - PR.from_dict(pr, repo) + PR.from_dict(pr, repo, tech_team_members) for repo in tech_repos() for pr in query.prs(repo.org, repo.name) ] @@ -119,3 +145,16 @@ def _get_repos(): def _repo_owners(org): return {repo: team for team in _TECH_TEAMS for repo in query.team_repos(org, team)} + + +def _tech_team_members(): + return ( + _MANAGERS + | _EX_DEVELOPERS + | { + person + for org in _ORGS + for team in _TECH_TEAMS + for person in query.team_members(org, team) + } + ) diff --git a/metrics/github/metrics.py b/metrics/github/metrics.py index 73e3355f..569ab2e1 100644 --- a/metrics/github/metrics.py +++ b/metrics/github/metrics.py @@ -22,14 +22,14 @@ def calculate_pr_counts(prs, predicate): end = pr.closed_on if pr.closed_on else datetime.date.today() for day in iter_days(start, end): if predicate(pr, day): - counts[(pr.repo.org, pr.repo.name, pr.author, day)] += 1 + counts[(pr.repo.org, pr.repo.name, pr.author, pr.is_content, day)] += 1 return dict(counts) def convert_pr_counts_to_metrics(counts, name): metrics = [] for coord, count in counts.items(): - org, repo, author, date_ = coord + org, repo, author, is_content, date_ = coord timestamp = datetime.datetime.combine(date_, datetime.time()) metrics.append( { @@ -38,6 +38,7 @@ def convert_pr_counts_to_metrics(counts, name): "organisation": org, "repo": repo, "author": author, + "is_content": is_content, "value": count, } ) diff --git a/metrics/github/query.py b/metrics/github/query.py index 76df2f91..e3a69f51 100644 --- a/metrics/github/query.py +++ b/metrics/github/query.py @@ -32,11 +32,19 @@ def team_repos(org, team): """The API doesn't make it easy for us to get all the information we need about repos in one place, so we just return a list of repos here and join that to the richer repo objects in the caller.""" - results = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team) - for repo in results: + repos = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team) + for repo in repos: yield repo["name"] +def team_members(org, team): + members = _client().rest_query( + "/orgs/{org}/teams/{team}/members", org=org, team=team + ) + for member in members: + yield member["login"] + + def vulnerabilities(org, repo): query = """ query vulnerabilities($cursor: String, $org: String!, $repo: String!) { diff --git a/metrics/timescaledb/tables.py b/metrics/timescaledb/tables.py index 4908acfa..f1fbb448 100644 --- a/metrics/timescaledb/tables.py +++ b/metrics/timescaledb/tables.py @@ -20,6 +20,7 @@ Column("name", Text, primary_key=True), Column("value", Integer), Column("author", Text, primary_key=True), + Column("is_content", Boolean, primary_key=True), Column("organisation", Text, primary_key=True), Column("repo", Text, primary_key=True), ) diff --git a/queries/current-old-prs-content.sql b/queries/current-old-prs-content.sql new file mode 100644 index 00000000..36c14a21 --- /dev/null +++ b/queries/current-old-prs-content.sql @@ -0,0 +1,61 @@ +WITH + last_saturday AS ( + SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date + ), + old_prs_only AS ( + SELECT time, organisation, repo, author, is_content, value + FROM github_pull_requests + WHERE name = 'queue_older_than_7_days' + ), + in_timeframe AS ( + SELECT time, organisation, repo, author, is_content, value + FROM old_prs_only + WHERE $__timeFilter(time) + ), + fields_munged AS ( + -- the time field is a timestamp, but we only ever write midnight; + -- we need to keep it as a timestamp type for bucketing below + SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs + FROM in_timeframe + ), + dependabot_removed AS ( + SELECT day, repo, author, is_content, num_prs + FROM fields_munged + WHERE author NOT LIKE 'dependabot%' + ), + content_ignored AS ( + SELECT day, repo, author, num_prs + FROM dependabot_removed + WHERE is_content + ), + authors_aggregated AS ( + SELECT day, repo, sum(num_prs) as num_prs + FROM content_ignored + GROUP BY day, repo + ), + partial_week_ignored AS ( + SELECT day, repo, num_prs + FROM authors_aggregated, last_saturday + WHERE day < last_saturday.the_date + ), + last_week_only AS ( + SELECT day, repo, num_prs + FROM partial_week_ignored, last_saturday + WHERE day >= last_saturday.the_date - 7 + ), + bucketed_in_weeks AS ( + SELECT + -- label buckets with the (exclusive) end date; + -- the 'origin' argument can be _any_ Saturday + time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket, + repo, + -- aggregate by taking the last value because this is a gauge, not a count + last(num_prs, day) as num_prs + FROM + last_week_only, last_saturday + GROUP BY bucket, repo + ) +SELECT repo as "Repo", num_prs AS "Old PRs" +FROM bucketed_in_weeks +ORDER BY bucket DESC, num_prs DESC +LIMIT 5 diff --git a/queries/current-old-prs-humans.sql b/queries/current-old-prs-humans.sql index 4429c3e4..70efd255 100644 --- a/queries/current-old-prs-humans.sql +++ b/queries/current-old-prs-humans.sql @@ -3,34 +3,34 @@ WITH SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date ), old_prs_only AS ( - SELECT time, organisation, repo, author, value + SELECT time, organisation, repo, author, is_content, value FROM github_pull_requests WHERE name = 'queue_older_than_7_days' ), in_timeframe AS ( - SELECT time, organisation, repo, author, value + SELECT time, organisation, repo, author, is_content, value FROM old_prs_only WHERE $__timeFilter(time) ), fields_munged AS ( -- the time field is a timestamp, but we only ever write midnight; -- we need to keep it as a timestamp type for bucketing below - SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs + SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs FROM in_timeframe ), dependabot_removed AS ( - SELECT day, repo, author, num_prs + SELECT day, repo, author, is_content, num_prs FROM fields_munged WHERE author NOT LIKE 'dependabot%' ), - non_dev_op_ignored AS ( + content_ignored AS ( SELECT day, repo, author, num_prs FROM dependabot_removed - WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16')) + WHERE NOT is_content ), authors_aggregated AS ( SELECT day, repo, sum(num_prs) as num_prs - FROM non_dev_op_ignored + FROM content_ignored GROUP BY day, repo ), partial_week_ignored AS ( diff --git a/queries/old-prs-content.sql b/queries/old-prs-content.sql new file mode 100644 index 00000000..cd2e3d9a --- /dev/null +++ b/queries/old-prs-content.sql @@ -0,0 +1,55 @@ +WITH + last_saturday AS ( + SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date + ), + old_prs_only AS ( + SELECT time, organisation, repo, author, is_content, value + FROM github_pull_requests + WHERE name = 'queue_older_than_7_days' + ), + in_timeframe AS ( + SELECT time, organisation, repo, author, is_content, value + FROM old_prs_only + WHERE $__timeFilter(time) + ), + fields_munged AS ( + -- the time field is a timestamp, but we only ever write midnight; + -- we need to keep it as a timestamp type for bucketing below + SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs + FROM in_timeframe + ), + dependabot_removed AS ( + SELECT day, repo, author, is_content, num_prs + FROM fields_munged + WHERE author NOT LIKE 'dependabot%' + ), + just_content AS ( + SELECT day, repo, author, num_prs + FROM dependabot_removed + WHERE is_content + ), + authors_aggregated AS ( + SELECT day, repo, sum(num_prs) as num_prs + FROM just_content + GROUP BY day, repo + ), + partial_week_ignored AS ( + SELECT day, repo, num_prs + FROM authors_aggregated, last_saturday + WHERE day < last_saturday.the_date + ), + bucketed_in_weeks AS ( + SELECT + -- label buckets with the (exclusive) end date; + -- the 'origin' argument can be _any_ Saturday + time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket, + repo, + -- aggregate by taking the last value because this is a gauge, not a count + last(num_prs, day) as num_prs + FROM + partial_week_ignored, last_saturday + GROUP BY bucket, repo + ) +SELECT bucket, repo, num_prs +FROM bucketed_in_weeks +ORDER BY bucket DESC, repo diff --git a/queries/old-prs-humans.sql b/queries/old-prs-humans.sql index 27a91228..820d686f 100644 --- a/queries/old-prs-humans.sql +++ b/queries/old-prs-humans.sql @@ -3,34 +3,34 @@ WITH SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date ), old_prs_only AS ( - SELECT time, organisation, repo, author, value + SELECT time, organisation, repo, author, is_content, value FROM github_pull_requests WHERE name = 'queue_older_than_7_days' ), in_timeframe AS ( - SELECT time, organisation, repo, author, value + SELECT time, organisation, repo, author, is_content, value FROM old_prs_only WHERE $__timeFilter(time) ), fields_munged AS ( -- the time field is a timestamp, but we only ever write midnight; -- we need to keep it as a timestamp type for bucketing below - SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs + SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs FROM in_timeframe ), dependabot_removed AS ( - SELECT day, repo, author, num_prs + SELECT day, repo, author, is_content, num_prs FROM fields_munged WHERE author NOT LIKE 'dependabot%' ), - non_dev_op_ignored AS ( + content_ignored AS ( SELECT day, repo, author, num_prs FROM dependabot_removed - WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16')) + WHERE NOT is_content ), authors_aggregated AS ( SELECT day, repo, sum(num_prs) as num_prs - FROM non_dev_op_ignored + FROM content_ignored GROUP BY day, repo ), partial_week_ignored AS ( diff --git a/queries/throughput-humals.sql b/queries/throughput-humals.sql index e57fb721..475afe07 100644 --- a/queries/throughput-humals.sql +++ b/queries/throughput-humals.sql @@ -3,34 +3,34 @@ WITH SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date ), throughputs_only AS ( - SELECT time, organisation, repo, author, value + SELECT time, organisation, repo, author, is_content, value FROM github_pull_requests WHERE name = 'prs_merged' ), in_timeframe AS ( - SELECT time, organisation, repo, author, value + SELECT time, organisation, repo, author, is_content, value FROM throughputs_only WHERE $__timeFilter(time) ), fields_munged AS ( -- the time field is a timestamp, but we only ever write midnight; -- we need to keep it as a timestamp type for bucketing below - SELECT time as day, organisation||'/'||repo AS repo, author, value as throughput + SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as throughput FROM in_timeframe ), dependabot_removed AS ( - SELECT day, repo, author, throughput + SELECT day, repo, author, is_content, throughput FROM fields_munged WHERE author NOT LIKE 'dependabot%' ), - non_dev_op_ignored AS ( + content_ignored AS ( SELECT day, repo, author, throughput FROM dependabot_removed - WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16')) + WHERE NOT is_content ), authors_aggregated AS ( SELECT day, repo, sum(throughput) as throughput - FROM non_dev_op_ignored + FROM content_ignored GROUP BY day, repo ), partial_week_ignored AS ( diff --git a/tests/metrics/github/test_github.py b/tests/metrics/github/test_github.py index 5f904fa5..6fe76fe7 100644 --- a/tests/metrics/github/test_github.py +++ b/tests/metrics/github/test_github.py @@ -115,6 +115,41 @@ def test_returns_none_for_unknown_ownership(patch): assert github.all_repos() == [repo("ebmdatalab", "the_repo", None)] +def test_correctly_labels_non_tech_prs_from_non_content_repos(patch): + patch("team_members", {"ebmdatalab": {"team-rex": ["tech-person"]}}) + patch("team_repos", {"ebmdatalab": {"team-rex": ["job-server"]}}) + patch("repos", {"ebmdatalab": [repo_data("job-server")]}) + patch("prs", {"ebmdatalab": {"job-server": [pr_data(author="non-tech-person")]}}) + + prs = github.tech_prs() + assert len(prs) == 1 + assert not prs[0].is_content + + +def test_correctly_labels_tech_prs_from_content_repos(patch): + patch("team_members", {"ebmdatalab": {"team-rex": ["tech-person"]}}) + patch("team_repos", {"ebmdatalab": {"team-rex": ["opensafely.org"]}}) + patch("repos", {"ebmdatalab": [repo_data("opensafely.org")]}) + patch("prs", {"ebmdatalab": {"opensafely.org": [pr_data(author="tech-person")]}}) + + prs = github.tech_prs() + assert len(prs) == 1 + assert not prs[0].is_content + + +def test_correctly_labels_non_tech_prs_from_content_repos(patch): + patch("team_members", {"ebmdatalab": {"team-rex": ["tech-person"]}}) + patch("team_repos", {"ebmdatalab": {"team-rex": ["opensafely.org"]}}) + patch("repos", {"ebmdatalab": [repo_data("opensafely.org")]}) + patch( + "prs", {"ebmdatalab": {"opensafely.org": [pr_data(author="non-tech-person")]}} + ) + + prs = github.tech_prs() + assert len(prs) == 1 + assert prs[0].is_content + + def test_is_old(): # A PR is old if it was created a week or more ago. assert pr(created_on=LONG_AGO).was_old_on(TODAY) @@ -153,5 +188,21 @@ def repo(org, name, team): return Repo(org, name, team, datetime.date.min) -def pr(created_on=TODAY, closed_on=None, merged_on=None): - return PR(repo("org", "repo", "team"), "author", created_on, merged_on, closed_on) +def pr_data(author="author"): + return dict( + author=dict(login=author), + createdAt=datetime.datetime.min.isoformat(), + mergedAt=None, + closedAt=None, + ) + + +def pr(created_on=TODAY, closed_on=None, merged_on=None, is_content=False): + return PR( + repo("org", "repo", "team"), + "author", + created_on, + merged_on, + closed_on, + is_content, + ) diff --git a/tests/metrics/github/test_metrics_prs.py b/tests/metrics/github/test_metrics_prs.py index 776221ef..a2ac5538 100644 --- a/tests/metrics/github/test_metrics_prs.py +++ b/tests/metrics/github/test_metrics_prs.py @@ -18,9 +18,9 @@ def test_makes_counts_for_every_day_between_pr_creation_and_now(): counts = calculate_pr_counts(prs, true) assert counts == { - ("an-org", "a-repo", "an-author", TWO_DAYS_AGO): 1, - ("an-org", "a-repo", "an-author", YESTERDAY): 1, - ("an-org", "a-repo", "an-author", TODAY): 1, + ("an-org", "a-repo", "an-author", False, TWO_DAYS_AGO): 1, + ("an-org", "a-repo", "an-author", False, YESTERDAY): 1, + ("an-org", "a-repo", "an-author", False, TODAY): 1, } @@ -33,7 +33,7 @@ def test_counts_prs(): ] counts = calculate_pr_counts(prs, true) - assert counts == {("an-org", "a-repo", "an-author", TODAY): 3} + assert counts == {("an-org", "a-repo", "an-author", False, TODAY): 3} def test_counts_only_prs_matching_predicate(): @@ -44,7 +44,7 @@ def test_counts_only_prs_matching_predicate(): ] counts = calculate_pr_counts(prs, lambda pr_, _date: pr_.merged_on) - assert counts == {("an-org", "a-repo", "an-author", TODAY): 1} + assert counts == {("an-org", "a-repo", "an-author", False, TODAY): 1} def test_returns_counts_by_org(): @@ -55,8 +55,8 @@ def test_returns_counts_by_org(): counts = calculate_pr_counts(prs, true) assert counts == { - ("an-org", "a-repo", "an-author", TODAY): 1, - ("another-org", "another-repo", "an-author", TODAY): 1, + ("an-org", "a-repo", "an-author", False, TODAY): 1, + ("another-org", "another-repo", "an-author", False, TODAY): 1, } @@ -68,8 +68,8 @@ def test_returns_counts_by_repo(): counts = calculate_pr_counts(prs, true) assert counts == { - ("an-org", "a-repo", "an-author", TODAY): 1, - ("an-org", "another-repo", "an-author", TODAY): 1, + ("an-org", "a-repo", "an-author", False, TODAY): 1, + ("an-org", "another-repo", "an-author", False, TODAY): 1, } @@ -79,8 +79,22 @@ def test_returns_counts_by_author(): counts = calculate_pr_counts(prs, true) assert counts == { - ("an-org", "a-repo", "an-author", TODAY): 1, - ("an-org", "a-repo", "another-author", TODAY): 1, + ("an-org", "a-repo", "an-author", False, TODAY): 1, + ("an-org", "a-repo", "another-author", False, TODAY): 1, + } + + +def test_distinguishes_content_from_non_content(): + r = repo("an-org", "a-repo") + prs = [ + pr(r, author="an-author", is_content=True), + pr(r, author="an-author", is_content=False), + ] + + counts = calculate_pr_counts(prs, true) + assert counts == { + ("an-org", "a-repo", "an-author", False, TODAY): 1, + ("an-org", "a-repo", "an-author", True, TODAY): 1, } @@ -95,8 +109,15 @@ def repo(org, name, is_archived=False): ) -def pr(repo_=None, created_on=TODAY, closed_on=None, merged_on=None, author=None): - return PR(repo_, author, created_on, merged_on, closed_on) +def pr( + repo_=None, + created_on=TODAY, + closed_on=None, + merged_on=None, + author=None, + is_content=False, +): + return PR(repo_, author, created_on, merged_on, closed_on, is_content) def true(*_args):