From a29ae7539bb9699620fa45834ae976c4530900cd Mon Sep 17 00:00:00 2001 From: Ben Butler-Cole Date: Fri, 8 Mar 2024 13:48:16 +0000 Subject: [PATCH 1/4] Add tech-shared team for repo ownership This dummy team allows us to record the fact that a small number of repos are jointly owned by the Tech teams. --- metrics/github/repos.py | 2 +- tests/metrics/github/test_repos.py | 30 ++++++++++++++++++++++++------ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/metrics/github/repos.py b/metrics/github/repos.py index b54ec230..d348627c 100644 --- a/metrics/github/repos.py +++ b/metrics/github/repos.py @@ -39,7 +39,7 @@ def _is_tech_owned(repo): # GitHub slugs for the teams we're interested in -_TECH_TEAMS = ["team-rap", "team-rex"] +_TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"] _NON_TECH_REPOS = { diff --git a/tests/metrics/github/test_repos.py b/tests/metrics/github/test_repos.py index b52cb94f..1a46c8ea 100644 --- a/tests/metrics/github/test_repos.py +++ b/tests/metrics/github/test_repos.py @@ -50,11 +50,24 @@ def test_dont_exclude_repos_from_unknown_orgs(patch): def test_looks_up_ownership(patch): - patch("repos", [repo("the_org", "repo1"), repo("the_org", "repo2")]) - patch("team_repos", {"the_org": {"team-rex": ["repo1"], "team-rap": ["repo2"]}}) + patch( + "repos", + [repo("the_org", "repo1"), repo("the_org", "repo2"), repo("the_org", "repo3")], + ) + patch( + "team_repos", + { + "the_org": { + "team-rex": ["repo1"], + "team-rap": ["repo2"], + "tech-shared": ["repo3"], + } + }, + ) assert repos.get_repo_ownership(None, ["the_org"]) == [ {"organisation": "the_org", "repo": "repo1", "owner": "team-rex"}, {"organisation": "the_org", "repo": "repo2", "owner": "team-rap"}, + {"organisation": "the_org", "repo": "repo3", "owner": "tech-shared"}, ] @@ -63,8 +76,8 @@ def test_looks_up_ownership_across_orgs(patch): patch( "team_repos", { - "org1": {"team-rex": ["repo1"], "team-rap": []}, - "org2": {"team-rex": [], "team-rap": ["repo2"]}, + "org1": {"team-rex": ["repo1"], "team-rap": [], "tech-shared": []}, + "org2": {"team-rex": [], "team-rap": ["repo2"], "tech-shared": []}, }, ) assert repos.get_repo_ownership(None, ["org1", "org2"]) == [ @@ -75,13 +88,18 @@ def test_looks_up_ownership_across_orgs(patch): def test_ignores_ownership_of_archived_repos(patch): patch("repos", [repo("the_org", "the_repo", archived_on=date.min)]) - patch("team_repos", {"the_org": {"team-rex": ["the_repo"], "team-rap": []}}) + patch( + "team_repos", + {"the_org": {"team-rex": ["the_repo"], "team-rap": [], "tech-shared": []}}, + ) assert repos.get_repo_ownership(None, ["the_org"]) == [] def test_returns_none_for_unknown_ownership(patch): patch("repos", [repo("the_org", "the_repo")]) - patch("team_repos", {"the_org": {"team-rex": [], "team-rap": []}}) + patch( + "team_repos", {"the_org": {"team-rex": [], "team-rap": [], "tech-shared": []}} + ) assert repos.get_repo_ownership(None, ["the_org"]) == [ {"organisation": "the_org", "repo": "the_repo", "owner": None} ] From 813ca2d9b97c86a637a57d442a5dd4ab1ac4b578 Mon Sep 17 00:00:00 2001 From: Ben Butler-Cole Date: Fri, 8 Mar 2024 13:50:27 +0000 Subject: [PATCH 2/4] Use GitHub teams for team ownership throughout instead of hard-coded exclusion list This has a side-effect of slightly changing the behaviour with respect to archived repos. We don't insist that archived repos have an owning team recorded. Now that we only display data for repos owned by _current_ teams, any archived repos that were owned by old teams or which were active before we had a team structure will not be included in the data. This doesn't affect our main use of this data, which is to understand how the teams are operating today. It potentially affects our long-term historic view of the data. In practice only 4% of PRs are excluded by this change and eye-balling the resulting charts over long timescales shows no significant differences, so we've decided that this is okay. --- metrics/github/repos.py | 118 ++--------------------------- tests/metrics/github/test_repos.py | 34 ++++++--- 2 files changed, 29 insertions(+), 123 deletions(-) diff --git a/metrics/github/repos.py b/metrics/github/repos.py index d348627c..bfcc17cf 100644 --- a/metrics/github/repos.py +++ b/metrics/github/repos.py @@ -2,7 +2,11 @@ def tech_repos(client, org): - return [r for r in query.repos(client, org) if _is_tech_owned(r)] + repo_names = [] + for team in _TECH_TEAMS: + repo_names.extend(query.team_repos(client, org, team)) + + return [r for r in query.repos(client, org) if r.name in repo_names] def get_repo_ownership(client, orgs): @@ -28,117 +32,5 @@ def get_repo_ownership(client, orgs): return repo_owners -def _is_tech_owned(repo): - # For now we are using a hard-coded list here because we don't have teams set up with repo - # lists for ebmdatalab. Later we can use the dynamically calculated repo ownership and get - # rid of this list. - # - # We use a deny-list rather than an allow-list so that newly created repos are treated as - # Tech-owned by default, in the hopes of minimizing surprise. - return not (repo.org in _NON_TECH_REPOS and repo.name in _NON_TECH_REPOS[repo.org]) - - # GitHub slugs for the teams we're interested in _TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"] - - -_NON_TECH_REPOS = { - "ebmdatalab": [ - "bennett-presentations", - "bnf-code-to-dmd", - "change_detection", - "copiloting", - "clinicaltrials-act-converter", - "clinicaltrials-act-tracker", - "copiloting-publication", - "datalab-jupyter", - "dmd-hosp-only", - "euctr-tracker-code", - "funding-applications", - "funding-report", - "ghost_branded_generics_paper", - "global-trial-landscape", - "imagemagick-magick", - "improvement_radar_prototype", - "jupyter-notebooks", - "kurtosis-pericyazine", - "lidocaine-eng-ire", - "low-priority-CCG-visit-RCT", - "nsaid-covid-codelist-notebook", - "opencorona-sandpit-for-fizz", - "opensafely-output-review", - "open-nhs-hospital-use-data", - "opioids-change-detection-notebook", - "outliers", - "prescribing-queries", - "price-concessions-accuracy-notebook", - "priceshocks", - "propaganda", - "publications", - "publications-copiloted", - "retracted.net", - "retractobot", - "retractobot-archive", - "rx-cost-item-analysis", - "Rx-Quantity-for-Long-Term-Conditions", - "Rx-Quantity-for-LTCs-notebook", - "scmd-narcolepsy", - "seb-test-notebook", - "service-analytics-team", - "teaching_resource", - "trialstracker", - "vaccinations-covid-codelist-notebook", - "nimodipine-rct", - "covid_trials_tracker-covid", - "datalabsupport", - "fdaaa_trends", - "openpathology-web", - "antibiotics-rct-analysis", - "seb-docker-test", - "openprescribing-doacs-workbook", - "sps-injectable-medicines-notebook", - "html-template-demo", - "covid-prescribing-impact-notebook", - "antibiotics-covid-codelist-notebook", - "medicines-seasonality-notebook", - "one-drug-database-analysis", - "cvd-covid-codelist-notebook", - "doacs-prescribing-notebook", - "ranitidine-shortages-notebook", - "raas-covid-codelist-notebook", - "statins-covid-codelist-notebook", - "medicines-and-poisoning-notebook", - "diabetes-drugs-covid-codelist-notebook", - "bnf-less-suitable-for-prescribing", - "insulin-covid-codelist-notebook", - "respiratory-meds-covid-codelist-notebook", - "immunosuppressant-covid-codelist-notebook", - "top-10-notebook", - "cusum-for-opioids-notebook", - "steroids-covid-codelist-notebook", - "chloroquine-covid-codelist-notebook", - "antibiotics-non-oral-routes-notebook", - "nhsdigital-shieldedrules-covid-codelist-methodology", - "anticoagulant-covid-codelist-notebook", - "repeat-prescribing-pandemic-covid-notebook", - "devon-formulary-adherence-notebook", - "statins-dose-paper", - "covid19_results_reporting", - "euctr_data_quality", - "factors-associated-with-changing-notebook", - "lidocaine-change-detection-notebook", - "datalab-notebook-template", - "antidepressant-trends", - "mortality_tracking-covid-notebook", - "ppi-covid-codelist-notebook", - "openpath-dash", - "fdaaa_requirements", - "openpath-pipeline", - "qof-data", - "nhstrusts", - ], - "opensafely-core": [ - "matching", - "scribe", - ], -} diff --git a/tests/metrics/github/test_repos.py b/tests/metrics/github/test_repos.py index 1a46c8ea..3b7b3454 100644 --- a/tests/metrics/github/test_repos.py +++ b/tests/metrics/github/test_repos.py @@ -28,25 +28,39 @@ def fake(_client, *keys): def test_includes_tech_owned_repos(patch): patch( - "repos", [repo("ebmdatalab", "sysadmin"), repo("opensafely-core", "job-server")] + "team_repos", + { + "opensafely-core": { + "team-rap": ["ehrql", "cohort-extractor"], + "team-rex": ["job-server"], + "tech-shared": [".github"], + } + }, + ) + patch( + "repos", + [ + repo("opensafely-core", "ehrql"), + repo("opensafely-core", "cohort-extractor"), + repo("opensafely-core", "job-server"), + repo("opensafely-core", ".github"), + ], ) - assert len(repos.tech_repos(None, None)) == 2 + assert len(repos.tech_repos(None, "opensafely-core")) == 4 def test_excludes_non_tech_owned_repos(patch): + patch( + "team_repos", + {"opensafely-core": {"team-rap": [], "team-rex": [], "tech-shared": []}}, + ) patch( "repos", [ - repo("ebmdatalab", "clinicaltrials-act-tracker"), - repo("opensafely-core", "matching"), + repo("opensafely-core", "other-repo"), ], ) - assert len(repos.tech_repos(None, None)) == 0 - - -def test_dont_exclude_repos_from_unknown_orgs(patch): - patch("repos", [repo("other", "any")]) - assert len(repos.tech_repos(None, None)) == 1 + assert len(repos.tech_repos(None, "opensafely-core")) == 0 def test_looks_up_ownership(patch): From 686a7d1fe27b5c354a92eb0c99088390bd0342d2 Mon Sep 17 00:00:00 2001 From: Ben Butler-Cole Date: Fri, 8 Mar 2024 13:59:55 +0000 Subject: [PATCH 3/4] Explictly exclude archived repos for clarity Previously an unpredictable subset of archived repos were excluded based on team ownership records (see previous commit for details). For clarity and consistently we are now excluding them all explicitly. --- metrics/github/repos.py | 12 ++++++------ tests/metrics/github/test_repos.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/metrics/github/repos.py b/metrics/github/repos.py index bfcc17cf..7502b849 100644 --- a/metrics/github/repos.py +++ b/metrics/github/repos.py @@ -6,7 +6,7 @@ def tech_repos(client, org): for team in _TECH_TEAMS: repo_names.extend(query.team_repos(client, org, team)) - return [r for r in query.repos(client, org) if r.name in repo_names] + return [r for r in _active_repos(client, org) if r.name in repo_names] def get_repo_ownership(client, orgs): @@ -18,11 +18,7 @@ def get_repo_ownership(client, orgs): for repo in query.team_repos(client, org, team): ownership[repo] = team - active_repos = [ - repo for repo in query.repos(client, org) if not repo.is_archived() - ] - - for repo in active_repos: + for repo in _active_repos(client, org): if repo.name in ownership: team = ownership[repo.name] else: @@ -32,5 +28,9 @@ def get_repo_ownership(client, orgs): return repo_owners +def _active_repos(client, org): + return [repo for repo in query.repos(client, org) if not repo.is_archived()] + + # GitHub slugs for the teams we're interested in _TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"] diff --git a/tests/metrics/github/test_repos.py b/tests/metrics/github/test_repos.py index 3b7b3454..2a6690a4 100644 --- a/tests/metrics/github/test_repos.py +++ b/tests/metrics/github/test_repos.py @@ -63,6 +63,26 @@ def test_excludes_non_tech_owned_repos(patch): assert len(repos.tech_repos(None, "opensafely-core")) == 0 +def test_excludes_archived_repos(patch): + patch( + "team_repos", + { + "opensafely-core": { + "team-rap": ["other-repo"], + "team-rex": [], + "tech-shared": [], + } + }, + ) + patch( + "repos", + [ + repo("opensafely-core", "other-repo", archived_on=date.min), + ], + ) + assert len(repos.tech_repos(None, "opensafely-core")) == 0 + + def test_looks_up_ownership(patch): patch( "repos", From 4c6c4d5d4c351a7da110f5e55861128567ca324e Mon Sep 17 00:00:00 2001 From: Ben Butler-Cole Date: Fri, 8 Mar 2024 14:31:50 +0000 Subject: [PATCH 4/4] Remove now-obsolete fine-grained calculation of metrics for archived repos See last commit for context. --- metrics/github/prs.py | 5 +---- metrics/github/query.py | 7 ++----- metrics/github/repos.py | 2 +- metrics/github/security.py | 3 +-- tests/metrics/github/test_prs.py | 23 ++-------------------- tests/metrics/github/test_repos.py | 8 ++++---- tests/metrics/github/test_security.py | 28 ++------------------------- 7 files changed, 13 insertions(+), 63 deletions(-) diff --git a/metrics/github/prs.py b/metrics/github/prs.py index f939c0df..2337f513 100644 --- a/metrics/github/prs.py +++ b/metrics/github/prs.py @@ -38,10 +38,7 @@ def calculate_counts(prs_by_repo, predicate): for repo, prs in prs_by_repo.items(): for pr in prs: start = pr["created_on"] - end = min( - repo.archived_on if repo.archived_on else date.today(), - pr["closed_on"] if pr["closed_on"] else date.today(), - ) + end = pr["closed_on"] if pr["closed_on"] else date.today() for day in iter_days(start, end): if predicate(pr, day): counts[(repo.org, repo.name, pr["author"], day)] += 1 diff --git a/metrics/github/query.py b/metrics/github/query.py index bd771b86..f5d4d1e6 100644 --- a/metrics/github/query.py +++ b/metrics/github/query.py @@ -32,7 +32,7 @@ def repos(client, org): org, raw_repo["name"], date_from_iso(raw_repo["createdAt"]), - date_from_iso(raw_repo["archivedAt"]), + raw_repo["archivedAt"] is not None, raw_repo["hasVulnerabilityAlertsEnabled"], ) @@ -42,12 +42,9 @@ class Repo: org: str name: str created_on: date - archived_on: date | None + is_archived: bool = False has_vulnerability_alerts_enabled: bool = False - def is_archived(self): - return bool(self.archived_on) - def team_repos(client, org, team): """The API doesn't make it easy for us to get all the information we need about repos in diff --git a/metrics/github/repos.py b/metrics/github/repos.py index 7502b849..2c648227 100644 --- a/metrics/github/repos.py +++ b/metrics/github/repos.py @@ -29,7 +29,7 @@ def get_repo_ownership(client, orgs): def _active_repos(client, org): - return [repo for repo in query.repos(client, org) if not repo.is_archived()] + return [repo for repo in query.repos(client, org) if not repo.is_archived] # GitHub slugs for the teams we're interested in diff --git a/metrics/github/security.py b/metrics/github/security.py index 8ae1283b..587b8c7e 100644 --- a/metrics/github/security.py +++ b/metrics/github/security.py @@ -34,8 +34,7 @@ def vulnerabilities(client, org, to_date): for repo in repos.tech_repos(client, org): vulns = list(map(Vulnerability.from_dict, query.vulnerabilities(client, repo))) - end = min(to_date, repo.archived_on) if repo.archived_on else to_date - for day in dates.iter_days(repo.created_on, end): + for day in dates.iter_days(repo.created_on, to_date): closed_vulns = sum(1 for v in vulns if v.is_closed_on(day)) open_vulns = sum(1 for v in vulns if v.is_open_on(day)) diff --git a/tests/metrics/github/test_prs.py b/tests/metrics/github/test_prs.py index 90c811ee..ac1abf2b 100644 --- a/tests/metrics/github/test_prs.py +++ b/tests/metrics/github/test_prs.py @@ -34,25 +34,6 @@ def test_makes_counts_for_every_day_between_pr_creation_and_now(): } -def test_drops_repos_after_archiving(): - repo_ = repo( - "an-org", - "a-repo", - archived_on=YESTERDAY, - ) - prs = { - repo_: [ - pr(author="an-author", created_on=TWO_DAYS_AGO), - ] - } - - counts = calculate_counts(prs, true) - assert counts == { - ("an-org", "a-repo", "an-author", TWO_DAYS_AGO): 1, - ("an-org", "a-repo", "an-author", YESTERDAY): 1, - } - - def test_counts_prs(): prs = { repo("an-org", "a-repo"): [ @@ -144,12 +125,12 @@ def test_was_merged_in_on(): assert not was_merged_on(pr(merged_on=TOMORROW), TODAY) -def repo(org, name, archived_on=None): +def repo(org, name, is_archived=False): return Repo( org, name, created_on=date.min, - archived_on=archived_on, + is_archived=is_archived, has_vulnerability_alerts_enabled=False, ) diff --git a/tests/metrics/github/test_repos.py b/tests/metrics/github/test_repos.py index 2a6690a4..013e41de 100644 --- a/tests/metrics/github/test_repos.py +++ b/tests/metrics/github/test_repos.py @@ -77,7 +77,7 @@ def test_excludes_archived_repos(patch): patch( "repos", [ - repo("opensafely-core", "other-repo", archived_on=date.min), + repo("opensafely-core", "other-repo", is_archived=True), ], ) assert len(repos.tech_repos(None, "opensafely-core")) == 0 @@ -121,7 +121,7 @@ def test_looks_up_ownership_across_orgs(patch): def test_ignores_ownership_of_archived_repos(patch): - patch("repos", [repo("the_org", "the_repo", archived_on=date.min)]) + patch("repos", [repo("the_org", "the_repo", is_archived=True)]) patch( "team_repos", {"the_org": {"team-rex": ["the_repo"], "team-rap": [], "tech-shared": []}}, @@ -139,11 +139,11 @@ def test_returns_none_for_unknown_ownership(patch): ] -def repo(org, name, archived_on=None): +def repo(org, name, is_archived=False): return Repo( org=org, name=name, created_on=date.min, - archived_on=archived_on, + is_archived=is_archived, has_vulnerability_alerts_enabled=False, ) diff --git a/tests/metrics/github/test_security.py b/tests/metrics/github/test_security.py index d242dce3..283ff252 100644 --- a/tests/metrics/github/test_security.py +++ b/tests/metrics/github/test_security.py @@ -46,35 +46,11 @@ def test_vulnerability_closed_on_is_closed(): assert v.is_closed_on(date(2023, 10, 29)) -def test_vulnerabilities_ignores_archived_repos_after_archive_date(monkeypatch): - archive_date = date(2022, 1, 3) - - def fake_repos(client, org): - return [ - Repo( - "anything", - "anything", - created_on=date(2022, 1, 1), - archived_on=archive_date, - ) - ] - - monkeypatch.setattr(security.repos, "tech_repos", fake_repos) - - def fake_vulnerabilities(client, repo): - return [] - - monkeypatch.setattr(security.query, "vulnerabilities", fake_vulnerabilities) - - result = security.vulnerabilities({}, "org", date(2022, 1, 10)) - assert result[-1]["time"] == archive_date - - def test_vulnerabilities(monkeypatch): def fake_repos(client, org): return [ - Repo(org, "test", date(2023, 10, 13), None, True), - Repo(org, "test2", date(2023, 10, 13), None, True), + Repo(org, "test", date(2023, 10, 13), False, True), + Repo(org, "test2", date(2023, 10, 13), False, True), ] monkeypatch.setattr(security.repos, "tech_repos", fake_repos)