Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distinguish content PRs #159

Merged
merged 2 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 42 additions & 3 deletions metrics/github/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,22 @@
# Slugs (not names!) of the GitHub entities we're interested in
_TECH_TEAMS = ["team-rap", "team-rex", "tech-shared"]
_ORGS = ["ebmdatalab", "opensafely-core"]
# Some repos (for example the websites) frequently have PRs created by people
# outside the tech teams. We don't want our delivery metrics to be skewed by these
# and we don't necessarily want to hold people in other teams to the same hygiene
# standards as we hold ourselves. So we treat them differently. (We don't want to
# exclude such PRs from other repos because it's definitely interesting if there are
# old PRs (or lots of PRs) created by non-tech-team members in those repos.)
_CONTENT_REPOS = {
"ebmdatalab": [
"opensafely.org",
"team-manual",
"bennett.ox.ac.uk",
"openprescribing",
]
}
_MANAGERS = {"sebbacon", "benbc"}
_EX_DEVELOPERS = {"ghickman", "milanwiedemann", "CarolineMorton"}


@dataclass(frozen=True)
Expand All @@ -23,6 +39,10 @@ class Repo:
def is_tech_owned(self):
return self.team in _TECH_TEAMS

@property
def is_content_repo(self):
return self.org in _CONTENT_REPOS and self.name in _CONTENT_REPOS[self.org]

@classmethod
def from_dict(cls, data, org, team):
return cls(
Expand All @@ -42,6 +62,7 @@ class PR:
created_on: datetime.date
merged_on: datetime.date
closed_on: datetime.date
is_content: bool

def was_old_on(self, date):
opened = self.created_on
Expand All @@ -56,13 +77,17 @@ def was_merged_on(self, date):
return self.merged_on and date == self.merged_on

@classmethod
def from_dict(cls, data, repo):
def from_dict(cls, data, repo, tech_team_members):
author = data["author"]["login"]
is_content = repo.is_content_repo and author not in tech_team_members

return cls(
repo,
data["author"]["login"],
author,
date_from_iso(data["createdAt"]),
date_from_iso(data["mergedAt"]),
date_from_iso(data["closedAt"]),
is_content,
)


Expand All @@ -84,8 +109,9 @@ def from_dict(cls, data, repo):


def tech_prs():
tech_team_members = _tech_team_members()
return [
PR.from_dict(pr, repo)
PR.from_dict(pr, repo, tech_team_members)
for repo in tech_repos()
for pr in query.prs(repo.org, repo.name)
]
Expand Down Expand Up @@ -119,3 +145,16 @@ def _get_repos():

def _repo_owners(org):
return {repo: team for team in _TECH_TEAMS for repo in query.team_repos(org, team)}


def _tech_team_members():
return (
_MANAGERS
| _EX_DEVELOPERS
| {
person
for org in _ORGS
for team in _TECH_TEAMS
for person in query.team_members(org, team)
}
)
5 changes: 3 additions & 2 deletions metrics/github/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def calculate_pr_counts(prs, predicate):
end = pr.closed_on if pr.closed_on else datetime.date.today()
for day in iter_days(start, end):
if predicate(pr, day):
counts[(pr.repo.org, pr.repo.name, pr.author, day)] += 1
counts[(pr.repo.org, pr.repo.name, pr.author, pr.is_content, day)] += 1
return dict(counts)


def convert_pr_counts_to_metrics(counts, name):
metrics = []
for coord, count in counts.items():
org, repo, author, date_ = coord
org, repo, author, is_content, date_ = coord
timestamp = datetime.datetime.combine(date_, datetime.time())
metrics.append(
{
Expand All @@ -38,6 +38,7 @@ def convert_pr_counts_to_metrics(counts, name):
"organisation": org,
"repo": repo,
"author": author,
"is_content": is_content,
"value": count,
}
)
Expand Down
12 changes: 10 additions & 2 deletions metrics/github/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,19 @@ def team_repos(org, team):
"""The API doesn't make it easy for us to get all the information we need about repos in
one place, so we just return a list of repos here and join that to the richer repo objects
in the caller."""
results = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team)
for repo in results:
repos = _client().rest_query("/orgs/{org}/teams/{team}/repos", org=org, team=team)
for repo in repos:
yield repo["name"]


def team_members(org, team):
members = _client().rest_query(
"/orgs/{org}/teams/{team}/members", org=org, team=team
)
for member in members:
yield member["login"]


def vulnerabilities(org, repo):
query = """
query vulnerabilities($cursor: String, $org: String!, $repo: String!) {
Expand Down
1 change: 1 addition & 0 deletions metrics/timescaledb/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Column("name", Text, primary_key=True),
Column("value", Integer),
Column("author", Text, primary_key=True),
Column("is_content", Boolean, primary_key=True),
Column("organisation", Text, primary_key=True),
Column("repo", Text, primary_key=True),
)
Expand Down
61 changes: 61 additions & 0 deletions queries/current-old-prs-content.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
WITH
last_saturday AS (
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
content_ignored AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
SELECT day, repo, num_prs
FROM authors_aggregated, last_saturday
WHERE day < last_saturday.the_date
),
last_week_only AS (
SELECT day, repo, num_prs
FROM partial_week_ignored, last_saturday
WHERE day >= last_saturday.the_date - 7
),
bucketed_in_weeks AS (
SELECT
-- label buckets with the (exclusive) end date;
-- the 'origin' argument can be _any_ Saturday
time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket,
repo,
-- aggregate by taking the last value because this is a gauge, not a count
last(num_prs, day) as num_prs
FROM
last_week_only, last_saturday
GROUP BY bucket, repo
)
SELECT repo as "Repo", num_prs AS "Old PRs"
FROM bucketed_in_weeks
ORDER BY bucket DESC, num_prs DESC
LIMIT 5
14 changes: 7 additions & 7 deletions queries/current-old-prs-humans.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ WITH
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, num_prs
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
non_dev_op_ignored AS (
content_ignored AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
WHERE NOT is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM non_dev_op_ignored
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
Expand Down
55 changes: 55 additions & 0 deletions queries/old-prs-content.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
WITH
last_saturday AS (
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
just_content AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM just_content
GROUP BY day, repo
),
partial_week_ignored AS (
SELECT day, repo, num_prs
FROM authors_aggregated, last_saturday
WHERE day < last_saturday.the_date
),
bucketed_in_weeks AS (
SELECT
-- label buckets with the (exclusive) end date;
-- the 'origin' argument can be _any_ Saturday
time_bucket('1 week', day, last_saturday.the_date) + '7 days' as bucket,
repo,
-- aggregate by taking the last value because this is a gauge, not a count
last(num_prs, day) as num_prs
FROM
partial_week_ignored, last_saturday
GROUP BY bucket, repo
)
SELECT bucket, repo, num_prs
FROM bucketed_in_weeks
ORDER BY bucket DESC, repo
14 changes: 7 additions & 7 deletions queries/old-prs-humans.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ WITH
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
old_prs_only AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'queue_older_than_7_days'
),
in_timeframe AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM old_prs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, value as num_prs
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as num_prs
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, num_prs
SELECT day, repo, author, is_content, num_prs
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
non_dev_op_ignored AS (
content_ignored AS (
SELECT day, repo, author, num_prs
FROM dependabot_removed
WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
WHERE NOT is_content
),
authors_aggregated AS (
SELECT day, repo, sum(num_prs) as num_prs
FROM non_dev_op_ignored
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
Expand Down
14 changes: 7 additions & 7 deletions queries/throughput-humals.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ WITH
SELECT CURRENT_DATE - (1 + CAST(extract(dow FROM CURRENT_DATE) AS INT)) % 7 as the_date
),
throughputs_only AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM github_pull_requests
WHERE name = 'prs_merged'
),
in_timeframe AS (
SELECT time, organisation, repo, author, value
SELECT time, organisation, repo, author, is_content, value
FROM throughputs_only
WHERE $__timeFilter(time)
),
fields_munged AS (
-- the time field is a timestamp, but we only ever write midnight;
-- we need to keep it as a timestamp type for bucketing below
SELECT time as day, organisation||'/'||repo AS repo, author, value as throughput
SELECT time as day, organisation||'/'||repo AS repo, author, is_content, value as throughput
FROM in_timeframe
),
dependabot_removed AS (
SELECT day, repo, author, throughput
SELECT day, repo, author, is_content, throughput
FROM fields_munged
WHERE author NOT LIKE 'dependabot%'
),
non_dev_op_ignored AS (
content_ignored AS (
SELECT day, repo, author, throughput
FROM dependabot_removed
WHERE NOT (repo = 'ebmdatalab/openprescribing' AND author IN ('richiecroker', 'chrisjwood16'))
WHERE NOT is_content
),
authors_aggregated AS (
SELECT day, repo, sum(throughput) as throughput
FROM non_dev_op_ignored
FROM content_ignored
GROUP BY day, repo
),
partial_week_ignored AS (
Expand Down
Loading