Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor to introduce a Repo value type #140

Merged
merged 7 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ check: black ruff
# fix formatting and import sort ordering
fix: devenv
$BIN/black .
$BIN/ruff --fix .
$BIN/ruff check --fix .


# Run the grafana stack
Expand Down
4 changes: 2 additions & 2 deletions metrics/github/prs.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ def calculate_counts(prs_by_repo, predicate):
for pr in prs:
start = pr["created_on"]
end = min(
repo["archived_on"] if repo["archived_on"] else date.today(),
repo.archived_on if repo.archived_on else date.today(),
pr["closed_on"] if pr["closed_on"] else date.today(),
)
for day in iter_days(start, end):
if predicate(pr, day):
counts[(repo["org"], repo["name"], pr["author"], day)] += 1
counts[(repo.org, repo.name, pr["author"], day)] += 1
return dict(counts)


Expand Down
63 changes: 33 additions & 30 deletions metrics/github/query.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,20 @@
import itertools
import os
from dataclasses import dataclass
from datetime import date

from metrics.github.repos import tech_owned_repo
from metrics.github.repos import NON_TECH_REPOS
from metrics.tools.dates import date_from_iso


# We want to use some of these objects as keys in dicts. This is a pretty half-hearted
# implementation, but it does as much as we need.
class FrozenDict:
def __init__(self, dict_):
self._dict = dict_

def __getitem__(self, key):
return self._dict[key]

def __hash__(self):
return hash(tuple(self._dict.items()))


def repos(client, org):
query = """
query repos($cursor: String, $org: String!) {
organization(login: $org) {
repositories(first: 100, after: $cursor) {
nodes {
name
createdAt
archivedAt
hasVulnerabilityAlertsEnabled
}
Expand All @@ -39,20 +29,33 @@ def repos(client, org):
for raw_repo in maybe_truncate(
client.get_query(query, path=["organization", "repositories"], org=org)
):
repo = FrozenDict(
{
"org": org,
"name": raw_repo["name"],
"archived_on": date_from_iso(raw_repo["archivedAt"]),
"hasVulnerabilityAlertsEnabled": raw_repo[
"hasVulnerabilityAlertsEnabled"
],
}
repo = Repo(
org,
raw_repo["name"],
date_from_iso(raw_repo["createdAt"]),
date_from_iso(raw_repo["archivedAt"]),
raw_repo["hasVulnerabilityAlertsEnabled"],
)
if tech_owned_repo(repo):
if repo.is_tech_owned():
yield repo


@dataclass(frozen=True)
class Repo:
org: str
name: str
created_on: date
archived_on: date | None
has_vulnerability_alerts_enabled: bool = False

def is_tech_owned(self):
# We use a deny-list rather than an allow-list so that newly created repos are treated as
# Tech-owned by default, in the hopes of minimizing surprise.
return not (
self.org in NON_TECH_REPOS and self.name in NON_TECH_REPOS[self.org]
)


def vulnerabilities(client, repo):
query = """
query vulnerabilities($cursor: String, $org: String!, $repo: String!) {
Expand All @@ -78,8 +81,8 @@ def vulnerabilities(client, repo):
return client.get_query(
query,
path=["organization", "repository", "vulnerabilityAlerts"],
org=repo["org"],
repo=repo["name"],
org=repo.org,
repo=repo.name,
)


Expand Down Expand Up @@ -111,13 +114,13 @@ def prs(client, repo):
client.get_query(
query,
path=["organization", "repository", "pullRequests"],
org=repo["org"],
repo=repo["name"],
org=repo.org,
repo=repo.name,
)
):
yield {
"org": repo["org"],
"repo": repo["name"],
"org": repo.org,
"repo": repo.name,
"author": pr["author"]["login"],
"closed_on": date_from_iso(pr["closedAt"]),
"created_on": date_from_iso(pr["createdAt"]),
Expand Down
8 changes: 0 additions & 8 deletions metrics/github/repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,3 @@
"scribe",
],
}


def tech_owned_repo(repo):
# We use a deny-list rather than an allow-list so that newly created repos are treated as
# Tech-owned by default, in the hopes of minimizing surprise.
return not (
repo["org"] in NON_TECH_REPOS and repo["name"] in NON_TECH_REPOS[repo["org"]]
)
46 changes: 9 additions & 37 deletions metrics/github/security.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,45 +28,16 @@ def from_dict(my_dict):
)


@dataclass
class Repo:
name: str
org: str
has_alerts_enabled: bool
vulnerabilities: list[Vulnerability]

def __post_init__(self):
self.vulnerabilities.sort(key=lambda v: v.created_on)

def earliest_date(self, default):
if self.vulnerabilities:
return self.vulnerabilities[0].created_on
return default

def vulnerabilities(client, org, to_date):
metrics = []

def get_repos(client, org):
for repo in query.repos(client, org):
if repo["archived_on"]:
continue
vulns = list(map(Vulnerability.from_dict, query.vulnerabilities(client, repo)))

vulnerabilities = []
for vuln in query.vulnerabilities(client, repo):
vulnerabilities.append(Vulnerability.from_dict(vuln))

yield Repo(
name=repo["name"],
org=repo["org"],
has_alerts_enabled=repo["hasVulnerabilityAlertsEnabled"],
vulnerabilities=vulnerabilities,
)


def vulnerabilities(client, org, to_date):
metrics = []
for repo in get_repos(client, org):
for day in dates.iter_days(repo.earliest_date(default=to_date), to_date):
closed_vulns = sum(1 for v in repo.vulnerabilities if v.is_closed_on(day))
open_vulns = sum(1 for v in repo.vulnerabilities if v.is_open_on(day))
end = min(to_date, repo.archived_on) if repo.archived_on else to_date
for day in dates.iter_days(repo.created_on, end):
closed_vulns = sum(1 for v in vulns if v.is_closed_on(day))
open_vulns = sum(1 for v in vulns if v.is_open_on(day))

metrics.append(
{
Expand All @@ -75,8 +46,9 @@ def vulnerabilities(client, org, to_date):
"open": open_vulns,
"organisation": repo.org,
"repo": repo.name,
"has_alerts_enabled": repo.has_alerts_enabled,
"has_alerts_enabled": repo.has_vulnerability_alerts_enabled,
"value": 0, # needed for the timescaledb
}
)

return metrics
4 changes: 2 additions & 2 deletions metrics/timescaledb/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,9 @@ def write(table, rows, engine=None):
constraint = inspect(engine).get_pk_constraint(table.name)["name"]

with engine.begin() as connection:
# batch our values (which are currently 5 item dicts) so we don't
# batch our values (which are currently up-to-7 item dicts) so we don't
# hit the 65535 params limit
for values in batched(rows, 10_000):
for values in batched(rows, 9_000):
benbc marked this conversation as resolved.
Show resolved Hide resolved
stmt = insert(table).values(values)

# use the constraint for this table to drive upserting where the
Expand Down
9 changes: 3 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,22 +65,19 @@ exclude = [
"htmlcov",
"venv",
]
extend-select = [
lint.extend-select = [
"A", # flake8-builtins
"I", # isort
"INP", # flake8-no-pep420
"ISC", # flake8-implicit-str-concat
"UP", # pyupgrade
"W", # pycodestyle warning
]
extend-ignore = [
lint.extend-ignore = [
"E501",
"E731",
]

[tool.ruff.isort]
lines-after-imports = 2

lint.isort.lines-after-imports = 2

[tool.setuptools.packages.find]
include = ["metrics*"]
4 changes: 1 addition & 3 deletions requirements.prod.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,7 @@ idna==3.4 \
psycopg[binary]==3.1.12 \
--hash=sha256:8ec5230d6a7eb654b4fb3cf2d3eda8871d68f24807b934790504467f1deee9f8 \
--hash=sha256:cec7ad2bc6a8510e56c45746c631cf9394148bdc8a9a11fd8cf8554ce129ae78
# via
# psycopg
# sqlalchemy
# via sqlalchemy
psycopg-binary==3.1.12 \
--hash=sha256:000838cb5ab7851116b462e58893a96b0f1e35864135a6283f3242a730ec45d3 \
--hash=sha256:03a851123d0155e1d6ca5b6cccf624e2fc71c8f7eae76f5100196e0fca047d30 \
Expand Down
14 changes: 7 additions & 7 deletions tests/metrics/github/test_prs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest

from metrics.github.prs import calculate_counts, is_old, was_merged_on
from metrics.github.query import FrozenDict
from metrics.github.query import Repo


TODAY = date(year=2023, month=6, day=10)
Expand Down Expand Up @@ -145,12 +145,12 @@ def test_was_merged_in_on():


def repo(org, name, archived_on=None):
return FrozenDict(
{
"org": org,
"name": name,
"archived_on": archived_on if archived_on else None,
}
return Repo(
org,
name,
created_on=date.min,
archived_on=archived_on,
has_vulnerability_alerts_enabled=False,
)


Expand Down
22 changes: 17 additions & 5 deletions tests/metrics/github/test_repos.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
from metrics.github.repos import tech_owned_repo
from datetime import date

from metrics.github.query import Repo


def test_dont_filter_out_repos_from_unknown_orgs():
assert tech_owned_repo({"name": "any", "org": "other"})
assert make_repo(org="other", name="any").is_tech_owned()
benbc marked this conversation as resolved.
Show resolved Hide resolved


def test_filtering_of_tech_owned_repos():
assert tech_owned_repo({"name": "metrics", "org": "ebmdatalab"})
assert not tech_owned_repo(
{"name": "clinicaltrials-act-tracker", "org": "ebmdatalab"}
assert make_repo(org="ebmdatalab", name="metrics").is_tech_owned()
assert not make_repo(
org="ebmdatalab", name="clinicaltrials-act-tracker"
).is_tech_owned()


def make_repo(org, name):
return Repo(
org=org,
name=name,
created_on=date.min,
archived_on=None,
has_vulnerability_alerts_enabled=False,
)
Loading