Skip to content

Commit

Permalink
github-to-sqlite pull-requests command (#48)
Browse files Browse the repository at this point in the history
Thanks, @adamjonas
  • Loading branch information
adamjonas authored Nov 29, 2020
1 parent 6721972 commit b37f555
Show file tree
Hide file tree
Showing 5 changed files with 620 additions and 0 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Save data from GitHub to a SQLite database.
- [How to install](#how-to-install)
- [Authentication](#authentication)
- [Fetching issues for a repository](#fetching-issues-for-a-repository)
- [Fetching pull requests for a repository](#fetching-pull-requests-for-a-repository)
- [Fetching issue comments for a repository](#fetching-issue-comments-for-a-repository)
- [Fetching commits for a repository](#fetching-commits-for-a-repository)
- [Fetching tags for a repository](#fetching-tags-for-a-repository)
Expand Down Expand Up @@ -64,6 +65,18 @@ You can use the `--issue` option to only load just one specific issue:

$ github-to-sqlite issues github.db simonw/datasette --issue=1

## Fetching pull-requests for a repository

While pull-requests are a type of issue, you will get more information on pull-requests by pulling them separately. For example, whether a pull-request has been merged and when.

Following the API of issues, the `pull-requests` command retrieves all of the pull-requests belonging to a specified repository.

$ github-to-sqlite pull-requests github.db simonw/datasette

You can use the `--pull-request` option to only load just one specific pull-request:

$ github-to-sqlite pull-requests github.db simonw/datasette --pull-request=81

## Fetching issue comments for a repository

The `issue-comments` command retrieves all of the comments on all of the issues in a repository.
Expand Down
34 changes: 34 additions & 0 deletions github_to_sqlite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,40 @@ def issues(db_path, repo, issue, auth, load):
utils.save_issues(db, issues, repo_full)
utils.ensure_db_shape(db)

@cli.command(name="pull-requests")
@click.argument(
"db_path",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
@click.argument("repo", required=False)
@click.option("--pull-request", help="Just pull this pull-request number")
@click.option(
"-a",
"--auth",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=True),
default="auth.json",
help="Path to auth.json token file",
)
@click.option(
"--load",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
help="Load pull-requests JSON from this file instead of the API",
)
def pull_requests(db_path, repo, pull_request, auth, load):
"Save pull_requests for a specified repository, e.g. simonw/datasette"
db = sqlite_utils.Database(db_path)
token = load_token(auth)
repo_full = utils.fetch_repo(repo, token)
if load:
pull_requests = json.load(open(load))
else:
pull_requests = utils.fetch_pull_requests(repo, token, pull_request)

pull_requests = list(pull_requests)
utils.save_pull_requests(db, pull_requests, repo_full)
utils.ensure_db_shape(db)


@cli.command(name="issue-comments")
@click.argument(
Expand Down
75 changes: 75 additions & 0 deletions github_to_sqlite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"commits": ["message"],
"issue_comments": ["body"],
"issues": ["title", "body"],
"pull_requests": ["title", "body"],
"labels": ["name", "description"],
"licenses": ["name"],
"milestones": ["title", "description"],
Expand Down Expand Up @@ -149,6 +150,70 @@ def save_issues(db, issues, repo):
table.m2m("labels", label, pk="id")


def save_pull_requests(db, pull_requests, repo):
if "milestones" not in db.table_names():
if "users" not in db.table_names():
# So we can define the foreign key from milestones:
db["users"].create({"id": int}, pk="id")
db["milestones"].create(
{"id": int, "title": str, "description": str, "creator": int, "repo": int},
pk="id",
foreign_keys=(("repo", "repos", "id"), ("creator", "users", "id")),
)
for original in pull_requests:
# Ignore all of the _url fields
pull_request = {
key: value for key, value in original.items() if not key.endswith("url")
}
# Add repo key
pull_request["repo"] = repo["id"]
# Pull request _links can be flattened to just their URL
pull_request["url"] = pull_request["_links"]["html"]["href"]
pull_request.pop("_links")
# Extract user
pull_request["user"] = save_user(db, pull_request["user"])
labels = pull_request.pop("labels")
# Head sha
pull_request["head"] = pull_request["head"]["sha"]
pull_request["base"] = pull_request["base"]["sha"]
# Extract milestone
if pull_request["milestone"]:
pull_request["milestone"] = save_milestone(db, pull_request["milestone"], repo["id"])
# For the moment we ignore the assignees=[] array but we DO turn assignee
# singular into a foreign key reference
pull_request.pop("assignees", None)
if original["assignee"]:
pull_request["assignee"] = save_user(db, pull_request["assignee"])
pull_request.pop("active_lock_reason")
# ignore requested_reviewers and requested_teams
pull_request.pop("requested_reviewers", None)
pull_request.pop("requested_teams", None)
# Insert record
table = db["pull_requests"].insert(
pull_request,
pk="id",
foreign_keys=[
("user", "users", "id"),
("assignee", "users", "id"),
("milestone", "milestones", "id"),
("repo", "repos", "id"),
],
alter=True,
replace=True,
columns={
"user": int,
"assignee": int,
"milestone": int,
"repo": int,
"title": str,
"body": str,
},
)
# m2m for labels
for label in labels:
table.m2m("labels", label, pk="id")


def save_user(db, user):
# Remove all url fields except avatar_url and html_url
to_save = {
Expand Down Expand Up @@ -274,6 +339,16 @@ def fetch_issues(repo, token=None, issue=None):
for issues in paginate(url, headers):
yield from issues

def fetch_pull_requests(repo, token=None, pull_request=None):
headers = make_headers(token)
if pull_request is not None:
url = "https://api.github.com/repos/{}/pulls/{}".format(repo, pull_request)
yield from [requests.get(url).json()]
else:
url = "https://api.github.com/repos/{}/pulls?state=all&filter=all".format(repo)
for pull_requests in paginate(url, headers):
yield from pull_requests


def fetch_issue_comments(repo, token=None, issue=None):
assert "/" in repo
Expand Down
Loading

0 comments on commit b37f555

Please sign in to comment.