-
Notifications
You must be signed in to change notification settings - Fork 0
/
collector.py
115 lines (91 loc) · 3.69 KB
/
collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import requests
from pymongo import MongoClient
import os
# GitHub token and MongoDB URI loaded from environment variables for security
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
MONGODB_URI = os.getenv("MONGODB_URI")
headers = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json",
}
def fetch_issue_urls(repo_owner, repo_name):
query = f"label:enhancement,proposal+-label:duplicate,documentation+repo:{repo_owner}/{repo_name}+is:issue"
non_range_url = f"https://api.github.com/search/issues?per_page=100&q={query}"
issue_urls = []
for year in range(2016, 2025):
url = non_range_url + f"+created:{year}-01-01..{year}-12-31"
while url:
response = requests.get(url, headers=headers)
if response.status_code == 403:
print(f"Rate limit hit. Request URL: {url}")
break
elif response.status_code != 200:
raise Exception(
f"GitHub API request failed with status code {response.status_code}"
)
data = response.json()
issue_urls.extend([issue["html_url"] for issue in data["items"]])
# Pagination handling
url = None
if "next" in response.links:
url = response.links["next"]["url"]
return issue_urls
def fetch_all_pages(url):
all_data = []
while url:
response = requests.get(url, headers=headers)
if response.status_code == 403:
print(f"Rate limit hit. Request URL: {url}")
return all_data
elif response.status_code != 200:
raise Exception(
f"GitHub API request failed with status code {response.status_code}"
)
all_data.extend(response.json())
url = None
if "next" in response.links:
url = response.links["next"]["url"]
return all_data
def fetch_issue_data_and_comments(issue_url):
issue_response = requests.get(issue_url, headers=headers)
if issue_response.status_code == 403:
print(f"Rate limit hit. Request URL: {issue_url}")
return None
elif issue_response.status_code != 200:
raise Exception(
f"GitHub API request failed with status code {issue_response.status_code}"
)
issue_data = issue_response.json()
comments_url = issue_data.get("comments_url", "")
comments_data = fetch_all_pages(comments_url)
document = issue_data
document["comments"] = comments_data
return document
def insert_into_mongodb(document, db_name="terraform", collection_name="issues"):
client = MongoClient(MONGODB_URI)
db = client[db_name]
collection = db[collection_name]
collection.insert_one(document)
def get_urls_from_mongodb(db_name="terraform", collection_name="issue_urls"):
client = MongoClient(MONGODB_URI)
db = client[db_name]
collection = db[collection_name]
document = collection.find_one()
client.close()
return document["urls"] if document and "urls" in document else []
def main(repo_owner, repo_name):
issue_urls = get_urls_from_mongodb()
if not issue_urls:
issue_urls = fetch_issue_urls(repo_owner, repo_name)
insert_into_mongodb({"urls": issue_urls}, collection_name="issue_urls")
for issue_url in issue_urls:
issue_data = fetch_issue_data_and_comments(
issue_url.replace("https://github.com", "https://api.github.com/repos")
)
if issue_data:
insert_into_mongodb(issue_data)
print(f"Issue {issue_data['id']} inserted into MongoDB.")
if __name__ == "__main__":
repo_owner = "hashicorp"
repo_name = "terraform"
main(repo_owner, repo_name)