-
Notifications
You must be signed in to change notification settings - Fork 41
/
command.py
107 lines (91 loc) · 2.98 KB
/
command.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import time
from typing import Optional
import click
import pickledb
from dotenv import load_dotenv
from tqdm import tqdm
from utils import *
# Load dot file
load_dotenv()
# Create Gorse client
gorse_client = Gorse("http://127.0.0.1:8088", os.getenv("GORSE_API_KEY"))
# Create GitHub client.
github_client = Github(os.getenv("GITHUB_ACCESS_TOKEN"))
@click.group()
def command():
pass
@command.command()
@click.argument("item_id")
def upsert_repo(item_id):
"""Upsert a repository into GitRec."""
repo = get_repo_info(github_client, item_id)
gorse_client.insert_item(repo)
print(repo)
def search_and_upsert(
db: pickledb.PickleDB, topic: Optional[str] = None, language: Optional[str] = None
):
query = "stars:>100"
if topic is not None:
query += " topic:" + topic
if language is not None:
query += " language:" + language
print("Upsert " + query)
repos = github_client.search_repositories(query)
for repo in tqdm(repos):
# Skip existed repo.
if not db.exists("repo"):
db.dcreate("repo")
if db.dexists("repo", repo.full_name):
continue
# Fetch labels.
labels = [topic for topic in repo.get_topics()]
if repo.language is not None and repo.language not in labels:
labels.append(repo.language.lower())
# Optimize labels
item = {
"ItemId": repo.full_name.replace("/", ":").lower(),
"Timestamp": str(repo.updated_at),
"Labels": labels,
"Categories": generate_categories(labels),
"Comment": repo.description,
}
# Truncate long comment
if item["Comment"] is not None and len(item["Comment"]) > MAX_COMMENT_LENGTH:
item["Comment"] = item["Comment"][:MAX_COMMENT_LENGTH]
gorse_client.insert_item(item)
db.dadd("repo", (repo.full_name, None))
@command.command()
def upsert_repos():
"""Upsert popular repositories (stars >= 100) into GitRec"""
# Load checkpoint
db = pickledb.load("checkpoint.db", True)
# Load existed topics
topics = set()
cursor = ""
while True:
items, cursor = gorse_client.get_items(1000, cursor)
for item in items:
if item["Labels"] is not None:
for topic in item["Labels"]:
topics.add(topic)
if cursor == "":
break
# Search and upsert
if not db.exists("topic"):
db.dcreate("topic")
for topic in topics:
if not db.dexists("topic", topic):
while True:
try:
search_and_upsert(db, topic=topic)
db.dadd("topic", (topic, None))
break
except RateLimitExceededException as e:
print(e)
time.sleep(1800)
continue
except Exception as e:
print(e)
time.sleep(60)
if __name__ == "__main__":
command()