-
Notifications
You must be signed in to change notification settings - Fork 0
/
repos.py
115 lines (100 loc) · 4.27 KB
/
repos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from utils import check_github_api_credentials
import sys
import pandas as pd
import requests
import tiktoken
import base64
# Function to count the number of tokens in a string using tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
encoding = tiktoken.encoding_for_model(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
# Function to fetch the default branch of a GitHub repository
def get_default_branch(repo_full_name, github_token):
api_url = f"https://api.github.com/repos/{repo_full_name}"
headers = {"Authorization": f"token {github_token}"}
response = requests.get(api_url, headers=headers)
if response.status_code == 200:
return response.json().get("default_branch", "main")
else:
print(f"Problem beim Abrufen des Standardbranches von {repo_full_name}")
return "main"
# Function to get the content of a specific file in a GitHub repository
def get_file_content(repo_full_name, filename, github_token):
api_url = f"https://api.github.com/repos/{repo_full_name}/contents/{filename}"
headers = {"Authorization": f"token {github_token}"}
response = requests.get(api_url, headers=headers)
try:
content_decoded = base64.b64decode(response.json()["content"]).decode("utf-8")
except UnicodeDecodeError:
return 'x' * 10000
if response.status_code == 200:
return content_decoded
else:
return None
# Function to count the number of Python tokens in a GitHub repository
def count_python_tokens(repo_full_name, github_token, max_tokens):
default_branch = get_default_branch(repo_full_name, github_token)
api_url = f"https://api.github.com/repos/{repo_full_name}/git/trees/{default_branch}?recursive=1"
headers = {"Authorization": f"token {github_token}"}
response = requests.get(api_url, headers=headers)
if response.status_code == 200:
tree = response.json()["tree"]
total_tokens = 0
for item in tree:
if item["path"].endswith(".py"):
file_content = get_file_content(repo_full_name, item["path"], github_token)
if file_content:
total_tokens += num_tokens_from_string(file_content, "gpt-3.5-turbo")
return total_tokens <= int(max_tokens)
else:
print(f"Problem beim Abrufen des Inhalts von {repo_full_name}")
return False
# Main function to search for GitHub repositories based on various criteria
def search_repositories(language, num_repos, year, max_tokens, query_terms, github_token):
num_repos = int(num_repos)
api_url = "https://api.github.com/search/repositories"
query = f"language:{language} created:{year} {query_terms}"
params = {
"q": query,
"sort": "stars",
"order": "asc",
"per_page": 100,
}
headers = {"Authorization": f"token {github_token}"}
filtered_repos = []
page_num = 1
while len(filtered_repos) < num_repos:
params["page"] = page_num
response = requests.get(api_url, headers=headers, params=params)
if response.status_code != 200:
print(f"Fehlercode: {response.status_code}")
print("Es gab ein Problem beim Abrufen der Repositories.")
return None
repos = response.json()["items"]
if not repos:
break
for repo in repos:
if len(filtered_repos) >= num_repos:
break
py_files_count = count_python_tokens(repo["full_name"], github_token, max_tokens)
if py_files_count:
filtered_repos.append(repo)
page_num += 1
df = pd.DataFrame([repo["html_url"] for repo in filtered_repos], columns=["Repository URL"])
df.to_csv("repositories.csv", index=False)
print("Die Repositories wurden erfolgreich in der Datei 'repositories.csv' gespeichert.")
return df
# Entry point of the script
if __name__ == "__main__":
if not check_github_api_credentials("https://api.github.com", ""):
sys.exit()
# Initialize parameters
language = "Python"
num_repos = '10'
max_tokens = '5000'
year = '2022'
search_terms = ['test', 'example']
query_terms = " OR ".join(search_terms)
df = search_repositories(language, num_repos, year, max_tokens, query_terms, "")
print(df)