-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathaggregate.py
executable file
·158 lines (138 loc) · 7.33 KB
/
aggregate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import json
import os
import csv
import re
def main():
with open('config.json') as config_file:
config = json.load(config_file)
api_response_cache_dir = config['api_response_cache_dir']
aggregate_output_dir = config['aggregate_output_dir']
monthly_endpoints = config['endpoints']['monthly']
single_endpoints = config['endpoints']['single']
monthly_itemized_endpoints = config['endpoints']['monthly_itemized']
process_monthly_endpoints(monthly_endpoints, api_response_cache_dir, aggregate_output_dir)
process_single_endpoints(single_endpoints, api_response_cache_dir, aggregate_output_dir)
process_monthly_itemized_endpoints(monthly_itemized_endpoints, api_response_cache_dir, aggregate_output_dir)
process_github_contributors(api_response_cache_dir, aggregate_output_dir)
def process_monthly_endpoints(monthly_endpoints, api_response_cache_dir, aggregate_output_dir):
for endpoint in monthly_endpoints:
datedir_parent = api_response_cache_dir + '/' + endpoint + '/'
totals = {}
for item in os.listdir(datedir_parent):
if (re.match("^[0-9][0-9][0-9][0-9]\-[0-9][0-9]$", item) is not None):
month = item
datedir = datedir_parent + '/' + month
for json_file in os.listdir(datedir):
path_and_json_file = datedir + '/' + json_file
with open(path_and_json_file) as f:
json_data = json.load(f)
count = json_data['data']['count']
last_count = totals.get(month, 0)
totals[month] = count + last_count
metric_filename = endpoint.replace('/', '-') + '.tsv'
path_and_metric_file = aggregate_output_dir + '/' + metric_filename
with open(path_and_metric_file, 'w') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
writer.writerow(['month', 'count'])
for month in sorted(totals):
writer.writerow([month, totals[month]])
def process_single_endpoints(single_endpoints, api_response_cache_dir, aggregate_output_dir):
for endpoint in single_endpoints:
if endpoint == '../version':
endpoint = 'version'
jsondir = api_response_cache_dir + '/' + endpoint + '/'
totals = {}
for item in os.listdir(jsondir):
if (re.match("^.*\.json$", item) is not None):
json_file = item
path_and_json_file = jsondir + '/' + json_file
with open(path_and_json_file) as f:
json_data = json.load(f)
if isinstance(json_data['data'], list):
for name_and_count in json_data['data']:
if endpoint == 'dataverses/byCategory':
metric_type = 'category'
else:
metric_type = 'subject'
name = name_and_count[metric_type]
count = name_and_count['count']
last_count = totals.get(name, 0)
totals[name] = count + last_count
else:
if endpoint== 'version':
name=json_data['data'][endpoint]
if name[0] == 'v':
name=name[1:]
name = name.split('-')[0]
totals[name]= totals.get(name,0) + 1
metric_filename = endpoint.replace('/', '-') + '.tsv'
path_and_metric_file = aggregate_output_dir + '/' + metric_filename
with open(path_and_metric_file, 'w') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
writer.writerow(['name', 'count'])
for name in sorted(totals):
writer.writerow([name, totals[name]])
def process_monthly_itemized_endpoints(monthly_itemized_endpoints, api_response_cache_dir, aggregate_output_dir):
for endpoint in monthly_itemized_endpoints:
datedir_parent = api_response_cache_dir + '/' + endpoint + '/'
if endpoint == 'datasets/bySubject/toMonth':
metric_type = 'subject'
else:
raise ValueError("Unexpected endpoint type: " + endpoint)
metric_filename = endpoint.replace('/', '-') + '.tsv'
path_and_metric_file = aggregate_output_dir + '/' + metric_filename
with open(path_and_metric_file, 'w') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
writer.writerow(['month', 'name', 'count'])
month_dirs = []
for item in os.listdir(datedir_parent):
if (re.match("^[0-9][0-9][0-9][0-9]\-[0-9][0-9]$", item) is not None):
month_dirs.append(item)
for month in sorted(month_dirs):
datedir = datedir_parent + '/' + month
totals = {}
for json_file in os.listdir(datedir):
path_and_json_file = datedir + '/' + json_file
with open(path_and_json_file) as f:
json_data = json.load(f)
for name_and_count in json_data['data']:
name = name_and_count[metric_type]
count = name_and_count['count']
last_count = totals.get(name, 0)
totals[name] = count + last_count
for name in sorted(totals):
writer.writerow([month, name, totals[name]])
def process_github_contributors(api_response_cache_dir, aggregate_output_dir):
github_dir = api_response_cache_dir + '/' + 'contributors' + '/' + 'github.com'
contributors_by_repo = []
for owner in os.listdir(github_dir):
owner_dir = github_dir + '/' + owner
for repo in os.listdir(owner_dir):
ids, usernames, urls, avatars = [], [], [], []
repo_url = 'https://github.com/' + owner + '/' + repo
repo_dir = owner_dir + '/' + repo
path_and_json_file = repo_dir + '/' + 'contributors.json'
try:
with open(path_and_json_file) as f:
json_data = json.load(f)
for contributor in json_data:
github_id = contributor['author']['id']
github_username = contributor['author']['login']
github_url = contributor['author']['html_url']
github_avatar = contributor['author']['avatar_url']
ids.append(github_id)
usernames.append(github_username)
urls.append(github_url)
avatars.append(github_avatar)
contributors = [{"username": u, "url": url, "avatar": avatar, "id": gid} for u, url, avatar, gid in zip(usernames, urls, avatars, ids)]
repo_info = {}
repo_info['url'] = repo_url
repo_info['contributors'] = contributors
contributors_by_repo.append(repo_info)
except FileNotFoundError:
pass
contributors_filename = 'contributors.json'
with open(contributors_filename, 'w') as f:
json.dump(contributors_by_repo, f, indent=4, ensure_ascii=True)
if __name__ == '__main__':
main()