-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
139 lines (92 loc) · 3.29 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import json
import os.path
import operator
from collections import defaultdict
# import mdp
import numpy as np
import pandas as p
# import matplotlib.pyplot as plt
# plt.style.use('ggplot')
import gng as g
from random import randrange
cutoff = 9
def parse_json(selection=['Pittsburgh']):
businesses = list()
for line in open('data/yelp_academic_dataset_business.json'):
businesses.append(json.loads(line))
sel_businesses = dict()
for bus in businesses:
if (bus['city'] in selection) and (bus['review_count'] > cutoff):
sel_businesses[bus['business_id']] = bus['review_count']
print("Businesses: " + str(len(sel_businesses.keys())))
with open('data/businesses.json', 'w') as outfile:
json.dump(sel_businesses, outfile)
user_reviews = defaultdict(dict)
for line in open('data/yelp_academic_dataset_review.json'):
l = json.loads(line)
if l['business_id'] in sel_businesses:
user_reviews[l['user_id']][l['business_id']] = l['stars']
print("Users: " + str(len(user_reviews.keys())))
with open('data/user_reviews.json', 'w') as outfile:
json.dump(user_reviews, outfile)
users = defaultdict(dict)
for ind, user in enumerate(user_reviews.keys()):
if len(user_reviews[user].keys()) > cutoff:
users[user] = user_reviews[user]
print("Users after cutoff: " + str(len(users.keys())))
with open('data/users.json', 'w') as outfile:
json.dump(users, outfile)
######################## END PARSE ########
def get_user_id(users_dict):
while True:
s = raw_input("Input user ID or 'q' to exit: ")
if s == 'q':
quit()
if s in users_dict:
return s
else:
print "User ID was not found, please try again."
def main():
sel = ['Pittsburgh']
print("Loading user and restaurant data...")
if not os.path.isfile('data/users.json'):
print("User data not pre-parsed -- will now parse\n")
parse_json(sel)
print("User data parsing finised!\n")
for line in open('data/businesses.json'):
# only one line
businesses = json.loads(line)
# business[business_id] = review_count
print("Restaurants: " + str(len(businesses)))
for line in open('data/users.json'):
# only one line
users = json.loads(line)
# users[user_id][business_id] = rating
print("Users: " + str(len(users)))
df = p.DataFrame(data=users).T.fillna(0) # fill in missing values with 0
def get_random_user():
u = df.iloc[randrange(len(df))]
return (u.values, u.name)
print("Building GNG network...")
gng = g.GrowingNeuralGas(get_random_user, 1338, verbose=0)
for i in range(100): # 6000
gng.step()
if gng.stepCount % 100==0:
print gng
print("Recommendation system ready!")
user_id = get_user_id(users)
user_node = df.loc[user_id].values
rec_users = [x.getUser() for x in gng.computeDistances(user_node)]
res = dict()
for user in rec_users:
res.update(users[user])
print "We recommend you follow the following users: ", user
sorted_res = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
print "\nWe recommend you look at the following restaurants: "
for res_id, rating in sorted_res[:10]:
print res_id, " with a rating of ", rating
# print type(node)
# print type(df.values[randrange(len(df))])
# -0itF0VWVBe3k2AdfUReGA
if __name__ == '__main__':
main()