-
Notifications
You must be signed in to change notification settings - Fork 6
/
preprocessing.py
126 lines (100 loc) · 4.26 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime, timedelta
import ast
# unix datetime
base = pd.Timestamp("1970-01-01")
CHUNK_SIZE = 1000000
REVIEW_DROP = 0
RESTAURANTS_PATH = 'dataset/processed_rest.csv'
REVIEWS_PATH = 'dataset/reviews.csv'
USERS_PATH = 'dataset/processed_users.csv'
# https://www.kaggle.com/zolboo/recommender-systems-knn-svd-nn-keras
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
if attr == None:
return "{}"
if key in attr:
return attr.pop(key)
# convert string to dictionary
def str_to_dict(attr):
if attr != None:
return ast.literal_eval(attr)
else:
return ast.literal_eval("{}")
def sub_timestamp(element):
element = element[0]
a, b = element.split('-')
a = datetime.strptime(a, "%H:%M")
b = datetime.strptime(b, "%H:%M")
return timedelta.total_seconds(b - a)
def get_device():
if torch.cuda.is_available():
device = torch.device('cuda:0')
else:
device = torch.device('cpu')
return device
def df_to_tensor(df):
device = get_device()
return torch.from_numpy(df.values).long().to(device)
def df_to_tensor_cpu(df):
return torch.from_numpy(df.values).long()
def process_data_chunk(reviews, users, restaurants):
reviews = pd.merge(reviews, users, how='inner', on='user_id')
reviews = reviews.drop(columns='user_id')
reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')
reviews = reviews.drop(columns='business_id')
print("REVIEWS.HEAD() -------------------------------------------------------------------")
print(reviews.head())
reviews = reviews.drop(columns=reviews.columns[0], axis=1)
print("REVIEWS.DROP() -------------------------------------------------------------------")
print(reviews.head())
return df_to_tensor(reviews)
# Load data files
# reviews = get_reviews()
def load_data(train_percent, val_percent, test_percent):
print("Reading users")
users = pd.read_csv(USERS_PATH)
users = users[users['review_count'] > REVIEW_DROP]
users['user_id'] = users['user_id'].astype('category')
users['user_id_num'] = users['user_id'].cat.codes
users = users[['user_id', 'user_id_num', 'review_count']]
user_id_to_num = dict(zip(users['user_id'], users['user_id_num']))
print("Reading businesses")
restaurants = pd.read_csv(RESTAURANTS_PATH)
restaurants['business_id'] = restaurants['business_id'].astype('category')
restaurants['business_id_num'] = restaurants['business_id'].cat.codes
restaurants = restaurants[['business_id', 'business_id_num']]
rest_id_to_num = dict(zip(restaurants['business_id'], restaurants['business_id_num']))
print("Reading reviews")
reviews = pd.read_csv(REVIEWS_PATH)
reviews = pd.merge(reviews, users, how='inner', on='user_id')
reviews = reviews.drop(columns='user_id')
reviews = pd.merge(reviews, restaurants, how='inner', on='business_id')
reviews = reviews.drop(columns='business_id')
print("REVIEWS.HEAD() -------------------------------------------------------------------")
print(reviews.head())
reviews = reviews.drop(columns=reviews.columns[0], axis=1)
print("REVIEWS.DROP() -------------------------------------------------------------------")
print(reviews.head())
# pickle.dump(user_id_to_num, open('../dataset/user_id_to_num.pkl', 'wb'))
# pickle.dump(rest_id_to_num, open('../dataset/rest_id_to_num.pkl', 'wb'))
# np.save('../dataset/data.npy', reviews.values)
training = reviews.sample(frac=train_percent)
left = reviews.drop(training.index)
validation = left.sample(frac=val_percent / (val_percent + test_percent))
test = left.drop(validation.index)
print("loaded")
return df_to_tensor_cpu(training), df_to_tensor_cpu(validation), df_to_tensor_cpu(test), user_id_to_num, rest_id_to_num
if __name__ == "__main__":
train, val, test, user, rest = load_data(0.6, 0.3, 0.1)
print("TRAIN ----------------------------------------------")
print(train.shape)
print("VAL ----------------------------------------------")
print(val.shape)
print("TEST ----------------------------------------------")
print(test.shape)