-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_util.py
142 lines (102 loc) · 4.59 KB
/
data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pandas as pd
import scipy.sparse as sp
import os
def create_URM_matrix(ratings_df):
URM_all = sp.csr_matrix((ratings_df["data"].values,
(ratings_df["user_id"].values, ratings_df["item_id"].values)))
return URM_all
def create_ICM_matrix(dataframe):
csr_matrix = sp.csr_matrix((dataframe["data"].values,
(dataframe["item_id"].values, dataframe["feature_id"].values)))
return csr_matrix
def combine_matrices(URM: sp.csr_matrix, ICM: sp.csr_matrix):
stacked_URM = sp.vstack([URM, ICM.T])
stacked_URM = sp.csr_matrix(stacked_URM)
stacked_ICM = sp.csr_matrix(stacked_URM.T)
return stacked_URM, stacked_ICM
def load_URM():
interactions_df = load_data_interactions()
# Make watched = 1
interactions_df.loc[interactions_df['data'] == 0, "data"] = 1
# Drop duplicates
interactions_df.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)
URM_all = create_URM_matrix(interactions_df)
return URM_all
def load_data():
interactions_df = load_data_interactions()
length_df = load_data_length()
type_df = load_data_type()
interactions_df.drop_duplicates()
interactions_df.loc[interactions_df['data'] == 0, "data"] = 1
# Remove cold items
length_df = length_df[length_df.item_id.isin(interactions_df.item_id)]
type_df = type_df[type_df.item_id.isin(interactions_df.item_id)]
# FEATURES
all_features_indices = pd.concat([length_df["feature_id"], type_df["feature_id"]], ignore_index=True)
mapped_id, original_id = pd.factorize(all_features_indices.unique())
print("Unique features: {}".format(len(original_id)))
features_original_ID_to_index = pd.Series(mapped_id, index=original_id)
length_df["feature_id"] = length_df["feature_id"].map(features_original_ID_to_index)
type_df["feature_id"] = type_df["feature_id"].map(features_original_ID_to_index)
URM_all = create_URM_matrix(interactions_df)
ICM_length = create_ICM_matrix(length_df)
ICM_type = create_ICM_matrix(type_df)
ICM_all = sp.hstack([ICM_type, ICM_length])
return URM_all, ICM_type, ICM_length, ICM_all
def load_data_interactions():
if os.path.exists("../data/interactions_and_impressions.csv"):
print('interactions_and_impressions found!')
return pd.read_csv(
"../data/interactions_and_impressions.csv",
sep=",",
names=["user_id", "item_id", "impressions", "data"],
header=0,
dtype={"user_id": int, "item_id": int, "impressions": str, "data": int})
else:
print("interactions_and_impressions not found.")
return None
def load_data_length():
if os.path.exists("../data/data_ICM_length.csv"):
print('data_ICM_length found!')
return pd.read_csv("../data/data_ICM_length.csv",
sep=",",
names=["item_id", "feature_id", "data"],
header=0,
dtype={"item_id": int, "feature_id": int, "data": int})
else:
print("data_ICM_length not found.")
return None
def load_data_type():
if os.path.exists("../data/data_ICM_type.csv"):
print('data_ICM_type found!')
return pd.read_csv("/Users/redaellimattia/Desktop/RecSysCompetition/Competition/data/data_ICM_type.csv",
sep=",",
names=["item_id", "feature_id", "data"],
header=0,
dtype={"item_id": int, "feature_id": int, "data": int})
else:
print("data_ICM_type not found.")
return None
def load_users_for_submission():
if os.path.exists("../data/data_target_users_test.csv"):
print('data_target_users_test found!')
return pd.read_csv(
"/Users/redaellimattia/Desktop/RecSysCompetition/Competition/data/data_target_users_test.csv",
names=['user_id'],
header=0,
dtype={"user_id": int})
else:
print("data_target_users_test not found.")
return None
def create_submission(recommender):
users_df = load_users_for_submission()
submission = []
for user_id in users_df["user_id"].values:
submission.append((user_id, recommender.recommend(user_id_array=user_id, cutoff=10)))
return submission
def write_submission(submission, file_name):
with open("../submissions/" + file_name + ".csv",
"w") as f:
f.write("user_id,item_list\n")
for user_id, items in submission:
f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")