forked from lijingsdu/sessionRec_NARM
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexample_preprocess.py
142 lines (122 loc) · 3.54 KB
/
example_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import time
import csv
import pickle
import operator
# Load .csv dataset
with open("", "rb") as f:
reader = csv.DictReader(f, delimiter=',')
sess_clicks = {}
sess_date = {}
ctr = 0
curid = -1
curdate = None
for data in reader:
sessid = data['SessionId']
if curdate and not curid == sessid:
date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
sess_date[curid] = date
curid = sessid
item = data['ItemId']
curdate = data['Time']
if sess_clicks.has_key(sessid):
sess_clicks[sessid] += [item]
else:
sess_clicks[sessid] = [item]
ctr += 1
if ctr % 100000 == 0:
print ('Loaded', ctr)
date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
sess_date[curid] = date
# Filter out length 1 sessions
for s in sess_clicks.keys():
if len(sess_clicks[s]) == 1:
del sess_clicks[s]
del sess_date[s]
# Count number of times each item appears
iid_counts = {}
for s in sess_clicks:
seq = sess_clicks[s]
for iid in seq:
if iid_counts.has_key(iid):
iid_counts[iid] += 1
else:
iid_counts[iid] = 1
sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1))
for s in sess_clicks.keys():
curseq = sess_clicks[s]
filseq = filter(lambda i: iid_counts[i] >= 5, curseq)
if len(filseq) < 2:
del sess_clicks[s]
del sess_date[s]
else:
sess_clicks[s] = filseq
# Split out test set based on dates
dates = sess_date.items()
maxdate = dates[0][1]
for _, date in dates:
if maxdate < date:
maxdate = date
# 7 days for test
splitdate = maxdate - 86400 * 7
print('Split date', splitdate)
train_sess = filter(lambda x: x[1] < splitdate, dates)
test_sess = filter(lambda x: x[1] > splitdate, dates)
# Sort sessions by date
train_sess = sorted(train_sess, key=operator.itemgetter(1))
test_sess = sorted(test_sess, key=operator.itemgetter(1))
# Choosing item count >=5 gives approximately the same number of items as reported in paper
item_dict = {}
item_ctr = 1
train_seqs = []
train_dates = []
# Convert training sessions to sequences and renumber items to start from 1
for s, date in train_sess:
seq = sess_clicks[s]
outseq = []
for i in seq:
if item_dict.has_key(i):
outseq += [item_dict[i]]
else:
outseq += [item_ctr]
item_dict[i] = item_ctr
item_ctr += 1
if len(outseq) < 2: # Doesn't occur
continue
train_seqs += [outseq]
train_dates += [date]
test_seqs = []
test_dates = []
# Convert test sessions to sequences, ignoring items that do not appear in training set
for s, date in test_sess:
seq = sess_clicks[s]
outseq = []
for i in seq:
if item_dict.has_key(i):
outseq += [item_dict[i]]
if len(outseq) < 2:
continue
test_seqs += [outseq]
test_dates += [date]
print(item_ctr)
def process_seqs(iseqs, idates):
out_seqs = []
out_dates = []
labs = []
for seq, date in zip(iseqs, idates):
for i in range(1, len(seq)):
tar = seq[-i]
labs += [tar]
out_seqs += [seq[:-i]]
out_dates += [date]
return out_seqs, out_dates, labs
tr_seqs, tr_dates, tr_labs = process_seqs(train_seqs,train_dates)
te_seqs, te_dates, te_labs = process_seqs(test_seqs,test_dates)
train = (tr_seqs, tr_labs)
test = (te_seqs, te_labs)
f1 = open('', 'w')
pickle.dump(train, f1)
f1.close()
f2 = open('', 'w')
pickle.dump(test, f2)
f2.close()
print('Done.')