-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_data.py
62 lines (45 loc) · 1.82 KB
/
fetch_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os.path
from scipy.sparse import coo_matrix
def no_file():
print ('Dataset not found, please check')
def fetch_data(min_plays=200):
file_path = 'E:/parto/Projects/recommender_system_py-master/recommender_system_py-master/data/100k_lines_lastfm.tsv'
if not os.path.exists(file_path):
return no_file()
# Data to create our coo_matrix
data, row, col = [], [], []
# Artists by id, and users
artists, users = {}, {}
# Read the file and fill variables with data to)
# create the matrix and have the artists by id
with open(file_path,encoding="utf8") as data_file:
for n, line in enumerate(data_file):
# If you use the original data from lastfm (14 million lines)
# if n == SOMEINT: break
# Readable data (for humans)
readable_data = line.split('\t')
user = readable_data[0]
artist_id = readable_data[1]
artist_name = readable_data[2]
plays = int(readable_data[3])
if user not in users:
users[user] = len(users)
if artist_id not in artists:
artists[artist_id] = {
'name' : artist_name,
'id' : len(artists)
}
# Data for the coo_matrix if the artist was played > 200 times
if plays > min_plays:
data.append(plays)
row.append(users[user])
col.append(artists[artist_id]['id'])
# Our matrix: ((plays, (user, artist)))
coo = coo_matrix((data,(row,col)))
# We return the matrix, the artist dictionary and the amount of users
dictionary = {
'matrix' : coo,
'artists' : artists,
'users' : len(users)
}
return dictionary