forked from gilpasternak35/SongRecommender
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_tools.py
68 lines (48 loc) · 2.04 KB
/
preprocess_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Functional Utility file maintaining necessary data preprocessing scripts
import os
import json
import re
from tqdm import tqdm
from collections import defaultdict
def dataloader_pipeline(file_list: list) -> list:
"""
Pipeline for loading in data
@param file_list: A list of files to load in
@returns data: A list of playlists from these files
"""
# Resulting data (hopefully to be stored in list)
data = []
# Regular expression for desired filenames
desired_filename = re.compile("mpd.*")
# Traversing through available datafiles
print("Starting Dataloading...")
for file in tqdm(file_list):
# Ensuring filename valid
if desired_filename.match(file):
# Opening and preprocessing
with open("./data/" + file, 'r') as file_reader:
data += json.load(file_reader)["playlists"]
print("Finished Dataloading...")
return data
def build_relevant_ds(data: list):
"""
Preprocesses data, simultaneously building relevant data structures
@param data - a data list of playlist dictionaries to preprocess
@returns a list of tracks per user, users per track, watered down data list
"""
def process_uri(uri:str):
"""URI Processing method"""
return uri.split(":")[2]
print("Preprocessing started...")
tracks_per_user, users_per_track, users_per_artist = defaultdict(list), defaultdict(list), defaultdict(list)
# Traversing through data and preprocessing
for playlist in data:
user = playlist['pid']
for track in playlist['tracks']:
# obtaining necessary data
track, artist, album = track['track_name'], track['artist_name'], track['album_name']
# Appending data to data structures
tracks_per_user[user].append(track)
users_per_track[track].append(user)
users_per_artist[artist].append(user)
return tracks_per_user, users_per_track, users_per_artist