-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
76 lines (70 loc) · 2.31 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import numpy as np
import pandas as pd
import torch
# fill genres columns
def find_genres_columns(current_genres, genres_title_list):
temp_list = []
for genre in genres_title_list:
if genre in current_genres:
temp_list.append(1)
else:
temp_list.append(0)
temp_list = np.array(temp_list)
return temp_list
def create_movie_features(movies_df):
# find all genres and their names
genres_title_list = []
for row in movies_df.values:
genres = row[2].split("|")
for genre in genres:
if genre not in genres_title_list:
genres_title_list.append(genre)
# splitting the genres of movies from their string
temp = movies_df.copy(deep=True)
year_list = []
name_list = []
genres_list = []
for row in temp.values:
id = row[0]
name = str(row[1]).strip()
name_inx = str(name).find('(')
year = name[-5:-1]
try:
year = int(year)
except Exception as e:
year = -1
genres = row[2].split('|')
genres = find_genres_columns(genres, genres_title_list)
genres_list.append(genres)
name = name[:name_inx-1].replace('(', '').replace(')', '')
year_list.append(year)
name_list.append(name)
genres_list = np.array(genres_list)
year_list = np.array(year_list)
name_list = np.array(name_list)
temp['title'] = name_list
temp['year'] = year_list
temp['genres'] = list(genres_list)
for i, genre in enumerate(genres_title_list):
temp[genre] = genres_list[:, i]
temp = temp[temp['(no genres listed)'] == 0]
del temp['(no genres listed)']
temp.head()
movies_df = temp
del movies_df['genres']
# creating dictionaries for movies
movie_dict = dict()
movie_name_dict = dict()
count = 0
movies_features = []
for row in movies_df.values:
movie_dict[row[0]] = count
movie_name_dict[row[1]] = row[1]
count+=1
movies_features.append(np.array(row[3:]))
# movies_features =np.array(movies_features, dtype=np.int64)
movies_features =np.array(movies_features)
movies_features =np.array(movies_features, dtype=np.float32)
print(movies_features.shape)
movies_features = torch.from_numpy(movies_features)
return movies_features