-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmost_common_data_sampler.py
87 lines (67 loc) · 2.44 KB
/
most_common_data_sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""Most common data sampler.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1HrGG5xzalgHLSdY96B_A26b8qXNwjbHO
"""
!pip install xlrd==2.0.1
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import io
from google.colab import files
import numpy as np
from datetime import datetime,timedelta
def read_dir_file(case_f): # case_f = 0 for uploading one File and case_f = 1 for uploading one Zipped Directory
#uploaded = files.upload() # to upload a Full Directory, please Zip it first (use WinZip)
for fn in uploaded.keys():
name = fn #.encode('utf-8')
#print('\nfile after encode', name)
#name = io.BytesIO(uploaded[name])
if case_f == 0: # case of uploading 'One File only'
print('\n file name: ', name)
return name
else: # case of uploading a directory and its subdirectories and files
zfile = zipfile.ZipFile(name, 'r') # unzip the directory
zfile.extractall()
for d in zfile.namelist(): # d = directory
print('\n main directory name: ', d)
return d
def xlookup(lookup_value, lookup_array, return_array, if_not_found:str = ''):
match_value = return_array.loc[lookup_array == lookup_value]
if match_value.empty:
return 0 if if_not_found == '' else if_not_found
else:
return match_value.tolist()[0]
#upload csv
uploaded = files.upload()
fileName = read_dir_file(0)
df1=pd.read_csv(io.BytesIO(uploaded[fileName]))
import pickle
import numpy as np
import pandas as pd
from collections import Counter
df1.columns
each_user_count=Counter(df1.userId)
each_item_count=Counter(df1.movieId)
user_ids=[u for u,c in each_user_count.most_common(10000)]
item_ids=[i for i,c in each_item_count.most_common(2000)]
df_small=df1[df1.userId.isin(user_ids)&df1.movieId.isin(item_ids)].copy()
df_small.shape
new_user_id_map={}
i=0
for old in user_ids:
new_user_id_map[old]=i
i+=1
print("i",i)
new_item_id_map={}
i=0
for old in item_ids:
new_item_id_map[old]=i
i+=1
print("i",i)
df_small.loc[:,'userId']=df_small.apply(lambda row: new_user_id_map[row.userId],axis=1)
df_small.loc[:,'movieId']=df_small.apply(lambda row1: new_item_id_map[row1.movieId],axis=1)
df_small.head()
df_small.to_csv('smallmovie_rating.csv')
files.download('smallmovie_rating.csv')