-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.py
55 lines (41 loc) · 1.32 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 18 08:48:17 2019
@author: Lukas
"""
from Dataload import make_df
from categories import find_cat_labels
from date_cleaning import date_cleaning
import numpy as np
import warnings
def clean_data(path, ignore_warn = True):
if ignore_warn:
warnings.filterwarnings("ignore")
df = make_df(path)
cat_labels = find_cat_labels('Datasets/CA_category_id.json')
#print(cat_labels)
df = date_cleaning(df)
return df
#df = clean_data('Datasets/**videos.csv')
def transform_data(data, cols, standardize = True, log_trans = True):
"""
Data to transform
Cols = numeric columns from data to transform
"""
data = data[cols]
if log_trans:
y = np.log(data)
y[y == -np.inf] = 0
y[y == np.inf] = 0
data = (y- data.mean(axis = 0)) / data.std(axis = 0)
if standardize:
data = (data- data.mean(axis = 0)) / data.std(axis = 0)
return data
"""
cols = ["likes", "dislikes", "views", "comment_count"]
data = clean_data('Datasets/**videos.csv')
data = transform_data(data, cols)
"""
#print("Mean: \n {} \n Std: \n {} \n Head: \n {}".format(data.mean(axis = 0), data.std(axis = 0), data.head()))
#data = clean_data('Datasets/**videos.csv')
#print(len(data))