-
Notifications
You must be signed in to change notification settings - Fork 1
/
cleaner.py
27 lines (25 loc) · 1.09 KB
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
import numpy as np
# Remove all duplicate entries. This happens when we requery too fast,
# and NextBus does not get a chance to update.
def dedupe(df):
before = df.shape[0]
df = df.drop_duplicates(subset=["timestamp", "route", "stop"], keep="first").reset_index(drop=True)
print("Removed " + str(before - df.shape[0]) + " duplicates.")
print("There are " + str(df.shape[0]) + " instances.")
return df
# Categorical -> One Hot
def onehot(df):
synthesized_features = []
if onehot:
for column in df.columns:
if df[column].dtype == np.object:
lenc = LabelEncoder()
le = lenc.fit_transform(df[column])
new_df = pd.DataFrame(OneHotEncoder(sparse=True).fit_transform(le.reshape(-1, 1)).toarray())
new_features = ["is_" + x for x in lenc.classes_]
new_df.columns = new_features
synthesized_features.append(new_features)
df = df.join(new_df)
return df, synthesized_features