forked from inderpartap/trendcast
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_engineering.py
112 lines (88 loc) · 3.61 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from datetime import datetime
import numpy as np
import pandas as pd
def get_data_type(path):
dummy = pd.read_csv(path, nrows=30)
dtype = dict(dummy.dtypes)
return dtype
def feature_engineering(df):
quarter_map = {
1: 1,
2: 1,
3: 1,
4: 2,
5: 2,
6: 2,
7: 3,
8: 3,
9: 3,
10: 4,
11: 4,
12: 4,
}
province_map = {"AB": 1, "BC": 2, "SK": 3}
df["quarter"] = df.month.map(quarter_map)
df["day_of_week_sin"] = np.sin(df.dayOfWeek * (2.0 * np.pi / 7))
df["day_of_week_cos"] = np.cos(df.dayOfWeek * (2.0 * np.pi / 7))
df["month_sin"] = np.sin((df.month - 1) * (2.0 * np.pi / 12))
df["month_cos"] = np.cos((df.month - 1) * (2.0 * np.pi / 12))
df["quarter_sin"] = np.sin((df.quarter - 1) * (2.0 * np.pi / 4))
df["quarter_cos"] = np.cos((df.quarter - 1) * (2.0 * np.pi / 4))
df["totQt_log"] = np.log(abs(df.totalQuantity) + 0.0001)
df["day_sin"] = np.sin((df.quarter - 1) * (2.0 * np.pi / 30))
df["day_cos"] = np.cos((df.quarter - 1) * (2.0 * np.pi / 30))
df["totQty_sqrt"] = np.sqrt(abs(df.totalQuantity))
df["totalQty_inverse"] = 1 / (df.totalQuantity + 1.534)
df["province_freq_encoding"] = df.province.map(
df.province.value_counts()) / len(df)
df["province_label_encoding"] = df.province.map(province_map)
df["city_freq_encoding"] = df.city.map(df.city.value_counts()) / len(df)
df["city_name_len"] = df.city.apply(lambda x: len(x))
return df
def split_train_test_ts(df, startdate, endDate, no_months):
# sorting the index to split the data based on dates
df["Date"] = df.date
df = df.set_index("date").sort_index()
# creating datetime obj
datetime_obj = datetime.strptime(endDate, "%Y-%m-%d")
new_year = datetime_obj.year
new_month = (datetime_obj.month - no_months) % 12
if no_months >= datetime_obj.month:
new_year = datetime_obj.year - 1
# getting the start date for test data
test_data_startdate = datetime(new_year, new_month, 1).strftime("%Y-%m-%d")
# splitting
df = df.rename(columns={"Date": "date"})
X_train = df.loc[startdate:test_data_startdate, :]
X_test = df.loc[test_data_startdate:endDate, :]
return (X_train, X_test)
def main():
# Add the directory of the following data!
training_data = "train_data.csv"
testing_data = "test_data.csv"
output_training_data = "data/train_data_final.csv"
output_testing_data = "data/test_data_final.csv"
# reading data
types = get_data_type(training_data)
train = pd.read_csv(training_data, dtype=types)
test = pd.read_csv(testing_data, dtype=types)
df = pd.concat([train, test])
df.iloc[:, 2:25] = df.iloc[:, 2:25].astype("float32")
df.iloc[:, 25:46] = df.iloc[:, 25:46].astype("int32")
# Adjusting the format of the data!
# df = df.dropna(how="any")
df["date"] = pd.to_datetime(df["date"])
# Feature engineering!
df = feature_engineering(df)
# define start date and end date for the train and test split, also with the size of test data based on number of columns
no_months_test_data = 3
startdate = min(df.date).strftime("%Y-%m-%d")
endDate = max(df.date).strftime("%Y-%m-%d")
X_train, X_test = split_train_test_ts(df, startdate, endDate,
no_months_test_data)
# saving train and test data
X_train.to_csv(output_training_data, index=False) # datapath['train_data']
X_test.to_csv(output_testing_data, index=False) # datapath['test_data']
# df.to_csv(output_directory, index=False)
if __name__ == "__main__":
main()