-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathsimplemedian.py
106 lines (73 loc) · 4.42 KB
/
simplemedian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
Private Score: 0.14598, Public Score: 0.14001
"""
import datetime as dt
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
pd.options.mode.chained_assignment = None
################################################################
# Import CSV Data into Pandas DataFrames #
################################################################
training_df = pd.read_csv("data/train.csv")
# store_df = pd.read_csv("data/store.csv")
test_df = pd.read_csv("data/test.csv")
# print(training_df.head())
# print(store_df.head())
# print(test_df.head())
################################################################
# Process Data (Universal) #
################################################################
def is_nan(val):
return val != val
############################################
# training_df & test_df #
############################################
# Fill NaN values in test_df with Open = 1 if DayOfWeek != 7
test_df["Open"][is_nan(test_df["Open"])] = (test_df["DayOfWeek"] != 7).astype(int)
# Create "Year" & "Month" columns
# training_df["Year"] = training_df["Date"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").year)
# training_df["Month"] = training_df["Date"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").month)
# test_df["Year"] = test_df["Date"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").year)
# test_df["Month"] = test_df["Date"].apply(lambda x: dt.datetime.strptime(x, "%Y-%m-%d").month)
# Create "YearMonth" column
# training_df["YearMonth"] = training_df["Date"].apply(lambda x: str(dt.datetime.strptime(x, "%Y-%m-%d").year) + "-" + str(dt.datetime.strptime(x, "%Y-%m-%d").month))
# test_df["YearMonth"] = test_df["Date"].apply(lambda x: str(dt.datetime.strptime(x, "%Y-%m-%d").year) + "-" + str(dt.datetime.strptime(x, "%Y-%m-%d").month))
# "StateHoliday" has values "0" & 0
# training_df["StateHoliday"].loc[training_df["StateHoliday"] == 0] = "0"
# test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"
# Create "StateHolidayBinary" column
# training_df["StateHolidayBinary"] = training_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
# test_df["StateHolidayBinary"] = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
# One-hot encoding of "DayOfWeek" & "StateHoliday" columns
# training_df = pd.get_dummies(training_df, columns=["DayOfWeek", "StateHoliday"])
# test_df = pd.get_dummies(test_df, columns=["DayOfWeek", "StateHoliday"])
############################################
# store_df #
############################################
# Fill NaN values in store_df for "CompetitionDistance" = 0 (since no record exists where "CD" = NaN & "COS[Y/M]" = !NaN)
# store_df["CompetitionDistance"][is_nan(store_df["CompetitionDistance"])] = 0
# Fill NaN values in store_df for "CompetitionSince[X]" with 1900-01
# store_df["CompetitionOpenSinceYear"][(store_df["CompetitionDistance"] != 0) & (is_nan(store_df["CompetitionOpenSinceYear"]))] = 1900
# store_df["CompetitionOpenSinceMonth"][(store_df["CompetitionDistance"] != 0) & (is_nan(store_df["CompetitionOpenSinceMonth"]))] = 1
# One-hot encoding of "StoreType" & "Assortment" columns
# store_df = pd.get_dummies(store_df, columns=["StoreType", "Assortment"])
################################################################
# Process Data (Custom) #
################################################################
# Any custom data processing goes here.
################################################################
# Training the Model & Predicting Sales #
################################################################
"""
This model simply calculates the median value for every ["Store", "DayOfWeek", "Promo"] combination and assigns that value as the prediction for every ["Store", "DayOfWeek", "Promo"] combination in the test data.
Features: Store, DayOfWeek, Promo
Assumptions:
- The only factors that significantly affect the sales in a particular store are "DayOfWeek" & "Promo".
"""
columns = ["Store", "DayOfWeek", "Promo"]
medians = training_df.groupby(columns)["Sales"].median()
medians = medians.reset_index()
test_df_modified = pd.merge(test_df, medians, on=columns, how="left")
test_df_modified.loc[test_df_modified.Open == 0, "Sales"] = 0
test_df_modified[["Id", "Sales"]].to_csv("predictions/simplemedian.csv", index=False)