-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData Cleaning.Rmd
225 lines (177 loc) · 6.5 KB
/
Data Cleaning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
---
title: "Data Cleaning"
author: "Georgia Christodoulou"
date: "2024-10-17"
output: html_document
---
```{r}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```
The purpose of this code is to process data necessary for modeling. This includes:
1. Cleaning (handling missing values, feature engineering, value standardization, etc.)
2. Class balancing
We'll first load a few packages and the data:
```{r}
library(tidyverse)
library(dbplyr)
library(data.table)
library(caret)
library(smotefamily)
application_train <- as.data.frame(fread("data/application_train.csv"))
application_test <- as.data.frame(fread("data/application_test.csv"))
glimpse(application_train)
```
This function will perform the previously described cleaning.
```{r}
application_cleaning <- function(data, train = TRUE) {
# remove entire columns
data <- data[, !grepl("DOCUMENT|AVG|MODE|MEDI", names(data))]
# impute NA with 0 for specific columns
NA_0 <- grep("SOCIAL|BUREAU", names(data), value = TRUE)
for (col in NA_0) {
data[is.na(data[[col]]), col] <- 0
}
# convert numeric variables with 2 unique values to boolean
for (col in names(data)) {
if (is.numeric(data[[col]]) && length(unique(data[[col]])) == 2) {
data[[col]] <- as.logical(data[[col]])
}
}
# convert character variables with 2 unique values to boolean
data$NAME_CONTRACT_TYPE <- data$NAME_CONTRACT_TYPE == "Cash loans"
data$CODE_GENDER <- data$CODE_GENDER == "M"
data$FLAG_OWN_CAR <- data$FLAG_OWN_CAR == "Y"
data$FLAG_OWN_REALTY <- data$FLAG_OWN_REALTY == "Y"
# change "TARGET" to "DEFAULT"
colnames(data)[colnames(data) == "TARGET"] <- "DEFAULT"
# change "CODE_GENDER" to "GENDER_MALE"
colnames(data)[colnames(data) == "CODE_GENDER"] <- "GENDER_MALE"
# change "NAME_CONTRACT_TYPE" to "CASH_LOAN"
colnames(data)[colnames(data) == "NAME_CONTRACT_TYPE"] <- "CASH_LOAN"
# impute blanks with NA
data[data == ""] <- NA
# impute NAME_TYPE_SUITE with "Unaccompanied"
data$NAME_TYPE_SUITE[is.na(data$NAME_TYPE_SUITE)] <- "Unaccompanied"
# impute OCCUPATION_TYPE with XNA
data$OCCUPATION_TYPE[is.na(data$OCCUPATION_TYPE)] <- "XNA"
# convert all character columns to factors
data[sapply(data, is.character)] <- lapply(data[sapply(data, is.character)], factor)
# remove rows where FLAG_OWN_CAR = "Y" and OWN_CAR_AGE is NA
if(train) {
# ONLY DROP ROWS IF THIS IS TRAINING
data <- data[!(data$FLAG_OWN_CAR == "Y" & is.na(data$OWN_CAR_AGE)), ]
} else {
# PRESERVE ROWS & IMPUTE IF THIS IS TESTING
if(sum(is.na(data$OWN_CAR_AGE)) > 0) {
mean_own_car_age <- median(as.integer(data$OWN_CAR_AGE), na.rm = TRUE)
data$OWN_CAR_AGE[data$FLAG_OWN_CAR == "Y" & is.na(data$OWN_CAR_AGE)] <- mean_own_car_age
}
}
# add 1 year to all non-NA values of OWN_CAR_AGE
data$OWN_CAR_AGE <- ifelse(!is.na(data$OWN_CAR_AGE), data$OWN_CAR_AGE + 1, data$OWN_CAR_AGE)
# replace remaining NAs in OWN_CAR_AGE with 0
data$OWN_CAR_AGE[is.na(data$OWN_CAR_AGE)] <- 0
# replace NAs in EXT columns with the mean or median of the column
# take mean of source 1 and median of source 2 and 3
ext1_mean <- mean(data$EXT_SOURCE_1, na.rm = TRUE)
ext2_med <- median(data$EXT_SOURCE_2, na.rm = TRUE)
ext3_med <- median(data$EXT_SOURCE_3, na.rm = TRUE)
# add columns to indicate imputed or not
data$IMPUTED_EXT1 <- is.na(data$EXT_SOURCE_1)
data$IMPUTED_EXT2 <- is.na(data$EXT_SOURCE_2)
data$IMPUTED_EXT3 <- is.na(data$EXT_SOURCE_3)
# replace NAs
data$EXT_SOURCE_1[is.na(data$EXT_SOURCE_1)] <- ext1_mean
data$EXT_SOURCE_2[is.na(data$EXT_SOURCE_2)] <- ext2_med
data$EXT_SOURCE_3[is.na(data$EXT_SOURCE_3)] <- ext3_med
# remove rows with any remaining NA values
if(train) {
# ONLY DROP ROWS IF THIS IS TRAINING
data <- na.omit(data)
} else {
# PRESERVE ROWS IF THIS IS TESTING
cols <- sapply(data, function(y) sum(length(which(is.na(y)))))
cols_w_na <- names(cols[which(cols != 0)])
for(col in cols_w_na) {
col_type <- typeof(col)
if(col_type %in% c('logical', 'character')) {
uniq_vals <- table(data[,match(col, names(data))]) |> prop.table()
imputed_value <- sample(names(uniq_vals), 1, prob = as.vector(uniq_vals))
} else {
value <- median(data[,match(col, names(data))], na.rm = TRUE)
imputed_value <- ifelse(col_type == 'integer', as.integer(value), value)
}
data[,match(col, names(data))] <- imputed_value
}
}
return(data)
}
```
We'll now apply that to each data set, both train and test:
```{r}
application_train_clean <- application_cleaning(application_train)
application_test_clean <- application_cleaning(application_test, train = FALSE)
```
We'll write these files to the "data/" directory for easy reference during modeling.
```{r}
fwrite(
application_train_clean,
file = "data/application_train_clean.csv",
row.names = FALSE
)
fwrite(
application_test_clean,
file = "data/application_test_clean.csv",
row.names = FALSE
)
```
The next function will create a data set of balanced classes using the **SMOTE** approach of Synthetic Minority Oversampling Technique.
```{r}
application_smote <- function(data) {
# CHECK FOR SINGLE VALUE COLUMNS
unique <- sapply(data, function(x) length(unique(x)))
remove_cols <- names(unique[unique == 1])
# FORMAT DATA
data_clean <-
data |>
select(-all_of(remove_cols)) |>
select(-SK_ID_CURR) |>
mutate(
DEFAULT = factor(DEFAULT),
across(where(is.character) & -DEFAULT, ~factor(make.names(.))),
across(where(is.logical), ~factor(ifelse(.,"Y","N")))
)
# CONFIRM IMBALANCE
print("---Old Balance---")
print(table(data_clean$DEFAULT) |> prop.table())
# ONE-HOT-ENCODE VARIABLES WITH {CARET}
dmy <- dummyVars("~ . -DEFAULT", data_clean)
data_dmy <- data.frame(predict(dmy, data_clean))
# APPLY SMOTE
smote_results <- SMOTE(
data_dmy,
target = data_clean$DEFAULT
)
# EXTRACT SMOTE DATE
data_smote <- smote_results$data |>
mutate(DEFAULT = class) |>
select(-class)
# CONFIRM REBALANCE3
print("--- New Balance ---")
print(table(data_smote$DEFAULT) |> prop.table())
return(data_smote)
}
```
Now, we'll apply this to only the training set:
```{r}
application_train_smote <- application_smote(application_train_clean)
```
We'll then write it to the "data/" directory:
```{r}
fwrite(
application_train_smote,
file = "data/application_train_smote.csv",
row.names = FALSE
)
```
Now we have both a cleaned and balanced training data set for use in all of our modeling.