-
Notifications
You must be signed in to change notification settings - Fork 1
/
Regression.R
193 lines (132 loc) · 7.67 KB
/
Regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#
# Regression.R
# Obesity Risk
#
# Created by Diego Revilla on 26/04/24
# Copyright � 2024 . All Rights reserved
#
############################################################################################
# LIBRARIES
library(ggplot2) # For some basic plotting while exploring the data
library(faraway) # For data (assuming it contains 'obesity_data' dataset)
library(arules) # For association rules mining (not used in the provided code)
library(ResourceSelection) # For model selection (not used in the provided code)
library(caret) # For machine learning model evaluation
library(pROC) # For ROC curve analysis
library(dplyr) # For data manipulation
############################################################################################
# DATA LOADING
obesity_data_loading <- function(path) {
data <- read.csv(path)
factorized_columns <- c('Gender', 'CAEC', 'CALC', 'MTRANS', 'NObeyesdad')
booleanized_columns <- c('FAVC', 'family_history_with_overweight', 'SMOKE', 'SCC')
lapply(booleanized_columns, function(x) {data[[x]] <<- data[[x]] == 'yes'})
return(data)
}
obesity_data <- obesity_data_loading('Datasets/ObesityDataSet.csv')
############################################################################################
# LINEAR REGRESSION
data_linear <- obesity_data
# Assuming your dataset is called 'obesity_data' and you want to include only numeric variables
# Select only numeric variables
numeric_data <- obesity_data[, sapply(obesity_data, is.numeric)]
# Compute correlation matrix
correlation_matrix <- cor(numeric_data)
# Build dummy variables
data_linear$CALC <- ifelse(data_linear$CALC == 'Frequently' | data_linear$CALC == 'Always',1,0)
data_linear$CAEC <- ifelse(data_linear$CAEC == 'Frequently' | data_linear$CAEC == 'Always',1,0)
data_linear$MTRANS<- ifelse(data_linear$MTRANS == 'Automobile' | data_linear$CALC == 'Public_Transport',1,0)
# Calculate different linear regression models by backwards elimination
lm0 <- lm(Weight ~ Gender + Age + Height + family_history_with_overweight + FAVC + FCVC + NCP + CAEC + SMOKE + CH2O + SCC + FAF + TUE + CALC + MTRANS ,data = data_linear)
summary(lm0) # Exclude SMOKE variable from our model
lm1 <- lm(Weight ~ Gender + Age + Height + family_history_with_overweight + FAVC + FCVC + NCP + CAEC + CH2O + SCC + FAF + TUE + CALC + MTRANS ,data = data_linear)
summary(lm1) # Exclude CALC variable from our model
lm2 <- lm(Weight ~ Gender + Age + Height + family_history_with_overweight + FAVC + FCVC + NCP + CAEC + CH2O + SCC + FAF + TUE + MTRANS ,data = data_linear)
summary(lm2) # Exclude NCP variable from our model
lm3 <- lm(Weight ~ Gender + Age + Height + family_history_with_overweight + FAVC + FCVC + CAEC + CH2O + SCC + FAF + TUE + MTRANS ,data = data_linear)
summary(lm3) # We are also going to exclude Gender (Explanation on the report)
lm4 <- lm(Weight ~ Age + Height + family_history_with_overweight + FAVC + FCVC + CAEC + CH2O + SCC + FAF + TUE + MTRANS ,data = data_linear)
summary(lm4)
# Let's study our refined model in detail
linear_regression_model <- lm4
# Calculate the confidence interval
confint(linear_regression_model)
# Summarize the regression results
anova(linear_regression_model)
# Plot the predicted weight against the actual weight
plot(obesity_data$Weight, fitted(linear_regression_model), main = "Actual vs. Predicted", xlab = "Actual Weight", ylab = "Predicted Weight")
abline(0, 1, col = "red") # Add a diagonal line for perfect prediction
# Let's check the normality of Residuals
plot(linear_regression_model, 2)
# Let's check the influential points with Cook's distance
plot(linear_regression_model, 4)
############################################################################################
# LOGISTIC REGRESSION
data_logistic <- obesity_data
# Convert the response variable to binary: Obesity_Type_III and Obesity_Type_II are marked as 1, others as 0
data_logistic$NObeyesdad <- ifelse(data_logistic$NObeyesdad == 'Obesity_Type_III' | data_logistic$NObeyesdad == 'Obesity_Type_II', 1, 0)
# Remove rows with missing values
data_logistic <- data_logistic[complete.cases(data_logistic),]
# Fit logistic regression model
model <- glm(NObeyesdad ~ NCP + CH2O + FAF + TUE, family = binomial(), data = data_logistic)
# Fit additional logistic regression models with different combinations of predictor variables
model1 <- glm(NObeyesdad ~ NCP + CH2O + FAF + TUE + Age + Height + CAEC + MTRANS + CALC, family = binomial(), data = data_logistic)
model2 <- glm(NObeyesdad ~ NCP + CH2O + FAF + FCVC + TUE, family = binomial(), data = data_logistic)
model3 <- glm(NObeyesdad ~ NCP + CH2O + FAF + Height + Age + TUE, family = binomial(), data = data_logistic)
model4 <- glm(NObeyesdad ~ NCP + CH2O + FAF + Age + TUE, family = binomial(), data = data_logistic)
model5 <- glm(NObeyesdad ~ NCP + CH2O + FAF + Height + TUE, family = binomial(), data = data_logistic)
model6 <- glm(NObeyesdad ~ NCP + CH2O + FAF + TUE + Weight, family = binomial(), data = data_logistic)
# Calculate likelihood ratio statistic and p-value for model comparison
lratiostat <- model1$null.deviance - model$deviance
dofstat <- model1$df.null - model$df.residual
pchisq(lratiostat, df = dofstat, lower.tail = FALSE)
# Calculate likelihood ratio statistic and p-value for model comparison
lratiostat <- model2$null.deviance - model$deviance
dofstat <- model2$df.null - model$df.residual
pchisq(lratiostat, df = dofstat, lower.tail = FALSE)
# Calculate likelihood ratio statistic and p-value for model comparison
lratiostat <- model3$null.deviance - model$deviance
dofstat <- model3$df.null - model$df.residual
pchisq(lratiostat, df = dofstat, lower.tail = FALSE)
# Calculate likelihood ratio statistic and p-value for model comparison
lratiostat <- model4$null.deviance - model$deviance
dofstat <- model4$df.null - model$df.residual
pchisq(lratiostat, df = dofstat, lower.tail = FALSE)
# Calculate likelihood ratio statistic and p-value for model comparison
lratiostat <- model5$null.deviance - model$deviance
dofstat <- model5$df.null - model$df.residual
pchisq(lratiostat, df = dofstat, lower.tail = FALSE)
# Calculate likelihood ratio statistic and p-value for model comparison
lratiostat <- model6$null.deviance - model$deviance
dofstat <- model6$df.null - model$df.residual
pchisq(lratiostat, df = dofstat, lower.tail = FALSE)
# Conduct Hosmer-Lemeshow test for model diagnosis
hoslem.test(data_logistic$NObeyesdad, model$fitted.values)
hoslem.test(data_logistic$NObeyesdad, model1$fitted.values)
hoslem.test(data_logistic$NObeyesdad, model2$fitted.values)
hoslem.test(data_logistic$NObeyesdad, model3$fitted.values)
hoslem.test(data_logistic$NObeyesdad, model4$fitted.values)
hoslem.test(data_logistic$NObeyesdad, model5$fitted.values)
hoslem.test(data_logistic$NObeyesdad, model6$fitted.values)
# Summarize model coefficients and interpret the exponentiated coefficients
summary(model1)
exp(model1$coefficients)
cp <- 0.5
# Extract response and predicted values
response <- data_logistic$NObeyesdad
prediction <- ifelse(model6$fitted.values < cp, 0, 1)
# Plot ROC curve
par(pty = "s")
Roc <- roc(response, model1$fitted.values, plot = TRUE, legacy.axes = TRUE, print.auc = TRUE)
plot(Roc$thresholds, Roc$sensitivities, type = "l", xlab = "Cut points", ylab = "Sensitivity/Specificity", bty = "n")
lines(Roc$thresholds, Roc$specificities)
text(x = 0.7, y = 0.05, labels = "Sensitivity")
text(x = 0.7, y = 0.95, labels = "Specificity")
abline(v = 0.084, col = "gray", lty = 2)
# Change threshold for classification
cp <- 0.35
# Generate predictions using new threshold
prediction <- ifelse(model1$fitted.values < cp, 0, 1)
# Evaluate model performance using confusion matrix with new threshold
tab <- confusionMatrix(as.factor(prediction), as.factor(response), positive = "1")
tab