Bitcoin_Heist_Code.Rmd


# Appendix

## Read in the Data 
```{r}
set.seed(1)

# Read the data
df <- read.csv("BitcoinHeistData.csv", stringsAsFactors=FALSE, header=TRUE)
summary(df)

```

## Data Cleaning
```{r}

# simplify data
data <- df[sample(nrow(df),14370, replace = FALSE),] 

# remove address column
data$address <- NULL

# convert variable type
data$year <- as.numeric(data$year)
data$day <- as.numeric(data$day)
data$length <- as.numeric(data$length)
data$weight <- as.numeric(data$weight)
data$count <- as.numeric(data$count)
data$looped <- as.numeric(data$looped)
data$neighbors <- as.numeric(data$neighbors)
data$income <- as.numeric(data$income)

# turn y variable into binary 0, 1
data$label <- ifelse(data$label=="white", 0, 1)

# preview data
tail(data)

```

## Split into training and testing sets
```{r}

flag <- sort(sample(nrow(data),4790, replace = FALSE))
btctrain <- data[-flag,]
btctest <- data[flag,]

## true response value for training and testing data
y1    <- btctrain$label;
y2    <- btctest$label;

```


## EDA
```{r}

# Visualize correlations between variables
corr<-cor(data)
library(corrplot)
corrplot(corr, method="circle")

# Set label as factor
data$label <- as.factor(data$label)

library(ggplot2)
# response variable
ggplot(data.frame(data$label), aes(x=data$label)) +
  geom_bar(fill="#702963") + xlab("Label") + ylab("Frequency") + ggtitle("Distribution of 'label' Response Variable")

```

## Boosting Method

```{r}

library(gbm)
library(magrittr)
library(dplyr)

# create grid search
hyper_grid <- expand.grid(
  learning_rate = c(0.3, 0.1, 0.05, 0.01, 0.005),
  RMSE = NA,
  trees = NA
)

# execute grid search
for(i in seq_len(nrow(hyper_grid))) {

  # fit gbm
  set.seed(123)  # for reproducibility
  train_time <- system.time({
    m <- gbm(
      formula = label ~ .,
      data = btctrain,
      distribution = "bernoulli",
      n.trees = 5000, 
      shrinkage = hyper_grid$learning_rate[i], 
      cv.folds = 10 
   )
  })
  
  # add SSE, trees, and training time to results
  hyper_grid$RMSE[i]  <- sqrt(min(m$cv.error))
  hyper_grid$trees[i] <- which.min(m$cv.error)
  hyper_grid$Time[i]  <- train_time[["elapsed"]]

}

# results
arrange(hyper_grid, RMSE)

```


```{r}

#GBM
gbm.btc1 <- gbm(label ~ .,data=btctrain,
                distribution = 'bernoulli',
                n.trees = 5000, 
                shrinkage = 0.05,
                cv.folds = 10)

## Find the estimated optimal number of iterations
perf_gbm1 <- gbm.perf(gbm.btc1, method="cv") 
message("The optimal n.trees iterations is: ", perf_gbm1)


```

```{r}

# search grid
hyper_grid <- expand.grid(
  n.trees = perf_gbm1,
  shrinkage = 0.05,
  interaction.depth = c(3, 5, 7)
)

# create model fit function
model_fit <- function(n.trees, shrinkage, interaction.depth) {
  set.seed(123)
  m <- gbm(
    formula = label ~ .,
    data = btctrain,
    distribution = "bernoulli",
    n.trees = n.trees,
    shrinkage = shrinkage,
    interaction.depth = interaction.depth,
    cv.folds = 10
  )
  # compute RMSE
  sqrt(min(m$cv.error))
}

# perform search grid with functional programming
hyper_grid$rmse <- purrr::pmap_dbl(
  hyper_grid,
  ~ model_fit(
    n.trees = ..1,
    shrinkage = ..2,
    interaction.depth = ..3
    )
)

# results
arrange(hyper_grid, rmse)

```


```{r}

#update model
gbm.btc2 <- gbm(label ~ .,data=btctrain,
                 distribution = 'bernoulli',
                   n.trees = perf_gbm1, 
                   shrinkage = 0.05, 
                   interaction.depth = 5,
                   cv.folds = 10)

## Which variances are important
summary(gbm.btc2)

```


```{r}

## Training error
message("Predicted classification probabilities of first ten rows:")
pred1gbm <- predict(gbm.btc2,newdata = btctrain, n.trees=perf_gbm1, type="response")
pred1gbm[1:10]

message("Predicted label values of first ten rows: ")
y1hat <- ifelse(pred1gbm < 0.5, 0, 1)
y1hat[1:10]

message("The training error is: ", sum(y1hat != y1)/length(y1))


## Testing Error
y2hat <- ifelse(predict(gbm.btc2, newdata = btctest[,-9], n.trees=perf_gbm1, type="response") < 0.5, 0, 1)
message("The testing error is: ", mean(y2hat != y2) )

boost <- mean(y2hat != y2)

```

## Random Forest

```{r}

library(randomForest)
library(caret)


# tuned parameters
mtry_tune = round(sqrt(8), 0)
nodesize_tune = 1
ntree_tune = 500

control <- trainControl(method = 'repeatedcv',number = 5)

storeMaxtrees <- list()
tuneGrid <- expand.grid(.mtry = mtry_tune)
for (ntree in c(500, 1000, 2000, 5000)) {
  set.seed(1)
  rf.maxtrees <- train(as.factor(label) ~ .,
                       data = btctrain,
                       method = "rf",
                       metric = "Accuracy",
                       tuneGrid = tuneGrid,
                       trControl = control,
                       importance = TRUE,
                       nodesize = nodesize_tune,
                       ntree = ntree)
  key <- toString(ntree)
  storeMaxtrees[[key]] <- rf.maxtrees
}
resultsTree <- resamples(storeMaxtrees)
res = summary(resultsTree)
print(res)
ntree_tune <- res$models[which.max(res$statistics$Accuracy[,"Mean"])]


message("The tuned mtry value is: ", mtry_tune)
message("The tuned nodesize value is: ", nodesize_tune)
message("The tuned ntree value is: ", ntree_tune)

```


```{r}

#F: Random Forest
modF <- randomForest(as.factor(label) ~., data=btctrain, 
                    mtry = 3,
                    nodesize = 1,
                    ntree = 500,
                    importance=TRUE)

#check importance
importance(modF, type=1)
importance(modF, type=2)
varImpPlot(modF)

```


```{r}

# prediction errors
y2hatF = predict(modF, btctest, type='class')
message("The predicted testing error is: ", mean(y2hatF != y2))
rftest <- mean(y2hatF != y2)

```

## Baseline Methods
```{r}
#A. Logistic regression 
modA <- step(glm(label ~ ., data = btctrain, family = "binomial"), trace=0)
summary(modA)

y2hatA <- ifelse(predict(modA, btctest[,-58], type="response" ) < 0.5, 0, 1)
message("The testing error is: ", sum(y2hatA != y2)/length(y2))
steplog <- mean( y2hatA != y2)

```

```{r}

#B.Linear Discriminant Analysis
library(MASS)
modB <- lda(btctrain[,1:8], btctrain[,9])
print(modB)


y2hatB <- predict(modB, btctest[,-9])$class
message("The testing error is: ", mean( y2hatB  != y2))
ldiscrim <-  mean( y2hatB != y2)

```

```{r}

## C. Naive Bayes (with full X)
library(e1071)
modC <- naiveBayes(as.factor(label) ~. , data = btctrain)
summary(modC)

y2hatC <- predict(modC, newdata = btctest, type="class")

message("The testing error is: ", mean( y2hatC != y2))
nbayes <- mean( y2hatC != y2)

```

```{r}

#E: a single Tree
library(rpart)
modE0 <- rpart(label ~ .,data=btctrain, method="class", 
                     parms=list(split="gini"))

#tune parameters
opt <- which.min(modE0$cptable[, "xerror"]); 
cp1 <- modE0$cptable[opt, "CP"];
modE <- prune(modE0,cp=cp1);
summary(modE)

y2hatE <-  predict(modE, btctest[,-9],type="class")
message("The testing error is: ", mean(y2hatE != y2))
singletree <- mean(y2hatE != y2)

```

## Results

```{r}

# Print all training errors
message("Testing Error of Each Model: ")
testing_errors = c(boost, rftest, steplog, ldiscrim, nbayes, singletree)
models <- c("Boosting", "Random Forest", "Stepwise Logistic", "Linear Discriminant Analysis", "Naive Bayes", "Single Tree")
results_table <- data.frame(model=models, testing_error=testing_errors)

print(results_table)

```