-
Notifications
You must be signed in to change notification settings - Fork 0
/
Splitten.Rmd
67 lines (54 loc) · 1.73 KB
/
Splitten.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
---
title: "Template"
author: "Anna Grefhorst"
date: "`r Sys.Date()`"
output: html_document
---
```{r setup}
library(readr)
```
```{r cars}
# Laden van de data. In train_data moet alleen train.csv zitten.
train_data <- read.csv("/Users/whgrefhorst/Documents/Econometrie/Master/Machine Learning/train.csv")
target<-train_data$target
predictors<-train_data[,2:ncol(train_data)]
```
```{r pressure, echo=FALSE}
#Split the training data into a training set and a validation set
set.seed(123) # For reproducibility
k= 10 # Number of folds (dit kan je aanpassen!)
n= nrow(train_data) #total amount of data
fraction_test=0.2
fraction_training=0.8
rows_to_keep <- floor(fraction_training*nrow(train_data))
train_data<-train_data[(1:rows_to_keep),]
n=rows_to_keep
# Split into 10 equal parts
shuffled_indices <- sample(1:n)
split_size <- floor(n / 10)
last_split_size<- n - 9*split_size
split_data <- split(train_data[shuffled_indices, ],
ceiling(seq_along(shuffled_indices) / split_size))
length(split_data)
# Check if there are more than k splits, if yes combine the last two
if (length(split_data) > k) {
# Combine the 10th and 11th parts
split_data[[k]] <- rbind(split_data[[k]], split_data[[k+1]])
# Remove the 11th part
split_data <- split_data[-(k+1)]
}
i=1
for (i in 1:k){
print(paste0("fold: ",i,"/",k))
# Create a training and validation set
validation_indices <- split_data[[i]]
train_indices <- split_data[-i]
# Makes it easier to use the training data afterwards, the validation data is already a data frame
train_indices <- do.call(rbind, train_indices)
print(nrow(validation_indices))
print(nrow(train_indices))
#Doe hier wat je wil
}
#Hier heb je dan nog de test set, die is 0.2*
print("finished")
```