-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_SEER_dataset.R
32 lines (27 loc) · 1.15 KB
/
split_SEER_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env Rscript
#' ----------------------------------------------------------------------------
#' title: split_SEER_dataset.R
#' description:
#' Script to split the CSV file with the SEER dataset (SeerMetHeader.csv)
#' into two separate files (Dataset1.csv and Dataset2.csv) that can be used
#' to test the distributed learning algorithm and infrastructure.
#' author:
#' Melle Sieswerda <[email protected]>
#' Anna van der Zalm <[email protected]>
#' Gijs Geleijnse <[email protected]>
#' date: 09-may-2018
#' license: MIT License
#' ----------------------------------------------------------------------------
# Load the original (full) CSV file
full_data <- as.matrix(read.csv("SeerMetHeader.csv"))
# Odd rows go into dataset 1, even in dataset 2
data1 <- full_data[seq(1, nrow(full_data), by=2), ]
data2 <- full_data[seq(2, nrow(full_data), by=2), ]
# Move row 12 from dataset 1 to dataset 2 to ensure unequal number of
# eventtimes
tmp <- data1[12, ]
data1 <- data1[-12,]
data2 <- rbind(data2, tmp)
# Write the end result to disk
write.csv(data1, file = "Dataset1.csv", row.names=FALSE)
write.csv(data2, file = "Dataset2.csv", row.names=FALSE)