-
Notifications
You must be signed in to change notification settings - Fork 1
/
canary_core.R
305 lines (264 loc) · 16 KB
/
canary_core.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#########################################################################
##
## Offline mode of CANARY: A Water Quality Event Detection Tool
## https://software.sandia.gov/trac/canary
## implemented in R. At the moment its not fully implemented. Pattern
## library and baseline change are missing.
##
## Version 1.0
## Copyright (c) 2018 Christian Hoeffer
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
##
#########################################################################
#define function offline_mode with given default values which can be called from another R script
offline_mode <- function(loaded_data=NULL,vals=NULL,window_size=2000,threshold=1.5,used_algo=lm,bed_window=5,prob_bed=0.01,bed_thresh=0.975,debug_flag=FALSE,debug_number_iteration=2040,...){
#how to pass variables to a function
#https://cran.r-project.org/doc/manuals/r-devel/R-intro.html#Named-arguments-and-defaults
#https://cran.r-project.org/doc/manuals/r-devel/R-intro.html#The-three-dots-argument
### check loaded_data
#check if data object is empty or not passed to the function
if(is.null(loaded_data)){
stop("Function needs data object loaded_data to run.")
}
#check if passed data object is a data.frame
if(!is.data.frame(loaded_data)){
stop("Function needs data object loaded_data in the form of a data.frame to run properly.")
}
#check if one or more NA values are in the data.frame; anyNA instead of any(is.na) is used, see
#https://stackoverflow.com/questions/6551825/fastest-way-to-detect-if-vector-has-at-least-1-na/35713234#35713234
if(anyNA(loaded_data)){
warning("Given loaded_data data.frame contains one ore more NA values! This influences the behaviour of the used_algo and other parts of the code.")
}
#check if first column of the data object is of data type factor which indicates the timestamp column
if(is.factor(loaded_data[0,1])){
#remove timestamps from the data object (mostly first column), it will be added later again
subset_loaded_data <- loaded_data[,-1]
}
else{
stop("Unexpected structure in the given loaded_data data.frame. Function offline_mode of canary_core.R expects data type factor in the first column of loaded_data indicating that the first column contains the timestamps. Otherwise it will not
run properly.")
}
### check vals
#check if vals is empty or not passed to the function
if(is.null(vals)){
stop("Function needs data object vals to run.")
}
#check if passed data object is a vector
if(!is.vector(vals)){
stop("Function needs data object vals in the form of a vector to run properly")
}
#check if one or more elements in the vector vals are NOT column names of the loaded_data
if(any(!(is.element(vals,colnames(loaded_data))))){
stop("One or more elements in the vector vals are NOT column names of the loaded_data data.frame. In this case function offline_mode will not run properly.")
}
#check if one or more elements of the vector vals are duplicated. This will break the the creation of the data frames and also later causes trouble if assigning results to one specific column name.
if(any(duplicated(vals))){
stop("One or more elements of the vector vals are duplicated. In this case function offline_mode will not run properly.")
}
#check if debugging/testing mode is selected or not
if(debug_flag==TRUE){
#limit number_iteration to 2030 (default, set by debug_number_iteration) and just loop over the first 2030 rows of the given loaded_data data.frame
#for fast testing different used_algo or algoAddParam on a loaded_data or other testing of this function
number_iteration<-debug_number_iteration
}
if(debug_flag==FALSE){
#loop over the whole length of the given loaded_data data.frame which means analyze all rows
number_iteration<-nrow(loaded_data)
}
#check if window size + bed_windows is smaller number_iteration; if not the function offline_mode will not work properly
if((window_size+bed_window)>number_iteration){
stop("window size + bed_window=",window_size+bed_window," is bigger than number_iteration=",number_iteration,". In this case function offline_mode will not run properly.")
}
### BED (Binomial Event Discriminator)
#define a the function for BED see: https://www.osti.gov/servlets/purl/1267022
binevdis <- function(r,n,p){
part1 <- factorial(n)/(factorial(r)*factorial(n-r))
part2 <- (p^r)*(1-p)^(n-r)
return(part1*part2)
}
############### create data.frames which are needed in the function ###############
### create empty data.frame as a blueprint for adding rows of the loaded_data while filling the window
subset_used <- subset_loaded_data[0,]
### create empty data.frame as a blueprint for adding the timestamps of the loaded_data while filling the window
subset_used_timestamps <- loaded_data[0,1, drop = FALSE]
#create empty data frame for residuals and standard deviation of each column, defined in vals, in the window
temp_res_sd<-setNames(as.data.frame(matrix(ncol = length(vals), nrow = 2)),vals)
row.names(temp_res_sd)<-c("res","sd")
### create data.frame for residuals of each selected column (defined in vals)
#create empty variable
used_names_res <- NULL
#fill variable with the needed names for the residuals data.frame
#https://stackoverflow.com/questions/6984796/how-to-paste-a-string-on-each-element-of-a-vector-of-strings-using-apply-in-r
used_names_res <- paste0(vals, "_residual")
#create data.frame for residuals for each selected column and assign the needed column names to the residual data.frame
#https://stackoverflow.com/a/32712555
calc_residuals <- setNames(as.data.frame(matrix(ncol = length(used_names_res), nrow = 0)),used_names_res)
#fill columns XXX_residual with NA until first calculation (depending on the window size)
calc_residuals[1:window_size,] <- as.numeric(NA)
### create data.frame for if event or not of each selected column (defined in vals)
#create empty variable
used_names_events <- NULL
#fill variable with the needed names for the column specific event data.frame
used_names_events <- paste0(vals, "_event")
#create data.frame for outlier detection saving for each selected column and assign the needed column names to the column specific event data.frame
events_col <- setNames(as.data.frame(matrix(ncol = length(used_names_events), nrow = 0)),used_names_events)
#fill columns XXX_event with NA until first calculation (depending on the window size)
events_col[1:window_size,] <- NA
### create data.frame for each row if event or not (like canary does it)
#create data.frame for outlier detection saving
events <- read.csv(text="Event")
#fill column Event with NA until first calculation (depending on the window size)
events[1:window_size,] <- NA
### create data.frame for BED for each selected column (defined in vals)
#create empty variable
used_names_bed <- NULL
#fill variable with the needed names for the column specific probability of an event data.frame and assign the needed column
#names to the column specific probability of an event data.frame
used_names_bed <- paste0(vals, "_prob_event")
#create data.frame for BED results saving for each column used
events_col_bed <- setNames(as.data.frame(matrix(ncol = length(used_names_bed), nrow = 0)),used_names_bed)
#fill columns XXX_prob_event with NA until first calculation (depending on the window_size and bed_window)
events_col_bed[1:(window_size+bed_window),] <- as.numeric(NA)
### create data.frame for each row for BED probability (like canary does it)
#create data.frame for saving the probabilities of an event
events_bed <- read.csv(text="Prob_event")
#fill column Event with NA until first calculation (depending on the window_size and bed_window)
events_bed[1:(window_size+bed_window),] <- as.numeric(NA)
### create data.frame for each row for final decision if event or not
#create data.frame for saving the probabilities of an event
events_final <- read.csv(text="Final_event")
#fill column Event with NA until first calculation (depending on the window size)
events_final[1:(window_size+bed_window),] <- NA
############### loop over the number of rows (defined by number_iteration) in the loaded_data data.frame ###############
#fill subset_used until window is filled with enough observations
subset_used <- subset_loaded_data[1:(window_size-1),]
#subset_used_return will be used later for returning the original values from the loaded_data
#because the data in subset_used will be modified during the for loop
subset_used_return <- subset_loaded_data[1:number_iteration,]
#fill subset_used_timestamps
subset_used_timestamps <- loaded_data[1:number_iteration,1,drop = FALSE]
#start loop of the canary offline mode with adding each time a new row/element to the window and perform the calculations
for(row in window_size:number_iteration){
#add new row to subset_used and subset_used_print
subset_used <- rbind(subset_used,subset_loaded_data[row,])
#reset all values of temp_res_sd for next loop
temp_res_sd[]<-NA
#take the last observations of subsed_used which fit the window_size, so the observations which are in the window
defined_window <- tail(subset_used,window_size)
#scale() returns NaN in some cases if window size is too small, see https://stackoverflow.com/questions/15363610/why-does-scale-return-nan-for-zero-variance-columns
#and therefore windowssize has to be big enough to avoid this!
#normalize data; scale returns a matrix object, therefore the object is transformed back to a data.frame
scaled_defined_window <- as.data.frame(scale(defined_window))
#check if one or more NaN values are in the scaled_defined_window (as.matrix is needed because this function does not work with a data.frame)
if(any(is.nan(as.matrix(scaled_defined_window)))){
stop("Normalized data in the window contains NaN values. This might crash the used_algo, therefore function offline_mode stops")
}
#remove last added row
scaled_defined_window_last_row_rm <- head(scaled_defined_window, -1)
#exclude the last added row
scaled_defined_window_last_row <- tail(scaled_defined_window, 1)
#calculations for the residuals and events for all selected columns (defined in vals)
for(i in 1:length(vals)){
#assign name
used_name <- vals[i]
#train selected algorithm (defined by used_algo);formula has to be inserted as ONE text, therefore paste is used and in addation
#as.formula() is used to transform the text to a formula so that it is recognized by algorithms as type formula
#unlist() unpacks the additional parameters which can be passed to the selected algorithm
used_model <- used_algo(formula = as.formula(paste0(used_name, " ~ . ")), data = scaled_defined_window_last_row_rm,...)
pred_value <- predict(used_model, scaled_defined_window_last_row)
#calculate residual between predicted and real value. unname() is used because return value is a named num and abs() is used to get absolut value
temp_res_sd["res",used_name] <- abs(unname(pred_value-scaled_defined_window_last_row[,used_name]))
#assign residual to the apppropriate column in the calc_residuals data.frame
calc_residuals[row,paste0(used_name, "_residual")] <- temp_res_sd["res",used_name]
#calculate sd for this column in the window
temp_res_sd["sd",used_name]<-sd(defined_window[,used_name],na.rm = TRUE)
#print(sd(defined_window[,"B_TOC_VAL"],na.rm=TRUE))
#check if outlier or not for each column separately (residual classification) and store it in the events_col data.frame
#additional functionality which is added and is not in CANARY
#check for NA values because they brake the if statement
if(!is.na(temp_res_sd["res",used_name])){
#outlier
if(temp_res_sd["res",used_name]>threshold*temp_res_sd["sd",used_name]){
#add event = TRUE to events_col data.frame
events_col[row,paste0(used_name,"_event")]=TRUE
}
#backround
if(temp_res_sd["res",used_name]<=threshold*temp_res_sd["sd",used_name]){
#add event = FALSE to events_col data.frame
events_col[row,paste0(used_name,"_event")]=FALSE
}
}
else{
#add event = NA to events_col data.frame
events_col[row,paste0(used_name,"_event")]=NA
}
### BED analysis for each column
#check if all of the last rows in events_col, defined by the bed_window, contain no NA values -> so are valid for BED analysis
#additional functionality which is added and is not in CANARY itself
if(!any(is.na(tail(events_col[,paste0(used_name,"_event")],bed_window)))){
#how often was an event detected in the bed_window? event = TRUE https://stackoverflow.com/questions/2190756/how-to-count-true-values-in-a-logical-vector
occur_col_true<-sum(tail(events_col[,paste0(used_name,"_event")],bed_window),na.rm=TRUE)
#Calculation of the probability of an event, therefore 1-binevdis
events_col_bed[row,paste0(used_name, "_prob_event")]<-1-binevdis(occur_col_true,bed_window,prob_bed)
}
}
#find biggest residual in temp_res_sd and compare it with threshold
index_biggest_res<-which.max(temp_res_sd["res",])
max_rs<-temp_res_sd["res",index_biggest_res]
#check if event or not (Result outlier Classification)
#check for existence of NA values because they break the if statement
if(length(max_rs) > 0){
#outlier
if(max_rs>threshold*temp_res_sd["sd",index_biggest_res]){
#add event = TRUE to events data.frame
events[row,"Event"] <- TRUE
#remove value from the column of subset_used which is in this row the source for the outlier classification so that it is excluded from
#the values used to predict water quality at next time step needs to be checked if assigning there NA is the proper way to solve it or not
subset_used[row,colnames(temp_res_sd[index_biggest_res])]<-NA
}
#backround
if(max_rs<=threshold*temp_res_sd["sd",index_biggest_res]){
#add event = FALSE to events data.frame
events[row,"Event"] <- FALSE
}
}
else{
#add event = NA to events data.frame
events[row,"Event"] <- NA
}
### BED analysis for each row (also final decision if event or not)
#check if all of the last rows in events, defined by the bed_window, contain no NA values -> so are valid for BED analysis
if(!any(is.na(tail(events[,"Event"],bed_window)))){
#how often was an event detected in the bed_window? event = TRUE https://stackoverflow.com/questions/2190756/how-to-count-true-values-in-a-logical-vector
occur_true<-sum(tail(events[,"Event"],bed_window),na.rm=TRUE)
#Calculation of the probability of an event, therefore 1-binevdis
temp_prob_event <- 1-binevdis(occur_true,bed_window,prob_bed)
events_bed[row,"Prob_event"] <- temp_prob_event
### check if probability of an event is bigger a chosen threshold (defined by bed_thresh)
#if yes, mark it as TRUE -> event
if(temp_prob_event>=bed_thresh){
events_final[row,"Final_event"]<-TRUE
}
#if not, mark it as FALSE -> NO event
if(temp_prob_event<=bed_thresh){
events_final[row,"Final_event"]<-FALSE
}
}
}
#combine all data.frames to one big data.frame for the results return value
results <- cbind(subset_used_timestamps,subset_used_return,calc_residuals,events_col,events,events_col_bed,events_bed,events_final)
#return the results data frame
return(results)
}