-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsigFeature.R
234 lines (211 loc) · 13.5 KB
/
sigFeature.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
DESCRIPTION <- "identify the features which make a group withen an enumeration of the data unique."
#' Takes a data set of features of interest and a feature to group on and returns a matrix of group vs factors with
#' a value representing how unique the factor is. The higher the number, the more likely it better describes the group,
#' differentiating it from the groups.
#' WARNING: df must only contain logicals which roll up to factors. Thats the majority of the VERISR data, but not all of it.
#' WARNING: The distribution comparison algorithm assumes a sum of 100% across a vector. Since we have some groups that don't have
#' the feature at all, and some that have more than 100% (say when 2 values for action.hacking.variety are chosen)
#' the numbers are not completely accurate and should only be used as a guide for exploring the data.
#' WARNING: Due to the vectors not being exactly as expected as described above, some "Inf" and some "NA"/"NaN" will appear in the results.
#' I'm not yet sure of the best way to interpret them.
#' Inf appears to be when the group is completely in the non-group. e.g. group: c(1,0,1,0) non-group: c(1,5,1,0)
#' "NA"/"NaN" may equate to "does not exist in the group". Either may indicate relevance.
#' NOTE: I'm handling factors but not hierarchicals like the DBIR actually has
#' maybe the bayesian stuff in the entropy package can help with this.
#' NOTE: For continuous variables, I could notionally use em2d(a,b) from emdist (earth movers dist package)
#' problem 1: that seems to require 'binning' the vector
#' problem 2: the values max at the distances so c(1,0,0) to c(0,0,1) would be 3. That needs to be normalized
#' NOTE: This uses a nested 'for' loop. I'm sorry.
#' NOTE: This takes a second to run. See note about nested 'for' loops and me being sorry.
#'
#' @param df A verisr object subsetted to feature columns of interest plus the \code{group_feature} column(s)
#' @param group_feature The feature you want to understand how the patterns are different in. Maybe be a single column like 'pattern' or a column part of a set of columns like 'action.malware.variety.Other'
#' @return a matrix of the levels of group_feature vs the other features with values representing the significance of the feature to the level.
#' @examples
#' sf <- vz %>% select(contains("variety"), contains("vector"), pattern) %>% sigFeatures("pattern")` will let us know what
#' features relevant per pattern. `sf["Cyber-Espionage", ] %>% sort(decreasing=TRUE) %>% View()` will produce the following:
#' 1 action.hacking.variety Inf
#' 2 actor.external.variety 5.492158e+02
#' 3 attribute.confidentiality.data.variety 7.418538e+01
#' 4 attribute.availability.variety 6.560892e+01
#' 5 asset.assets.variety 5.287068e+01
#' ...
#' To understand "Cyber-Espionage" we should therefore subset to it and enumerate "action.hacking.variety" or "action.external.variety":
#' vz %>% filter(pattern.Cyber-Espionage) %>% getenum("action.external.variety")
#' vz %>% filter(!pattern.Cyber-Espionage) %>% getenum("action.external.variety")
#' incidents <- vz %>% select(timeline.incident.year,
#' plus.dbir_year,
#' contains("variety"),
#' contains("vector"),
#' starts_with("attribute.confidentiality.data_disclosure"),
#' starts_with("data_discovery"),
#' matches("^victim.(industry2.|employee_count|orgsize)"),
#' matches("timeline.*.unit.*"))
#' breaches <- incidents %>% filter(attribute.confidentiality.data_disclosure.Yes)
#' sf <- breaches %>% select(-timeline.incident.year, -plus.dbir_year) %>% sigFeatures("action.malware.vector.Email link")
#' sf["Email link", ] %>% melt() %>% add_rownames() %>% arrange(desc(value))
#' action.hacking.variety Inf
#' actor.external.variety Inf
#' asset.assets.variety Inf
#' action.malware.vector 224.46550114
#' timeline.compromise.unit 35.83258006
#' victim.industry2 4.51195209
#' action.malware.variety 3.77334349
#' ...
#' vz %>% filter(`action.malware.vector.Email link`) %>% getenum("timeline.compromise.unit")
#' enum x n freq
#' 1: Days 11 49 0.22448980
#' 2: Hours 2 49 0.04081633
#' 3: Minutes 1 49 0.02040816
#' 4: Months 0 49 0.00000000
#' 5: NA 15 49 0.30612245
#' 6: Never 0 49 0.00000000
#' 7: Seconds 11 49 0.22448980
#' 8: Unknown 7 49 0.14285714
#' 9: Weeks 2 49 0.04081633
#' 10: Years 0 49 0.00000000
#' vz %>% filter(`action.malware.vector.Email link`) %>% select(starts_with("timeline.compromise.unit"), timeline.incident.year) %>% getTSenum(depth=4, table="x", transpose=TRUE)
#' Days Hours Minutes Months NA Never Seconds Unknown Weeks Years
#' 2010 4 0 0 0 0 0 0 0 0 0
#' 2011 0 0 0 0 0 0 1 0 1 0
#' 2012 0 1 0 0 0 0 5 3 0 0
#' 2013 3 0 1 0 4 0 5 3 0 0
#' 2014 4 1 0 0 11 0 0 1 1 0
sigFeatures <- function(df, group_feature) {
require(qdapTools)
require(tidyr)
require(reshape2)
require(entropy)
require(dplyr)
# Raise errors basedon df size
if (nrow(df[df[[group_feature]], ]) == 0) {
stop(paste0(group_feature, " not present in df."))
}
if (nrow(df[!df[[group_feature]], ]) == 0) {
stop(paste0("No rows exist where ", group_feature, " is not present in df so no rows to compare to."))
} else {
# if the group_feature is logical, it needs to be part of the chunk_gather
if (class(df[[group_feature]]) == "logical") {
chunk_group_feature <- strsplit(group_feature, "[.](?!.*[.])", perl=T)[[1]][1]
# Get the count of records by group_feature to divide by
group_feature_sums <- df %>% select(starts_with(chunk_group_feature)) %>% colSums() %>% as.data.frame() %>% add_rownames()
# split the group feature column into key and value (list of vector of 2 as a column)
group_feature_sums[[1]] <- strsplit(group_feature_sums[[1]], "[.](?!.*[.])", perl=T)
# keep only the 2nd item and get back to a vector
group_feature_sums[[1]] <- sapply(group_feature_sums[[1]], function(x) {x[2]})
# filter where the group feature doesn't exist in the data
# This is because those rows could implicitly have any value for this enumeration.
# NOTE: This should leave any row with the major enumeration. (e.g. if feature is action.hacking.variety.DoS, all action.*.variety.* should be included)
# Doing so will cause issues later on in the row duplication block immediately below.
group_feature_rows <- df %>% select(starts_with(chunk_group_feature)) %>% apply(MARGIN=1, any)
#df <- df[group_feature_rows,] # This line was causing errors and, anyway `df <- df[rep.int(seq(nrow(df)), times=l) ,]` below accomplishes the same thing
# get a column to represent the combination of the group_feature function
# create a feature with a list of the group_feature that are trust
df[[chunk_group_feature]] <- df %>% select(starts_with(chunk_group_feature)) %>% apply(MARGIN=1, function(x) {names(x[x==TRUE])})
# unwind that list to a strait vector of all true features
group <-unlist(df[[chunk_group_feature]])
# separate group to only the last section
group <- sapply(group, function(x) {ret <- strsplit(x, "[.](?!.*[.])", perl=T); ret[[1]][2]})
# get the number of true features per row
l <- sapply(df[[chunk_group_feature]], length)
# l <= 0 won't work
if (length(l) <= 1) {
stop(paste("The feature to analyze,", chunk_group_feature, "has", l, "values. It must have at least 2 values."))
}
# duplicate each row by the number of true features
df <- df[rep.int(seq(nrow(df)), times=l) ,]
# replace the list of features per row with a single feature per row
df[[chunk_group_feature]] <- group
} else {
if (length(unique(df[[group_feature]])) <= 1) {
stop(paste("The feature to analyze,", group_feature, "has", l, "values. It must have at least 2 values."))
}
chunk_group_feature <- group_feature
# Get the count of records by group_feature to divide by
group_feature_sums <- df %>% group_by_(chunk_group_feature) %>% summarize(count=n()) %>% as.data.frame()
}
# identify the logical and illogical columns to be gathered
logical_columns <- names(df)[sapply(df, is.logical)]
illogical_columns <- names(df)[sapply(df, function(x) {!is.logical(x)})]
### Gather the Logicals ###
if (length(logical_columns) > 0) {
# gather to group_feature, key, value sets
#chunk_gather <- df[ , colnames(df) %in% c(logical_columns, chunk_group_feature), with=F] %>%
#chunk_gather <- df[ , colnames(df) %in% c(logical_columns, chunk_group_feature)] %>%
chunk_gather <- df %>% dplyr::select(one_of(c(logical_columns, chunk_group_feature))) %>%
gather_("enum", "value", setdiff(logical_columns, chunk_group_feature)) %>%
filter(!is.na(value)) %>%
filter(value)
# get to factors rather than logicals
chunk_gather <- chunk_gather %>% select(-value) %>% separate(enum, c("enum", "value"), sep = "[.](?!.*[.])", extra="merge")
# Add the counts back in
chunk_gather$sum <- lookup(chunk_gather[[chunk_group_feature]], as.data.frame(group_feature_sums))
}
### End Gather the Logicals ###
### Gather the Factors ###
if (length(illogical_columns) > 1) {
# illogical..., as in character, factor, numeric columns. get it?! GET IT?!!
#illogical_chunk_gather <- df[ , !colnames(df) %in% setdiff(logical_columns, chunk_group_feature), with=F] %>%
#illogical_chunk_gather <- df[ , !colnames(df) %in% setdiff(logical_columns, chunk_group_feature)] %>%
illogical_chunk_gather <- df %>% dplyr::select(one_of(setdiff(names(.), setdiff(logical_columns, chunk_group_feature)))) %>%
gather_("enum", "value", setdiff(colnames(df), c(logical_columns, chunk_group_feature)))
# Add the counts back in
illogical_chunk_gather$sum <- lookup(illogical_chunk_gather[[chunk_group_feature]], as.data.frame(group_feature_sums))
# Make sure value is a character vector
illogical_chunk_gather$value <- as.character(illogical_chunk_gather$value)
### End Gather the Factors ###
# Bind the two
chunk_gather <- rbind(chunk_gather, illogical_chunk_gather)
}
# build a results matrix
results <- matrix(data=0,
nrow=length(unique(chunk_gather[[chunk_group_feature]])),
ncol=length(unique(chunk_gather$enum)),
dimnames=list(unique(chunk_gather[[chunk_group_feature]]), unique(chunk_gather$enum)))
# I feel dirty. hope to replace these for loops with something more R'y
for (feature in unique(chunk_gather$enum)) {
# Build the table for a single feature
chunk_cast <- chunk_gather %>%
# select the feature
filter(enum==feature) %>% #ERROR: Replace "action.error.variety"
# get rid of the enumeration column as we subsetted to it
group_by_(chunk_group_feature, "value") %>%
# convert counts to percentages
summarize(sum=n()) %>%
# create a matrix dataframe that can then have rows summed and compared to all other rows.
reshape2::dcast(list(chunk_group_feature, "value"), value.var="sum", fill=0)
# get cluster as row names
if (any(is.na(chunk_cast[[chunk_group_feature]]))) {
stop("The Group Feature cannot have any values of 'NA'")
} else {
rownames(chunk_cast) <- chunk_cast[[chunk_group_feature]]
}
feature_chunk_cols <- unique(chunk_gather[chunk_gather$enum == feature, ]$value)
feature_chunk_rows <- unique(chunk_gather[[chunk_group_feature]])
feature_chunk <- matrix(data=0,
nrow=length(feature_chunk_rows),
ncol=length(feature_chunk_cols),
dimnames=list(feature_chunk_rows, feature_chunk_cols)) %>%
as.data.frame()
feature_chunk[rownames(chunk_cast), colnames(feature_chunk)] <- chunk_cast[,setdiff(colnames(chunk_cast), chunk_group_feature)]
feature_chunk <- as.matrix(feature_chunk)
if (ncol(feature_chunk) > 2) {
for (group in unique(chunk_gather[[chunk_group_feature]])) {
g <- feature_chunk[as.character(group), ]
if (nrow(feature_chunk) > 2) {
not_g <- colSums(feature_chunk[setdiff(rownames(feature_chunk), as.character(group)), ]) %>% as.vector()
} else {
not_g <- feature_chunk[setdiff(rownames(feature_chunk), as.character(group)), ] %>% as.vector()
}
# checks if not_g is all 0's. if it is, chi2 will fail
if ((length(not_g) != 0) && (unique(not_g) != 0)) {
results[as.character(group), feature] <- 0.5 * chi2.plugin(g, not_g)
} else {
results[as.character(group), feature] <- NaN
}
}
}
}
results
}
}