-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathyear2year.R
128 lines (108 loc) · 5.58 KB
/
year2year.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
DESCRIPTION <- "find the enumeration that changed the most significantly between years"
#' Identify which enumeration have changed most year-to-year in a verisr object
#' NOTE: df *must contain* timeline.incident.year. If plus.dbir_year exists, it will be used.
#'
#' @param df A verisr object of logical columns to analyze.
#' @param current_year The year of interest. Defaults to this year.
#' @param last_year The year to compare to. Defaults to last year.
#' @param filter Bool. If TRUE, only 5th & 95th percentile results in at least 1/1000th of the records will be shown
#' @return An ordered data frame of the difference and ratio of the percentage of each enumeration in \code{current_year} vs \code{last_year} as well as the count of the enumeration in the current_year.
#' @examples
#' incidents <- vz %>% select(timeline.incident.year,
#' contains("variety"),
#' contains("vector"),
#' starts_with("attribute.confidentiality.data_disclosure"),
#' starts_with("data_discovery"),
#' matches("^victim.(industry2.|employee_count|orgsize)"),
#' matches("timeline.*.unit.*"))
#' breaches <- incidents %>% filter(attribute.confidentiality.data_disclosure.Yes)
#' y2y <- year2year(breaches)
year2year <- function(df, current_year=NULL, last_year=NULL, filter=TRUE) {
# I'm sure I'm doing something wrong by writing this but it's helpful for comparing the ratios
abs_ratio <- function(x) {x[x < 1] <- 1/x[x < 1]; x}
# Set the years to compare
if (is.null(current_year)) {
current_year <- format(Sys.Date(), "%Y")
}
if (is.null(last_year)) {
last_year <- as.character(as.numeric(current_year) - 1)
}
# Year based on dbir year if possible
if ("plus.dbir_year" %in% colnames(df) & as.numeric(current_year) >= 2009) {
current_df <- df %>% filter(plus.dbir_year == as.character(current_year))
} else {
current_df <- df %>% filter(timeline.incident.year == as.character(as.numeric(current_year) - 1))
}
if (nrow(current_df) == 0) {
stop("No data for current year!")
}
# if ("plus.dbir_year" %in% colnames(df)) {
# last_df <- df %>% filter(plus.dbir_year == last_year)
# } else {
# last_df <- df %>% filter(timeline.incident.year == as.character(as.numeric(last_year) - 1))
# }
# well, since no-one coded plus.dbir_year in the 2014 data, we'll use incident year
last_df <- df %>% filter(timeline.incident.year == as.character(as.numeric(last_year) - 1))
if (nrow(last_df) == 0) {
stop("No data for last year!")
}
if ("plus.dbir_year" %in% colnames(df)) {
current_df <- current_df %>% select(-timeline.incident.year, -plus.dbir_year)
last_df <- last_df %>% select(-timeline.incident.year, -plus.dbir_year)
} else {
current_df <- current_df %>% select(-timeline.incident.year)
last_df <- last_df %>% select(-timeline.incident.year)
}
# get this year's breach percents
current_df_pcts <- colSums(current_df) / nrow(current_df)
# get last year's breach percents
last_df_pcts <- colSums(last_df) / nrow(last_df)
df_ratio <- current_df_pcts/last_df_pcts
df_ratio[is.nan(df_ratio)] <- 1
# going from ratio abs difference
df_diff <- current_df_pcts - last_df_pcts
# Below line might be useful in dealing with infinits
# df_ratio[is.infinite(df_ratio)] <- max(df_ratio) * 1.20
# Visibly inespect breach_ratio for fun
# breach_ratio[order(breach_ratio, decreasing=T)]
# ggplot(melt(breach_ratio), aes(x=seq_along(value), y=value, size=sqrt(colSums(current_breaches)))) + geom_point(stat="identity")
# Change to a data frame and add the record counts
# df_ratio <- df_ratio %>% melt() %>% add_rownames(var="feature")
# df_feature_cnts <- current_df %>% select(one_of(df_ratio$feature)) %>% colSums() %>% melt() %>% add_rownames(var="feature")
# df_ratio <- full_join(df_ratio, df_feature_cnts, by="feature")
# names(df_ratio) <- c("Feature", "Ratio", "Count")
# turn the ratio into a dataframe
ret <- df_ratio %>% melt()
# bind the difference to it
ret <- cbind(df_diff, ret)
# set their names
names(ret) <- c("Difference", "Ratio")
# add the actual enumeration names
ret <- ret %>% add_rownames(var="Enumeration")
# Get the counts for each enumeration in the current year
df_feature_cnts <- current_df %>% select(one_of(ret$Enumeration)) %>% colSums() %>% melt(value.name = "Count") %>% add_rownames(var="Enumeration")
# Join the counts to the return dataframe
ret <- full_join(ret, df_feature_cnts, by="Enumeration")
if (filter) {
# Need to choose outliers
# The data is long tailed so no mean/SD
ratio_percentiles <- quantile(abs_ratio(df_ratio), c(0.97))
diff_percentiles <- quantile(abs(df_diff), c(0.95))
# Filter
# df_ratio <- df_ratio[df_ratio > percentiles[1]]
ret <- ret[abs_ratio(ret[["Ratio"]]) > ratio_percentiles[1] | abs(ret[["Difference"]]) > diff_percentiles[1], ]
# Filter record counts at or below the 50th percentile (picked totally arbitrarily)
#df_ratio %>% filter(Count > quantile(df_ratio$Count, c(.5))["50%"])
# Filter record counts at or below 45 rather than the above quantile filtering.
# (We use 45 in the standard report)
#df_ratio %>% filter(Count > 45)
# Filter record counts at or or below 1/1000 of the total records
# df_ratio <- df_ratio %>% filter(Count > nrow(df) / 1000)
}
# Order it
# df_ratio <- df_ratio %>% arrange(desc(abs(Ratio)))
ret <- ret %>% arrange(desc(abs(Difference)))
# Return it
# df_ratio
ret
}