-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdiscovery_to_containment.R
74 lines (59 loc) · 4.07 KB
/
discovery_to_containment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
DESCRIPTION <- "Analyze discovery to containment time of breaches. First check the data to make sure it's useful for analysis. Then do all the stuff to calculate the difference. Then chart it in a histogram."
# Check the number of incidents we have to work with. Should be about 145 out of 200k
vz %>% filter(attribute.confidentiality.data_disclosure.Yes) %>% filter(timeline.discovery.value > 0 & timeline.containment.value > 0) %>% dim
# Test to ensure actions are not too biased
vz %>% filter(attribute.confidentiality.data_disclosure.Yes) %>% filter(timeline.discovery.value > 0 & timeline.containment.value > 0) %>% getenum("action")
# (They aren't)
# Test that attributes aren't too biased
vz %>% filter(attribute.confidentiality.data_disclosure.Yes) %>% filter(timeline.discovery.value > 0 & timeline.containment.value > 0) %>% getenum("attribute")
# (They aren't)
# Gather the containment times
chunk <- vz %>% filter(attribute.confidentiality.data_disclosure.Yes) %>% filter(timeline.discovery.value > 0 & timeline.containment.value > 0) %>%
select(starts_with("timeline.containment.unit."),
timeline.containment.value,
plus.master_id) %>%
gather(`Containment Unit`, `Containment Value`, -plus.master_id, -timeline.containment.value) %>%
filter(!is.na(`Containment Value`)) %>%
filter(`Containment Value`) %>%
select(-`Containment Value`) %>%
filter(!grepl("Unknown", `Containment Unit`)) %>%
rename(`Containment Value` = timeline.containment.value)
# gather the discovery times
temp <- vz %>% filter(attribute.confidentiality.data_disclosure.Yes) %>% filter(timeline.discovery.value > 0 & timeline.containment.value > 0) %>%
select(starts_with("timeline.discovery.unit."),
timeline.discovery.value,
plus.master_id) %>%
gather(`Discovery Unit`, `Discovery Value`, -plus.master_id, -timeline.discovery.value) %>%
filter(!is.na(`Discovery Value`)) %>%
filter(`Discovery Value`) %>%
select(-`Discovery Value`) %>%
filter(!grepl("Unknown", `Discovery Unit`)) %>%
rename(`Discovery Value` = timeline.discovery.value)
# Join discovery and containment times
chunk <- full_join(chunk, temp, by="plus.master_id")
# get rid of the one line with an NA in it at the end
chunk <- chunk[1:144,]
# Convert names into number of days
chunk$`Containment Unit` <- chunk$`Containment Unit` %>% plyr::mapvalues(c("timeline.containment.unit.Days", "timeline.containment.unit.Hours", "timeline.containment.unit.Minutes", "timeline.containment.unit.Months", "timeline.containment.unit.Weeks", "timeline.containment.unit.Years"), c(1, 1/24, 1/(24 * 60), 30, 7, 365))
chunk$`Discovery Unit` <- chunk$`Discovery Unit` %>% plyr::mapvalues(c("timeline.discovery.unit.Days", "timeline.discovery.unit.Hours", "timeline.discovery.unit.Minutes", "timeline.discovery.unit.Months", "timeline.discovery.unit.Weeks", "timeline.discovery.unit.Years", "timeline.discovery.unit.Seconds"), c(1, 1/24, 1/(24 * 60), 30, 7, 365, 1/(24*60*60)))
# cast factors to numerics
chunk$`Discovery Unit` <- as.numeric(levels(chunk$`Discovery Unit`))[chunk$`Discovery Unit`]
chunk$`Containment Unit` <- as.numeric(levels(chunk$`Containment Unit`))[chunk$`Containment Unit`]
# Calculate the differential
chunk <- chunk %>% mutate(differential = `Containment Value` * `Containment Unit` - `Discovery Value` * `Discovery Unit`)
chunk <- chunk %>% mutate(`Differential Days` = `Discovery Value` * `Discovery Unit`)
# since sometimes contaiment is from discovery & sometimes it's from compromise, we'll fix that
chunk <- chunk %>% filter(differential < 0) %>% mutate(differential = `Containment Value` * `Containment Unit`)
# Median time to discovery
median(chunk$`Discovery Days`)
# Median time from discovery to containment
median(chunk$differential)
# (notice data is long-tailed)
gg <- chunk %>% ggplot()
gg <- gg + aes(x=differential)
gg <- gg + geom_histogram()
gg <- gg + geom_vline(aes(xintercept=median(differential), color='red'))
gg <- gg + annotate("text", x=9, y=-1, label="Median", color="red")
gg <- gg + labs(x="Days", y="Incidents", title="Discovery to Containment Time (Incidents)")
gg <- gg + theme_hc()
gg