Skip to content

Commit

Permalink
v1.8.3 New & improved Data Connectors and Baseline Runner (#238)
Browse files Browse the repository at this point in the history
* [DC] Fixes Azure Subscription Inventory Ingestion

* Removes ingestion/aad_auditlogs.py workaround

* [DC] Adds Tenable Settings user connector

* [Baselines] Runner uses subprocess (#235)

Changes were tested by building and running locally in the development environment.

* [DC] Corrects AWS connector subtitles

* Fixes Slack handler templates containing special characters

* Sp 1363 (#240)

* Fixes bug in Baseline Runner where R could return NaN as a null value, breaking the sql statement.

* Corrects bug to display date as an actual date, and includes full model in prediction baseline (#242)

* Corrects bug that doesn't display the full date for prediction violations baseline
* Adds full model information to prediction violations baseline
* Removes debug print statements

* Renames tenable.png to tenable_settings.png

* Fixes to Tenable settings connector

* [DC] Bump version, cleanup typos + inconsistency

* [Installer] Adds support for Azure URL Parsing

* parse_snowflake_url handles regions better

* [Dispatcher] Adds assignee and area params to Jira Handler (#239)

* [Installer, Runners] Removes default error alerts and creates error views

Our analysts would like separate workflows for handing the Eng/Ops role of fixing alerts and data sources and the analysis role of handling existing alerts, so we're moving  to views of alert errors instead of creating Jira tickets from them by default.

* [Runners] Removes stderr output when Jira handler is used but undefined

* [Baselines] Converts Temporal Baseline to work with counts and raw events
  • Loading branch information
sfc-gh-afedorov authored Aug 5, 2019
1 parent 141e993 commit 1d77d06
Show file tree
Hide file tree
Showing 26 changed files with 362 additions and 380 deletions.
40 changes: 31 additions & 9 deletions src/baseline_modules/temporal_by_day/temporal_by_day.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@ require('dplyr')
require(tidyr)
require(purrr)
#input_table
#EVENT_TIME
#EVENT_TIME OR DAY
#NUM_EVENTS -> OPTIONAL
#PIVOT
#ID

get_percentiles <- function(dataframe, column_name, exit_name, column_names_summarize_by ){
p <- c(.1, .25,.75,.95,.5)
p_names <- map_chr(p, ~paste0(.x*100, exit_name))
Expand All @@ -19,23 +21,43 @@ get_percentiles <- function(dataframe, column_name, exit_name, column_names_summ
return(dataframe %>% group_by_(column_name) %>% summarize_at(vars(column_names_summarize_by), full_funs))
}

results <- input_table
results$EVENT_TIME <- as.POSIXct(results$EVENT_TIME)
results$DAY <- as.Date(results$EVENT_TIME, na.rm=TRUE)

earliest_time <- min(results$DAY, na.rm=TRUE)
latest_time <- max(results$DAY, na.rm=TRUE)
num_days <- latest_time - earliest_time
results <- input_table
print(colnames(results))
if('EVENT_TIME' %in% colnames(results)) {
print('Event time case triggered')
results$EVENT_TIME <- as.POSIXct(results$EVENT_TIME)
results$DAY <- as.Date(results$EVENT_TIME, na.rm=TRUE)
}else{
print('No event time going to day')
results$EVENT_TIME <- NA
results$DAY <- as.Date(INPUT_TABLE$DAY,"%Y-%m-%d", tz="GMT")

}

if(!('ID' %in% colnames(results))){
print('No id in input')
results$ID <- NA
}

if('NUM_EVENTS' %in% colnames(results)){
print('Num events found')
by_day_when_present <- results %>%
group_by(PIVOT, DAY)%>%
summarise(num_events=sum(NUM_EVENTS), num_ids=sum(ID))
}else{
print('num events not found')
by_day_when_present <- results %>%
group_by(PIVOT, DAY) %>%
summarise(num_events=n(),
num_ids=length(unique(ID))
)

expand_days <- by_day_when_present %>% tidyr::complete(DAY=seq.Date(min(DAY, na.rm=TRUE), max(DAY, na.rm=TRUE), by="day"), PIVOT, fill = list(num_events = 0, num_ids=0))
}
print(by_day_when_present)
earliest_time <- min(results$DAY, na.rm=TRUE)
latest_time <- max(results$DAY, na.rm=TRUE)
num_days <- latest_time - earliest_time
expand_days <- by_day_when_present %>% tidyr::complete(DAY=seq.Date(min(DAY, na.rm=TRUE), max(DAY, na.rm=TRUE), by="day"), PIVOT, fill = list(num_events = 0, if('ID' %in% colnames(results)) num_ids=0 else num_ids=NA))

when_present_numeric<- get_percentiles(by_day_when_present, 'PIVOT', 'when_present', c('num_ids', 'num_events'))
expand_days_numeric <- get_percentiles(expand_days, 'PIVOT', 'overall', c('num_ids', 'num_events'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,17 @@ require(MASS)
require(tidyr)
require(purrr)


print('a')

a <- input_table
rm(input_table)



a$CURRENT_DAY <- a$CURRENT_DAY <- as.Date(as.POSIXct(a$CURRENT_DAY), format='%Y-%m-%d')
a$FINAL <- as.logical(a$FINAL)
a$NEW <- as.logical(a$NEW)
a$PROD <- as.logical(a$PROD)
colnames(a) <- make.unique(names(a))

#Group for counts
b <- a %>% group_by(QUERY_ID, CURRENT_DAY) %>% dplyr::summarize(counts=n_distinct(UNIQUE_KEYS))
namessss <- a %>% group_by(QUERY_ID) %>% dplyr::summarize(TITLE=first(TITLE))
Expand All @@ -51,13 +50,6 @@ c <- b %>%
c$age = as.integer(Sys.Date() - c$CURRENT_DAY+2)
rm(b)

print(c)
#Group for name
c <- base::merge(c, namessss, by = "QUERY_ID", all.x=TRUE)
print(unique(c$QUERY_ID))
print(unique(c$CURRENT_DAY))
print(unique(c$age))
print(unique(c$counts))

#Group for name
c <- base::merge(c, namessss, by = "QUERY_ID", all.x=TRUE)
Expand All @@ -66,22 +58,25 @@ c <- base::merge(c, namessss, by = "QUERY_ID", all.x=TRUE)
model <- c %>% tidyr::nest(-QUERY_ID) %>%
mutate(
fit=map(data, ~ rlm(counts ~ CURRENT_DAY, weights=1/age^2, data = ., na.action = 'na.omit', maxit=100)) )
print('model_complete')

e <- c %>%
tidyr::complete(CURRENT_DAY=seq.Date(min(c$CURRENT_DAY), max(c$CURRENT_DAY)+100, by="day"),QUERY_ID)
e$age = as.integer(max(e$CURRENT_DAY) - e$CURRENT_DAY+1)
nested <- e %>% tidyr::nest(-QUERY_ID)


prediction <-
model %>%
inner_join(nested, by = "QUERY_ID") %>%
mutate(results=map2(.x = model$fit, .y = nested$data, .f = ~augment(.x, newdata = .y), .id=.x), model2=model$fit) %>%
unnest(c(results))


prediction <- base::merge(prediction, namessss, by = "QUERY_ID", all.x=TRUE)

prediction <- base::merge(prediction, dplyr::select(model, QUERY_ID, fit), by = "QUERY_ID", all.x=TRUE)
prediction$near_zero <- abs(prediction$.fitted)


prediction<- prediction %>% replace(., is.na(.), "")
return_value <- prediction %>% group_by(QUERY_ID) %>% summarise(last_day=max(CURRENT_DAY), x_intercept=as.character(CURRENT_DAY[which.min(near_zero)]) , unknown=as.character(x_intercept==last_day), value=min(near_zero), TITLE=first(TITLE.y)) %>% dplyr::select(QUERY_ID, TITLE, unknown, x_intercept)

Original file line number Diff line number Diff line change
Expand Up @@ -23,43 +23,41 @@
# '

require(dplyr)
require(tidyverse)
require(broom)
require(MASS)
require(broom)
require(MASS)
require(tidyr)
require(purrr)

a <- input_table
rm(input_table)
#Cleaning
a$CURRENT_DAY <- a$CURRENT_DAY <- as.Date(as.POSIXct(a$CURRENT_DAY), format='%Y-%m-%d')
a$FINAL <- as.logical(a$FINAL)
a$NEW <- as.logical(a$NEW)
a$PROD <- as.logical(a$PROD)
colnames(a) <- make.unique(names(a))


#Group for counts
b <- a %>% group_by(QUERY_ID, CURRENT_DAY) %>%
dplyr::summarize(counts=n_distinct(UNIQUE_KEYS))
namessss <- a %>% group_by(QUERY_ID) %>% dplyr::summarise(TITLE=first(TITLE))
b <- a %>% group_by(QUERY_ID, CURRENT_DAY) %>% dplyr::summarize(counts=n_distinct(UNIQUE_KEYS))
namessss <- a %>% group_by(QUERY_ID) %>% dplyr::summarize(TITLE=first(TITLE))

rm(a)
#Complete the missing values with zero -> no violations
c <- b %>%
tidyr::complete(CURRENT_DAY=seq.Date(min(b$CURRENT_DAY), max(b$CURRENT_DAY), by="day"),QUERY_ID, fill=list(counts = 0))
c$age = as.integer(Sys.Date() - c$CURRENT_DAY+2)
rm(b)

#Group for name
c <- base::merge(c, namessss, by = "QUERY_ID", all.x=TRUE)

#Do the prediction analysis
model <- c %>% nest(-QUERY_ID) %>%
model <- c %>% tidyr::nest(-QUERY_ID) %>%
mutate(
fit=map(data, ~ rlm(counts ~ CURRENT_DAY, weights=1/age^2, data = ., na.action = 'na.omit', maxit=100)) )
fit=map(data, ~ rlm(counts ~ CURRENT_DAY, weights=1/age^2, data = ., na.action = 'na.omit', maxit=100)) )

e <- c %>%
tidyr::complete(CURRENT_DAY=seq.Date(min(c$CURRENT_DAY), max(c$CURRENT_DAY)+100, by="day"),QUERY_ID)
e$age = as.integer(max(e$CURRENT_DAY) - e$CURRENT_DAY+1)
nested <- e %>% nest(-QUERY_ID)

nested <- e %>% tidyr::nest(-QUERY_ID)

prediction <-
model %>%
Expand All @@ -69,8 +67,15 @@ prediction <-

prediction <- base::merge(prediction, namessss, by = "QUERY_ID", all.x=TRUE)
prediction <- base::merge(prediction, dplyr::select(model, QUERY_ID, fit), by = "QUERY_ID", all.x=TRUE)
prediction$fit <- toString(prediction$fit)

prediction$fit <- as.character(prediction$fit)


return_value <- dplyr::select(prediction, QUERY_ID, TITLE.y, CURRENT_DAY, counts, .fitted, .se.fit, fit)
return_value$CURRENT_DAY <- as.character(return_value$CURRENT_DAY)
#return_value$fit <- 'This is my fit, iti s a random string for now'
return_value <- return_value %>% replace(., is.na(.), "")
colnames(return_value) <- c('QUERY_ID', 'TITLE', 'CURRENT_DAY', 'COUNTS', 'FITTED', 'SEFIT', 'FIT')

return_value
#END
3 changes: 3 additions & 0 deletions src/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from . import azure_vm
from . import aws_config
from . import aws_inventory
from . import tenable_settings

__all__ = [
'aws_inventory',
Expand All @@ -14,6 +15,7 @@
'azure_subscription',
'azure_vm',
'okta',
'tenable_settings',
]

connectors = {
Expand All @@ -24,6 +26,7 @@
'azure_subscription': azure_subscription,
'azure_vm': azure_vm,
'okta': okta,
'tenable_settings': tenable_settings,
}

CONNECTION_OPTIONS = [
Expand Down
2 changes: 1 addition & 1 deletion src/connectors/aws_cloudtrail.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""AWS CloudTrail
Collects AWS logs from an S3 bucket using AssumeRole
Collect AWS CloudTrail logs from S3 using a privileged Role
"""

from json import dumps
Expand Down
2 changes: 1 addition & 1 deletion src/connectors/aws_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""AWS Config
Collects Config logs from S3 using AssumeRole
Collect Config logs from S3 using a privileged Role
"""
from json import dumps
from time import sleep
Expand Down
2 changes: 1 addition & 1 deletion src/connectors/aws_inventory.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""AWS Asset Inventory
Collects AWS EC2, SG, ELB details using an Access Key
Collect AWS EC2, SG, ELB details using an Access Key
"""
from datetime import datetime
import json
Expand Down
3 changes: 1 addition & 2 deletions src/connectors/azure_subscription.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Azure Subscription Inventory
Collects Azure Subscriptions using a Service Principal (SP)
Collect Azure Subscriptions using a Service Principal (SP)
"""

from dateutil.parser import parse
Expand Down Expand Up @@ -115,7 +115,6 @@ def ingest(table_name, options):
'PARSE_JSON(column3)',
'column4',
'column5',
'column5',
'column6',
'column7',
'PARSE_JSON(column8)',
Expand Down
3 changes: 1 addition & 2 deletions src/connectors/azure_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
'required': True,
'default': 'default'
}

]

LANDING_TABLE_COLUMNS = [
Expand Down Expand Up @@ -121,7 +120,7 @@ def connect(connection_name, options):

return {
'newStage': 'finalized',
'newMessage': 'Landing and subscription data tables created for collectors to populate.'
'newMessage': 'Landing and metadata tables created for collectors to populate.'
}


Expand Down
Loading

0 comments on commit 1d77d06

Please sign in to comment.