Skip to content

Commit

Permalink
add outlier display and filtering it
Browse files Browse the repository at this point in the history
  • Loading branch information
syncpark authored and syncpark committed Sep 28, 2022
1 parent d81b7cc commit 0063d21
Show file tree
Hide file tree
Showing 6 changed files with 214 additions and 40 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "labeler"
version = "0.1.4"
version = "0.1.5"
authors = ["syncpark <[email protected]>"]
edition = "2018"

Expand Down
131 changes: 107 additions & 24 deletions src/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ use anyhow::Result;
use log::info;
use regex::Regex;
use serde::Deserialize;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::str::FromStr;

const SIGNATURE_DISPLAY_LENGTH: usize = 200;

const CLUSTER_ID_FOR_OUTLIERS: ClusterId = 1_000_000;
#[derive(Deserialize)]
struct SavedClusters {
detector_id: i32,
Expand All @@ -26,6 +26,8 @@ struct SavedClusters {
struct ClusterMember {
cluster_id: usize,
cluster_size: usize,
signature: Option<String>,
score: Option<f32>,
events: Vec<String>,
}

Expand All @@ -46,6 +48,9 @@ impl SavedClusters {
self.outlier_count,
)
}
fn outliers(&self) -> &Vec<String> {
&self.outliers
}
}

#[derive(Debug, Default, Clone)]
Expand All @@ -57,7 +62,8 @@ pub struct Members {
new_qualifier: Qualifier,
signature: Option<String>,
event_ids: Vec<MessageId>,
// tokens: HashMap<String, Vec<MessageId>>, // TODO: calculate token occurrences to correct label-score
filtered_events: Vec<Vec<MessageId>>, // tokens: HashMap<String, Vec<MessageId>>, // TODO: calculate token occurrences to correct label-score
filter: Vec<String>,
}

impl fmt::Display for Members {
Expand All @@ -68,7 +74,8 @@ impl fmt::Display for Members {
} else {
write!(f, ", {}<-{}", self.new_qualifier, self.qualifier)?;
}
write!(f, ", {} events", self.size)
write!(f, ", {} events", self.size)?;
write!(f, ", score = {}", self.score)
}
}

Expand Down Expand Up @@ -104,15 +111,14 @@ pub struct Clusters {
clusters: Vec<ClusterId>,
_outliers: Vec<String>,
clusters_map: HashMap<ClusterId, Members>,
// _message_cluster_map: HashMap<MessageId, ClusterId>,
tokens_clusters_map: HashMap<String, Vec<ClusterId>>,
}

impl Clusters {
/// # Errors
///
/// Will return `Err` if the query to get cluster records for the specified datasource failed.
pub fn new(path: &str, labels: &Labels) -> Result<Self> {
pub fn new(path: &str, labels: &Labels, delimiter: char) -> Result<Self> {
let save_clusters = SavedClusters::from_path(path)?;
{
let (detector_id, events_count, clusters_count, outliers_count) =
Expand All @@ -122,8 +128,8 @@ impl Clusters {
path, detector_id, events_count, clusters_count, outliers_count
);
}
let clusters = save_clusters.cluster_ids();
let clusters_map: HashMap<ClusterId, Members> = save_clusters
let mut clusters = save_clusters.cluster_ids();
let mut clusters_map: HashMap<ClusterId, Members> = save_clusters
.clusters
.iter()
.map(|m| {
Expand All @@ -137,31 +143,49 @@ impl Clusters {
Members {
id: m.cluster_id,
size: m.cluster_size,
score: 0.0,
score: m.score.unwrap_or_default(),
qualifier,
new_qualifier: qualifier,
signature: None,
signature: m.signature.as_ref().cloned(),
event_ids: m.events.clone(),
filtered_events: Vec::new(),
filter: Vec::new(),
},
)
})
.collect();

// let _message_cluster_map: HashMap<MessageId, ClusterId> = clusters_map
// .values()
// .flat_map(|c| {
// c.event_ids
// .iter()
// .map(|e| (e.to_string(), c.id))
// .collect::<Vec<_>>()
// })
// .collect();
if !save_clusters.outliers().is_empty() {
let message_id_index = 1;
let event_ids: Vec<_> = save_clusters
.outliers()
.iter()
.filter_map(|raw| {
let s: Vec<_> = raw.split(delimiter).collect();
s.get(message_id_index).map(|msg_id| (*msg_id).to_string())
})
.collect();
clusters_map.insert(
CLUSTER_ID_FOR_OUTLIERS,
Members {
id: CLUSTER_ID_FOR_OUTLIERS,
size: save_clusters.outliers().len(),
score: 0.0,
qualifier: Qualifier::default(),
new_qualifier: Qualifier::default(),
signature: None,
event_ids,
filtered_events: Vec::new(),
filter: Vec::new(),
},
);
clusters.push(CLUSTER_ID_FOR_OUTLIERS);
}

Ok(Self {
clusters,
_outliers: save_clusters.outliers,
clusters_map,
// _message_cluster_map,
tokens_clusters_map: HashMap::new(),
})
}
Expand Down Expand Up @@ -213,6 +237,13 @@ impl Clusters {
self.clusters.is_empty()
}

pub fn clear_filter(&mut self, cluster_id: ClusterId) {
if let Some(c) = self.clusters_map.get_mut(&cluster_id) {
c.filtered_events.clear();
c.filter.clear();
}
}

pub fn print(&self, cid: ClusterId, events: &Events, cfg: &CliConf) {
if let Some(c) = self.clusters_map.get(&cid) {
println!("{}", c);
Expand All @@ -221,11 +252,20 @@ impl Clusters {
println!("signature = {}", sig);
}
}
if !c.filter.is_empty() {
println!("Event Filter: {:#?}", c.filter);
}
if cfg.is_show_samples_on() {
let display_count = cfg.samples_count();
let event_ids = if let Some(last) = c.filtered_events.last() {
last
} else {
&c.event_ids
};
println!();
for (idx, message_id) in c.event_ids.iter().enumerate() {
for (idx, message_id) in event_ids.iter().enumerate() {
if idx > display_count {
println!("... {} more events", event_ids.len() - display_count);
break;
}
if let Some(msg) = events.get_message(message_id) {
Expand Down Expand Up @@ -308,10 +348,11 @@ impl Clusters {
.iter()
.filter_map(|cid| {
if let Some(c) = self.clusters_map.get(cid) {
if events.regex_match(&re, &c.event_ids) {
Some(*cid)
} else {
let matched = events.regex_match(&re, &c.event_ids);
if matched.is_empty() {
None
} else {
Some(*cid)
}
} else {
None
Expand All @@ -320,6 +361,48 @@ impl Clusters {
.collect())
}

pub fn regex_match_in_this_cluster(
&self,
cluster_id: ClusterId,
pattern: &str,
events: &Events,
) -> Result<Option<Vec<MessageId>>> {
let mut negate: bool = false;
let pattern = if pattern.starts_with('!') {
if pattern.len() == 1 {
return Ok(None);
}
negate = true;
pattern.get(1..).unwrap_or(pattern)
} else {
pattern
};

let re = Regex::new(pattern)?;
Ok(self.clusters_map.get(&cluster_id).map(|c| {
let cluster_event_ids = if let Some(last) = c.filtered_events.last() {
last
} else {
&c.event_ids
};
let matched = events.regex_match(&re, cluster_event_ids);
if negate {
let set_matched: HashSet<_> = matched.into_iter().collect();
let set_cluster: HashSet<_> = cluster_event_ids.iter().cloned().collect();
(&set_cluster - &set_matched).into_iter().collect()
} else {
matched
}
}))
}

pub fn set_filtered(&mut self, cluster_id: ClusterId, matched: Vec<MessageId>, pattern: &str) {
if let Some(c) = self.clusters_map.get_mut(&cluster_id) {
c.filter.push(pattern.to_string());
c.filtered_events.push(matched);
}
}

pub fn set_qualifier(&mut self, cid: ClusterId, qualifier: Qualifier) -> bool {
if let Some(c) = self.clusters_map.get_mut(&cid) {
return c.set_qualifier(qualifier);
Expand Down
31 changes: 22 additions & 9 deletions src/events.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,28 @@ impl Events {
}

#[must_use]
pub fn regex_match(&self, re: &Regex, event_ids: &[MessageId]) -> bool {
for msg_id in event_ids {
if let Some(evt) = self.events.get(msg_id) {
if re.is_match(&evt.content) {
return true;
}
}
}
false
pub fn regex_match(&self, re: &Regex, event_ids: &[MessageId]) -> Vec<String> {
event_ids
.iter()
.filter_map(|msg_id| {
self.events.get(msg_id).map(|event| {
if re.is_match(&event.content) {
Some(msg_id.to_string())
} else {
None
}
})
})
.flatten()
.collect()
// for msg_id in event_ids {
// if let Some(evt) = self.events.get(msg_id) {
// if re.is_match(&evt.content) {
// return true;
// }
// }
// }
// false
}

#[must_use]
Expand Down
Loading

0 comments on commit 0063d21

Please sign in to comment.