From 1b78aa3e30d86d0ad14609ea51391de5d65bc1fb Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Sat, 14 Oct 2023 20:00:32 +0200 Subject: [PATCH 1/7] turn halfspacetree to float generic --- src/anomaly/half_space_tree.rs | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/anomaly/half_space_tree.rs b/src/anomaly/half_space_tree.rs index 21b60ac..2701fed 100644 --- a/src/anomaly/half_space_tree.rs +++ b/src/anomaly/half_space_tree.rs @@ -2,8 +2,10 @@ use rand::prelude::*; +use num::{Float, FromPrimitive}; use std::convert::TryFrom; use std::mem; +use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign}; use crate::common::Observation; @@ -20,14 +22,14 @@ fn right_child(node: u32) -> u32 { } #[derive(Clone)] -struct Trees { +struct Trees { feature: Vec, - threshold: Vec, - l_mass: Vec, - r_mass: Vec, + threshold: Vec, + l_mass: Vec, + r_mass: Vec, } -impl Trees { +impl Trees { fn new(n_trees: u32, height: u32, features: &Vec, rng: &mut ThreadRng) -> Self { // #nodes = 2 ^ height - 1 let n_nodes: usize = usize::try_from(n_trees * (u32::pow(2, height) - 1)).unwrap(); @@ -47,21 +49,22 @@ impl Trees { // Allocate memory for the HST let mut hst = Trees { feature: init_vec(n_branches, String::from("")), - threshold: init_vec(n_branches, 0.0), - l_mass: init_vec(n_nodes, 0.0), - r_mass: init_vec(n_nodes, 0.0), + threshold: init_vec(n_branches, F::zero()), + l_mass: init_vec(n_nodes, F::zero()), + r_mass: init_vec(n_nodes, F::zero()), }; // Randomly assign features and thresholds to each branch for branch in 0..n_branches { let feature = features.choose(rng).unwrap(); hst.feature[branch] = feature.clone(); - hst.threshold[branch] = rng.gen(); // [0, 1] + let random_threshold: f64 = rng.gen(); + hst.threshold[branch] = F::from_f64(random_threshold).unwrap(); // [0, 1] } hst } } -pub struct HalfSpaceTree { +pub struct HalfSpaceTree { window_size: u32, counter: u32, n_trees: u32, @@ -70,10 +73,10 @@ pub struct HalfSpaceTree { rng: ThreadRng, n_branches: u32, n_nodes: u32, - trees: Option, + trees: Option>, first_learn: bool, } -impl HalfSpaceTree { +impl HalfSpaceTree { pub fn new( window_size: u32, n_trees: u32, @@ -108,10 +111,10 @@ impl HalfSpaceTree { pub fn update( &mut self, - observation: &Observation, + observation: &Observation, do_score: bool, do_update: bool, - ) -> Option { + ) -> Option { // build trees during the first pass if (!self.first_learn) && self.features.is_none() { self.features = Some(observation.clone().into_keys().collect()); @@ -124,7 +127,7 @@ impl HalfSpaceTree { self.first_learn = true; } - let mut score: f32 = 0.0; + let mut score: F = F::zero(); for tree in 0..self.n_trees { let mut node: u32 = 0; @@ -135,12 +138,12 @@ impl HalfSpaceTree { // Flag for scoring if do_score { score += hst.r_mass[(tree * self.n_nodes + node) as usize] - * u32::pow(2, depth) as f32; + * F::from_u32(u32::pow(2, depth)).unwrap(); } if do_update { // Update the l_mass - hst.l_mass[(tree * self.n_nodes + node) as usize] += 1.0; + hst.l_mass[(tree * self.n_nodes + node) as usize] += F::one(); } // Stop if the node is a leaf or stop early if the mass of the node is too small @@ -184,7 +187,7 @@ impl HalfSpaceTree { self.counter += 1; if self.counter == self.window_size { mem::swap(&mut hst.r_mass, &mut hst.l_mass); - hst.l_mass.fill(0.0); + hst.l_mass.fill(F::zero()); self.counter = 0; } } @@ -193,10 +196,10 @@ impl HalfSpaceTree { } return None; } - pub fn learn_one(&mut self, observation: &Observation) { + pub fn learn_one(&mut self, observation: &Observation) { self.update(observation, false, true); } - pub fn score_one(&mut self, observation: &Observation) -> Option { + pub fn score_one(&mut self, observation: &Observation) -> Option { self.update(observation, true, false) } } From 7ea398527448c3ef819607b0e194936c0443383b Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Sat, 14 Oct 2023 20:22:01 +0200 Subject: [PATCH 2/7] add generic for half space trees --- examples/anomaly_detection/credit_card.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/anomaly_detection/credit_card.rs b/examples/anomaly_detection/credit_card.rs index 6cb38b0..f14436c 100644 --- a/examples/anomaly_detection/credit_card.rs +++ b/examples/anomaly_detection/credit_card.rs @@ -1,5 +1,7 @@ use light_river::anomaly::half_space_tree::HalfSpaceTree; use light_river::datasets::credit_card::CreditCard; +use light_river::stream::iter_csv::IterCsv; +use std::fs::File; use std::time::Instant; fn main() { @@ -11,10 +13,10 @@ fn main() { let height: u32 = 6; // INITIALIZATION - let mut hst = HalfSpaceTree::new(window_size, n_trees, height, None); + let mut hst: HalfSpaceTree = HalfSpaceTree::new(window_size, n_trees, height, None); // LOOP - let transactions = CreditCard::load_credit_card_transactions().unwrap(); + let transactions: IterCsv = CreditCard::load_credit_card_transactions().unwrap(); for transaction in transactions { let observation = transaction.unwrap().get_observation(); let _ = hst.update(&observation, true, true); From ee78b8e298f4cc42d7f0a6191aeb5e0dba9dee03 Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Sat, 21 Oct 2023 18:09:58 +0200 Subject: [PATCH 3/7] Add max score normalisation --- src/anomaly/half_space_tree.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/anomaly/half_space_tree.rs b/src/anomaly/half_space_tree.rs index 2701fed..bbdae3b 100644 --- a/src/anomaly/half_space_tree.rs +++ b/src/anomaly/half_space_tree.rs @@ -192,6 +192,7 @@ impl H } } if do_score { + score = F::one() - (score / self.max_score()); return Some(score); } return None; @@ -202,4 +203,9 @@ impl H pub fn score_one(&mut self, observation: &Observation) -> Option { self.update(observation, true, false) } + fn max_score(&self) -> F { + F::from(self.n_trees).unwrap() + * F::from(self.window_size).unwrap() + * (F::from(2.).unwrap().powi(self.height as i32 + 1) - F::one()) + } } From 1683bab908693682f39cee9e4964f084fa186d8f Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Fri, 3 Nov 2023 21:20:23 +0100 Subject: [PATCH 4/7] add get_probabilities method --- src/common.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/common.rs b/src/common.rs index 9796844..5c90812 100644 --- a/src/common.rs +++ b/src/common.rs @@ -177,7 +177,6 @@ where /// use std::collections::HashMap; /// use light_river::common::{ClassifierTarget, ClassifierTargetProbabilities}; /// use num::Float; -// /// /// let mut probs: ClassifierTargetProbabilities = HashMap::new(); /// probs.insert(ClassifierTarget::Bool(true), 0.7); @@ -222,6 +221,17 @@ impl C } } } + pub fn get_probabilities(&self) -> ClassifierTargetProbabilities { + // If we had only the prediction we set the probability to 1.0 + match self { + ClassifierOutput::Prediction(y) => { + let mut probs = ClassifierTargetProbabilities::new(); + probs.insert(y.clone(), F::from(1.0).unwrap()); + probs + } + ClassifierOutput::Probabilities(y) => y.clone(), + } + } } /// Represents a regression target using a Float value. From 3c4563d8049e994ea77851470515887eec2c40b1 Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Fri, 3 Nov 2023 21:20:41 +0100 Subject: [PATCH 5/7] Confusion matrix works now --- src/metrics/confusion.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/metrics/confusion.rs b/src/metrics/confusion.rs index 0d2ce38..3dded96 100644 --- a/src/metrics/confusion.rs +++ b/src/metrics/confusion.rs @@ -63,7 +63,7 @@ pub struct ConfusionMatrix>, sum_row: HashMap, sum_col: HashMap, - total_weight: F, + pub total_weight: F, } impl ConfusionMatrix { @@ -133,10 +133,10 @@ impl C &mut self, y_pred: &ClassifierOutput, y_true: &ClassifierTarget, - sample_weight: F, + sample_weight: Option, ) { - self.n_samples -= sample_weight; - self._update(y_pred, y_true, -sample_weight); + self.n_samples -= sample_weight.unwrap_or(F::one()); + self._update(y_pred, y_true, -sample_weight.unwrap_or(F::one())); } pub fn get(&self, label: &ClassifierTarget) -> HashMap { // return rows of the label in the confusion matrix From 7db270e0568999d833b83a88ae5e0bf963c1b781 Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Fri, 3 Nov 2023 21:20:59 +0100 Subject: [PATCH 6/7] add rocauc --- src/metrics/mod.rs | 1 + src/metrics/rocauc.rs | 227 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+) diff --git a/src/metrics/mod.rs b/src/metrics/mod.rs index 7d30848..ee5646b 100644 --- a/src/metrics/mod.rs +++ b/src/metrics/mod.rs @@ -1,3 +1,4 @@ pub mod accuracy; pub mod confusion; +pub mod rocauc; pub mod traits; diff --git a/src/metrics/rocauc.rs b/src/metrics/rocauc.rs index e69de29..f991855 100644 --- a/src/metrics/rocauc.rs +++ b/src/metrics/rocauc.rs @@ -0,0 +1,227 @@ +use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign}; + +use crate::common::{ClassifierOutput, ClassifierTarget}; +use crate::metrics::confusion::ConfusionMatrix; +use crate::metrics::traits::ClassificationMetric; +use num::{Float, FromPrimitive}; + +/// Receiver Operating Characteristic Area Under the Curve (ROC AUC). +/// +/// This metric provides an approximation of the true ROC AUC. Computing the true ROC AUC would +/// require storing all the predictions and ground truths, which may not be efficient. The approximation +/// error is typically insignificant as long as the predicted probabilities are well calibrated. Regardless, +/// this metric can be used to reliably compare models with each other. +/// +/// # Parameters +/// +/// - `n_threshold`: The number of thresholds used for discretizing the ROC curve. A higher value will lead to +/// more accurate results, but will also require more computation time and memory. +/// - `pos_val`: Value to treat as "positive". +/// +/// # Examples +/// +/// ``` +/// use light_river::metrics::rocauc::ROCAUC; +/// use light_river::metrics::traits::ClassificationMetric; +/// use light_river::common::{ClassifierTarget, ClassifierOutput}; +/// use std::collections::HashMap; +/// +/// let y_pred = vec![ +/// ClassifierOutput::Probabilities(HashMap::from([ +/// (ClassifierTarget::from(true), 0.1), +/// (ClassifierTarget::from(false), 0.9), +/// ])), +/// ClassifierOutput::Probabilities(HashMap::from([(ClassifierTarget::from(true), 0.4)])), +/// ClassifierOutput::Probabilities(HashMap::from([ +/// (ClassifierTarget::from(true), 0.35), +/// (ClassifierTarget::from(false), 0.65), +/// ])), +/// ClassifierOutput::Probabilities(HashMap::from([ +/// (ClassifierTarget::from(true), 0.8), +/// (ClassifierTarget::from(false), 0.2), +/// ])), +/// ]; +/// let y_true: Vec = vec![false, false, true, true]; +/// +/// let mut metric = ROCAUC::new(Some(10), ClassifierTarget::from(true)); +/// +/// for (yt, yp) in y_true.iter().zip(y_pred.iter()) { +/// metric.update(yp, &ClassifierTarget::from(*yt), Some(1.0)); +/// } +/// +/// println!("ROCAUC: {:.2}%", metric.get() * (100.0 as f64)); +/// // ROCAUC: 87.50% +/// ``` +/// +/// # Notes +/// +/// The true ROC AUC might differ from the approximation. The accuracy can be improved by increasing the number +/// of thresholds, but this comes at the cost of more computation time and memory usage. +/// +pub struct ROCAUC { + n_threshold: Option, + pos_val: ClassifierTarget, + thresholds: Vec, + cms: Vec>, +} +impl ROCAUC { + pub fn new(n_threshold: Option, pos_val: ClassifierTarget) -> Self { + let n_threshold = n_threshold.unwrap_or(10); + + let mut thresholds = Vec::with_capacity(n_threshold); + + for i in 0..n_threshold { + thresholds.push( + F::from(i).unwrap() / (F::from(n_threshold).unwrap() - F::from(1.0).unwrap()), + ); + } + thresholds[0] -= F::from(1e-7).unwrap(); + thresholds[n_threshold - 1] += F::from(1e-7).unwrap(); + + let mut cms = Vec::with_capacity(n_threshold); + for _ in 0..n_threshold { + cms.push(ConfusionMatrix::new()); + } + + Self { + n_threshold: Some(n_threshold), + pos_val: pos_val, + thresholds: thresholds, + cms: cms, + } + } +} + +impl + ClassificationMetric for ROCAUC +{ + fn update( + &mut self, + y_pred: &ClassifierOutput, + y_true: &ClassifierTarget, + sample_weight: Option, + ) { + // Get the probability of the positive class + let p_pred = y_pred.get_probabilities(); + let default_proba = F::zero(); + let p_pred_pos = p_pred.get(&self.pos_val).unwrap_or(&default_proba); + + // Convert the target to a binary target + let y_true = ClassifierTarget::from(y_true.eq(&self.pos_val)); + + for (threshold, cm) in self.thresholds.iter().zip(self.cms.iter_mut()) { + let y_pred = + ClassifierOutput::Prediction(ClassifierTarget::from(p_pred_pos.ge(threshold))); + cm.update(&y_pred, &y_true, sample_weight); + } + } + + fn revert( + &mut self, + y_pred: &ClassifierOutput, + y_true: &ClassifierTarget, + sample_weight: Option, + ) { + let p_pred = y_pred.get_probabilities(); + + let default_proba = F::zero(); + let p_pred_pos = p_pred.get(&self.pos_val).unwrap_or(&default_proba); + let y_true = ClassifierTarget::from(y_true.eq(&self.pos_val)); + + for (threshold, cm) in self.thresholds.iter().zip(self.cms.iter_mut()) { + let y_pred = + ClassifierOutput::Prediction(ClassifierTarget::from(p_pred_pos.ge(threshold))); + cm.revert(&y_pred, &y_true, sample_weight); + } + } + fn get(&self) -> F { + let mut tprs: Vec = (0..self.n_threshold.unwrap()).map(|_| F::zero()).collect(); + let mut fprs: Vec = (0..self.n_threshold.unwrap()).map(|_| F::zero()).collect(); + + for (i, cm) in self.cms.iter().enumerate() { + let true_positives: F = cm.true_positives(&self.pos_val); + let true_negatives: F = cm.true_negatives(&self.pos_val); + let false_positives: F = cm.false_positives(&self.pos_val); + let false_negatives: F = cm.false_negatives(&self.pos_val); + + // Handle the case of zero division + let mut tpr: Option = None; + if true_positives + false_negatives != F::zero() { + tpr = Some(true_positives.div(true_positives + false_negatives)); + } + + tprs[i] = tpr.unwrap_or(F::zero()); + + // Handle the case of zero division + let mut fpr: Option = None; + if false_positives + true_negatives != F::zero() { + fpr = Some(false_positives.div(false_positives + true_negatives)); + } + + fprs[i] = fpr.unwrap_or(F::zero()); + } + // Trapezoidal integration + let mut auc = F::zero(); + for i in 0..self.n_threshold.unwrap() - 1 { + auc += (fprs[i + 1] - fprs[i]) * (tprs[i + 1] + tprs[i]) / F::from(2.0).unwrap(); + } // TODO: Turn it functional + + -auc + } + + fn is_multiclass(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_rocauc() { + // same example as in the doctest + let y_pred = vec![ + ClassifierOutput::Prediction(ClassifierTarget::from("cat")), + ClassifierOutput::Prediction(ClassifierTarget::from("dog")), + ClassifierOutput::Prediction(ClassifierTarget::from("bird")), + ClassifierOutput::Prediction(ClassifierTarget::from("cat")), + ]; + let y_true: Vec<&str> = vec!["cat", "cat", "dog", "cat"]; + + let mut metric = ROCAUC::new(Some(10), ClassifierTarget::from("cat")); + + for (yt, yp) in y_true.iter().zip(y_pred.iter()) { + metric.update(yp, &ClassifierTarget::from(*yt), Some(1.0)); + } + println!("{}", metric.get()); + } + #[test] + fn another() { + use std::collections::HashMap; + + let y_pred = vec![ + ClassifierOutput::Probabilities(HashMap::from([ + (ClassifierTarget::from(true), 0.1), + (ClassifierTarget::from(false), 0.9), + ])), + ClassifierOutput::Probabilities(HashMap::from([(ClassifierTarget::from(true), 0.4)])), + ClassifierOutput::Probabilities(HashMap::from([ + (ClassifierTarget::from(true), 0.35), + (ClassifierTarget::from(false), 0.65), + ])), + ClassifierOutput::Probabilities(HashMap::from([ + (ClassifierTarget::from(true), 0.8), + (ClassifierTarget::from(false), 0.2), + ])), + ]; + let y_true: Vec = vec![false, false, true, true]; + + let mut metric: ROCAUC = ROCAUC::new(Some(10), ClassifierTarget::from(true)); + + for (yt, yp) in y_true.iter().zip(y_pred.iter()) { + metric.update(yp, &ClassifierTarget::from(*yt), Some(1.0)); + } + + println!("ROCAUC: {:.2}%", metric.get() * (100.0 as f64)); + } +} From c10772c2bcc02c0dc85930d88c9b9f0e093de265 Mon Sep 17 00:00:00 2001 From: Adil Zouitine Date: Fri, 3 Nov 2023 21:21:19 +0100 Subject: [PATCH 7/7] Add classifier target as output for compatibility with metrics --- src/anomaly/half_space_tree.rs | 39 ++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/src/anomaly/half_space_tree.rs b/src/anomaly/half_space_tree.rs index bbdae3b..4821d6c 100644 --- a/src/anomaly/half_space_tree.rs +++ b/src/anomaly/half_space_tree.rs @@ -3,11 +3,12 @@ use rand::prelude::*; use num::{Float, FromPrimitive}; +use std::collections::HashMap; use std::convert::TryFrom; use std::mem; use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign}; -use crate::common::Observation; +use crate::common::{ClassifierOutput, ClassifierTarget, Observation}; // Return the index of a node's left child node. #[inline] @@ -64,6 +65,25 @@ impl T hst } } +/// Half-space trees are an online variant of isolation forests. +/// They work well when anomalies are spread out. +/// However, they do not work well if anomalies are packed together in windows. +/// By default, this implementation assumes that each feature has values that are comprised +/// between 0 and 1. +/// # Parameters +/// +/// - `window_size`: The number of observations to consider when computing the score. +/// - `n_trees`: The number of trees to use. +/// - `height`: The height of each tree. +/// - `features`: The list of features to use. If `None`, the features will be inferred from the first observation. +/// +/// # Example +/// +/// ``` +/// +/// +/// +/// ``` pub struct HalfSpaceTree { window_size: u32, counter: u32, @@ -114,7 +134,7 @@ impl H observation: &Observation, do_score: bool, do_update: bool, - ) -> Option { + ) -> Option> { // build trees during the first pass if (!self.first_learn) && self.features.is_none() { self.features = Some(observation.clone().into_keys().collect()); @@ -193,14 +213,18 @@ impl H } if do_score { score = F::one() - (score / self.max_score()); - return Some(score); + return Some(ClassifierOutput::Probabilities(HashMap::from([( + ClassifierTarget::from(true), + score, + )]))); + // return Some(score); } return None; } pub fn learn_one(&mut self, observation: &Observation) { self.update(observation, false, true); } - pub fn score_one(&mut self, observation: &Observation) -> Option { + pub fn score_one(&mut self, observation: &Observation) -> Option> { self.update(observation, true, false) } fn max_score(&self) -> F { @@ -209,3 +233,10 @@ impl H * (F::from(2.).unwrap().powi(self.height as i32 + 1) - F::one()) } } + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_hst() {} +}