diff --git a/examples/classification/keystroke.rs b/examples/classification/keystroke.rs index d2024ce..916f283 100644 --- a/examples/classification/keystroke.rs +++ b/examples/classification/keystroke.rs @@ -69,12 +69,14 @@ fn main() { ClassifierTarget::String(y) => y, _ => unimplemented!(), }; + let y = labels.clone().iter().position(|l| l == &y).unwrap(); + let x_ord = Array1::::from_vec(features.iter().map(|k| x[k]).collect()); // DEBUG: remove it // let x_ord = x_ord.slice(s![0..2]).to_owned(); println!("=M=1 partial_fit"); - mf.partial_fit(&x_ord, &y); + mf.partial_fit(&x_ord, y); println!("=M=2 predict_proba"); let score = mf.predict_proba(&x_ord); diff --git a/examples/classification/synthetic.rs b/examples/classification/synthetic.rs index 4256574..3bc04c7 100644 --- a/examples/classification/synthetic.rs +++ b/examples/classification/synthetic.rs @@ -65,13 +65,15 @@ fn main() { ClassifierTarget::String(y) => y, _ => unimplemented!(), }; + let y = labels.clone().iter().position(|l| l == &y).unwrap(); + let x_ord = Array1::::from_vec(features.iter().map(|k| x[k]).collect()); // Skip first sample since tree has still no node if idx != 0 { // let probs = mf.predict_proba(&x_ord); // println!("=M=2 probs: {:?}", probs.to_vec()); - let score = mf.score(&x_ord, &y); + let score = mf.score(&x_ord, y); // println!("=M=3 score: {:?}", score); score_total += score; @@ -84,7 +86,7 @@ fn main() { // panic!("stop"); // } println!("=M=1 partial_fit {x_ord}"); - mf.partial_fit(&x_ord, &y); + mf.partial_fit(&x_ord, y); } let elapsed_time = now.elapsed(); diff --git a/src/classification/mondrian_forest.rs b/src/classification/mondrian_forest.rs index f387a8e..d0df82e 100644 --- a/src/classification/mondrian_forest.rs +++ b/src/classification/mondrian_forest.rs @@ -42,7 +42,7 @@ impl MondrianForest { /// working only on one. /// /// Function in River/LightRiver: "learn_one()" - pub fn partial_fit(&mut self, x: &Array1, y: &String) { + pub fn partial_fit(&mut self, x: &Array1, y: usize) { for tree in &mut self.trees { tree.partial_fit(x, y); } @@ -78,16 +78,15 @@ impl MondrianForest { total_probs } - pub fn score(&mut self, x: &Array1, y: &String) -> F { + pub fn score(&mut self, x: &Array1, y: usize) -> F { let probs = self.predict_proba(x); - let y_idx = self.labels.iter().position(|l| l == y).unwrap(); let pred_idx = probs .iter() .enumerate() .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) .map(|(idx, _)| idx) .unwrap(); - if pred_idx == y_idx { + if pred_idx == y { F::one() } else { F::zero() diff --git a/src/classification/mondrian_node.rs b/src/classification/mondrian_node.rs index e47a23c..08f9c10 100644 --- a/src/classification/mondrian_node.rs +++ b/src/classification/mondrian_node.rs @@ -26,24 +26,24 @@ use std::{clone, cmp, mem, usize}; pub struct Node { // Change 'Rc' to 'Weak' pub parent: Option, // Option>>>, - pub tau: F, // Time parameter: updated during 'node creation' or 'node update' + pub time: F, // Time: how much I increased the size of the box pub is_leaf: bool, pub min_list: Array1, // Lists representing the minimum and maximum values of the data points contained in the current node pub max_list: Array1, - pub delta: usize, // Dimension in which a split occurs - pub xi: F, // Split point along the dimension specified by delta - pub left: Option, // Option>>>, - pub right: Option, // Option>>>, + pub feature: usize, // Feature in which a split occurs + pub threshold: F, // Threshold in which the split occures + pub left: Option, + pub right: Option, pub stats: Stats, } impl Node { - pub fn update_leaf(&mut self, x: &Array1, label_idx: usize) { - self.stats.add(x, label_idx); + pub fn update_leaf(&mut self, x: &Array1, y: usize) { + self.stats.add(x, y); } pub fn update_internal(&self, left_s: &Stats, right_s: &Stats) -> Stats { left_s.merge(right_s) } - pub fn get_parent_tau(&self) -> F { + pub fn get_parent_time(&self) -> F { panic!("Implemented in 'mondrian_tree' instead of 'mondrian_node'") } /// Check if all the labels are the same in the node. @@ -100,19 +100,17 @@ impl Stats { let probs = self.predict_proba(x); probs * w } - pub fn add(&mut self, x: &Array1, label_idx: usize) { + pub fn add(&mut self, x: &Array1, y: usize) { // Same as: self.sums[label] += x; - self.sums - .row_mut(label_idx) - .zip_mut_with(&x, |a, &b| *a += b); + self.sums.row_mut(y).zip_mut_with(&x, |a, &b| *a += b); - // Same as: self.sq_sums[label_idx] += x*x; + // Same as: self.sq_sums[y] += x*x; // e.g. x: [1.059 0.580] -> x*x: [1.122 0.337] self.sq_sums - .row_mut(label_idx) + .row_mut(y) .zip_mut_with(&x, |a, &b| *a += b * b); - self.counts[label_idx] += 1; + self.counts[y] += 1; } fn merge(&self, s: &Stats) -> Stats { // NOTE: nel215 returns a new Stats object, we are only changing the node values here @@ -172,8 +170,7 @@ impl Stats { .enumerate() { // println!("predict_proba() - mid - index: {:?}, sum: {:?}, sq_sum: {:?}, count: {:?}", index, sum.to_vec(), sq_sum.to_vec(), count); - // let epsilon = F::epsilon(); // Don't use this variable, write 'F::epsilon' where needed. - let epsilon = F::from_f32(1e-9).unwrap(); + let epsilon = F::epsilon(); // F::from_f32(1e-9).unwrap(); let count_f = F::from_usize(count).unwrap(); let avg = &sum / count_f; let var = (&sq_sum / count_f) - (&avg * &avg) + epsilon; @@ -182,9 +179,9 @@ impl Stats { let pi = F::from_f32(std::f32::consts::PI).unwrap() * F::from_f32(2.0).unwrap(); let z = pi.powi(x.len() as i32) * sigma.mapv(|s| s * s).sum().sqrt(); // Same as dot product - let dot_delta = (&(x - &avg) * &(x - &avg)).sum(); + let dot_feature = (&(x - &avg) * &(x - &avg)).sum(); let dot_sigma = (&sigma * &sigma).sum(); - let exponent = -F::from_f32(0.5).unwrap() * dot_delta / dot_sigma; + let exponent = -F::from_f32(0.5).unwrap() * dot_feature / dot_sigma; // epsilon added since exponent.exp() could be zero if exponent is very small let mut prob = (exponent.exp() + epsilon) / z; if count <= 0 { diff --git a/src/classification/mondrian_tree.rs b/src/classification/mondrian_tree.rs index 56020c1..e038af4 100644 --- a/src/classification/mondrian_tree.rs +++ b/src/classification/mondrian_tree.rs @@ -52,17 +52,17 @@ impl MondrianTree { ) -> fmt::Result { if let Some(idx) = node_idx { let node = &self.nodes[idx]; - // writeln!(f, "{}Node {}: left={:?}, right={:?}, parent={:?}, tau={:.3}, is_leaf={}, min={:?}, max={:?}", - // prefix, idx, node.left, node.right, node.parent, node.tau, node.is_leaf, node.min_list.to_vec(), node.max_list.to_vec())?; + // writeln!(f, "{}Node {}: left={:?}, right={:?}, parent={:?}, time={:.3}, is_leaf={}, min={:?}, max={:?}", + // prefix, idx, node.left, node.right, node.parent, node.time, node.is_leaf, node.min_list.to_vec(), node.max_list.to_vec())?; writeln!( f, - "{}├─Node {}: left={:?}, right={:?}, parent={:?}, tau={:.3}, min={:?}, max={:?}", + "{}├─Node {}: left={:?}, right={:?}, parent={:?}, time={:.3}, min={:?}, max={:?}", prefix, idx, node.left, node.right, node.parent, - node.tau, + node.time, node.min_list.to_vec(), node.max_list.to_vec() )?; @@ -87,30 +87,24 @@ impl MondrianTree { } } - fn create_leaf( - &mut self, - x: &Array1, - label_idx: usize, - parent: Option, - tau: F, - ) -> usize { + fn create_leaf(&mut self, x: &Array1, y: usize, parent: Option, time: F) -> usize { let num_labels = self.labels.len(); let feature_dim = self.features.len(); let mut node = Node:: { parent, - tau, // F::from(1e9).unwrap(), // Very large value + time, // F::from(1e9).unwrap(), // Very large value is_leaf: true, min_list: x.clone(), max_list: x.clone(), - delta: 0, - xi: F::zero(), + feature: 0, + threshold: F::zero(), left: None, right: None, stats: Stats::new(num_labels, feature_dim), }; - node.update_leaf(x, label_idx); + node.update_leaf(x, y); self.nodes.push(node); let node_idx = self.nodes.len() - 1; node_idx @@ -154,113 +148,121 @@ impl MondrianTree { fn compute_split_time( &self, - tau: F, + time: F, exp_sample: F, node_idx: usize, - label_idx: usize, + y: usize, extensions_sum: F, ) -> F { - if self.nodes[node_idx].is_dirac(label_idx) { - println!("extend_mondrian_block()/_go_downwards() - node: {node_idx} - extensions_sum: {:?} - all same class", extensions_sum); + if self.nodes[node_idx].is_dirac(y) { + println!( + "go_downwards() - node: {node_idx} - extensions_sum: {:?} - all same class", + extensions_sum + ); return F::zero(); } if extensions_sum > F::zero() { - let split_time = tau + exp_sample; + let split_time = time + exp_sample; // From River: If the node is a leaf we must split it if self.nodes[node_idx].is_leaf { - println!("extend_mondrian_block()/_go_downwards() - node: {node_idx} - extensions_sum: {:?} - split is_leaf", extensions_sum); + println!( + "go_downwards() - node: {node_idx} - extensions_sum: {:?} - split is_leaf", + extensions_sum + ); return split_time; } // From River: Otherwise we apply Mondrian process dark magic :) // 1. We get the creation time of the childs (left and right is the same) let child_idx = self.nodes[node_idx].left.unwrap(); - let child_time = self.nodes[child_idx].tau; + let child_time = self.nodes[child_idx].time; // 2. We check if splitting time occurs before child creation time if split_time < child_time { - println!("extend_mondrian_block()/_go_downwards() - node: {node_idx} - extensions_sum: {:?} - split mid tree", extensions_sum); + println!( + "go_downwards() - node: {node_idx} - extensions_sum: {:?} - split mid tree", + extensions_sum + ); // Go to next child???? return split_time; } - println!("extend_mondrian_block()/_go_downwards() - node: {node_idx} - extensions_sum: {:?} - not increased enough to split (mid node)", extensions_sum); + println!("go_downwards() - node: {node_idx} - extensions_sum: {:?} - not increased enough to split (mid node)", extensions_sum); } else { - println!("extend_mondrian_block()/_go_downwards() - node: {node_idx} - extensions_sum: {:?} - not outside box", extensions_sum); + println!( + "go_downwards() - node: {node_idx} - extensions_sum: {:?} - not outside box", + extensions_sum + ); } F::zero() } - fn extend_mondrian_block(&mut self, node_idx: usize, x: &Array1, label_idx: usize) -> usize { - // tau is 'node.time' in - let tau = self.nodes[node_idx].tau; - // TODO: 'node_min_list' and 'node_max_list' be accessible without cloning - let node_min_list = self.nodes[node_idx].min_list.clone(); - let node_max_list = self.nodes[node_idx].max_list.clone(); - - let e_min = (&node_min_list - x).mapv(|v| F::max(v, F::zero())); - let e_max = (x - &node_max_list).mapv(|v| F::max(v, F::zero())); - // Extensions sum: size of the box - let e_sum = &e_min + &e_max; - - // TODO: epsilon is used in nel215 code, but not River. Check if it's useful. - // let lambda = e_sum.sum() + F::epsilon(); - // In nel215 lambda is 'rate' - let lambda = e_sum.sum(); - let exp_dist = Exp::new(lambda.to_f32().unwrap()).unwrap(); - // 'exp_sample' is 'E' in nel215 code, 'T' in River - let exp_sample = F::from_f32(exp_dist.sample(&mut self.rng)).unwrap(); - // DEBUG: shadowing with Exp expected value - let exp_sample = F::one() / lambda; - - let split_time = self.compute_split_time(tau, exp_sample, node_idx, label_idx, e_sum.sum()); - // println!("extend_mondrian_block - post compute_split_time() - split_time: {:?}", split_time); + fn go_downwards(&mut self, node_idx: usize, x: &Array1, y: usize) -> usize { + let time = self.nodes[node_idx].time; + let node_min_list = &self.nodes[node_idx].min_list; + let node_max_list = &self.nodes[node_idx].max_list; + let extensions = { + let e_min = (node_min_list - x).mapv(|v| F::max(v, F::zero())); + let e_max = (x - node_max_list).mapv(|v| F::max(v, F::zero())); + &e_min + &e_max + }; + // 'T' in River + let exp_sample = { + let lambda = extensions.sum(); + let exp_dist = Exp::new(lambda.to_f32().unwrap()).unwrap(); + let exp_sample = F::from_f32(exp_dist.sample(&mut self.rng)).unwrap(); + // DEBUG: shadowing with Exp expected value + let exp_sample = F::one() / lambda; + exp_sample + }; + let split_time = self.compute_split_time(time, exp_sample, node_idx, y, extensions.sum()); if split_time > F::zero() { - // We split the current node: if leaf we add children, otherwise we add a new node along the path - let cumsum = e_sum - .iter() - .scan(F::zero(), |acc, &x| { - *acc = *acc + x; - Some(*acc) - }) - .collect::>(); - println!("e_sum: {:?}, cumsum: {:?}", e_sum.to_vec(), cumsum.to_vec()); - - let e_sample = F::from_f32(self.rng.gen::()).unwrap() * e_sum.sum(); - // DEBUG: shadowing with expected value - let e_sample = F::from_f32(0.5).unwrap() * e_sum.sum(); - let delta = cumsum.iter().position(|&val| val > e_sample).unwrap_or(0); - - let (lower_bound, upper_bound) = if x[delta] > node_min_list[delta] { + // Here split the current node: if leaf we add children, otherwise + // we add a new node along the path + let feature = { + let cumsum = extensions + .iter() + .scan(F::zero(), |acc, &x| { + *acc = *acc + x; + Some(*acc) + }) + .collect::>(); + let e_sample = F::from_f32(self.rng.gen::()).unwrap() * extensions.sum(); + // DEBUG: shadowing with expected value + let e_sample = F::from_f32(0.5).unwrap() * extensions.sum(); + cumsum.iter().position(|&val| val > e_sample).unwrap() + }; + + let (lower_bound, upper_bound) = if x[feature] > node_min_list[feature] { ( - node_min_list[delta].to_f32().unwrap(), - x[delta].to_f32().unwrap(), + node_min_list[feature].to_f32().unwrap(), + x[feature].to_f32().unwrap(), ) } else { ( - x[delta].to_f32().unwrap(), - node_max_list[delta].to_f32().unwrap(), + x[feature].to_f32().unwrap(), + node_max_list[feature].to_f32().unwrap(), ) }; - let xi = F::from_f32(self.rng.gen_range(lower_bound..upper_bound)).unwrap(); - // DEBUG: setting expected value - let xi = F::from_f32((lower_bound + upper_bound) / 2.0).unwrap(); + let threshold = F::from_f32(self.rng.gen_range(lower_bound..upper_bound)).unwrap(); + // DEBUG: split in the middle + let threshold = F::from_f32((lower_bound + upper_bound) / 2.0).unwrap(); - let mut min_list = node_min_list; - let mut max_list = node_max_list; + let mut min_list = node_min_list.clone(); + let mut max_list = node_max_list.clone(); min_list.zip_mut_with(x, |a, &b| *a = F::min(*a, b)); max_list.zip_mut_with(x, |a, &b| *a = F::max(*a, b)); // Create and push new parent node let parent_node = Node { parent: self.nodes[node_idx].parent, - tau: self.nodes[node_idx].tau, + time: self.nodes[node_idx].time, is_leaf: false, min_list, max_list, - delta, - xi, + feature, + threshold, left: None, right: None, stats: Stats::new(self.labels.len(), self.features.len()), @@ -268,10 +270,10 @@ impl MondrianTree { self.nodes.push(parent_node); let parent_idx = self.nodes.len() - 1; - let sibling_idx = self.create_leaf(x, label_idx, Some(parent_idx), split_time); + let sibling_idx = self.create_leaf(x, y, Some(parent_idx), split_time); // Set the children appropriately - if x[delta] <= xi { + if x[feature] <= threshold { // Grandpa: self.nodes[node_idx].parent // (new) Parent: parent_idx // Child: node_idx @@ -284,9 +286,9 @@ impl MondrianTree { } self.nodes[node_idx].parent = Some(parent_idx); - self.nodes[node_idx].tau = split_time; + self.nodes[node_idx].time = split_time; - self.update_internal(parent_idx); + self.update_downwards(parent_idx); return parent_idx; } else { @@ -300,29 +302,31 @@ impl MondrianTree { if node.is_leaf { // println!("else - updating leaf"); - node.update_leaf(x, label_idx); + node.update_leaf(x, y); } else { // println!("else - updating non-leaf"); - if x[node.delta] <= node.xi { + if x[node.feature] <= node.threshold { let node_left = node.left.unwrap(); - let node_left_new = Some(self.extend_mondrian_block(node_left, x, label_idx)); + let node_left_new = Some(self.go_downwards(node_left, x, y)); let node = &mut self.nodes[node_idx]; node.left = node_left_new; } else { let node_right = node.right.unwrap(); - let node_right_new = Some(self.extend_mondrian_block(node_right, x, label_idx)); + let node_right_new = Some(self.go_downwards(node_right, x, y)); let node = &mut self.nodes[node_idx]; node.right = node_right_new; }; - self.update_internal(node_idx); + self.update_downwards(node_idx); } return node_idx; } } /// Update 'node stats' by merging 'right child stats + left child stats'. - fn update_internal(&mut self, node_idx: usize) { - // In nel215 code update_internal is not called for the children, check if it's needed + fn update_downwards(&mut self, node_idx: usize) { + // From River: + // Ranges (min_list, max_list) are already up to date + let node = &self.nodes[node_idx]; let left_s = &self.nodes[node.left.unwrap()].stats; let right_s = &self.nodes[node.right.unwrap()].stats; @@ -336,11 +340,10 @@ impl MondrianTree { /// working only on one. /// /// Function in River/LightRiver: "learn_one()" - pub fn partial_fit(&mut self, x: &Array1, y: &String) { - let label_idx = self.labels.clone().iter().position(|l| l == y).unwrap(); + pub fn partial_fit(&mut self, x: &Array1, y: usize) { self.root = match self.root { - None => Some(self.create_leaf(x, label_idx, None, F::zero())), - Some(root_idx) => Some(self.extend_mondrian_block(root_idx, x, label_idx)), + None => Some(self.create_leaf(x, y, None, F::zero())), + Some(root_idx) => Some(self.go_downwards(root_idx, x, y)), }; println!("partial_fit() tree post {}", self); } @@ -349,14 +352,14 @@ impl MondrianTree { unimplemented!("Make the program first work with 'partial_fit', then implement this") } - /// Function in River: "_go_downwards()" + /// Function in River: "go_downwards()" /// /// Recursive function to predict probabilities. fn predict(&self, x: &Array1, node_idx: usize, p_not_separated_yet: F) -> Array1 { let node = &self.nodes[node_idx]; - // Step 1: Calculate the time delta from the parent node. - let d = node.tau - self.get_parent_tau(node_idx); + // Step 1: Calculate the time feature from the parent node. + let d = node.time - self.get_parent_time(node_idx); // Step 2: If 'x' is outside the box, calculate distance of 'x' from the box let dist_max = (x - &node.max_list).mapv(|v| F::max(v, F::zero())); @@ -386,7 +389,7 @@ impl MondrianTree { // println!("predict() - ischild - res: {:?}, res2: {:?}", res.to_vec(), res2.to_vec()); return res + res2; } else { - let child_idx = if x[node.delta] <= node.xi { + let child_idx = if x[node.feature] <= node.threshold { node.left } else { node.right @@ -402,10 +405,10 @@ impl MondrianTree { unimplemented!() } - pub fn get_parent_tau(&self, node_idx: usize) -> F { - // If node is root, tau is 0 + pub fn get_parent_time(&self, node_idx: usize) -> F { + // If node is root, time is 0 match self.nodes[node_idx].parent { - Some(parent_idx) => self.nodes[parent_idx].tau, + Some(parent_idx) => self.nodes[parent_idx].time, None => F::from_f32(0.0).unwrap(), } }