diff --git a/examples/classification/synthetic.rs b/examples/classification/synthetic.rs index 723eb23..27af746 100644 --- a/examples/classification/synthetic.rs +++ b/examples/classification/synthetic.rs @@ -1,4 +1,4 @@ -use light_river::classification::mondrian_forest::MondrianForest; +use light_river::classification::mondrian_forest::MondrianForestClassifier; use light_river::common::ClassifierTarget; use light_river::datasets::synthetic::Synthetic; @@ -35,7 +35,6 @@ fn get_labels(transactions: IterCsv) -> Vec { fn main() { let now = Instant::now(); - let window_size: usize = 1000; let n_trees: usize = 1; let transactions_f = Synthetic::load_data(); @@ -44,7 +43,8 @@ fn main() { let transactions_c = Synthetic::load_data(); let labels = get_labels(transactions_c); println!("labels: {labels:?}, features: {features:?}"); - let mut mf: MondrianForest = MondrianForest::new(window_size, n_trees, &features, &labels); + let mut mf: MondrianForestClassifier = + MondrianForestClassifier::new(n_trees, features.len(), labels.len()); let mut score_total = 0.0; let transactions = Synthetic::load_data(); @@ -76,9 +76,7 @@ fn main() { score_total / idx.to_f32().unwrap() ); } - // if idx == 10 { - // panic!("stop"); - // } + println!("=M=1 partial_fit {x_ord}"); mf.partial_fit(&x_ord, y); } diff --git a/python_baseline_synthetic.py b/python_baseline_synthetic.py index b1af1ac..1e1d742 100644 --- a/python_baseline_synthetic.py +++ b/python_baseline_synthetic.py @@ -9,7 +9,8 @@ use_aggregation=False, ) -df = pd.read_csv("/home/robotics/light-river/syntetic_dataset_int.csv") +df = pd.read_csv("/home/robotics/light-river/syntetic_dataset.csv") +# df = pd.read_csv("/home/robotics/light-river/syntetic_dataset_paper.csv") X = df[["feature_1", "feature_2"]] y = df["label"].values @@ -19,23 +20,29 @@ import numpy as np + def count_nodes(node): if isinstance(node, MondrianLeafClassifier): return 1 elif isinstance(node, MondrianBranchClassifier): return 1 + np.sum([count_nodes(c) for c in node.children]) -for i, ((_, x), true_label) in enumerate( - zip(X.iterrows(), y), 1 -): # start counting from 1 + +# for i, ((_, x), true_label) in enumerate(zip(X.iterrows(), y), 1): +for i, ((_, x), true_label) in enumerate(zip(X.iterrows(), y)): pred_proba = mf.predict_proba_one(x.to_dict()) if pred_proba: predicted_label = max(pred_proba, key=pred_proba.get) if predicted_label == true_label: score_total += 1 + + print( + f"{score_total} / {i} = {score_total/i}, nodes:", + count_nodes(mf.data[0]._root), + ) + + print("=M=1 x:", list(x.to_dict().values())) mf.learn_one(x.to_dict(), true_label) - if i > 0: - print(f"{score_total} / {i} = {score_total/i}, nodes:", count_nodes(mf.data[0]._root)) # if counter > 10: # raise () diff --git a/src/classification/mondrian_forest.rs b/src/classification/mondrian_forest.rs index 50a256c..03d42a9 100644 --- a/src/classification/mondrian_forest.rs +++ b/src/classification/mondrian_forest.rs @@ -1,5 +1,5 @@ use crate::classification::alias::FType; -use crate::classification::mondrian_tree::MondrianTree; +use crate::classification::mondrian_tree::MondrianTreeClassifier; use ndarray::Array1; @@ -10,21 +10,15 @@ use std::collections::HashMap; use std::usize; -pub struct MondrianForest { - trees: Vec>, - labels: Vec, +pub struct MondrianForestClassifier { + trees: Vec>, + n_labels: usize, } -impl MondrianForest { - pub fn new( - window_size: usize, - n_trees: usize, - features: &Vec, - labels: &Vec, - ) -> Self { - let tree_default = MondrianTree::new(window_size, features, labels); +impl MondrianForestClassifier { + pub fn new(n_trees: usize, n_features: usize, n_labels: usize) -> Self { + let tree_default = MondrianTreeClassifier::new(n_features, n_labels); let trees = vec![tree_default; n_trees]; - let labels = labels.clone(); - MondrianForest:: { trees, labels } + MondrianForestClassifier:: { trees, n_labels } } /// Note: In Nel215 codebase should work on multiple records, here it's @@ -37,20 +31,8 @@ impl MondrianForest { } } - pub fn fit(x: &HashMap, y: &String) { - unimplemented!() - } - pub fn predict_proba(&self, x: &Array1) -> Array1 { - // scores shape in nel215: (n_trees, n_samples, n_labels) - // scores shape here: (n_trees, n_labels). We are doing one shot learning. - let n_trees = self.trees.len(); - let n_labels = self.labels.len(); - - // Initialize an accumulator array for summing probabilities from each tree - let mut total_probs = Array1::::zeros(n_labels); - - // Sum probabilities from each tree + let mut tot_probs = Array1::::zeros(self.n_labels); for tree in &self.trees { let probs = tree.predict_proba(x); assert!( @@ -58,13 +40,10 @@ impl MondrianForest { "Probability should not be NaN. Found: {:?}.", probs.to_vec() ); - total_probs += &probs; + tot_probs += &probs; } - - // Average the probabilities by the number of trees - total_probs /= F::from_usize(n_trees).unwrap(); - - total_probs + tot_probs /= F::from_usize(self.trees.len()).unwrap(); + tot_probs } pub fn score(&mut self, x: &Array1, y: usize) -> F { diff --git a/src/classification/mondrian_node.rs b/src/classification/mondrian_node.rs index 2b1f40c..c3490f1 100644 --- a/src/classification/mondrian_node.rs +++ b/src/classification/mondrian_node.rs @@ -13,9 +13,8 @@ use std::usize; /// Node struct #[derive(Clone)] pub struct Node { - // Change 'Rc' to 'Weak' - pub parent: Option, // Option>>>, - pub time: F, // Time: how much I increased the size of the box + pub parent: Option, + pub time: F, // Time: how much I increased the size of the box pub is_leaf: bool, pub min_list: Array1, // Lists representing the minimum and maximum values of the data points contained in the current node pub max_list: Array1, @@ -32,9 +31,6 @@ impl Node { pub fn update_internal(&self, left_s: &Stats, right_s: &Stats) -> Stats { left_s.merge(right_s) } - pub fn get_parent_time(&self) -> F { - panic!("Implemented in 'mondrian_tree' instead of 'mondrian_node'") - } /// Check if all the labels are the same in the node. /// e.g. y=2, stats.counts=[0, 1, 10] -> False /// e.g. y=2, stats.counts=[0, 0, 10] -> True @@ -52,7 +48,7 @@ pub struct Stats { pub sums: Array2, pub sq_sums: Array2, pub counts: Array1, - num_labels: usize, + n_labels: usize, } impl fmt::Display for Stats { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -75,12 +71,12 @@ impl fmt::Display for Stats { } } impl Stats { - pub fn new(num_labels: usize, feature_dim: usize) -> Self { + pub fn new(n_labels: usize, n_features: usize) -> Self { Stats { - sums: Array2::zeros((num_labels, feature_dim)), - sq_sums: Array2::zeros((num_labels, feature_dim)), - counts: Array1::zeros(num_labels), - num_labels, + sums: Array2::zeros((n_labels, n_features)), + sq_sums: Array2::zeros((n_labels, n_features)), + counts: Array1::zeros(n_labels), + n_labels, } } pub fn create_result(&self, x: &Array1, w: F) -> Array1 { @@ -88,7 +84,7 @@ impl Stats { probs * w } pub fn add(&mut self, x: &Array1, y: usize) { - // Same as: self.sums[label] += x; + // Same as: self.sums[y] += x; self.sums.row_mut(y).zip_mut_with(&x, |a, &b| *a += b); // Same as: self.sq_sums[y] += x*x; @@ -105,7 +101,7 @@ impl Stats { sums: self.sums.clone() + &s.sums, sq_sums: self.sq_sums.clone() + &s.sq_sums, counts: self.counts.clone() + &s.counts, - num_labels: self.num_labels, + n_labels: self.n_labels, } } /// Return probabilities of sample 'x' belonging to each class. @@ -134,7 +130,7 @@ impl Stats { /// assert!((probs.clone().sum() - 1.0f32).abs() < 1e-4, "Sum of probabilities should be 1"); /// ``` pub fn predict_proba(&self, x: &Array1) -> Array1 { - let mut probs = Array1::zeros(self.num_labels); + let mut probs = Array1::zeros(self.n_labels); let mut sum_prob = F::zero(); // println!("predict_proba() - start {}", self); diff --git a/src/classification/mondrian_tree.rs b/src/classification/mondrian_tree.rs index f853dca..e02b62c 100644 --- a/src/classification/mondrian_tree.rs +++ b/src/classification/mondrian_tree.rs @@ -14,24 +14,21 @@ use std::fmt; use std::usize; #[derive(Clone)] -pub struct MondrianTree { - window_size: usize, - features: Vec, - labels: Vec, +pub struct MondrianTreeClassifier { + n_features: usize, + n_labels: usize, rng: ThreadRng, - first_learn: bool, nodes: Vec>, root: Option, } -impl fmt::Display for MondrianTree { +impl fmt::Display for MondrianTreeClassifier { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "\nā”Œ MondrianTree")?; + writeln!(f, "\nā”Œ MondrianTreeClassifier")?; self.recursive_repr(self.root, f, "ā”‚ ") } } - -impl MondrianTree { +impl MondrianTreeClassifier { /// Helper method to recursively format node details. fn recursive_repr( &self, @@ -63,23 +60,18 @@ impl MondrianTree { } } -impl MondrianTree { - pub fn new(window_size: usize, features: &Vec, labels: &Vec) -> Self { - MondrianTree:: { - window_size, - features: features.clone(), - labels: labels.clone(), +impl MondrianTreeClassifier { + pub fn new(n_features: usize, n_labels: usize) -> Self { + MondrianTreeClassifier:: { + n_features, + n_labels, rng: rand::thread_rng(), - first_learn: false, nodes: vec![], root: None, } } fn create_leaf(&mut self, x: &Array1, y: usize, parent: Option, time: F) -> usize { - let num_labels = self.labels.len(); - let feature_dim = self.features.len(); - let mut node = Node:: { parent, time, // F::from(1e9).unwrap(), // Very large value @@ -90,7 +82,7 @@ impl MondrianTree { threshold: F::zero(), left: None, right: None, - stats: Stats::new(num_labels, feature_dim), + stats: Stats::new(self.n_labels, self.n_features), }; node.update_leaf(x, y); @@ -99,14 +91,6 @@ impl MondrianTree { node_idx } - /// Note: In Nel215 codebase should work on multiple records, here it's - /// working only on one, so it's the same as "predict()". - pub fn predict_proba(&self, x: &Array1) -> Array1 { - // println!("predict_proba() - tree size: {}", self.nodes.len()); - // self.test_tree(); - self.predict(x, self.root.unwrap(), F::one()) - } - fn test_tree(&self) { // TODO: move to test for node_idx in 0..self.nodes.len() { @@ -253,7 +237,7 @@ impl MondrianTree { threshold, left: None, right: None, - stats: Stats::new(self.labels.len(), self.features.len()), + stats: Stats::new(self.n_labels, self.n_features), }; self.nodes.push(parent_node); @@ -280,7 +264,7 @@ impl MondrianTree { return parent_idx; } else { - // No split, we just update the node and go to the next one + // No split, just update the node. If leaf add to count, else call recursively next child node. let node = &mut self.nodes[node_idx]; // println!("pre - node: {:?}, node range: ({:?}-{:?}), x: {:?}", node_idx, node.min_list.to_vec(), node.max_list.to_vec(), x.to_vec()); @@ -304,7 +288,6 @@ impl MondrianTree { let node = &mut self.nodes[node_idx]; node.right = node_right_new; }; - self.update_downwards(node_idx); } return node_idx; } @@ -340,6 +323,14 @@ impl MondrianTree { unimplemented!("Make the program first work with 'partial_fit', then implement this") } + /// Note: In Nel215 codebase should work on multiple records, here it's + /// working only on one, so it's the same as "predict()". + pub fn predict_proba(&self, x: &Array1) -> Array1 { + // println!("predict_proba() - tree size: {}", self.nodes.len()); + // self.test_tree(); + self.predict(x, self.root.unwrap(), F::one()) + } + fn predict(&self, x: &Array1, node_idx: usize, p_not_separated_yet: F) -> Array1 { let node = &self.nodes[node_idx]; @@ -348,7 +339,6 @@ impl MondrianTree { // d (time delta with parent): more dist with parent, more prob of splitting let p = { let d = node.time - self.get_parent_time(node_idx); - // If 'x' is outside the box, calculate distance of 'x' from the box let dist_max = (x - &node.max_list).mapv(|v| F::max(v, F::zero())); let dist_min = (&node.min_list - x).mapv(|v| F::max(v, F::zero())); let eta = dist_min.sum() + dist_max.sum(); @@ -373,11 +363,7 @@ impl MondrianTree { } } - fn get_params(&self) { - unimplemented!() - } - - pub fn get_parent_time(&self, node_idx: usize) -> F { + fn get_parent_time(&self, node_idx: usize) -> F { // If node is root, time is 0 match self.nodes[node_idx].parent { Some(parent_idx) => self.nodes[parent_idx].time,