Skip to content

Commit

Permalink
Rename MondrianForest to MondrianForestClassifier
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoDiFrancesco committed May 6, 2024
1 parent da23d14 commit 85030ad
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 97 deletions.
10 changes: 4 additions & 6 deletions examples/classification/synthetic.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use light_river::classification::mondrian_forest::MondrianForest;
use light_river::classification::mondrian_forest::MondrianForestClassifier;

use light_river::common::ClassifierTarget;
use light_river::datasets::synthetic::Synthetic;
Expand Down Expand Up @@ -35,7 +35,6 @@ fn get_labels(transactions: IterCsv<f32, File>) -> Vec<String> {

fn main() {
let now = Instant::now();
let window_size: usize = 1000;
let n_trees: usize = 1;

let transactions_f = Synthetic::load_data();
Expand All @@ -44,7 +43,8 @@ fn main() {
let transactions_c = Synthetic::load_data();
let labels = get_labels(transactions_c);
println!("labels: {labels:?}, features: {features:?}");
let mut mf: MondrianForest<f32> = MondrianForest::new(window_size, n_trees, &features, &labels);
let mut mf: MondrianForestClassifier<f32> =
MondrianForestClassifier::new(n_trees, features.len(), labels.len());
let mut score_total = 0.0;

let transactions = Synthetic::load_data();
Expand Down Expand Up @@ -76,9 +76,7 @@ fn main() {
score_total / idx.to_f32().unwrap()
);
}
// if idx == 10 {
// panic!("stop");
// }

println!("=M=1 partial_fit {x_ord}");
mf.partial_fit(&x_ord, y);
}
Expand Down
19 changes: 13 additions & 6 deletions python_baseline_synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
use_aggregation=False,
)

df = pd.read_csv("/home/robotics/light-river/syntetic_dataset_int.csv")
df = pd.read_csv("/home/robotics/light-river/syntetic_dataset.csv")
# df = pd.read_csv("/home/robotics/light-river/syntetic_dataset_paper.csv")
X = df[["feature_1", "feature_2"]]
y = df["label"].values

Expand All @@ -19,23 +20,29 @@

import numpy as np


def count_nodes(node):
if isinstance(node, MondrianLeafClassifier):
return 1
elif isinstance(node, MondrianBranchClassifier):
return 1 + np.sum([count_nodes(c) for c in node.children])

for i, ((_, x), true_label) in enumerate(
zip(X.iterrows(), y), 1
): # start counting from 1

# for i, ((_, x), true_label) in enumerate(zip(X.iterrows(), y), 1):
for i, ((_, x), true_label) in enumerate(zip(X.iterrows(), y)):
pred_proba = mf.predict_proba_one(x.to_dict())
if pred_proba:
predicted_label = max(pred_proba, key=pred_proba.get)
if predicted_label == true_label:
score_total += 1

print(
f"{score_total} / {i} = {score_total/i}, nodes:",
count_nodes(mf.data[0]._root),
)

print("=M=1 x:", list(x.to_dict().values()))
mf.learn_one(x.to_dict(), true_label)
if i > 0:
print(f"{score_total} / {i} = {score_total/i}, nodes:", count_nodes(mf.data[0]._root))

# if counter > 10:
# raise ()
Expand Down
45 changes: 12 additions & 33 deletions src/classification/mondrian_forest.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::classification::alias::FType;
use crate::classification::mondrian_tree::MondrianTree;
use crate::classification::mondrian_tree::MondrianTreeClassifier;

use ndarray::Array1;

Expand All @@ -10,21 +10,15 @@ use std::collections::HashMap;

use std::usize;

pub struct MondrianForest<F: FType> {
trees: Vec<MondrianTree<F>>,
labels: Vec<String>,
pub struct MondrianForestClassifier<F: FType> {
trees: Vec<MondrianTreeClassifier<F>>,
n_labels: usize,
}
impl<F: FType> MondrianForest<F> {
pub fn new(
window_size: usize,
n_trees: usize,
features: &Vec<String>,
labels: &Vec<String>,
) -> Self {
let tree_default = MondrianTree::new(window_size, features, labels);
impl<F: FType> MondrianForestClassifier<F> {
pub fn new(n_trees: usize, n_features: usize, n_labels: usize) -> Self {
let tree_default = MondrianTreeClassifier::new(n_features, n_labels);
let trees = vec![tree_default; n_trees];
let labels = labels.clone();
MondrianForest::<F> { trees, labels }
MondrianForestClassifier::<F> { trees, n_labels }
}

/// Note: In Nel215 codebase should work on multiple records, here it's
Expand All @@ -37,34 +31,19 @@ impl<F: FType> MondrianForest<F> {
}
}

pub fn fit(x: &HashMap<String, f32>, y: &String) {
unimplemented!()
}

pub fn predict_proba(&self, x: &Array1<F>) -> Array1<F> {
// scores shape in nel215: (n_trees, n_samples, n_labels)
// scores shape here: (n_trees, n_labels). We are doing one shot learning.
let n_trees = self.trees.len();
let n_labels = self.labels.len();

// Initialize an accumulator array for summing probabilities from each tree
let mut total_probs = Array1::<F>::zeros(n_labels);

// Sum probabilities from each tree
let mut tot_probs = Array1::<F>::zeros(self.n_labels);
for tree in &self.trees {
let probs = tree.predict_proba(x);
assert!(
!probs.iter().any(|&x| x.is_nan()),
"Probability should not be NaN. Found: {:?}.",
probs.to_vec()
);
total_probs += &probs;
tot_probs += &probs;
}

// Average the probabilities by the number of trees
total_probs /= F::from_usize(n_trees).unwrap();

total_probs
tot_probs /= F::from_usize(self.trees.len()).unwrap();
tot_probs
}

pub fn score(&mut self, x: &Array1<F>, y: usize) -> F {
Expand Down
26 changes: 11 additions & 15 deletions src/classification/mondrian_node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ use std::usize;
/// Node struct
#[derive(Clone)]
pub struct Node<F> {
// Change 'Rc' to 'Weak'
pub parent: Option<usize>, // Option<Rc<RefCell<Node<F>>>>,
pub time: F, // Time: how much I increased the size of the box
pub parent: Option<usize>,
pub time: F, // Time: how much I increased the size of the box
pub is_leaf: bool,
pub min_list: Array1<F>, // Lists representing the minimum and maximum values of the data points contained in the current node
pub max_list: Array1<F>,
Expand All @@ -32,9 +31,6 @@ impl<F: FType> Node<F> {
pub fn update_internal(&self, left_s: &Stats<F>, right_s: &Stats<F>) -> Stats<F> {
left_s.merge(right_s)
}
pub fn get_parent_time(&self) -> F {
panic!("Implemented in 'mondrian_tree' instead of 'mondrian_node'")
}
/// Check if all the labels are the same in the node.
/// e.g. y=2, stats.counts=[0, 1, 10] -> False
/// e.g. y=2, stats.counts=[0, 0, 10] -> True
Expand All @@ -52,7 +48,7 @@ pub struct Stats<F> {
pub sums: Array2<F>,
pub sq_sums: Array2<F>,
pub counts: Array1<usize>,
num_labels: usize,
n_labels: usize,
}
impl<F: FType + fmt::Display> fmt::Display for Stats<F> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
Expand All @@ -75,20 +71,20 @@ impl<F: FType + fmt::Display> fmt::Display for Stats<F> {
}
}
impl<F: FType> Stats<F> {
pub fn new(num_labels: usize, feature_dim: usize) -> Self {
pub fn new(n_labels: usize, n_features: usize) -> Self {
Stats {
sums: Array2::zeros((num_labels, feature_dim)),
sq_sums: Array2::zeros((num_labels, feature_dim)),
counts: Array1::zeros(num_labels),
num_labels,
sums: Array2::zeros((n_labels, n_features)),
sq_sums: Array2::zeros((n_labels, n_features)),
counts: Array1::zeros(n_labels),
n_labels,
}
}
pub fn create_result(&self, x: &Array1<F>, w: F) -> Array1<F> {
let probs = self.predict_proba(x);
probs * w
}
pub fn add(&mut self, x: &Array1<F>, y: usize) {
// Same as: self.sums[label] += x;
// Same as: self.sums[y] += x;
self.sums.row_mut(y).zip_mut_with(&x, |a, &b| *a += b);

// Same as: self.sq_sums[y] += x*x;
Expand All @@ -105,7 +101,7 @@ impl<F: FType> Stats<F> {
sums: self.sums.clone() + &s.sums,
sq_sums: self.sq_sums.clone() + &s.sq_sums,
counts: self.counts.clone() + &s.counts,
num_labels: self.num_labels,
n_labels: self.n_labels,
}
}
/// Return probabilities of sample 'x' belonging to each class.
Expand Down Expand Up @@ -134,7 +130,7 @@ impl<F: FType> Stats<F> {
/// assert!((probs.clone().sum() - 1.0f32).abs() < 1e-4, "Sum of probabilities should be 1");
/// ```
pub fn predict_proba(&self, x: &Array1<F>) -> Array1<F> {
let mut probs = Array1::zeros(self.num_labels);
let mut probs = Array1::zeros(self.n_labels);
let mut sum_prob = F::zero();

// println!("predict_proba() - start {}", self);
Expand Down
60 changes: 23 additions & 37 deletions src/classification/mondrian_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,21 @@ use std::fmt;
use std::usize;

#[derive(Clone)]
pub struct MondrianTree<F: FType> {
window_size: usize,
features: Vec<String>,
labels: Vec<String>,
pub struct MondrianTreeClassifier<F: FType> {
n_features: usize,
n_labels: usize,
rng: ThreadRng,
first_learn: bool,
nodes: Vec<Node<F>>,
root: Option<usize>,
}

impl<F: FType + fmt::Display> fmt::Display for MondrianTree<F> {
impl<F: FType + fmt::Display> fmt::Display for MondrianTreeClassifier<F> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "\nMondrianTree")?;
writeln!(f, "\nMondrianTreeClassifier")?;
self.recursive_repr(self.root, f, "│ ")
}
}

impl<F: FType + fmt::Display> MondrianTree<F> {
impl<F: FType + fmt::Display> MondrianTreeClassifier<F> {
/// Helper method to recursively format node details.
fn recursive_repr(
&self,
Expand Down Expand Up @@ -63,23 +60,18 @@ impl<F: FType + fmt::Display> MondrianTree<F> {
}
}

impl<F: FType> MondrianTree<F> {
pub fn new(window_size: usize, features: &Vec<String>, labels: &Vec<String>) -> Self {
MondrianTree::<F> {
window_size,
features: features.clone(),
labels: labels.clone(),
impl<F: FType> MondrianTreeClassifier<F> {
pub fn new(n_features: usize, n_labels: usize) -> Self {
MondrianTreeClassifier::<F> {
n_features,
n_labels,
rng: rand::thread_rng(),
first_learn: false,
nodes: vec![],
root: None,
}
}

fn create_leaf(&mut self, x: &Array1<F>, y: usize, parent: Option<usize>, time: F) -> usize {
let num_labels = self.labels.len();
let feature_dim = self.features.len();

let mut node = Node::<F> {
parent,
time, // F::from(1e9).unwrap(), // Very large value
Expand All @@ -90,7 +82,7 @@ impl<F: FType> MondrianTree<F> {
threshold: F::zero(),
left: None,
right: None,
stats: Stats::new(num_labels, feature_dim),
stats: Stats::new(self.n_labels, self.n_features),
};

node.update_leaf(x, y);
Expand All @@ -99,14 +91,6 @@ impl<F: FType> MondrianTree<F> {
node_idx
}

/// Note: In Nel215 codebase should work on multiple records, here it's
/// working only on one, so it's the same as "predict()".
pub fn predict_proba(&self, x: &Array1<F>) -> Array1<F> {
// println!("predict_proba() - tree size: {}", self.nodes.len());
// self.test_tree();
self.predict(x, self.root.unwrap(), F::one())
}

fn test_tree(&self) {
// TODO: move to test
for node_idx in 0..self.nodes.len() {
Expand Down Expand Up @@ -253,7 +237,7 @@ impl<F: FType> MondrianTree<F> {
threshold,
left: None,
right: None,
stats: Stats::new(self.labels.len(), self.features.len()),
stats: Stats::new(self.n_labels, self.n_features),
};

self.nodes.push(parent_node);
Expand All @@ -280,7 +264,7 @@ impl<F: FType> MondrianTree<F> {

return parent_idx;
} else {
// No split, we just update the node and go to the next one
// No split, just update the node. If leaf add to count, else call recursively next child node.

let node = &mut self.nodes[node_idx];
// println!("pre - node: {:?}, node range: ({:?}-{:?}), x: {:?}", node_idx, node.min_list.to_vec(), node.max_list.to_vec(), x.to_vec());
Expand All @@ -304,7 +288,6 @@ impl<F: FType> MondrianTree<F> {
let node = &mut self.nodes[node_idx];
node.right = node_right_new;
};
self.update_downwards(node_idx);
}
return node_idx;
}
Expand Down Expand Up @@ -340,6 +323,14 @@ impl<F: FType> MondrianTree<F> {
unimplemented!("Make the program first work with 'partial_fit', then implement this")
}

/// Note: In Nel215 codebase should work on multiple records, here it's
/// working only on one, so it's the same as "predict()".
pub fn predict_proba(&self, x: &Array1<F>) -> Array1<F> {
// println!("predict_proba() - tree size: {}", self.nodes.len());
// self.test_tree();
self.predict(x, self.root.unwrap(), F::one())
}

fn predict(&self, x: &Array1<F>, node_idx: usize, p_not_separated_yet: F) -> Array1<F> {
let node = &self.nodes[node_idx];

Expand All @@ -348,7 +339,6 @@ impl<F: FType> MondrianTree<F> {
// d (time delta with parent): more dist with parent, more prob of splitting
let p = {
let d = node.time - self.get_parent_time(node_idx);
// If 'x' is outside the box, calculate distance of 'x' from the box
let dist_max = (x - &node.max_list).mapv(|v| F::max(v, F::zero()));
let dist_min = (&node.min_list - x).mapv(|v| F::max(v, F::zero()));
let eta = dist_min.sum() + dist_max.sum();
Expand All @@ -373,11 +363,7 @@ impl<F: FType> MondrianTree<F> {
}
}

fn get_params(&self) {
unimplemented!()
}

pub fn get_parent_time(&self, node_idx: usize) -> F {
fn get_parent_time(&self, node_idx: usize) -> F {
// If node is root, time is 0
match self.nodes[node_idx].parent {
Some(parent_idx) => self.nodes[parent_idx].time,
Expand Down

0 comments on commit 85030ad

Please sign in to comment.