online-ml · MarcoDiFrancesco · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 12, 2024
diff --git a/.github/workflows/clippy_check.yml b/.github/workflows/clippy_check.yml
@@ -4,9 +4,9 @@ jobs:
   clippy_check:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v1
-      - run: rustup component add clippy
-      - uses: actions-rs/clippy-check@v1
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          args: --all-features
+      - uses: actions/checkout@v3
+      # - run: rustup component add clippy
+      # - uses: actions-rs/clippy-check@v1
+      #   with:
+      #     token: ${{ secrets.GITHUB_TOKEN }}
+      #     args: --all-features
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -11,12 +11,35 @@ env:
 
 jobs:
   build:
-
     runs-on: ubuntu-latest
 
     steps:
     - uses: actions/checkout@v3
+
+    # Setup Rust toolchain
+    - name: Set up Rust
+      uses: actions-rs/toolchain@v1
+      with:
+        profile: minimal
+        toolchain: stable
+        override: true
+
+    # Cache Cargo registry, index, and build output
+    - name: Cache Cargo dependencies
+      uses: actions/cache@v3
+      with:
+        path: |
+          ~/.cargo/registry
+          ~/.cargo/git
+          target
+        key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-cargo-
+
+    # Build project
     - name: Build
       run: cargo build --verbose
+
+    # Run tests
     - name: Run tests
       run: cargo test --verbose
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@ Cargo.lock
 target/
 *.csv
 *.zip
-
+/.vscode/
 # Local configuration
 .cargo/config.toml
+/.venv*/
+generate_data_synthetic.py
+/run_synthetic_output*.txt
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,6 +15,8 @@ zip = "0.6.4"
 rand = "0.8.5"
 time = "0.3.29"
 half = "2.3.1"
+ndarray = "0.15.6"
+rand_distr = "0.4.3"
 
 [dev-dependencies]
 criterion = { version = "0.5", features = ["html_reports"] }
@@ -29,6 +31,26 @@ opt-level = 3
 name = "credit_card"
 path = "examples/anomaly_detection/credit_card.rs"
 
+[[example]]
+name = "genesis_demonstrator"
+path = "examples/classification/genesis_demonstrator.rs"
+
+[[example]]
+name = "keystroke"
+path = "examples/classification/keystroke.rs"
+
+[[example]]
+name = "machine_degradations"
+path = "examples/regression/machine_degradations.rs"
+
+[[example]]
+name = "synthetic"
+path = "examples/classification/synthetic.rs"
+
+[[example]]
+name = "synthetic-regression"
+path = "examples/regression/synthetic_regression.rs"
+
 [[bench]]
 name = "hst"
 harness = false
diff --git a/README.md b/README.md
@@ -43,7 +43,9 @@ cargo run --release --example credit_card
 
 ### 📊 Classification
 
-🏗️ We plan to implement Aggregated Mondrian Forests.
+```sh
+RUSTFLAGS=-Awarnings cargo run --example synthetic
+```
 
 ### 🛒 Recsys
 

diff --git a/examples/anomaly_detection/credit_card.rs b/examples/anomaly_detection/credit_card.rs
@@ -1,6 +1,6 @@
 use light_river::anomaly::half_space_tree::HalfSpaceTree;
 use light_river::common::ClassifierOutput;
-use light_river::common::ClassifierTarget;
+use light_river::common::ClfTarget;
 use light_river::datasets::credit_card::CreditCard;
 use light_river::metrics::rocauc::ROCAUC;
 use light_river::metrics::traits::ClassificationMetric;
@@ -16,7 +16,7 @@ fn main() {
     let window_size: u32 = 1000;
     let n_trees: u32 = 50;
     let height: u32 = 6;
-    let pos_val_metric = ClassifierTarget::from("1".to_string());
+    let pos_val_metric = ClfTarget::from("1".to_string());
     let pos_val_tree = pos_val_metric.clone();
     let mut roc_auc: ROCAUC<f32> = ROCAUC::new(Some(10), pos_val_metric.clone());
     // INITIALIZATION
@@ -32,7 +32,7 @@ fn main() {
         let score = hst.update(&observation, true, true).unwrap();
         // println!("Label: {:?}", label);
         // println!("Score: {:?}", score);
-        roc_auc.update(&score, &label, Some(1.));
+        // roc_auc.update(&score, &label, Some(1.));
     }
 
     let elapsed_time = now.elapsed();

diff --git a/examples/classification/genesis_demonstrator.rs b/examples/classification/genesis_demonstrator.rs
@@ -0,0 +1,110 @@
+use light_river::datasets::genesis_demonstrator::GenesisDemostrator;
+use light_river::mondrian_forest::mondrian_forest::MondrianForestClassifier;
+
+use light_river::common::{Classifier, ClfTarget};
+use light_river::datasets::synthetic::Synthetic;
+use light_river::stream::iter_csv::IterCsv;
+use ndarray::Array1;
+use num::ToPrimitive;
+
+use std::fs::File;
+use std::time::Instant;
+
+/// Get list of features of the dataset.
+///
+/// e.g. features: ["H.e", "UD.t.i", "H.i", ...]
+fn get_features(transactions: IterCsv<f32, File>) -> Vec<String> {
+    let sample = transactions.into_iter().next();
+    let observation = sample.unwrap().unwrap().get_observation();
+    let mut out: Vec<String> = observation.iter().map(|(k, _)| k.clone()).collect();
+    out.sort();
+    out
+}
+
+fn get_labels(transactions: IterCsv<f32, File>, label_name: &str) -> Vec<String> {
+    let mut labels = vec![];
+    for t in transactions {
+        let data = t.unwrap();
+        // TODO: use instead 'to_classifier_target' and a vector of 'ClfTarget'
+        let target = data.get_y().unwrap()[label_name].to_string();
+        if !labels.contains(&target) {
+            labels.push(target);
+        }
+    }
+    labels
+}
+
+fn get_dataset_size(transactions: IterCsv<f32, File>) -> usize {
+    let mut length = 0;
+    for _ in transactions {
+        length += 1;
+    }
+    length
+}
+
+fn main() {
+    let n_trees: usize = 10;
+
+    let transactions_f = GenesisDemostrator::load_data();
+    let features = get_features(transactions_f);
+
+    let transactions_c = GenesisDemostrator::load_data();
+    let labels = get_labels(transactions_c, "Label");
+    println!("labels: {labels:?}, features: {features:?}");
+    let mut mf: MondrianForestClassifier<f32> =
+        MondrianForestClassifier::new(n_trees, features.len(), labels.len());
+    let mut score_total = 0.0;
+
+    let transactions_l = GenesisDemostrator::load_data();
+    let dataset_size = get_dataset_size(transactions_l);
+
+    let now = Instant::now();
+
+    let transactions = GenesisDemostrator::load_data();
+    for (idx, transaction) in transactions.enumerate() {
+        let data = transaction.unwrap();
+
+        let x = data.get_observation();
+        let x = Array1::<f32>::from_vec(features.iter().map(|k| x[k]).collect());
+
+        let y = data.to_classifier_target("Label").unwrap();
+        let y = match y {
+            ClfTarget::String(y) => y,
+            _ => unimplemented!(),
+        };
+        let y = labels.clone().iter().position(|l| l == &y).unwrap();
+        let y = ClfTarget::from(y);
+        // println!("=M=1 x:{}, idx: {}", x, idx);
+
+        // Skip first sample since tree has still no node
+        if idx != 0 {
+            let score = mf.predict_one(&x, &y);
+            score_total += score;
+            // println!(
+            //     "Accuracy: {} / {} = {}",
+            //     score_total,
+            //     dataset_size - 1,
+            //     score_total / idx.to_f32().unwrap()
+            // );
+        }
+
+        // if idx == 4 {
+        //     break;
+        // }
+
+        mf.learn_one(&x, &y);
+    }
+
+    let elapsed_time = now.elapsed();
+    println!("Took {}ms", elapsed_time.as_millis());
+
+    println!(
+        "Accuracy: {} / {} = {}",
+        score_total,
+        dataset_size - 1,
+        score_total / (dataset_size - 1).to_f32().unwrap()
+    );
+
+    let forest_size = mf.get_forest_size();
+    println!("Forest tree sizes: {:?}", forest_size);
+}
diff --git a/examples/classification/keystroke.rs b/examples/classification/keystroke.rs
@@ -0,0 +1,109 @@
+use light_river::common::{Classifier, ClfTarget};
+use light_river::datasets::keystroke::Keystroke;
+use light_river::mondrian_forest::mondrian_forest::MondrianForestClassifier;
+
+use light_river::stream::iter_csv::IterCsv;
+use ndarray::Array1;
+
+use num::ToPrimitive;
+use std::fs::File;
+use std::time::Instant;
+
+/// Get list of features of the dataset.
+///
+/// e.g. features: ["H.e", "UD.t.i", "H.i", ...]
+fn get_features(transactions: IterCsv<f32, File>) -> Vec<String> {
+    let sample = transactions.into_iter().next();
+    let observation = sample.unwrap().unwrap().get_observation();
+    let mut out: Vec<String> = observation.iter().map(|(k, _)| k.clone()).collect();
+    out.sort();
+    out
+}
+
+fn get_labels(transactions: IterCsv<f32, File>, label_name: &str) -> Vec<String> {
+    let mut labels = vec![];
+    for t in transactions {
+        let data = t.unwrap();
+        // TODO: use instead 'to_classifier_target' and a vector of 'ClfTarget'
+        let target = data.get_y().unwrap()[label_name].to_string();
+        if !labels.contains(&target) {
+            labels.push(target);
+        }
+    }
+    labels
+}
+
+fn get_dataset_size(transactions: IterCsv<f32, File>) -> usize {
+    let mut length = 0;
+    for _ in transactions {
+        length += 1;
+    }
+    length
+}
+
+fn main() {
+    let n_trees: usize = 1;
+
+    let transactions_f = Keystroke::load_data();
+    let features = get_features(transactions_f);
+
+    let transactions_c = Keystroke::load_data();
+    let labels = get_labels(transactions_c, "subject");
+    println!("labels: {labels:?}, features: {features:?}");
+    let mut mf: MondrianForestClassifier<f32> =
+        MondrianForestClassifier::new(n_trees, features.len(), labels.len());
+    let mut score_total = 0.0;
+
+    let transactions_l = Keystroke::load_data();
+    let dataset_size = get_dataset_size(transactions_l);
+
+    let now = Instant::now();
+
+    let transactions = Keystroke::load_data();
+    for (idx, transaction) in transactions.enumerate() {
+        let data = transaction.unwrap();
+
+        let x = data.get_observation();
+        let x = Array1::<f32>::from_vec(features.iter().map(|k| x[k]).collect());
+
+        let y = data.to_classifier_target("subject").unwrap();
+        let y = match y {
+            ClfTarget::String(y) => y,
+            _ => unimplemented!(),
+        };
+        let y = labels.clone().iter().position(|l| l == &y).unwrap();
+        let y = ClfTarget::from(y);
+        // println!("=M=1 x:{}, idx: {}", x, idx);
+
+        // Skip first sample since tree has still no node
+        if idx != 0 {
+            let score = mf.predict_one(&x, &y);
+            score_total += score;
+            // println!(
+            //     "Accuracy: {} / {} = {}",
+            //     score_total,
+            //     dataset_size - 1,
+            //     score_total / idx.to_f32().unwrap()
+            // );
+        }
+
+        // if idx == 4 {
+        //     break;
+        // }
+
+        mf.learn_one(&x, &y);
+    }
+
+    let elapsed_time = now.elapsed();
+    println!("Took {}ms", elapsed_time.as_millis());
+
+    println!(
+        "Accuracy: {} / {} = {}",
+        score_total,
+        dataset_size - 1,
+        score_total / (dataset_size - 1).to_f32().unwrap()
+    );
+
+    let forest_size = mf.get_forest_size();
+    println!("Forest tree sizes: {:?}", forest_size);
+}