From e5f933ac0b1bb0510266b1210f3b24573e17a979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Fri, 18 Oct 2024 16:48:14 +0300 Subject: [PATCH] timeout added --- .github/workflows/CI.yml | 4 +- Cargo.toml | 4 +- README.md | 2 +- benches/perpetual_benchmarks.rs | 4 +- examples/cal_housing.rs | 2 +- examples/cover_types.rs | 2 +- examples/titanic.rs | 2 +- python-package/Cargo.toml | 6 +- python-package/examples/openml_cnae.ipynb | 124 --------------------- python-package/examples/openml_mnist.ipynb | 86 ++++++++++++++ python-package/pyproject.toml | 5 +- python-package/python/perpetual/booster.py | 3 + python-package/src/booster.rs | 15 ++- python-package/src/multi_output.rs | 2 + src/booster.rs | 74 +++++++----- src/multi_output.rs | 9 +- src/tree.rs | 13 ++- 17 files changed, 177 insertions(+), 180 deletions(-) delete mode 100644 python-package/examples/openml_cnae.ipynb create mode 100644 python-package/examples/openml_mnist.ipynb diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5f6f82a..3b0ca80 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -5,7 +5,7 @@ jobs: windows-build-test: strategy: matrix: - pyversion: ["3.8", "3.9", "3.10", "3.11", "3.12"] + pyversion: ["3.9", "3.10", "3.11", "3.12"] runs-on: "windows-latest" steps: - uses: actions/checkout@v4 @@ -101,7 +101,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - pyversion: ["3.8", "3.9", "3.10", "3.11", "3.12"] + pyversion: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 - name: Install latests stable Rust diff --git a/Cargo.toml b/Cargo.toml index 026b2df..4ce1720 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.5.1" +version = "0.5.2" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -22,7 +22,7 @@ codegen-units = 1 [dependencies] rayon = "1.8" thiserror = "1.0.64" -serde_json = { version = "1.0.127", features = ["float_roundtrip"] } +serde_json = { version = "1.0.129", features = ["float_roundtrip"] } serde = { version = "1.0.209", features = ["derive"] } approx = "0.5" log = "0.4" diff --git a/README.md b/README.md index 616a552..50eb084 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ pip install perpetual To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). ```toml -perpetual = "0.5.1" +perpetual = "0.5.2" ``` ## Paper diff --git a/benches/perpetual_benchmarks.rs b/benches/perpetual_benchmarks.rs index cb44043..7dab405 100644 --- a/benches/perpetual_benchmarks.rs +++ b/benches/perpetual_benchmarks.rs @@ -157,6 +157,7 @@ pub fn tree_benchmarks(c: &mut Criterion) { black_box(0.3), black_box(None), black_box(None), + black_box(None), ) .unwrap(); }) @@ -173,12 +174,13 @@ pub fn tree_benchmarks(c: &mut Criterion) { black_box(0.3), black_box(None), black_box(None), + black_box(None), ) .unwrap(); }) }); let mut booster = PerpetualBooster::default(); - booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); + booster.fit(&data, &y, None, None, 0.1, None, None, None).unwrap(); booster_train.bench_function("Predict Booster", |b| { b.iter(|| booster.predict(black_box(&data), false)) }); diff --git a/examples/cal_housing.rs b/examples/cal_housing.rs index fcdb955..11341a2 100644 --- a/examples/cal_housing.rs +++ b/examples/cal_housing.rs @@ -123,7 +123,7 @@ fn main() -> Result<(), Box> { .set_num_threads(Some(*num_threads)); let now = SystemTime::now(); - model.fit(&matrix_train, &y_train, None, None, *budget, None, None)?; + model.fit(&matrix_train, &y_train, None, None, *budget, None, None, None)?; println!("now.elapsed: {:?}", now.elapsed().unwrap().as_secs_f32()); let trees = model.get_prediction_trees(); diff --git a/examples/cover_types.rs b/examples/cover_types.rs index 1e5ef5f..2aafb22 100644 --- a/examples/cover_types.rs +++ b/examples/cover_types.rs @@ -148,7 +148,7 @@ fn main() -> Result<(), Box> { .map(|y| if (*y as i32) == i { 1.0 } else { 0.0 }) .collect(); - model.fit(&matrix_train, &y_tr, None, None, *budget, None, None)?; + model.fit(&matrix_train, &y_tr, None, None, *budget, None, None, None)?; println!("Completed fitting model number: {}", i); let trees = model.get_prediction_trees(); diff --git a/examples/titanic.rs b/examples/titanic.rs index 57ec8cd..8ffcf6c 100644 --- a/examples/titanic.rs +++ b/examples/titanic.rs @@ -51,7 +51,7 @@ fn main() -> Result<(), Box> { // the relevant `set_` methods for any parameters you would like to // adjust. let mut model = PerpetualBooster::default().set_objective(Objective::LogLoss); - model.fit(&matrix, &y, None, None, *budget, None, None)?; + model.fit(&matrix, &y, None, None, *budget, None, None, None)?; println!("Model prediction: {:?} ...", &model.predict(&matrix, true)[0..10]); diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index 2762915..722bac4 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.5.1" +version = "0.5.2" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -18,8 +18,8 @@ name = "perpetual" crate-type = ["cdylib", "rlib"] [dependencies] -pyo3 = { version = "0.22.4", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.5.1", path = "../" } +pyo3 = { version = "0.22.5", features = ["extension-module"] } +perpetual_rs = {package="perpetual", version = "0.5.2", path = "../" } numpy = "0.22.0" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/examples/openml_cnae.ipynb b/python-package/examples/openml_cnae.ipynb deleted file mode 100644 index da555bb..0000000 --- a/python-package/examples/openml_cnae.ipynb +++ /dev/null @@ -1,124 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from perpetual import PerpetualBooster\n", - "from sklearn.metrics import accuracy_score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train = pd.read_csv(\"../../resources/cnae_train_flat.csv\", index_col=0).to_numpy().reshape(972, -1)\n", - "X_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_test = pd.read_csv(\"../../resources/cnae_test_flat.csv\", index_col=0).to_numpy().reshape(108, -1)\n", - "X_test.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_train = pd.read_csv(\"../../resources/cnae_train_y.csv\", index_col=0).to_numpy().flatten()\n", - "y_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_test = pd.read_csv(\"../../resources/cnae_test_y.csv\", index_col=0).to_numpy().flatten()\n", - "y_test.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = PerpetualBooster()\n", - "model.fit(X_train, y_train, budget=1.0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = model.predict(X_test)\n", - "acc = accuracy_score(y_test, y_pred)\n", - "print(acc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.number_of_trees" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py311", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python-package/examples/openml_mnist.ipynb b/python-package/examples/openml_mnist.ipynb new file mode 100644 index 0000000..e454b3c --- /dev/null +++ b/python-package/examples/openml_mnist.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from perpetual import PerpetualBooster" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "logging.basicConfig()\n", + "logging.getLogger().setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = pd.read_csv(\"../../resources/fashion_train_flat.csv\", index_col=False, header=None).to_numpy().reshape(63000, -1)\n", + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_train = pd.read_csv(\"../../resources/fashion_train_y.csv\", index_col=False, header=None).to_numpy().flatten()\n", + "y_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = PerpetualBooster(log_iterations=1)\n", + "model.fit(X_train, y_train, budget=1.0, timeout=360)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.number_of_trees" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index c478902..dc5b32c 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.5.1" +version = "0.5.2" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ @@ -19,11 +19,10 @@ keywords = [ authors = [{ name = "Mutlu Simsek" }] readme = "README.md" dependencies = ["numpy", "typing-extensions"] -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Rust", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/python-package/python/perpetual/booster.py b/python-package/python/perpetual/booster.py index 0974047..a43cbb8 100644 --- a/python-package/python/perpetual/booster.py +++ b/python-package/python/perpetual/booster.py @@ -156,6 +156,7 @@ def fit( alpha: Union[float, None] = None, reset: Union[bool, None] = None, categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto", + timeout: Union[float, None] = None, ) -> Self: """Fit the gradient booster on a provided dataset. @@ -172,6 +173,7 @@ def fit( reset: whether to reset the model or continue training. categorical_features: The names or indices for categorical features. `auto` for Polars or Pandas categorical data type. + timeout: optional fit timeout in seconds """ features_, flat_data, rows, cols, categorical_features_, cat_mapping = ( @@ -244,6 +246,7 @@ def fit( alpha=alpha, reset=reset, categorical_features=categorical_features_, # type: ignore + timeout=timeout, ) return self diff --git a/python-package/src/booster.rs b/python-package/src/booster.rs index 9bb0f49..ad016f2 100644 --- a/python-package/src/booster.rs +++ b/python-package/src/booster.rs @@ -140,6 +140,7 @@ impl PerpetualBooster { alpha: Option, reset: Option, categorical_features: Option>, + timeout: Option, ) -> PyResult<()> { let flat_data = flat_data.as_slice()?; let data = Matrix::new(flat_data, rows, cols); @@ -152,10 +153,16 @@ impl PerpetualBooster { None => None, }; - match self - .booster - .fit(&data, y, sample_weight_, alpha, budget, reset, categorical_features) - { + match self.booster.fit( + &data, + y, + sample_weight_, + alpha, + budget, + reset, + categorical_features, + timeout, + ) { Ok(m) => Ok(m), Err(e) => Err(PyValueError::new_err(e.to_string())), }?; diff --git a/python-package/src/multi_output.rs b/python-package/src/multi_output.rs index 462b68f..8dd4457 100644 --- a/python-package/src/multi_output.rs +++ b/python-package/src/multi_output.rs @@ -159,6 +159,7 @@ impl MultiOutputBooster { alpha: Option, reset: Option, categorical_features: Option>, + timeout: Option, ) -> PyResult<()> { let flat_data = flat_data.as_slice()?; let data = Matrix::new(flat_data, rows, cols); @@ -182,6 +183,7 @@ impl MultiOutputBooster { budget, reset, categorical_features, + timeout, ) { Ok(m) => Ok(m), Err(e) => Err(PyValueError::new_err(e.to_string())), diff --git a/src/booster.rs b/src/booster.rs index ff29ca9..b4b6417 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -21,6 +21,7 @@ use rand::SeedableRng; use rayon::prelude::*; use serde::{Deserialize, Deserializer, Serialize}; use std::collections::{HashMap, HashSet}; +use std::time::Instant; use std::{fs, mem}; use sysinfo::System; @@ -251,6 +252,7 @@ impl PerpetualBooster { /// * `budget` - budget to fit the model. /// * `reset` - Reset the model or continue training. /// * `categorical_features` - categorical features. + /// * `timeout` - fit timeout limit in seconds. pub fn fit( &mut self, data: &Matrix, @@ -260,6 +262,7 @@ impl PerpetualBooster { budget: f32, reset: Option, categorical_features: Option>, + timeout: Option, ) -> Result<(), PerpetualError> { let constraints_map = self .monotone_constraints @@ -287,6 +290,7 @@ impl PerpetualBooster { budget, reset, categorical_features, + timeout, )?; } else { let splitter = MissingImputerSplitter::new(self.eta, self.allow_missing_splits, constraints_map); @@ -299,6 +303,7 @@ impl PerpetualBooster { budget, reset, categorical_features, + timeout, )?; }; @@ -315,7 +320,10 @@ impl PerpetualBooster { budget: f32, reset: Option, categorical_features: Option>, + timeout: Option, ) -> Result<(), PerpetualError> { + let start = Instant::now(); + let n_threads_available = std::thread::available_parallelism().unwrap().get(); let num_threads = match self.num_threads { Some(num_threads) => num_threads, @@ -494,34 +502,26 @@ impl PerpetualBooster { .map(|n| n.generalization.unwrap_or(0.0)) .max_by(|a, b| a.total_cmp(b)) .unwrap_or(0.0); - if generalization < GENERALIZATION_THRESHOLD && tree.stopper != TreeStopper::LossDecr { + if generalization < GENERALIZATION_THRESHOLD && tree.stopper != TreeStopper::LossDecrement { stopping += 1; - if verbose { - println!( - "round {0}, tree.nodes: {1}, tree.depth: {2}, stopping: {3}", - i, - tree.nodes.len(), - tree.depth, - stopping, - ); - } // If root node cannot be split due to no positive split gain, stop boosting. if tree.nodes.len() == 1 { break; } } - } else { - if verbose { - println!( - "round {0}, tree.nodes: {1}, tree.depth: {2}", - i, - tree.nodes.len(), - tree.depth, - ); - } } - if tree.stopper != TreeStopper::LossDecr { + if verbose { + info!( + "round {:0?}, tree.nodes: {:1?}, tree.depth: {:2?}, tree.stopper: {:3?}", + i, + tree.nodes.len(), + tree.depth, + tree.stopper, + ); + } + + if tree.stopper != TreeStopper::LossDecrement { n_low_loss_rounds += 1; } else { n_low_loss_rounds = 0; @@ -529,12 +529,20 @@ impl PerpetualBooster { self.trees.push(tree); + (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha); + loss = calc_loss(y, &yhat, sample_weight, alpha); + if stopping >= STOPPING_ROUNDS { + info!("Auto stopping since stopping round limit reached."); break; } - (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha); - loss = calc_loss(y, &yhat, sample_weight, alpha); + if let Some(t) = timeout { + if start.elapsed().as_secs_f32() > t { + warn!("Reached timeout limit before auto stopping. Try to decrease the budget or increase the timeout for the best performance."); + break; + } + } if i == ITERATION_LIMIT - 1 { warn!("Reached iteration limit before auto stopping. Try to decrease the budget for the best performance."); @@ -542,7 +550,11 @@ impl PerpetualBooster { } if self.log_iterations > 0 { - info!("Finished training booster with {} trees.", self.trees.len()); + info!( + "Finished training a booster with {0} trees in {1}.", + self.trees.len(), + start.elapsed().as_secs() + ); } Ok(()) @@ -991,7 +1003,7 @@ mod tests { let data = Matrix::new(&data_vec, 891, 5); let mut booster = PerpetualBooster::default().set_max_bin(300).set_base_score(0.5); - booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); + booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); @@ -1013,7 +1025,7 @@ mod tests { let mut booster = PerpetualBooster::default(); - booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); + booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); @@ -1037,7 +1049,7 @@ mod tests { .set_objective(Objective::SquaredLoss) .set_max_bin(300); - booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); + booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); @@ -1060,7 +1072,7 @@ mod tests { //let data = Matrix::new(data.get_col(1), 891, 1); let mut booster = PerpetualBooster::default().set_max_bin(300).set_base_score(0.5); - booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); + booster.fit(&data, &y, None, None, 0.3, None, None, None).unwrap(); let preds = booster.predict(&data, true); booster.save_booster("resources/model64.json").unwrap(); @@ -1090,7 +1102,9 @@ mod tests { let mut booster = PerpetualBooster::default(); - booster.fit(&data, &y, None, None, 0.1, None, Some(cat_index)).unwrap(); + booster + .fit(&data, &y, None, None, 0.1, None, Some(cat_index), None) + .unwrap(); let file = fs::read_to_string("resources/titanic_train_y.csv").expect("Something went wrong reading the file"); let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); @@ -1213,8 +1227,8 @@ mod tests { .set_max_bin(10) .set_num_threads(Some(2)); - model1.fit(&matrix_test, &y_test, None, None, 0.1, None, None)?; - model2.fit(&matrix_test, &y_test, None, None, 0.1, None, None)?; + model1.fit(&matrix_test, &y_test, None, None, 0.1, None, None, None)?; + model2.fit(&matrix_test, &y_test, None, None, 0.1, None, None, None)?; let trees1 = model1.get_prediction_trees(); let trees2 = model2.get_prediction_trees(); diff --git a/src/multi_output.rs b/src/multi_output.rs index a773684..ae75f33 100644 --- a/src/multi_output.rs +++ b/src/multi_output.rs @@ -194,7 +194,13 @@ impl MultiOutputBooster { budget: f32, reset: Option, categorical_features: Option>, + timeout: Option, ) -> Result<(), PerpetualError> { + let timeout_booster = match timeout { + Some(t) => Some(t / self.n_boosters as f32), + None => None, + }; + for i in 0..self.n_boosters { let _ = self.boosters[i].fit( data, @@ -204,6 +210,7 @@ impl MultiOutputBooster { budget, reset, categorical_features.clone(), + timeout_booster, ); } Ok(()) @@ -547,7 +554,7 @@ mod tests { println!("The number of boosters: {:?}", booster.get_boosters().len()); assert!(booster.get_boosters().len() == n_classes); - booster.fit(&data, &y, None, None, 0.1, None, None).unwrap(); + booster.fit(&data, &y, None, None, 0.1, None, None, Some(59.0)).unwrap(); let probas = booster.predict_proba(&data, true); diff --git a/src/tree.rs b/src/tree.rs index 8df9acb..e6b3c40 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -11,11 +11,11 @@ use std::cmp::max; use std::collections::{BinaryHeap, HashMap, HashSet}; use std::fmt::{self, Display}; -#[derive(Deserialize, Serialize, Clone, PartialEq)] +#[derive(Deserialize, Serialize, Clone, PartialEq, Debug)] pub enum TreeStopper { - MaxDepth, - LossDecr, - Overfitting, + Generalization, + LossDecrement, + MaxNodes, } #[derive(Deserialize, Serialize, Clone)] @@ -36,7 +36,7 @@ impl Tree { pub fn new() -> Self { Tree { nodes: HashMap::new(), - stopper: TreeStopper::Overfitting, + stopper: TreeStopper::Generalization, depth: 0, n_leaves: 0, } @@ -95,12 +95,13 @@ impl Tree { while !growable.is_empty() { // If this will push us over the number of allocated nodes, break. if self.nodes.len() > (n_nodes_alloc - 3) { + self.stopper = TreeStopper::MaxNodes; break; } if let Some(tld) = target_loss_decrement { if loss_decr_avg > tld { - self.stopper = TreeStopper::LossDecr; + self.stopper = TreeStopper::LossDecrement; break; } }