From 88b3f9b11417f1ab5eb1b14f8272f4a6bd0bfb09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Wed, 17 Jul 2024 22:46:48 +0300 Subject: [PATCH] polars support --- Cargo.toml | 9 +- README.md | 19 +- benches/perpetual_benchmarks.rs | 2 +- examples/cal_housing.rs | 54 +-- examples/cover_types.rs | 48 ++- examples/titanic.rs | 14 +- python-package/Cargo.toml | 13 +- python-package/examples/benchmark_lgbm.py | 84 +++++ .../examples/benchmark_perpetual.py | 46 +++ .../examples/performance_benchmark.ipynb | 319 +++--------------- python-package/pyproject.toml | 5 +- python-package/python/perpetual/__init__.py | 117 +++++-- python-package/src/lib.rs | 2 +- python-package/tests/test_booster.py | 31 +- src/booster.rs | 91 ++--- src/constants.rs | 2 +- src/histogram.rs | 8 +- src/splitter.rs | 98 +++--- src/tree.rs | 26 +- src/utils.rs | 133 +++++++- 20 files changed, 623 insertions(+), 498 deletions(-) create mode 100644 python-package/examples/benchmark_lgbm.py create mode 100644 python-package/examples/benchmark_perpetual.py diff --git a/Cargo.toml b/Cargo.toml index 5a55f00..ef69c15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.1.0" +version = "0.2.0" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -9,6 +9,9 @@ license-file = "LICENSE" readme = "README.md" repository = "https://github.com/perpetual-ml/perpetual" +keywords = ["machine-learning", "perpetual", "ai", "ml"] +categories = ["algorithms", "mathematics", "science"] + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [profile.release] lto = 'fat' @@ -27,10 +30,10 @@ hashbrown = { version = "0.14", features = ["serde", "rayon"] } [dev-dependencies] criterion = "0.5" -polars = "0.40" +polars = "0.41" reqwest = { version = "0.12", features = ["blocking"] } csv = "1.3" -chrono = "0.4.38" +chrono = "0.4" [[bench]] name = "perpetual_benchmarks" diff --git a/README.md b/README.md index 45cefaf..00ae013 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [![Python Versions](https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white)](https://pypi.org/project/perpetual) [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual) [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual) +[![Discord](https://img.shields.io/discord/1247650900214812692?logo=discord&cacheSeconds=10)](https://discord.gg/vADKk9Wr) @@ -14,21 +15,21 @@ ## _A self-generalizing, hyperparameter-free gradient boosting machine_ -PerpetualBooster is a gradient boosting machine (GBM) algorithm which doesn't have hyperparameters to be tuned so that you can use it without hyperparameter optimization packages unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget and increase it once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data. +PerpetualBooster is a gradient boosting machine (GBM) algorithm which doesn't have hyperparameters to be tuned so that you can use it without hyperparameter optimization packages unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 1.0) and increase it (e.g. 2.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data. ## Benchmark Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in the single run. Thus, it achieves around 100x speed-up at the same accuracy with different `budget` levels and with different datasets. The speed-up might be slightly lower or significantly higher than 100x depending on the dataset. -The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset: +The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression): | Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Perpetual cpu time | LightGBM cpu time | Speed-up | | ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- | -------- | -| 1.1 | 100 | 0.192 | 0.192 | 8.9 | 1003 | 113x | -| 1.2 | 200 | 0.190 | 0.191 | 11.0 | 2030 | 186x | -| 1.5 | 300 | 0.187 | 0.188 | 18.7 | 3272 | 179x | +| 1.0 | 100 | 0.192 | 0.192 | 7.6 | 978 | 129x | +| 1.5 | 300 | 0.188 | 0.188 | 21.8 | 3066 | 141x | +| 2.1 | 1000 | 0.185 | 0.186 | 86.0 | 8720 | 101x | -You can reproduce the results using the [performance_benchmark.ipynb](./python-package/examples/performance_benchmark.ipynb) notebook in the [examples](./python-package/examples) folder. +You can reproduce the results using the scripts in the [examples](./python-package/examples) folder. ## Usage @@ -38,7 +39,7 @@ You can use the algorithm like in the example below. Check examples folders for from perpetual import PerpetualBooster model = PerpetualBooster(objective="SquaredLoss") -model.fit(X, y, budget=0.4) +model.fit(X, y, budget=1.0) ``` ## Documentation @@ -53,10 +54,10 @@ The package can be installed directly from [pypi](https://pypi.org/project/perpe pip install perpetual ``` -To use in a rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). +To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual). ```toml -perpetual = "0.1.0" +perpetual = "0.2.0" ``` ## Paper diff --git a/benches/perpetual_benchmarks.rs b/benches/perpetual_benchmarks.rs index fc077c0..40fefc4 100644 --- a/benches/perpetual_benchmarks.rs +++ b/benches/perpetual_benchmarks.rs @@ -173,7 +173,7 @@ pub fn tree_benchmarks(c: &mut Criterion) { let mut booster = PerpetualBooster::default(); booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); booster_train.bench_function("Predict Booster", |b| { - b.iter(|| booster.predict(black_box(&data), false, None)) + b.iter(|| booster.predict(black_box(&data), false)) }); } diff --git a/examples/cal_housing.rs b/examples/cal_housing.rs index 77eaef7..1b1d9b6 100644 --- a/examples/cal_housing.rs +++ b/examples/cal_housing.rs @@ -6,6 +6,7 @@ // hyperfine --runs 3 ./target/release/examples/cal_housing // hyperfine --runs 3 .\target\release\examples\cal_housing // hyperfine --runs 11 'cargo run --release --example cal_housing 0.1 0.3 2' +// hyperfine --runs 11 'cargo run --release --example cal_housing 2.0' // cargo flamegraph --example cal_housing @@ -27,48 +28,51 @@ fn main() -> Result<(), Box> { let args: Vec = env::args().collect(); let budget = &args[1].parse::().unwrap(); - let _all_names = [ - "MedInc", - "HouseAge", - "AveRooms", - "AveBedrms", - "Population", - "AveOccup", - "Latitude", - "Longitude", - "MedHouseVal", + let all_names = [ + "MedInc".to_string(), + "HouseAge".to_string(), + "AveRooms".to_string(), + "AveBedrms".to_string(), + "Population".to_string(), + "AveOccup".to_string(), + "Latitude".to_string(), + "Longitude".to_string(), + "MedHouseVal".to_string(), ]; - let _feature_names = [ - "MedInc", - "HouseAge", - "AveRooms", - "AveBedrms", - "Population", - "AveOccup", - "Latitude", - "Longitude", + let feature_names = [ + "MedInc".to_string(), + "HouseAge".to_string(), + "AveRooms".to_string(), + "AveBedrms".to_string(), + "Population".to_string(), + "AveOccup".to_string(), + "Latitude".to_string(), + "Longitude".to_string(), ]; + let column_names_train = Arc::new(all_names.clone()); + let column_names_test = Arc::new(all_names.clone()); + let df_train = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new(_all_names.iter().map(|&s| s.to_string()).collect()))) + .with_columns(Some(column_names_train)) .try_into_reader_with_file_path(Some("resources/cal_housing_train.csv".into()))? .finish() .unwrap(); let df_test = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new(_all_names.iter().map(|&s| s.to_string()).collect()))) + .with_columns(Some(column_names_test)) .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))? .finish() .unwrap(); // Get data in column major format... let id_vars_train: Vec<&str> = Vec::new(); - let mdf_train = df_train.melt(&id_vars_train, _feature_names)?; + let mdf_train = df_train.unpivot(feature_names.clone(), &id_vars_train)?; let id_vars_test: Vec<&str> = Vec::new(); - let mdf_test = df_test.melt(&id_vars_test, _feature_names)?; + let mdf_test = df_test.unpivot(feature_names, &id_vars_test)?; let data_train = Vec::from_iter( mdf_train @@ -121,11 +125,11 @@ fn main() -> Result<(), Box> { let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum(); println!("n_leaves: {:?}", n_leaves); - let y_pred = model.predict(&matrix_train, true, None); + let y_pred = model.predict(&matrix_train, true); let error = mse(&y_train, &y_pred); println!("mse_train: {:?}", error); - let y_pred = model.predict(&matrix_test, true, None); + let y_pred = model.predict(&matrix_test, true); let error = mse(&y_test, &y_pred); println!("mse_test: {:?}", error); diff --git a/examples/cover_types.rs b/examples/cover_types.rs index e1a9353..1ed46a3 100644 --- a/examples/cover_types.rs +++ b/examples/cover_types.rs @@ -1,11 +1,11 @@ //! An example using the `cover types` dataset -// cargo run --release --example cover_types 0.1 0.3 +// cargo run --release --example cover_types 1.0 // cargo build --release --example cover_types // hyperfine --runs 3 ./target/release/examples/cover_types -// hyperfine --runs 3 .\target\release\examples\cover_types 0.1 0.3 -// hyperfine --runs 3 'cargo run --release --example cover_types 0.1 0.3' +// hyperfine --runs 3 .\target\release\examples\cover_types 1.0 +// hyperfine --runs 3 'cargo run --release --example cover_types 1.0' // cargo flamegraph --example cover_types @@ -39,7 +39,7 @@ pub fn multiclass_log_loss(y_true: &[f64], y_pred: &[Vec]) -> f64 { fn main() -> Result<(), Box> { let args: Vec = env::args().collect(); - let budget = &args[1].parse::().unwrap(); + let budget = &args[1].parse::().unwrap_or(1.0); let mut features: Vec<&str> = [ "Elevation", @@ -66,29 +66,37 @@ fn main() -> Result<(), Box> { let mut features_and_target = features.clone(); features_and_target.push("Cover_Type"); + let features_and_target_arc1 = features_and_target + .iter() + .map(|s| String::from(s.to_owned())) + .collect::>() + .into(); + + let features_and_target_arc2 = features_and_target + .iter() + .map(|s| String::from(s.to_owned())) + .collect::>() + .into(); + let df_train = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new( - features_and_target.iter().map(|&s| s.to_string()).collect(), - ))) + .with_columns(Some(features_and_target_arc1)) .try_into_reader_with_file_path(Some("resources/cover_types_train.csv".into()))? .finish() .unwrap(); let df_test = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new( - features_and_target.iter().map(|&s| s.to_string()).collect(), - ))) + .with_columns(Some(features_and_target_arc2)) .try_into_reader_with_file_path(Some("resources/cover_types_train.csv".into()))? .finish() .unwrap(); // Get data in column major format... let id_vars_train: Vec<&str> = Vec::new(); - let mdf_train = df_train.melt(&id_vars_train, &features)?; + let mdf_train = df_train.unpivot(&features, &id_vars_train)?; let id_vars_test: Vec<&str> = Vec::new(); - let mdf_test = df_test.melt(&id_vars_test, &features)?; + let mdf_test = df_test.unpivot(&features, &id_vars_test)?; let data_train = Vec::from_iter( mdf_train @@ -128,14 +136,8 @@ fn main() -> Result<(), Box> { let matrix_train = Matrix::new(&data_train, y_train.len(), 54); let matrix_test = Matrix::new(&data_test, y_test.len(), 54); - // Create booster. - // To provide parameters generate a default booster, and then use - // the relevant `set_` methods for any parameters you would like to - // adjust. - let mut raw_train_array = vec![vec![0.0; 7]; y_train.len()]; let mut raw_test_array = vec![vec![0.0; 7]; y_test.len()]; - // let mut raw_test = Vec::new(); for i in 1..8 { println!(); @@ -155,8 +157,8 @@ fn main() -> Result<(), Box> { let n_leaves: usize = trees.iter().map(|t| (t.nodes.len() + 1) / 2).sum(); println!("n_leaves: {:?}", n_leaves); - let y_pred_train = model.predict(&matrix_train, true, None); - let y_pred_test = model.predict(&matrix_test, true, None); + let y_pred_train = model.predict(&matrix_train, true); + let y_pred_test = model.predict(&matrix_test, true); raw_train_array .iter_mut() @@ -166,7 +168,6 @@ fn main() -> Result<(), Box> { .iter_mut() .enumerate() .for_each(|(idx, raw)| raw[(i - 1) as usize] = y_pred_test[idx]); - // raw_test.push(Series::new(&i.to_string(), y_pred)); } let loss_train = multiclass_log_loss(&y_train, &raw_train_array); @@ -175,10 +176,5 @@ fn main() -> Result<(), Box> { println!("loss_train: {}", loss_train); println!("loss_test: {}", loss_test); - // let mut df_raw_test = DataFrame::new(raw_test).unwrap(); - // let mut output_file = File::create("./raw_test.csv").expect("Failed to create an output file."); - // let mut writer = CsvWriter::new(&mut output_file).include_header(true); - // writer.finish(&mut df_raw_test).expect("Failed to write the CSV file."); - Ok(()) } diff --git a/examples/titanic.rs b/examples/titanic.rs index 4f4abe4..57ec8cd 100644 --- a/examples/titanic.rs +++ b/examples/titanic.rs @@ -11,18 +11,22 @@ fn main() -> Result<(), Box> { let features_and_target = ["survived", "pclass", "age", "sibsp", "parch", "fare"]; + let features_and_target_arc = features_and_target + .iter() + .map(|s| String::from(s.to_owned())) + .collect::>() + .into(); + let df = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new( - features_and_target.iter().map(|&s| s.to_string()).collect(), - ))) + .with_columns(Some(features_and_target_arc)) .try_into_reader_with_file_path(Some("resources/titanic.csv".into()))? .finish() .unwrap(); // Get data in column major format... let id_vars: Vec<&str> = Vec::new(); - let mdf = df.melt(id_vars, ["pclass", "age", "sibsp", "parch", "fare"])?; + let mdf = df.unpivot(["pclass", "age", "sibsp", "parch", "fare"], id_vars)?; let data = Vec::from_iter( mdf.select_at_idx(1) @@ -49,7 +53,7 @@ fn main() -> Result<(), Box> { let mut model = PerpetualBooster::default().set_objective(Objective::LogLoss); model.fit(&matrix, &y, None, None, *budget, None, None)?; - println!("Model prediction: {:?} ...", &model.predict(&matrix, true, None)[0..10]); + println!("Model prediction: {:?} ...", &model.predict(&matrix, true)[0..10]); Ok(()) } diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index e9c6e91..fe24014 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.1.0" +version = "0.2.0" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -9,16 +9,19 @@ license-file = "LICENSE" readme = "README.md" repository = "https://github.com/perpetual-ml/perpetual" +keywords = ["machine-learning", "perpetual", "ai", "ml"] +categories = ["algorithms", "mathematics", "science"] + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] name = "perpetual" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.21.0", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.1.0", path = "../" } +pyo3 = { version = "0.21", features = ["extension-module"] } +perpetual_rs = {package="perpetual", version = "0.2.0", path = "../" } numpy = "0.21.0" -ndarray = "0.15.1" +ndarray = "0.15" serde_plain = { version = "1.0" } serde = { version = "1.0" } -pyo3-log = "0.10.0" +pyo3-log = "0.11" diff --git a/python-package/examples/benchmark_lgbm.py b/python-package/examples/benchmark_lgbm.py new file mode 100644 index 0000000..56772ef --- /dev/null +++ b/python-package/examples/benchmark_lgbm.py @@ -0,0 +1,84 @@ +import optuna +import numpy as np +from time import process_time +from functools import partial +from lightgbm import LGBMRegressor, LGBMClassifier +from sklearn.metrics import mean_squared_error, log_loss +from sklearn.datasets import fetch_covtype, fetch_california_housing +from sklearn.model_selection import train_test_split, cross_validate + + +def prepare_data(cal_housing, seed): + if cal_housing: + data, target = fetch_california_housing(return_X_y=True, as_frame=True) + scoring = "neg_mean_squared_error" + metric_function = mean_squared_error + metric_name = "mse" + LGBMBooster = LGBMRegressor + else: + data, target = fetch_covtype(return_X_y=True, as_frame=True) + scoring = "neg_log_loss" + metric_function = log_loss + metric_name = "log_loss" + LGBMBooster = LGBMClassifier + X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2248, random_state=seed) + return X_train, X_test, y_train, y_test, scoring, metric_function, metric_name, LGBMBooster + +best_cv_results = None +cv_results = None + +def save_best_cv_results(study, trial): + global best_cv_results + if study.best_trial.number == trial.number: + best_cv_results = cv_results + +def objective_function(trial, seed, n_estimators, LGBMBooster, X_train, y_train, scoring): + global cv_results + params = { + 'seed': seed, + 'verbosity': -1, + 'n_estimators': n_estimators, + 'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True), + 'min_split_gain': trial.suggest_float('min_split_gain', 1e-6, 1.0, log=True), + 'reg_alpha': trial.suggest_float('reg_alpha', 1e-6, 1.0, log=True), + 'reg_lambda': trial.suggest_float('reg_lambda', 1e-6, 1.0, log=True), + 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0), + 'subsample': trial.suggest_float('subsample', 0.2, 1.0), + 'subsample_freq': trial.suggest_int('subsample_freq', 1, 10), + 'max_depth': trial.suggest_int('max_depth', 3, 33), + 'num_leaves': trial.suggest_int('num_leaves', 2, 1024), + 'min_child_samples': trial.suggest_int('min_child_samples', 1, 100), + } + model = LGBMBooster(**params) + cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=scoring, return_train_score=True, return_estimator=True) + return -1 * np.mean(cv_results['test_score']) + + +if __name__ == '__main__': + optuna.logging.set_verbosity(optuna.logging.WARNING) + cal_housing = True + n_estimators = 300 + n_trials = 100 + cpu_times = [] + metrics = [] + for seed in [0, 1]: + X_train, X_test, y_train, y_test, scoring, metric_function, metric_name, LGBMBooster = prepare_data(cal_housing, seed) + sampler = optuna.samplers.TPESampler(seed=seed) + study = optuna.create_study(direction='minimize', sampler=sampler) + obj = partial(objective_function, seed=seed, n_estimators=n_estimators, LGBMBooster=LGBMBooster, X_train=X_train, y_train=y_train, scoring=scoring) + start = process_time() + study.optimize(obj, n_trials=n_trials, callbacks=[save_best_cv_results]) + stop = process_time() + cpu_times.append(stop - start) + + models = best_cv_results["estimator"] + if metric_name == "log_loss": + y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0) + else: + y_pred = np.mean([model.predict(X_test) for model in models], axis=0) + metric = metric_function(y_test, y_pred) + metrics.append(metric) + + print(f"seed: {seed}, cpu time: {stop - start}, {metric_name}: {metric}") + + print(f"average cpu time: {np.mean(cpu_times)}, average {metric_name}: {np.mean(metrics)}") diff --git a/python-package/examples/benchmark_perpetual.py b/python-package/examples/benchmark_perpetual.py new file mode 100644 index 0000000..bb007af --- /dev/null +++ b/python-package/examples/benchmark_perpetual.py @@ -0,0 +1,46 @@ +import numpy as np +from time import process_time +from perpetual import PerpetualBooster +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, log_loss +from sklearn.datasets import fetch_covtype, fetch_california_housing + + +def prepare_data(cal_housing, seed): + if cal_housing: + data, target = fetch_california_housing(return_X_y=True, as_frame=True) + metric_function = mean_squared_error + metric_name = "mse" + objective = "SquaredLoss" + else: + data, target = fetch_covtype(return_X_y=True, as_frame=True) + metric_function = log_loss + metric_name = "log_loss" + objective = "LogLoss" + X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2248, random_state=seed) + return X_train, X_test, y_train, y_test, metric_function, metric_name, objective + + +if __name__ == '__main__': + budget = 1.5 + cal_housing = True + cpu_times = [] + metrics = [] + for seed in [0, 1, 2, 3, 4]: + X_train, X_test, y_train, y_test, metric_function, metric_name, objective = prepare_data(cal_housing, seed) + model = PerpetualBooster(objective=objective) + start = process_time() + model.fit(X_train, y_train, budget=budget) + stop = process_time() + cpu_times.append(stop - start) + + if metric_name == "log_loss": + y_pred = model.predict_proba(X_test) + else: + y_pred = model.predict(X_test) + metric = metric_function(y_test, y_pred) + metrics.append(metric) + + print(f"seed: {seed}, cpu time: {stop - start}, {metric_name}: {metric}, n_trees: {model.number_of_trees}") + + print(f"average cpu time: {np.mean(cpu_times)}, average {metric_name}: {np.mean(metrics)}") diff --git a/python-package/examples/performance_benchmark.ipynb b/python-package/examples/performance_benchmark.ipynb index 70a359e..ea21c7e 100644 --- a/python-package/examples/performance_benchmark.ipynb +++ b/python-package/examples/performance_benchmark.ipynb @@ -2,12 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import optuna\n", "import numpy as np\n", + "import polars as pl\n", "from lightgbm import LGBMRegressor, LGBMClassifier\n", "from sklearn.metrics import mean_squared_error, log_loss\n", "from sklearn.datasets import fetch_covtype, fetch_california_housing\n", @@ -17,38 +18,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Python 3.10.14\n" - ] - } - ], + "outputs": [], "source": [ "!python --version" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "numpy: 1.26.4\n", - "optuna: 3.6.0\n", - "lightgbm: 4.3.0\n", - "scikit-learn: 1.4.2\n", - "perpetual: 0.1.0\n" - ] - } - ], + "outputs": [], "source": [ "from importlib.metadata import version\n", "\n", @@ -61,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -70,18 +51,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "seed = 4 # average results are reported for 5 seeds -> [0, 1, 2, 3, 4]\n", - "n_estimators = 1 # results are reported for 100, 200, 300 n_estimators.\n", + "seed = 0 # average results are reported for 5 seeds -> [0, 1, 2, 3, 4]\n", + "n_estimators = 1 # results are reported for 100, 300, 1000 n_estimators.\n", "n_trials = 1" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -103,18 +84,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len(X_train): 16000\n", - "len(X_test): 4640\n" - ] - } - ], + "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2248, random_state=seed)\n", "\n", @@ -124,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -139,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -167,17 +139,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[I 2024-07-11 01:26:49,412] A new study created in memory with name: no-name-4a28ade5-6c54-459c-aaa7-2f3e0bd9c040\n" - ] - } - ], + "outputs": [], "source": [ "sampler = optuna.samplers.TPESampler(seed=seed)\n", "study = optuna.create_study(direction='minimize', sampler=sampler)" @@ -185,25 +149,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[I 2024-07-11 01:26:50,073] Trial 0 finished with value: 1.0644386016870766 and parameters: {'learning_rate': 0.4073657656436648, 'min_split_gain': 0.0019204079494910193, 'reg_alpha': 0.685655809011563, 'reg_lambda': 0.019448941142879615, 'colsample_bytree': 0.7581830596778167, 'subsample': 0.3728715964643011, 'subsample_freq': 10, 'max_depth': 3, 'num_leaves': 260, 'min_child_samples': 44}. Best is trial 0 with value: 1.0644386016870766.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: total: 469 ms\n", - "Wall time: 617 ms\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "study.optimize(objective_function, n_trials=n_trials, callbacks=[save_best_cv_results])" @@ -211,31 +159,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of finished trials: 1\n", - "Best trial:\n", - " Number: 0\n", - " Value: 1.0644386016870766\n", - " Params: \n", - " learning_rate: 0.4073657656436648\n", - " min_split_gain: 0.0019204079494910193\n", - " reg_alpha: 0.685655809011563\n", - " reg_lambda: 0.019448941142879615\n", - " colsample_bytree: 0.7581830596778167\n", - " subsample: 0.3728715964643011\n", - " subsample_freq: 10\n", - " max_depth: 3\n", - " num_leaves: 260\n", - " min_child_samples: 44\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"Number of finished trials: {len(study.trials)}\")\n", "print(f\"Best trial:\")\n", @@ -248,20 +174,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CV train scores: [1.05503594 1.06615895 1.0512751 1.0563806 1.0637785 ]\n", - "CV train scores average : 1.058526\n", - "CV valid scores: [1.08474222 1.06704007 1.08133229 1.06841159 1.02066684]\n", - "CV valid scores average : 1.064439\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"CV train scores: {-1 * best_cv_results['train_score']}\")\n", "print(f\"CV train scores average : {round(np.mean(-1 * best_cv_results['train_score']), 6)}\")\n", @@ -271,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -280,21 +195,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 0, train mse: 1.060977\n", - "Model 1, train mse: 1.066335\n", - "Model 2, train mse: 1.057287\n", - "Model 3, train mse: 1.058787\n", - "Model 4, train mse: 1.055156\n" - ] - } - ], + "outputs": [], "source": [ "for i, model in enumerate(models):\n", " y_pred = model.predict_proba(X_train) if metric_name == \"log_loss\" else model.predict(X_train)\n", @@ -303,21 +206,9 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 0, test mse: 1.035823\n", - "Model 1, test mse: 1.042579\n", - "Model 2, test mse: 1.032748\n", - "Model 3, test mse: 1.039261\n", - "Model 4, test mse: 1.031398\n" - ] - } - ], + "outputs": [], "source": [ "for i, model in enumerate(models):\n", " y_pred = model.predict_proba(X_test) if metric_name == \"log_loss\" else model.predict(X_test)\n", @@ -326,17 +217,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train mse: 1.049704\n" - ] - } - ], + "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = np.mean([model.predict_proba(X_train) for model in models], axis=0)\n", @@ -347,17 +230,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test mse: 1.02622\n" - ] - } - ], + "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = np.mean([model.predict_proba(X_test) for model in models], axis=0)\n", @@ -384,17 +259,23 @@ "| 100 | 3 | 0.188629 | 1143 |\n", "| 100 | 4 | 0.194338 | 860 |\n", "| 100 | avg | 0.192196 | 978 |\n", - "| 300 | 0 | 0.185100 | 2589 |\n", + "| 300 | 0 | 0.185100 | 2282 |\n", "| 300 | 1 | 0.192767 | 3650 |\n", "| 300 | 2 | 0.190481 | 2746 |\n", "| 300 | 3 | 0.182359 | 2782 |\n", "| 300 | 4 | 0.191614 | 3871 |\n", - "| 300 | avg | 0.188464 | 3128 |" + "| 300 | avg | 0.188464 | 3066 |\n", + "| 1000 | 0 | 0.179158 | 9615 |\n", + "| 1000 | 1 | 0.190866 | 7258 |\n", + "| 1000 | 2 | 0.188030 | 10997 |\n", + "| 1000 | 3 | 0.179903 | 7636 |\n", + "| 1000 | 4 | 0.190033 | 8095 |\n", + "| 1000 | avg | 0.185598 | 8720 |" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -403,28 +284,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: total: 8.58 s\n", - "Wall time: 8.52 s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "model.fit(X_train, y_train, budget=1.0)" @@ -432,17 +294,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test mse: 0.198443\n" - ] - } - ], + "outputs": [], "source": [ "if metric_name == \"log_loss\":\n", " y_pred = model.predict_proba(X_test)\n", @@ -453,91 +307,12 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "106" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model.number_of_trees" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### target_loss_decrement = (budget / 10.0) * eta * loss_avg;\n", - "\n", - "\n", - "\n", - "| Perpetual budget | Seed | Perpetual mse | Perpetual cpu time | cpu time improved |\n", - "| ---------------- | ---- | ------------- | ------------------ | ----------------- |\n", - "| 1.0 | 0 | 0.187273 | 9.23 | 9.28 |\n", - "| 1.0 | 1 | 0.189911 | 10.5 | 9.69 |\n", - "| 1.0 | 2 | 0.194937 | 11.0 | 11.0 |\n", - "| 1.0 | 3 | 0.182932 | 9.77 | 10.5 |\n", - "| 1.0 | 4 | 0.198443 | 9.88 | 8.58 |\n", - "| 1.0 | avg | 0.190699 | 10.1 | 9.81 |\n", - "| 1.5 | 0 | 0.185843 | 28.6 | 27.2 |\n", - "| 1.5 | 1 | 0.188146 | 26.8 | 25.5 |\n", - "| 1.5 | 2 | 0.190484 | 26.6 | 25.2 |\n", - "| 1.5 | 3 | 0.178708 | 25.1 | 23.1 |\n", - "| 1.5 | 4 | 0.192352 | 21.6 | 20.8 |\n", - "| 1.5 | avg | 0.187107 | 25.7 | 24.4 |" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### tld = eta * eta * loss_avg\n", - "\n", - "\n", - "\n", - "| Perpetual budget | Seed | Perpetual mse | Perpetual cpu time |\n", - "| ---------------- | ---- | ------------- | ------------------ |\n", - "| 1.1 | 0 | 0.190265 | 8.27 |\n", - "| 1.1 | 1 | 0.190839 | 8.81 |\n", - "| 1.1 | 2 | 0.198457 | 11.2 |\n", - "| 1.1 | 3 | 0.181992 | 8.94 |\n", - "| 1.1 | 4 | 0.199403 | 7.47 |\n", - "| 1.1 | avg | 0.192191 | 8.94 |\n", - "| 1.4 | 0 | 0.189875 | 16.8 |\n", - "| 1.4 | 1 | 0.186736 | 20.0 |\n", - "| 1.4 | 2 | 0.191496 | 21.1 |\n", - "| 1.4 | 3 | 0.180240 | 19.2 |\n", - "| 1.4 | 4 | 0.197255 | 18.3 |\n", - "| 1.4 | avg | 0.189120 | 19.1 |\n", - "| 1.5 | 0 | 0.189845 | 20.5 |\n", - "| 1.5 | 1 | 0.188703 | 23.9 |\n", - "| 1.5 | 2 | 0.195430 | 31.3 |\n", - "| 1.5 | 3 | 0.179527 | 27.6 |\n", - "| 1.5 | 4 | 0.196902 | 23.2 |\n", - "| 1.5 | avg | 0.190081 | 25.3 |\n", - "| 1.6 | 0 | 0.188318 | 28.4 |\n", - "| 1.6 | 1 | 0.187110 | 31.8 |\n", - "| 1.6 | 2 | 0.195210 | 37.9 |\n", - "| 1.6 | 3 | 0.179427 | 33.9 |\n", - "| 1.6 | 4 | 0.197369 | 28.1 |\n", - "| 1.6 | avg | 0.189487 | 32.0 |" - ] } ], "metadata": { diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 170765c..c260289 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,6 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" +version = "0.2.0" description = "A self-generalizing, hyperparameter-free gradient boosting machine" license = { file = "LICENSE" } keywords = [ @@ -17,7 +18,7 @@ keywords = [ ] authors = [{ name = "Mutlu Simsek" }] readme = "README.md" -dependencies = ["numpy>=1.21", "pandas>=2.0"] +dependencies = ["numpy"] requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", @@ -30,7 +31,7 @@ classifiers = [ ] [project.optional-dependencies] -dev = ["maturin", "pytest", "seaborn", "scikit-learn", "mkdocs-material==9.*", "mkdocstrings[python]==0.22.0", "mkdocs-autorefs", "ruff>=0.0.272", "ucimlrepo"] +dev = ["pandas", "polars", "pyarrow", "maturin", "pytest", "seaborn", "scikit-learn", "mkdocs-material==9.*", "mkdocstrings[python]==0.22.0", "mkdocs-autorefs", "ruff>=0.0.272", "ucimlrepo"] [tool.maturin] sdist-include = ["LICENSE", "README.md"] diff --git a/python-package/python/perpetual/__init__.py b/python-package/python/perpetual/__init__.py index 5e59c57..7e732c0 100644 --- a/python-package/python/perpetual/__init__.py +++ b/python-package/python/perpetual/__init__.py @@ -7,15 +7,13 @@ from typing import Any, Dict, Iterable, Optional, Protocol, Union, cast import numpy as np -import pandas as pd from perpetual.perpetual import PerpetualBooster as CratePerpetualBooster # type: ignore from perpetual.serialize import BaseSerializer, ObjectSerializer + __all__ = ["PerpetualBooster"] -ArrayLike = Union[pd.Series, np.ndarray] -FrameLike = Union[pd.DataFrame, np.ndarray] CONTRIBUTION_METHODS = { "weight": "Weight", @@ -136,8 +134,28 @@ def get_metadata(self, key: str) -> str: """pass""" +def type_df(df): + if type(df).__name__ == "DataFrame": + if type(df).__module__.split(".")[0] == "pandas": + return "pandas_df" + elif type(df).__module__.split(".")[0] == "polars": + return "polars_df" + else: + return "" + + +def type_series(y): + if type(y).__name__ == "Series": + if type(y).__module__.split(".")[0] == "pandas": + return "pandas_series" + elif type(y).__module__.split(".")[0] == "polars": + return "polars_series" + else: + return "" + + def convert_input_frame( - X: FrameLike, categorical_features + X, categorical_features ) -> tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]: """Convert data to format needed by booster. @@ -145,7 +163,7 @@ def convert_input_frame( tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping """ categorical_features_ = None - if isinstance(X, pd.DataFrame): + if type_df(X) == "pandas_df": X_ = X.to_numpy() features_ = X.columns.to_list() if categorical_features == "auto": @@ -153,6 +171,20 @@ def convert_input_frame( categorical_features_ = [ features_.index(c) for c in categorical_columns ] or None + elif type_df(X) == "polars_df": + import polars.selectors as cs + + try: + X_ = X.to_numpy(allow_copy=False) + except: + X_ = X.to_numpy(allow_copy=True) + + features_ = X.columns + if categorical_features == "auto": + categorical_columns = X.select(cs.categorical()).columns + categorical_features_ = [ + features_.index(c) for c in categorical_columns + ] or None else: # Assume it's a numpy array. X_ = X @@ -208,17 +240,21 @@ def f(x): return features_, flat_data, rows, cols, categorical_features_, cat_mapping -def transform_input_frame( - X: FrameLike, cat_mapping -) -> tuple[list[str], np.ndarray, int, int]: +def transform_input_frame(X, cat_mapping) -> tuple[list[str], np.ndarray, int, int]: """Convert data to format needed by booster. Returns: tuple[list[str], np.ndarray, int, int]: Return column names, the flat data, number of rows, the number of columns """ - if isinstance(X, pd.DataFrame): + if type_df(X) == "pandas_df": X_ = X.to_numpy() features_ = X.columns.to_list() + elif type_df(X) == "polars_df": + try: + X_ = X.to_numpy(allow_copy=False) + except: + X_ = X.to_numpy(allow_copy=True) + features_ = X.columns else: # Assume it's a numpy array. X_ = X @@ -248,9 +284,11 @@ def f(x): return features_, flat_data, rows, cols -def _convert_input_array(x: ArrayLike) -> np.ndarray: - if isinstance(x, pd.Series): +def _convert_input_array(x) -> np.ndarray: + if type_series(x) == "pandas_series": x_ = x.to_numpy() + elif type_series(x) == "polars_series": + x_ = x.to_numpy(allow_copy=False) else: x_ = x if not np.issubdtype(x_.dtype, "float64"): @@ -258,9 +296,11 @@ def _convert_input_array(x: ArrayLike) -> np.ndarray: return x_ -def _convert_input_array_f32(x: ArrayLike) -> np.ndarray: - if isinstance(x, pd.Series): +def _convert_input_array_f32(x) -> np.ndarray: + if type_series(x) == "pandas_series": x_ = x.to_numpy() + elif type_series(x) == "polars_series": + x_ = x.to_numpy(allow_copy=False) else: x_ = x if not np.issubdtype(x_.dtype, "float32"): @@ -413,9 +453,9 @@ def __init__( def fit( self, - X: FrameLike, - y: ArrayLike, - sample_weight: Union[ArrayLike, None] = None, + X, + y, + sample_weight=None, budget: float = 1.0, reset: Union[bool, None] = None, categorical_features: Union[Iterable[int], Iterable[str], str, None] = "auto", @@ -423,8 +463,8 @@ def fit( """Fit the gradient booster on a provided dataset. Args: - X (FrameLike): Either a pandas DataFrame, or a 2 dimensional numpy array. - y (ArrayLike): Either a pandas Series, or a 1 dimensional numpy array. If "LogLoss" + X (FrameLike): Either a Polars or Pandas DataFrame, or a 2 dimensional Numpy array. + y (ArrayLike): Either a Polars or Pandas Series, or a 1 dimensional Numpy array. If "LogLoss" was the objective type specified, then this should only contain 1 or 0 values, where 1 is the positive class being predicted. If "SquaredLoss" is the objective type, then any continuous variable can be provided. @@ -482,7 +522,7 @@ def _validate_features(self, features: list[str]): f"Columns mismatch between data {features} passed, and data {self.feature_names_in_} used at fit." ) - def predict(self, X: FrameLike, parallel: Union[bool, None] = None) -> np.ndarray: + def predict(self, X, parallel: Union[bool, None] = None) -> np.ndarray: """Predict with the fitted booster on new data. Args: @@ -518,7 +558,7 @@ def feature_importances_(self) -> np.ndarray: return np.array([vals.get(ft, 0.0) for ft in range(self.n_features_)]) def predict_contributions( - self, X: FrameLike, method: str = "Average", parallel: Union[bool, None] = None + self, X, method: str = "Average", parallel: Union[bool, None] = None ) -> np.ndarray: """Predict with the fitted booster on new data, returning the feature contribution matrix. The last column is the bias term. @@ -558,7 +598,7 @@ def predict_contributions( def partial_dependence( self, - X: FrameLike, + X, feature: Union[str, int], samples: int | None = 100, exclude_missing: bool = True, @@ -629,7 +669,7 @@ def partial_dependence( """ if isinstance(feature, str): - if not isinstance(X, pd.DataFrame): + if not (type_df(X) == "pandas_df" or type_df(X) == "polars_df"): raise ValueError( "If `feature` is a string, then the object passed as `X` must be a pandas DataFrame." ) @@ -648,8 +688,10 @@ def partial_dependence( [feature_idx] = [i for i, v in enumerate(X.columns) if v == feature] elif isinstance(feature, int): feature_idx = feature - if isinstance(X, pd.DataFrame): - values = X.iloc[:, feature].to_numpy() + if type_df(X) == "pandas_df": + values = X.to_numpy()[:, feature] + elif type_df(X) == "polars_df": + values = X.to_numpy(allow_copy=False)[:, feature] else: values = X[:, feature] else: @@ -788,7 +830,7 @@ def save_booster(self, path: str): def _standardize_monotonicity_map( self, - X: Union[pd.DataFrame, np.ndarray], + X, ) -> dict[int, Any]: if isinstance(X, np.ndarray): return self.monotone_constraints @@ -798,7 +840,7 @@ def _standardize_monotonicity_map( def _standardize_terminate_missing_features( self, - X: Union[pd.DataFrame, np.ndarray], + X, ) -> set[int]: if isinstance(X, np.ndarray): return set(self.terminate_missing_features) @@ -941,14 +983,14 @@ def get_node_lists(self, map_features_names: bool = True) -> list[list[Node]]: trees.append(nodes) return trees - def trees_to_dataframe(self) -> pd.DataFrame: - """Return the tree structure as a pandas DataFrame object. + def trees_to_dataframe(self): + """Return the tree structure as a Polars or Pandas DataFrame object. Returns: - pd.DataFrame: Trees in a pandas dataframe. + DataFrame: Trees in a Polars or Pandas DataFrame. Example: - This can be used directly to print out the tree structure as a pandas dataframe. The Leaf values will have the "Gain" column replaced with the weight value. + This can be used directly to print out the tree structure as a dataframe. The Leaf values will have the "Gain" column replaced with the weight value. ```python model.trees_to_dataframe().head() @@ -989,6 +1031,15 @@ def _id(i: int) -> str: for n in tree ] - return pd.DataFrame.from_records(vals).sort_values( - ["Tree", "Node"], ascending=[True, True] - ) + try: + import polars as pl + + return pl.from_records(vals).sort( + ["Tree", "Node"], descending=[False, False] + ) + except: + import pandas as pd + + return pd.DataFrame.from_records(vals).sort_values( + ["Tree", "Node"], ascending=[True, True] + ) diff --git a/python-package/src/lib.rs b/python-package/src/lib.rs index b1df5bb..e9be180 100644 --- a/python-package/src/lib.rs +++ b/python-package/src/lib.rs @@ -203,7 +203,7 @@ impl PerpetualBooster { let flat_data = flat_data.as_slice()?; let data = Matrix::new(flat_data, rows, cols); let parallel = parallel.unwrap_or(true); - Ok(self.booster.predict(&data, parallel, None).into_pyarray_bound(py)) + Ok(self.booster.predict(&data, parallel).into_pyarray_bound(py)) } pub fn predict_contributions<'py>( diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py index cbbd22e..6bb757b 100644 --- a/python-package/tests/test_booster.py +++ b/python-package/tests/test_booster.py @@ -490,7 +490,7 @@ def test_missing_treatment(X_y): contribus_average = model.predict_contributions(X, method="average") assert contribus_average.shape[1] == X.shape[1] + 1 # Wont be exactly zero because of floating point error. - assert np.allclose(contribus_average[:, :-1][X.isna()], 0, atol=0.000001) + assert np.allclose(contribus_average[:, :-1][X.isna()], 0, atol=0.001) assert np.allclose(contribus_weight.sum(1), preds) assert (X.isna().sum().sum()) > 0 @@ -514,7 +514,7 @@ def test_missing_treatment_split_further(X_y): [pclass_idx] = [i for i, f in enumerate(X.columns) if f == "pclass"] [fare_idx] = [i for i, f in enumerate(X.columns) if f == "fare"] - model.fit(X, y=y) + model.fit(X, y) preds = model.predict(X) contribus_weight = model.predict_contributions(X, method="weight") @@ -530,7 +530,7 @@ def test_missing_treatment_split_further(X_y): contribus_average = model.predict_contributions(X, method="average") assert np.allclose(contribus_weight.sum(1), preds) - assert np.allclose(contribus_average, contribus_weight, atol=0.000001) + assert np.allclose(contribus_average, contribus_weight, atol=0.001) def test_AverageNodeWeight_missing_node_treatment(X_y): @@ -754,3 +754,28 @@ def test_categorical(X_y): X[cols] = X[cols].astype("category") model = PerpetualBooster() model.fit(X, y) + + +def test_polars(X_y): + import polars as pl + + X = pl.from_pandas(pd.read_csv("../resources/adult_test_df.csv", index_col=False)) + y = np.array( + pd.read_csv( + "../resources/adult_test_y.csv", index_col=False, header=None + ).squeeze("columns") + ) + cols = [ + "workclass", + "education", + "marital-status", + "occupation", + "relationship", + "race", + "native-country", + ] + X = X.with_columns(pl.col(cols).cast(pl.Categorical)) + model = PerpetualBooster() + model.fit(X, y) + model.predict(X) + model.trees_to_dataframe() diff --git a/src/booster.rs b/src/booster.rs index 50d673e..a39b86f 100644 --- a/src/booster.rs +++ b/src/booster.rs @@ -10,13 +10,12 @@ use crate::splitter::{MissingBranchSplitter, MissingImputerSplitter, Splitter}; use crate::tree::{Tree, TreeStopper}; use crate::utils::odds; use hashbrown::HashMap as BrownHashMap; -use log::info; +use log::{info, warn}; use rayon::prelude::*; use serde::{Deserialize, Deserializer, Serialize}; use std::collections::{HashMap, HashSet}; use std::fs; -pub type CalData<'a> = (Matrix<'a, f64>, &'a [f64], &'a [f32]); // (x_flat_data, rows, cols), y, alpha type ImportanceFn = fn(&Tree, &mut HashMap); #[derive(Serialize, Deserialize)] @@ -324,7 +323,7 @@ impl PerpetualBooster { self.base_score = calc_init_callables(&self.objective)(y, sample_weight, alpha); yhat = vec![self.base_score; y.len()]; } else { - yhat = self.predict(data, self.parallel, None); + yhat = self.predict(data, self.parallel); } let calc_grad_hess = gradient_hessian_callables(&self.objective); @@ -334,12 +333,13 @@ impl PerpetualBooster { let loss_base = calc_loss(y, &vec![self.base_score; y.len()], sample_weight, alpha); let loss_avg = loss_base.iter().sum::() / loss_base.len() as f32; - let eta = splitter.get_eta(); - let target_loss_decrement = (budget / 10.0) * eta * loss_avg; - // let eta = splitter.get_eta(); - // let grad_avg = grad.iter().map(|g| g.abs()).sum::() / grad.len() as f32; - // let target_loss_decrement = eta * eta * grad_avg; + let base = 10.0_f32; + let n = base / budget; + let reciprocals_of_powers = n / (n - 1.0); + let truncated_series_sum = reciprocals_of_powers - (1.0 + 1.0 / n); + let theta = 1.0 / n - truncated_series_sum; + let target_loss_decrement = theta * base.powf(-budget) * loss_avg; let is_const_hess = match sample_weight { Some(_sample_weight) => false, @@ -404,21 +404,25 @@ impl PerpetualBooster { if tree.nodes.len() < 5 && tree.nodes.values().last().unwrap().generalization < Some(1.0) { es += 1; - println!( - "round {0}, tree.nodes: {1}, tree.depth: {2}, early stopping: {3}", - i, - tree.nodes.len(), - tree.depth, - es - ); + if verbose { + println!( + "round {0}, tree.nodes: {1}, tree.depth: {2}, early stopping: {3}", + i, + tree.nodes.len(), + tree.depth, + es + ); + } } else { // es = 0; - println!( - "round {0}, tree.nodes: {1}, tree.depth: {2}", - i, - tree.nodes.len(), - tree.depth, - ); + if verbose { + println!( + "round {0}, tree.nodes: {1}, tree.depth: {2}", + i, + tree.nodes.len(), + tree.depth, + ); + } } if tree.stopper != TreeStopper::LossDecr { @@ -436,13 +440,13 @@ impl PerpetualBooster { (grad, hess) = calc_grad_hess(y, &yhat, sample_weight, alpha); loss = calc_loss(y, &yhat, sample_weight, alpha); - if verbose { - info!("Completed iteration {}", i); + if i == ITERATIONS_LIMIT - 1 { + warn!("Reached iteration limit before early stopping. Try to decrease the budget for the best performance."); } } if self.log_iterations > 0 { - info!("Finished training booster with {} iterations.", self.trees.len()); + info!("Finished training booster with {} trees.", self.trees.len()); } Ok(()) } @@ -455,13 +459,9 @@ impl PerpetualBooster { /// Generate predictions on data using the gradient booster. /// /// * `data` - Either a pandas DataFrame, or a 2 dimensional numpy array. - /// * `score` - Optional unceartinity score. - pub fn predict(&self, data: &Matrix, parallel: bool, score: Option) -> Vec { - let init_pred = match score { - None => self.base_score, - Some(score) => self.base_score + score, - }; - let mut init_preds = vec![init_pred; data.rows]; + /// * `parallel` - Predict in parallel. + pub fn predict(&self, data: &Matrix, parallel: bool) -> Vec { + let mut init_preds = vec![self.base_score; data.rows]; self.get_prediction_trees().iter().for_each(|tree| { for (p_, val) in init_preds.iter_mut().zip(tree.predict(data, parallel, &self.missing)) { *p_ += val; @@ -470,6 +470,19 @@ impl PerpetualBooster { init_preds } + /// Generate probabilities on data using the gradient booster. + /// + /// * `data` - Either a pandas DataFrame, or a 2 dimensional numpy array. + /// * `parallel` - Predict in parallel. + pub fn predict_proba(&self, data: &Matrix, parallel: bool) -> Vec { + let preds = self.predict(data, parallel); + if parallel { + preds.par_iter().map(|p| 1.0 / (1.0 + (-p).exp())).collect() + } else { + preds.iter().map(|p| 1.0 / (1.0 + (-p).exp())).collect() + } + } + /// Predict the contributions matrix for the provided dataset. pub fn predict_contributions(&self, data: &Matrix, method: ContributionsMethod, parallel: bool) -> Vec { match method { @@ -887,7 +900,7 @@ mod tests { .set_base_score(0.5) .set_initialize_base_score(false); booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); - let preds = booster.predict(&data, false, None); + let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); println!("{}", booster.trees[0]); @@ -905,14 +918,14 @@ mod tests { let y: Vec = file.lines().map(|x| x.parse::().unwrap()).collect(); let data = Matrix::new(&data_vec, 891, 5); - //let data = Matrix::new(data.get_col(1), 891, 1); + let mut booster = PerpetualBooster::default() .set_nbins(300) .set_base_score(0.5) .set_initialize_base_score(false); booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); - let preds = booster.predict(&data, false, None); + let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); println!("{}", booster.trees[0]); @@ -938,7 +951,7 @@ mod tests { booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); booster.fit(&data, &y, None, None, 0.9, Some(false), None).unwrap(); - let preds = booster.predict(&data, false, None); + let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); println!("{}", booster.trees[0]); @@ -963,7 +976,7 @@ mod tests { .set_initialize_base_score(true); booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); - let preds = booster.predict(&data, false, None); + let preds = booster.predict(&data, false); let contribs = booster.predict_contributions(&data, ContributionsMethod::Average, false); assert_eq!(contribs.len(), (data.cols + 1) * data.rows); println!("{}", booster.trees[0]); @@ -989,11 +1002,11 @@ mod tests { .set_initialize_base_score(false); booster.fit(&data, &y, None, None, 0.3, None, None).unwrap(); - let preds = booster.predict(&data, true, None); + let preds = booster.predict(&data, true); booster.save_booster("resources/model64.json").unwrap(); let booster2 = PerpetualBooster::load_booster("resources/model64.json").unwrap(); - assert_eq!(booster2.predict(&data, true, None)[0..10], preds[0..10]); + assert_eq!(booster2.predict(&data, true)[0..10], preds[0..10]); // Test with non-NAN missing. booster.missing = 0.0; @@ -1024,7 +1037,7 @@ mod tests { .set_initialize_base_score(false); booster.fit(&data, &y, None, None, 0.3, None, Some(&cat_index)).unwrap(); - let preds = booster.predict(&data, true, None); + let preds = booster.predict(&data, true); println!("{:?}", &preds[..10]); Ok(()) diff --git a/src/constants.rs b/src/constants.rs index b8977ac..f9a7f70 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -1,3 +1,3 @@ pub const STOPPING_ROUNDS: usize = 3; pub const ITERATIONS_LIMIT: usize = 10000; -pub const N_NODES_ALLOCATED: usize = 10000; +pub const N_NODES_ALLOCATED: usize = 3000; diff --git a/src/histogram.rs b/src/histogram.rs index 8c827c4..8f3540c 100644 --- a/src/histogram.rs +++ b/src/histogram.rs @@ -277,13 +277,13 @@ impl HistogramMatrix { /// matrix, and the other child histogram matrix. This should be used /// when the node has only two possible splits, left and right. pub fn from_parent_child( - hist_map: &mut HashMap, + hist_tree: &mut HashMap, root_num: usize, child_num: usize, update_num: usize, ) { unsafe { - let mut histograms = hist_map + let mut histograms = hist_tree .get_many_unchecked_mut([&root_num, &child_num, &update_num]) .unwrap(); @@ -309,14 +309,14 @@ impl HistogramMatrix { /// and two other child histograms. This should be used with the node has /// three possible split paths, right, left, and missing. pub fn from_parent_two_children( - hist_map: &mut HashMap, + hist_tree: &mut HashMap, root_num: usize, first_num: usize, second_num: usize, update_num: usize, ) { unsafe { - let mut histograms = hist_map + let mut histograms = hist_tree .get_many_unchecked_mut([&root_num, &first_num, &second_num, &update_num]) .unwrap(); diff --git a/src/splitter.rs b/src/splitter.rs index 0638c82..dc9cef7 100644 --- a/src/splitter.rs +++ b/src/splitter.rs @@ -7,6 +7,7 @@ use crate::tree::Tree; use crate::utils::{ between, bound_to_parent, constrained_weight, constrained_weight_const_hess, cull_gain, gain_given_weight, gain_given_weight_const_hess, pivot_on_split, pivot_on_split_const_hess, pivot_on_split_exclude_missing, + pivot_on_split_exclude_missing_const_hess, }; use hashbrown::HashMap; use std::collections::HashSet; @@ -888,32 +889,50 @@ impl Splitter for MissingBranchSplitter { split_info: SplitInfo, n_nodes: &usize, node: &mut SplittableNode, - index: &mut [usize], + mut index: &mut [usize], col_index: &[usize], data: &Matrix, cuts: &JaggedMatrix, grad: &mut [f32], - hess: Option<&mut [f32]>, + mut hess: Option<&mut [f32]>, parallel: bool, - hist_map: &mut HashMap, + hist_tree: &mut HashMap, cat_index: Option<&[u64]>, ) -> Vec { let missing_child = *n_nodes; let left_child = missing_child + 1; let right_child = missing_child + 2; - let (left_cat, right_cat) = get_categories(cat_index, &split_info, hist_map, node); + let (left_cat, right_cat) = get_categories(cat_index, &split_info, hist_tree, node); // We need to move all of the index's above and below our // split value. // pivot the sub array that this node has on our split value // Missing all falls to the bottom. - let (mut missing_split_idx, mut split_idx) = pivot_on_split_exclude_missing( - &mut index[node.start_idx..node.stop_idx], - data.get_col(split_info.split_feature), - split_info.split_bin, - left_cat.as_deref(), - ); + let mut missing_split_idx: usize; + let mut split_idx: usize; + if hess.is_some() { + (missing_split_idx, split_idx) = pivot_on_split_exclude_missing( + node.start_idx, + node.stop_idx, + &mut index, + grad, + &mut hess.as_mut().unwrap(), + data.get_col(split_info.split_feature), + split_info.split_bin, + left_cat.as_deref(), + ); + } else { + (missing_split_idx, split_idx) = pivot_on_split_exclude_missing_const_hess( + node.start_idx, + node.stop_idx, + &mut index, + grad, + data.get_col(split_info.split_feature), + split_info.split_bin, + left_cat.as_deref(), + ); + } node.update_children(missing_child, left_child, right_child, &split_info, left_cat, right_cat); @@ -965,7 +984,7 @@ impl Splitter for MissingBranchSplitter { // will be a leaf, assign this node as a leaf. missing_is_leaf = true; if max_ == 1 { - let right_hist = hist_map.get_mut(&right_child).unwrap(); + let right_hist = hist_tree.get_mut(&right_child).unwrap(); right_hist.update( split_idx, node.stop_idx, @@ -978,9 +997,9 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - HistogramMatrix::from_parent_child(hist_map, node.num, right_child, left_child); + HistogramMatrix::from_parent_child(hist_tree, node.num, right_child, left_child); } else { - let left_hist = hist_map.get_mut(&left_child).unwrap(); + let left_hist = hist_tree.get_mut(&left_child).unwrap(); left_hist.update( missing_split_idx, split_idx, @@ -993,12 +1012,12 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - HistogramMatrix::from_parent_child(hist_map, node.num, left_child, right_child); + HistogramMatrix::from_parent_child(hist_tree, node.num, left_child, right_child); } } else if max_ == 0 { // Max is missing, calculate the other two // levels histograms. - let left_hist = hist_map.get_mut(&left_child).unwrap(); + let left_hist = hist_tree.get_mut(&left_child).unwrap(); left_hist.update( missing_split_idx, split_idx, @@ -1011,7 +1030,7 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - let right_hist = hist_map.get_mut(&right_child).unwrap(); + let right_hist = hist_tree.get_mut(&right_child).unwrap(); right_hist.update( split_idx, node.stop_idx, @@ -1024,9 +1043,9 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - HistogramMatrix::from_parent_two_children(hist_map, node.num, left_child, right_child, missing_child); + HistogramMatrix::from_parent_two_children(hist_tree, node.num, left_child, right_child, missing_child); } else if max_ == 1 { - let miss_hist = hist_map.get_mut(&missing_child).unwrap(); + let miss_hist = hist_tree.get_mut(&missing_child).unwrap(); miss_hist.update( node.start_idx, missing_split_idx, @@ -1039,7 +1058,7 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - let right_hist = hist_map.get_mut(&right_child).unwrap(); + let right_hist = hist_tree.get_mut(&right_child).unwrap(); right_hist.update( split_idx, node.stop_idx, @@ -1052,10 +1071,10 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - HistogramMatrix::from_parent_two_children(hist_map, node.num, missing_child, right_child, left_child); + HistogramMatrix::from_parent_two_children(hist_tree, node.num, missing_child, right_child, left_child); } else { // right is the largest - let miss_hist = hist_map.get_mut(&missing_child).unwrap(); + let miss_hist = hist_tree.get_mut(&missing_child).unwrap(); miss_hist.update( node.start_idx, missing_split_idx, @@ -1068,7 +1087,7 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - let left_hist = hist_map.get_mut(&left_child).unwrap(); + let left_hist = hist_tree.get_mut(&left_child).unwrap(); left_hist.update( missing_split_idx, split_idx, @@ -1081,7 +1100,7 @@ impl Splitter for MissingBranchSplitter { parallel, true, ); - HistogramMatrix::from_parent_two_children(hist_map, node.num, missing_child, left_child, right_child); + HistogramMatrix::from_parent_two_children(hist_tree, node.num, missing_child, left_child, right_child); } let mut missing_node = SplittableNode::from_node_info( @@ -1535,7 +1554,7 @@ impl Splitter for MissingImputerSplitter { fn get_categories( cat_index: Option<&[u64]>, s: &SplitInfo, - hist_map: &HashMap, + hist_tree: &HashMap, n: &SplittableNode, ) -> (Option>, Option>) { let (left_cat, right_cat) = match cat_index { @@ -1543,7 +1562,7 @@ fn get_categories( if c_index.contains(&(s.split_feature as u64)) { let mut left_cat = Vec::new(); let mut right_cat = Vec::new(); - let hist = hist_map.get(&n.num).unwrap().0.get_col(s.split_feature); + let hist = hist_tree.get(&n.num).unwrap().0.get_col(s.split_feature); let mut is_left = true; for bin in hist.iter() { if bin.num == s.split_bin { @@ -1654,37 +1673,37 @@ mod tests { assert!(between(93.0, 95.0, s.left_node.cover)); assert!(between(114.0, 116.0, s.right_node.cover)); assert!(between(7.0, 7.2, s.left_node.gain)); - assert!(between(298.0, 300.0, s.right_node.gain)); - assert!(between(88.0, 89.0, s.split_gain)); + assert!(between(298.0, 302.0, s.right_node.gain)); + assert!(between(88.0, 95.0, s.split_gain)); } #[test] fn test_cal_housing() -> Result<(), Box> { let n_bins = 256; let n_cols = 8; + let feature_names = [ - "MedInc", - "HouseAge", - "AveRooms", - "AveBedrms", - "Population", - "AveOccup", - "Latitude", - "Longitude", - "MedHouseVal", + "MedInc".to_owned(), + "HouseAge".to_owned(), + "AveRooms".to_owned(), + "AveBedrms".to_owned(), + "Population".to_owned(), + "AveOccup".to_owned(), + "Latitude".to_owned(), + "Longitude".to_owned(), + "MedHouseVal".to_owned(), ]; let df_test = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new(feature_names.iter().map(|&s| s.to_string()).collect()))) + .with_columns(Some(Arc::new(feature_names))) .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))? .finish() .unwrap(); let id_vars: Vec<&str> = Vec::new(); - let mdf_test = df_test.melt( - &id_vars, + let mdf_test = df_test.unpivot( [ "MedInc", "HouseAge", @@ -1695,6 +1714,7 @@ mod tests { "Latitude", "Longitude", ], + &id_vars, )?; let data_test = Vec::from_iter( diff --git a/src/tree.rs b/src/tree.rs index 1cd76cc..8a15e7f 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -715,7 +715,7 @@ mod tests { // println!("{}", tree); // let preds = tree.predict(&data, false); // println!("{:?}", &preds[0..10]); - assert_eq!(17, tree.nodes.len()); + assert_eq!(37, tree.nodes.len()); // Test contributions prediction... let weights = tree.distribute_leaf_weights(); let mut contribs = vec![0.; (data.cols + 1) * data.rows]; @@ -987,27 +987,26 @@ mod tests { let n_bins = 256; let n_cols = 8; let feature_names = [ - "MedInc", - "HouseAge", - "AveRooms", - "AveBedrms", - "Population", - "AveOccup", - "Latitude", - "Longitude", - "MedHouseVal", + "MedInc".to_owned(), + "HouseAge".to_owned(), + "AveRooms".to_owned(), + "AveBedrms".to_owned(), + "Population".to_owned(), + "AveOccup".to_owned(), + "Latitude".to_owned(), + "Longitude".to_owned(), + "MedHouseVal".to_owned(), ]; let df_test = CsvReadOptions::default() .with_has_header(true) - .with_columns(Some(Arc::new(feature_names.iter().map(|&s| s.to_string()).collect()))) + .with_columns(Some(Arc::new(feature_names))) .try_into_reader_with_file_path(Some("resources/cal_housing_test.csv".into()))? .finish() .unwrap(); let id_vars: Vec<&str> = Vec::new(); - let mdf_test = df_test.melt( - &id_vars, + let mdf_test = df_test.unpivot( [ "MedInc", "HouseAge", @@ -1018,6 +1017,7 @@ mod tests { "Latitude", "Longitude", ], + &id_vars, )?; let data_test = Vec::from_iter( diff --git a/src/utils.rs b/src/utils.rs index 00d55d5..900934b 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -378,7 +378,7 @@ pub fn map_bin>(x: &[T], v: &T, missing: &T) -> Option { pub fn pivot_on_split( start: usize, stop: usize, - ind: &mut [usize], + idx: &mut [usize], grad: &mut [f32], hess: &mut [f32], feature: &[u16], @@ -386,7 +386,7 @@ pub fn pivot_on_split( missing_right: bool, left_cat: Option<&[u16]>, ) -> usize { - let index = &mut ind[start..stop]; + let index = &mut idx[start..stop]; let g = &mut grad[start..stop]; let h = &mut hess[start..stop]; @@ -428,14 +428,14 @@ pub fn pivot_on_split( pub fn pivot_on_split_const_hess( start: usize, stop: usize, - ind: &mut [usize], + idx: &mut [usize], grad: &mut [f32], feature: &[u16], split_value: u16, missing_right: bool, left_cat: Option<&[u16]>, ) -> usize { - let index = &mut ind[start..stop]; + let index = &mut idx[start..stop]; let g = &mut grad[start..stop]; let length = index.len(); @@ -489,11 +489,18 @@ pub fn pivot_on_split_const_hess( /// * `split_value` - the split value to use to pivot on. #[inline] pub fn pivot_on_split_exclude_missing( - index: &mut [usize], + start: usize, + stop: usize, + idx: &mut [usize], + grad: &mut [f32], + hess: &mut [f32], feature: &[u16], split_value: u16, left_cat: Option<&[u16]>, ) -> (usize, usize) { + let index = &mut idx[start..stop]; + let gr = &mut grad[start..stop]; + let hs = &mut hess[start..stop]; // I think we can do this in O(n) time... let mut low = 0; let mut high = index.len() - 1; @@ -509,6 +516,8 @@ pub fn pivot_on_split_exclude_missing( let l = feature[index[low]]; if l == 0 { index.swap(missing, low); + gr.swap(missing, low); + hs.swap(missing, low); missing += 1; } match exclude_missing_compare(&split_value, l, left_cat) { @@ -523,6 +532,8 @@ pub fn pivot_on_split_exclude_missing( // then that value with low. if h == 0 { index.swap(missing, high); + gr.swap(missing, high); + hs.swap(missing, high); missing += 1; // Low must be at least equal to // missing. Otherwise, we would get @@ -542,6 +553,76 @@ pub fn pivot_on_split_exclude_missing( } if low < high { index.swap(high, low); + gr.swap(high, low); + hs.swap(high, low); + } + } + (missing, low) +} + +#[inline] +pub fn pivot_on_split_exclude_missing_const_hess( + start: usize, + stop: usize, + idx: &mut [usize], + grad: &mut [f32], + feature: &[u16], + split_value: u16, + left_cat: Option<&[u16]>, +) -> (usize, usize) { + let index = &mut idx[start..stop]; + let gr = &mut grad[start..stop]; + // I think we can do this in O(n) time... + let mut low = 0; + let mut high = index.len() - 1; + // The index of the first value, that is not + // missing. + let mut missing = 0; + let max_idx = high; + while low < high { + // Go until we find a low value that needs to + // be swapped, this will be the first value + // that our split value is less or equal to. + while low < max_idx { + let l = feature[index[low]]; + if l == 0 { + index.swap(missing, low); + gr.swap(missing, low); + missing += 1; + } + match exclude_missing_compare(&split_value, l, left_cat) { + Ordering::Less | Ordering::Equal => break, + Ordering::Greater => low += 1, + } + } + while high > low { + let h = feature[index[high]]; + // If this is missing, we need to + // swap this value with missing, and + // then that value with low. + if h == 0 { + index.swap(missing, high); + gr.swap(missing, high); + missing += 1; + // Low must be at least equal to + // missing. Otherwise, we would get + // stuck, because low will be zero + // then... + if missing > low { + low = missing; + } + } + // Go until we find a high value that needs to be + // swapped, this will be the first value that our + // split_value is greater than. + match exclude_missing_compare(&split_value, h, left_cat) { + Ordering::Less | Ordering::Equal => high -= 1, + Ordering::Greater => break, + } + } + if low < high { + index.swap(high, low); + gr.swap(high, low); } } (missing, low) @@ -903,7 +984,9 @@ mod tests { // Using minimum value... let mut idx = vec![2, 6, 9, 5, 8, 13, 11, 7]; let f = vec![15, 10, 10, 0, 3, 0, 0, 9, 3, 5, 2, 6, 13, 19, 14]; - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 1, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 1, None); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); pivot_missing_assert(1, &idx, &f, &split_i); @@ -911,14 +994,16 @@ mod tests { // Higher value... let mut idx = vec![2, 6, 9, 5, 8, 13, 11, 7]; let f = vec![15, 10, 10, 0, 3, 0, 0, 9, 3, 5, 2, 6, 13, 19, 14]; - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 10, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 10, None); //let split_i = pivot_on_split(&mut idx, &f, 10, false); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); pivot_missing_assert(10, &idx, &f, &split_i); // Run it again, and ensure it works on an already sorted list... - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 10, None); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 10, None); //let split_i = pivot_on_split(&mut idx, &f, 10, false); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); @@ -926,7 +1011,7 @@ mod tests { // Run it again, and ensure it works on reversed list... idx.reverse(); - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 10, None); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 10, None); //let split_i = pivot_on_split(&mut idx, &f, 10, false); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); @@ -935,7 +1020,9 @@ mod tests { // Small test done with python let mut idx = vec![0, 1, 2, 3, 4, 5]; let f = vec![1, 0, 1, 3, 0, 4]; - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 2, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 2, None); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); pivot_missing_assert(2, &idx, &f, &split_i); @@ -952,7 +1039,9 @@ mod tests { // TODO: Add more tests for this... let mut idx = vec![2, 6, 9, 5, 8, 13, 11, 7]; let f = vec![15, 10, 10, 2, 3, 5, 7, 9, 3, 5, 2, 6, 13, 19, 14]; - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 10, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 10, None); //let split_i = pivot_on_split(&mut idx, &f, 10, false); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); @@ -965,11 +1054,13 @@ mod tests { let mut rng = StdRng::seed_from_u64(0); let f = (0..100).map(|_| rng.gen_range(0..15)).collect::>(); let mut idx = index.choose_multiple(&mut rng, 73).copied().collect::>(); - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 10, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 10, None); pivot_missing_assert(10, &idx, &f, &split_i); // Already sorted... - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 10, None); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 10, None); pivot_missing_assert(10, &idx, &f, &split_i); // Without missing... @@ -977,7 +1068,9 @@ mod tests { let mut rng = StdRng::seed_from_u64(0); let f = (0..100).map(|_| rng.gen_range(1..15)).collect::>(); let mut idx = index.choose_multiple(&mut rng, 73).copied().collect::>(); - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, 5, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, 5, None); // let map_ = idx.iter().map(|i| f[*i]).collect::>(); // println!("{:?}, {:?}, {:?}", split_i, idx, map_); pivot_missing_assert(5, &idx, &f, &split_i); @@ -988,7 +1081,9 @@ mod tests { let f = (0..100).map(|_| rng.gen_range(0..15)).collect::>(); let mut idx = index.choose_multiple(&mut rng, 73).copied().collect::>(); let sv = idx.iter().map(|i| f[*i]).max().unwrap(); - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, sv, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, sv, None); pivot_missing_assert(sv, &idx, &f, &split_i); // Using non-0 minimum... @@ -997,7 +1092,9 @@ mod tests { let f = (0..100).map(|_| rng.gen_range(0..15)).collect::>(); let mut idx = index.choose_multiple(&mut rng, 73).copied().collect::>(); let sv = idx.iter().filter(|i| f[**i] > 0).map(|i| f[*i]).min().unwrap(); - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, sv, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, sv, None); pivot_missing_assert(sv, &idx, &f, &split_i); // Using non-0 minimum with no missing... @@ -1006,7 +1103,9 @@ mod tests { let f = (0..100).map(|_| rng.gen_range(1..15)).collect::>(); let mut idx = index.choose_multiple(&mut rng, 73).copied().collect::>(); let sv = idx.iter().filter(|i| f[**i] > 0).map(|i| f[*i]).min().unwrap(); - let split_i = pivot_on_split_exclude_missing(&mut idx, &f, sv, None); + let mut grad = idx.iter().map(|i| *i as f32).collect::>(); + let mut hess = grad.clone(); + let split_i = pivot_on_split_exclude_missing(0, idx.len(), &mut idx, &mut grad, &mut hess, &f, sv, None); pivot_missing_assert(sv, &idx, &f, &split_i); }