From 6e20f09691ae76fe1e343856c1705dfd1a0752b0 Mon Sep 17 00:00:00 2001 From: Khalil HADJI Date: Mon, 2 Dec 2019 21:44:47 +0100 Subject: [PATCH 1/3] First attempt to implement linear regression --- linfa-supervised/Cargo.toml | 16 +++ linfa-supervised/src/lib.rs | 5 + .../src/linear_regression/algorithm.rs | 102 ++++++++++++++++++ linfa-supervised/src/linear_regression/mod.rs | 3 + 4 files changed, 126 insertions(+) create mode 100644 linfa-supervised/Cargo.toml create mode 100644 linfa-supervised/src/lib.rs create mode 100644 linfa-supervised/src/linear_regression/algorithm.rs create mode 100644 linfa-supervised/src/linear_regression/mod.rs diff --git a/linfa-supervised/Cargo.toml b/linfa-supervised/Cargo.toml new file mode 100644 index 000000000..d034b4fc5 --- /dev/null +++ b/linfa-supervised/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "linfa-supervised" +version = "0.1.0" +authors = ["Khalil HADJI "] +edition = "2018" +description = "A collection of supervised learning algorithms" +license = "MIT/Apache-2.0" + +repository = "https://github.com/LukeMathWalker/linfa" + +keywords = ["supervised", "machine-learning", "linfa", "regression", "linear"] +categories = ["algorithms", "mathematics", "science"] + +[dependencies] +ndarray = { version = "0.13" , features = ["rayon", "approx"] } +ndarray-linalg = "0.12" \ No newline at end of file diff --git a/linfa-supervised/src/lib.rs b/linfa-supervised/src/lib.rs new file mode 100644 index 000000000..15a1801d2 --- /dev/null +++ b/linfa-supervised/src/lib.rs @@ -0,0 +1,5 @@ +mod linear_regression; + +pub mod linfa_supervised { + pub use crate::linear_regression::*; +} diff --git a/linfa-supervised/src/linear_regression/algorithm.rs b/linfa-supervised/src/linear_regression/algorithm.rs new file mode 100644 index 000000000..6d7df2c5a --- /dev/null +++ b/linfa-supervised/src/linear_regression/algorithm.rs @@ -0,0 +1,102 @@ +#![allow(non_snake_case)] +use ndarray::{stack, Array, Array1, ArrayBase, Axis, Data, Ix1, Ix2}; +use ndarray_linalg::Solve; +/* I will probably change the implementation for an enum for more type safety. +I have to make sure, it is a great idea when it comes to pyhton interoperability +enum Intercept { + NoIntercept, + Intercept(Array1) +} +pub struct LinearRegressor { + beta : Option>, + intercept : Intercept, +} +*/ + +/* +If fit_intercept is false, we suppose that the regression passes throught the origin +*/ +/* +The simple linear regression model is + y = bX + e where e ~ N(0, sigma^2 * I) +In probabilistic terms this corresponds to + y - bX ~ N(0, sigma^2 * I) + y | X, b ~ N(bX, sigma^2 * I) +The loss for the model is simply the squared error between the model +predictions and the true values: + Loss = ||y - bX||^2 +The maximum likelihood estimation for the model parameters `beta` can be computed +in closed form via the normal equation: + b = (X^T X)^{-1} X^T y +where (X^T X)^{-1} X^T is known as the pseudoinverse or Moore-Penrose inverse. +*/ +pub struct LinearRegressor { + beta: Option>, + fit_intercept: bool, +} + +impl LinearRegressor { + pub fn new(fit_intercept: bool) -> LinearRegressor { + LinearRegressor { + beta: None, + fit_intercept, + } + } + /* Instead of assert_eq we should probably return a Result, we first have to have a generic error type for all algorithms */ + pub fn fit(&mut self, X: &ArrayBase, Y: &ArrayBase) + where + A: Data, + B: Data, + { + let (n_samples, _) = X.dim(); + + // We have to make sure that the dimensions match + assert_eq!(Y.dim(), n_samples); + + self.beta = if self.fit_intercept { + let dummy_column: Array = Array::ones((n_samples, 1)); + /* + if x is has 2 features and 3 samples + x = [[1,2] + ,[3,4] + ,[5,6]] + dummy_column = [] + */ + let X_with_ones = stack(Axis(1), &[dummy_column.view(), X.view()]).unwrap(); + Some(LinearRegressor::fit_beta(&X_with_ones, Y)) + } else { + Some(LinearRegressor::fit_beta(X, Y)) + } + } + fn fit_beta(X: &ArrayBase, y: &ArrayBase) -> Array1 + where + A: Data, + B: Data, + { + let rhs = X.t().dot(y); + let linear_operator = X.t().dot(X); + linear_operator.solve_into(rhs).unwrap() + } + + pub fn predict(&self, X: &ArrayBase) -> Array1 + where + A: Data, + { + let (n_samples, _) = X.dim(); + + // If we are fitting the intercept, we need an additional column + if self.fit_intercept { + let dummy_column: Array = Array::ones((n_samples, 1)); + let X = stack(Axis(1), &[dummy_column.view(), X.view()]).unwrap(); + match &self.beta { + None => panic!("The linear regression estimator has to be fitted first!"), + Some(beta) => X.dot(beta), + } + } else { + match &self.beta { + None => panic!("The linear regression estimator has to be fitted first!"), + Some(beta) => X.dot(beta), + } + } + } +} diff --git a/linfa-supervised/src/linear_regression/mod.rs b/linfa-supervised/src/linear_regression/mod.rs new file mode 100644 index 000000000..4f6742108 --- /dev/null +++ b/linfa-supervised/src/linear_regression/mod.rs @@ -0,0 +1,3 @@ +mod algorithm; + +pub use algorithm::*; From 8860880e93a29ee4ae2f0ee2eb99b64858ac53f4 Mon Sep 17 00:00:00 2001 From: Khalil HADJI Date: Thu, 12 Dec 2019 19:22:20 +0100 Subject: [PATCH 2/3] Added ridge regression and some examples --- linfa-supervised/Cargo.toml | 6 +-- linfa-supervised/examples/main.rs | 40 +++++++++++++++++ linfa-supervised/src/lib.rs | 6 +-- .../src/linear_regression/algorithm.rs | 32 +++++++++++--- .../src/ridge_regression/algorithm.rs | 44 +++++++++++++++++++ linfa-supervised/src/ridge_regression/mod.rs | 3 ++ linfa-supervised/src/utils.rs | 6 +++ 7 files changed, 124 insertions(+), 13 deletions(-) create mode 100644 linfa-supervised/examples/main.rs create mode 100644 linfa-supervised/src/ridge_regression/algorithm.rs create mode 100644 linfa-supervised/src/ridge_regression/mod.rs create mode 100644 linfa-supervised/src/utils.rs diff --git a/linfa-supervised/Cargo.toml b/linfa-supervised/Cargo.toml index d034b4fc5..63614e10b 100644 --- a/linfa-supervised/Cargo.toml +++ b/linfa-supervised/Cargo.toml @@ -8,9 +8,9 @@ license = "MIT/Apache-2.0" repository = "https://github.com/LukeMathWalker/linfa" -keywords = ["supervised", "machine-learning", "linfa", "regression", "linear"] +keywords = ["supervised", "machine-learning", "linfa", "regression", "linear", "ridge", "lasso"] categories = ["algorithms", "mathematics", "science"] [dependencies] -ndarray = { version = "0.13" , features = ["rayon", "approx"] } -ndarray-linalg = "0.12" \ No newline at end of file +ndarray = { version = "0.13" , features = ["rayon"] } +ndarray-linalg = { version = "0.12", features = ["openblas"] } \ No newline at end of file diff --git a/linfa-supervised/examples/main.rs b/linfa-supervised/examples/main.rs new file mode 100644 index 000000000..3020418a9 --- /dev/null +++ b/linfa-supervised/examples/main.rs @@ -0,0 +1,40 @@ +use linfa_supervised::LinearRegression; +use linfa_supervised::RidgeRegression; +use ndarray::array; + +fn linear_regression() { + let mut linear_regression = LinearRegression::new(false); + let x = array![[1.0], [2.0], [3.0], [4.0]]; + let y = array![1.0, 2.0, 3.0, 4.0]; + linear_regression.fit(&x, &y); + let x_hat = array![[6.0], [7.0]]; + println!("{:#?}", linear_regression.predict(&x_hat)); + + let mut linear_regression2 = LinearRegression::new(true); + let x2 = array![[1.0], [2.0], [3.0], [4.0]]; + let y2 = array![2.0, 3.0, 4.0, 5.0]; + linear_regression2.fit(&x2, &y2); + let x2_hat = array![[6.0], [7.0]]; + println!("{:#?}", linear_regression2.predict(&x2_hat)); +} + +fn ridge_regression() { + let mut ridge_regression = RidgeRegression::new(0.0); + let x = array![[1.0], [2.0], [3.0], [4.0]]; + let y = array![1.0, 2.0, 3.0, 4.0]; + ridge_regression.fit(&x, &y); + let x_hat = array![[6.0], [7.0]]; + println!("{:#?}", ridge_regression.predict(&x_hat)); + + let mut ridge_regression2 = RidgeRegression::new(1.0); + let x2 = array![[1.0], [2.0], [3.0], [4.0]]; + let y2 = array![2.0, 3.0, 4.0, 5.0]; + ridge_regression2.fit(&x2, &y2); + let x2_hat = array![[6.0], [7.0]]; + println!("{:#?}", ridge_regression2.predict(&x2_hat)); +} + +fn main() { + linear_regression(); + ridge_regression(); +} diff --git a/linfa-supervised/src/lib.rs b/linfa-supervised/src/lib.rs index 15a1801d2..d6091f110 100644 --- a/linfa-supervised/src/lib.rs +++ b/linfa-supervised/src/lib.rs @@ -1,5 +1,5 @@ mod linear_regression; +mod ridge_regression; -pub mod linfa_supervised { - pub use crate::linear_regression::*; -} +pub use linear_regression::*; +pub use ridge_regression::*; diff --git a/linfa-supervised/src/linear_regression/algorithm.rs b/linfa-supervised/src/linear_regression/algorithm.rs index 6d7df2c5a..11c722840 100644 --- a/linfa-supervised/src/linear_regression/algorithm.rs +++ b/linfa-supervised/src/linear_regression/algorithm.rs @@ -30,14 +30,14 @@ in closed form via the normal equation: b = (X^T X)^{-1} X^T y where (X^T X)^{-1} X^T is known as the pseudoinverse or Moore-Penrose inverse. */ -pub struct LinearRegressor { +pub struct LinearRegression { beta: Option>, fit_intercept: bool, } -impl LinearRegressor { - pub fn new(fit_intercept: bool) -> LinearRegressor { - LinearRegressor { +impl LinearRegression { + pub fn new(fit_intercept: bool) -> LinearRegression { + LinearRegression { beta: None, fit_intercept, } @@ -60,12 +60,14 @@ impl LinearRegressor { x = [[1,2] ,[3,4] ,[5,6]] - dummy_column = [] + dummy_column = [[1] + ,[1] + ,[1]] */ let X_with_ones = stack(Axis(1), &[dummy_column.view(), X.view()]).unwrap(); - Some(LinearRegressor::fit_beta(&X_with_ones, Y)) + Some(LinearRegression::fit_beta(&X_with_ones, Y)) } else { - Some(LinearRegressor::fit_beta(X, Y)) + Some(LinearRegression::fit_beta(X, Y)) } } fn fit_beta(X: &ArrayBase, y: &ArrayBase) -> Array1 @@ -100,3 +102,19 @@ impl LinearRegressor { } } } + +#[cfg(test)] +mod test { + use super::*; + use ndarray::array; + + #[test] + fn linear_regression_test() { + let mut linear_regression = LinearRegression::new(false); + let x = array![[1.0], [2.0], [3.0], [4.0]]; + let y = array![1.0, 2.0, 3.0, 4.0]; + linear_regression.fit(&x, &y); + let x_hat = array![[6.0]]; + assert_eq!(linear_regression.predict(&x_hat), array![6.0]) + } +} diff --git a/linfa-supervised/src/ridge_regression/algorithm.rs b/linfa-supervised/src/ridge_regression/algorithm.rs new file mode 100644 index 000000000..55ee32f9a --- /dev/null +++ b/linfa-supervised/src/ridge_regression/algorithm.rs @@ -0,0 +1,44 @@ +#![allow(non_snake_case)] +use ndarray::{Array, Array1, ArrayBase, Data, Ix1, Ix2}; +use ndarray_linalg::Solve; +/* The difference between a linear regression and a Ridge regression is + that ridge regression has an L2 penalisation term to having some features + "taking all the credit" for the output. It is also a way to deal with over-fitting by adding bias. + Some details ... + b = (X^T X + aI)X^T y with a being the regularisation/penalisation term +*/ + +pub struct RidgeRegression { + beta: Option>, + alpha: f64, +} + +impl RidgeRegression { + pub fn new(alpha: f64) -> RidgeRegression { + RidgeRegression { + beta: None, + alpha: alpha, + } + } + + pub fn fit(&mut self, X: &ArrayBase, Y: &ArrayBase) + where + A: Data, + B: Data, + { + let second_term = X.t().dot(Y); + let (_, identity_size) = X.dim(); + let linear_operator = X.t().dot(X) + self.alpha * Array::eye(identity_size); + self.beta = Some(linear_operator.solve_into(second_term).unwrap()); + } + + pub fn predict(&self, X: &ArrayBase) -> Array1 + where + A: Data, + { + match &self.beta { + None => panic!("The ridge regression estimator has to be fitted first!"), + Some(beta) => X.dot(beta), + } + } +} diff --git a/linfa-supervised/src/ridge_regression/mod.rs b/linfa-supervised/src/ridge_regression/mod.rs new file mode 100644 index 000000000..4f6742108 --- /dev/null +++ b/linfa-supervised/src/ridge_regression/mod.rs @@ -0,0 +1,3 @@ +mod algorithm; + +pub use algorithm::*; diff --git a/linfa-supervised/src/utils.rs b/linfa-supervised/src/utils.rs new file mode 100644 index 000000000..2d727f391 --- /dev/null +++ b/linfa-supervised/src/utils.rs @@ -0,0 +1,6 @@ +trait GradientDescent { + fn gradient() + + fn optimise(){} + +} From 24c9117ec24a87e9d70cb91bef38be17d2a9a0f7 Mon Sep 17 00:00:00 2001 From: Khalil HADJI Date: Thu, 12 Dec 2019 20:05:03 +0100 Subject: [PATCH 3/3] Miscellaneous modifications --- Cargo.toml | 4 +++- linfa-supervised/src/linear_regression/algorithm.rs | 1 - src/lib.rs | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f78c820ce..332878e3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ categories = ["algorithms", "mathematics", "science"] [dependencies] linfa-clustering = "0.1" +linfa-supervised = {path = "linfa-supervised"} [dev-dependencies] ndarray = { version = "0.13" , features = ["rayon", "serde", "approx"]} @@ -22,7 +23,8 @@ rand_isaac = "0.2.0" ndarray-npy = { version = "0.5", default-features = false } [workspace] -members = ["linfa-clustering"] +members = ["linfa-clustering", "linfa-supervised"] [profile.release] opt-level = 3 +lto = true \ No newline at end of file diff --git a/linfa-supervised/src/linear_regression/algorithm.rs b/linfa-supervised/src/linear_regression/algorithm.rs index 11c722840..32fe16bfd 100644 --- a/linfa-supervised/src/linear_regression/algorithm.rs +++ b/linfa-supervised/src/linear_regression/algorithm.rs @@ -107,7 +107,6 @@ impl LinearRegression { mod test { use super::*; use ndarray::array; - #[test] fn linear_regression_test() { let mut linear_regression = LinearRegression::new(false); diff --git a/src/lib.rs b/src/lib.rs index 40c332e6e..b94b1a324 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,3 +32,7 @@ pub mod clustering { pub use linfa_clustering::*; } + +pub mod supervised { + pub use linfa_supervised::*; +}