From 4cf94fb113752a699c660ebe7b916e9326bc1600 Mon Sep 17 00:00:00 2001 From: oooo26 Date: Thu, 3 Aug 2023 15:55:16 +0800 Subject: [PATCH 1/2] Implement clipping on beta --- python/abess/bess_base.py | 13 +++++++++++-- python/pytest/test_flow.py | 40 ++++++++++++++++++++++++++++++++++++++ python/src/pywrap.cpp | 4 ++-- src/Algorithm.h | 13 +++++++++++++ src/AlgorithmGLM.h | 8 ++++++++ src/api.cpp | 27 +++++++++++++------------ src/api.h | 3 ++- src/utilities.h | 11 +++++++++-- src/workflow.h | 5 ++++- 9 files changed, 104 insertions(+), 20 deletions(-) diff --git a/python/abess/bess_base.py b/python/abess/bess_base.py index b1229028..ee702f9b 100644 --- a/python/abess/bess_base.py +++ b/python/abess/bess_base.py @@ -1,3 +1,4 @@ +import sys import numbers import warnings import numpy as np @@ -207,7 +208,9 @@ def fit(self, is_normal=True, sample_weight=None, cv_fold_id=None, - sparse_matrix=False): + sparse_matrix=False, + beta_low=None, + beta_high=None): r""" The fit function is used to transfer the information of data and return the fit result. @@ -583,6 +586,12 @@ def fit(self, always_select_list = np.zeros(0, dtype="int32") else: always_select_list = np.array(self.always_select, dtype="int32") + + # beta range + if beta_low is None: + beta_low = -sys.float_info.max + if beta_high is None: + beta_high = sys.float_info.max # unused n_lambda = 100 @@ -607,7 +616,7 @@ def fit(self, self.primary_model_fit_epsilon, early_stop, self.approximate_Newton, self.thread, self.covariance_update, sparse_matrix, self.splicing_type, self.important_search, - A_init_list, self.fit_intercept) + A_init_list, self.fit_intercept, beta_low, beta_high) self.coef_ = result[0].squeeze() self.intercept_ = result[1].squeeze() diff --git a/python/pytest/test_flow.py b/python/pytest/test_flow.py index 3b9c6331..312cf483 100644 --- a/python/pytest/test_flow.py +++ b/python/pytest/test_flow.py @@ -142,6 +142,46 @@ def test_normalize(): assert_value(model1.coef_, model2.coef_) assert_value(model1.intercept_, model2.intercept_) + @staticmethod + def test_clipping(): + np.random.seed(0) + n = 200 + p = 20 + k = 5 + + # range: (0, 10) + coef1 = np.zeros(p) + coef1[np.random.choice(p, k, replace=False) + ] = np.random.uniform(0, 10, size=k) + data1 = abess.make_glm_data( + n=n, p=p, k=k, family='gaussian', coef_=coef1) + + model1 = abess.LinearRegression() + model1.fit(data1.x, data1.y, beta_low=0, beta_high=10) + assert_value(model1.coef_, data1.coef_, 0.1, 0.1) + + # range: (-100, -90) + coef2 = np.zeros(p) + coef2[np.random.choice(p, k, replace=False) + ] = np.random.uniform(-100, -90, size=k) + data2 = abess.make_glm_data( + n=n, p=p, k=k, family='gaussian', coef_=coef2) + + model2 = abess.LinearRegression() + model2.fit(data2.x, data2.y, beta_low=-100, beta_high=-90) + assert_value(model2.coef_, data2.coef_, 0.1, 0.1) + + # force range + coef3 = np.zeros(p) + coef3[np.random.choice(p, k, replace=False) + ] = np.random.uniform(-10, 0, size=k) + data3 = abess.make_glm_data( + n=n, p=p, k=k, family='gaussian', coef_=coef3) + + model3 = abess.LinearRegression() + model3.fit(data3.x, data3.y, beta_low=0, beta_high=10) + assert (model3.coef_ >= 0).all() + @staticmethod def test_possible_input(): np.random.seed(2) diff --git a/python/src/pywrap.cpp b/python/src/pywrap.cpp index b7ab0d7c..b82dea0c 100644 --- a/python/src/pywrap.cpp +++ b/python/src/pywrap.cpp @@ -14,13 +14,13 @@ std::tuple pywrap_GLM( double lambda_max, int n_lambda, int screening_size, Eigen::VectorXi always_select_Vec, int primary_model_fit_max_iter, double primary_model_fit_epsilon, bool early_stop, bool approximate_Newton, int thread, bool covariance_update, bool sparse_matrix, int splicing_type, int sub_search, - Eigen::VectorXi A_init_Vec, bool fit_intercept) { + Eigen::VectorXi A_init_Vec, bool fit_intercept, double beta_low, double beta_high) { List mylist = abessGLM_API(x_Mat, y_Mat, n, p, normalize_type, weight_Vec, algorithm_type, model_type, max_iter, exchange_num, path_type, is_warm_start, eval_type, ic_coef, Kfold, sequence_Vec, lambda_sequence_Vec, s_min, s_max, lambda_min, lambda_max, n_lambda, screening_size, gindex_Vec, always_select_Vec, primary_model_fit_max_iter, primary_model_fit_epsilon, early_stop, approximate_Newton, thread, covariance_update, sparse_matrix, splicing_type, - sub_search, cv_fold_id_Vec, A_init_Vec, fit_intercept); + sub_search, cv_fold_id_Vec, A_init_Vec, fit_intercept, beta_low, beta_high); std::tuple output; int y_col = y_Mat.cols(); diff --git a/src/Algorithm.h b/src/Algorithm.h index 56b8d2d3..66057cf4 100644 --- a/src/Algorithm.h +++ b/src/Algorithm.h @@ -114,6 +114,8 @@ class Algorithm { int sub_search; // size of sub_searching in splicing int U_size; + double beta_range[2] = {-DBL_MAX, DBL_MAX}; + Algorithm() = default; virtual ~Algorithm(){}; @@ -161,6 +163,17 @@ class Algorithm { void update_exchange_num(int exchange_num) { this->exchange_num = exchange_num; } + void update_beta_range(double beta_low, double beta_high) { + if (beta_low > beta_high) { + this->beta_range[0] = -DBL_MAX; + this->beta_range[1] = DBL_MAX; + } else { + this->beta_range[0] = beta_low; + this->beta_range[1] = beta_high; + } + // std::cout << "beta range: " << beta_low << "," << beta_high << std::endl; + } + virtual void update_tau(int train_n, int N) { if (train_n == 1) { this->tau = 0.0; diff --git a/src/AlgorithmGLM.h b/src/AlgorithmGLM.h index 3692e928..e78f9ce5 100644 --- a/src/AlgorithmGLM.h +++ b/src/AlgorithmGLM.h @@ -79,6 +79,7 @@ class _abessGLM : public Algorithm { // Fitting method 2: Iteratively Reweighted Least Squares return this->_IRLS_fit(X, y, weights, beta, coef0, loss0, A, g_index, g_size); } + trunc(beta, this->beta_range); return true; }; virtual double loss_function(T4 &X, T1 &y, Eigen::VectorXd &weights, T2 &beta, T3 &coef0, Eigen::VectorXi &A, @@ -421,6 +422,8 @@ class abessLm : public _abessGLM { Eigen::VectorXd beta_full = XTX.ldlt().solve(XTy); extract_beta_coef0(beta_full, beta, coef0, this->fit_intercept); + + trunc(beta, this->beta_range); return true; }; @@ -777,6 +780,7 @@ class abessCox : public _abessGLM } beta = beta0; + trunc(beta, this->beta_range); return true; }; @@ -990,6 +994,7 @@ class abessMLm : public _abessGLMfit_intercept); + trunc(beta, this->beta_range); return true; // if (X.cols() == 0) // { @@ -1351,6 +1356,7 @@ class abessMultinomial : public _abessGLMfit_intercept); + trunc(beta, this->beta_range); return true; }; @@ -1566,6 +1572,7 @@ class abessGamma : public _abessGLM_IRLS_fit(X, y, weights, beta, coef0, loss0, A, g_index, g_size); } + trunc(beta, this->beta_range); return true; }; }; @@ -1799,6 +1806,7 @@ class abessOrdinal : public _abessGLMbeta_range); return true; } diff --git a/src/api.cpp b/src/api.cpp index c051874b..a2940baa 100644 --- a/src/api.cpp +++ b/src/api.cpp @@ -36,7 +36,8 @@ List abessGLM_API(Eigen::MatrixXd x, Eigen::MatrixXd y, int n, int p, int normal Eigen::VectorXi g_index, Eigen::VectorXi always_select, int primary_model_fit_max_iter, double primary_model_fit_epsilon, bool early_stop, bool approximate_Newton, int thread, bool covariance_update, bool sparse_matrix, int splicing_type, int sub_search, - Eigen::VectorXi cv_fold_id, Eigen::VectorXi A_init, bool fit_intercept) { + Eigen::VectorXi cv_fold_id, Eigen::VectorXi A_init, bool fit_intercept, double beta_low, + double beta_high) { #ifdef _OPENMP // Eigen::initParallel(); int max_thread = omp_get_max_threads(); @@ -190,12 +191,12 @@ List abessGLM_API(Eigen::MatrixXd x, Eigen::MatrixXd y, int n, int p, int normal out_result = abessWorkflow( x, y_vec, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, - algorithm_list_uni_dense); + beta_low, beta_high, algorithm_list_uni_dense); } else { out_result = abessWorkflow( x, y, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, - parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, - algorithm_list_mul_dense); + parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, beta_low, + beta_high, algorithm_list_mul_dense); } } else { Eigen::SparseMatrix sparse_x(n, p); @@ -220,12 +221,12 @@ List abessGLM_API(Eigen::MatrixXd x, Eigen::MatrixXd y, int n, int p, int normal out_result = abessWorkflow>( sparse_x, y_vec, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, - A_init, algorithm_list_uni_sparse); + A_init, beta_low, beta_high, algorithm_list_uni_sparse); } else { out_result = abessWorkflow>( sparse_x, y, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, - algorithm_list_mul_sparse); + beta_low, beta_high, algorithm_list_mul_sparse); } } @@ -305,6 +306,7 @@ List abessPCA_API(Eigen::MatrixXd x, int n, int p, int normalize_type, Eigen::Ve #endif List out_result_next; int num = 0; + double beta_low = -DBL_MAX, beta_high = DBL_MAX; if (!sparse_matrix) { while (num++ < pca_num) { @@ -324,7 +326,7 @@ List abessPCA_API(Eigen::MatrixXd x, int n, int p, int normalize_type, Eigen::Ve out_result_next = abessWorkflow( x, y_vec, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, - algorithm_list_uni_dense); + beta_low, beta_high, algorithm_list_uni_dense); Eigen::VectorXd beta_next; #ifdef R_BUILD beta_next = out_result_next["beta"]; @@ -406,7 +408,7 @@ List abessPCA_API(Eigen::MatrixXd x, int n, int p, int normalize_type, Eigen::Ve out_result_next = abessWorkflow>( sparse_x, y_vec, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, - A_init, algorithm_list_uni_sparse); + A_init, beta_low, beta_high, algorithm_list_uni_sparse); Eigen::VectorXd beta_next; #ifdef R_BUILD beta_next = out_result_next["beta"]; @@ -486,6 +488,7 @@ List abessRPCA_API(Eigen::MatrixXd x, int n, int p, int max_iter, int exchange_n int model_type = 10, algorithm_type = 6; int Kfold = 1; int normalize_type = 0; + double beta_low = -DBL_MAX, beta_high = DBL_MAX; Eigen::VectorXi cv_fold_id = Eigen::VectorXi::Zero(0); Eigen::VectorXd weight = Eigen::VectorXd::Ones(n); Eigen::VectorXd y_vec = Eigen::VectorXd::Zero(n); @@ -519,8 +522,8 @@ List abessRPCA_API(Eigen::MatrixXd x, int n, int p, int max_iter, int exchange_n if (!sparse_matrix) { out_result = abessWorkflow( x, y_vec, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, Kfold, - parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, - algorithm_list_uni_dense); + parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, beta_low, + beta_high, algorithm_list_uni_dense); } else { Eigen::SparseMatrix sparse_x(n, p); @@ -541,8 +544,8 @@ List abessRPCA_API(Eigen::MatrixXd x, int n, int p, int max_iter, int exchange_n out_result = abessWorkflow>( sparse_x, y_vec, n, p, normalize_type, weight, algorithm_type, path_type, is_warm_start, ic_type, ic_coef, - Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, - algorithm_list_uni_sparse); + Kfold, parameters, screening_size, g_index, early_stop, thread, sparse_matrix, cv_fold_id, A_init, beta_low, + beta_high, algorithm_list_uni_sparse); } for (int i = 0; i < algorithm_list_size; i++) { diff --git a/src/api.h b/src/api.h index 1e00c7b4..d7e0254c 100644 --- a/src/api.h +++ b/src/api.h @@ -112,7 +112,8 @@ List abessGLM_API(Eigen::MatrixXd x, Eigen::MatrixXd y, int n, int p, int normal Eigen::VectorXi g_index, Eigen::VectorXi always_select, int primary_model_fit_max_iter, double primary_model_fit_epsilon, bool early_stop, bool approximate_Newton, int thread, bool covariance_update, bool sparse_matrix, int splicing_type, int sub_search, - Eigen::VectorXi cv_fold_id, Eigen::VectorXi A_init, bool fit_intercept); + Eigen::VectorXi cv_fold_id, Eigen::VectorXi A_init, bool fit_intercept, double beta_low, + double beta_high); List abessPCA_API(Eigen::MatrixXd x, int n, int p, int normalize_type, Eigen::VectorXd weight, Eigen::MatrixXd sigma, int max_iter, int exchange_num, int path_type, bool is_warm_start, int ic_type, double ic_coef, diff --git a/src/utilities.h b/src/utilities.h index 4743bf9b..4dec280d 100644 --- a/src/utilities.h +++ b/src/utilities.h @@ -131,7 +131,7 @@ class Parameters { Rcout << "==> Parameter List:" << endl; #else std::cout << "==> Parameter List:" << endl; -#endif +#endif for (int i = 0; i < (this->sequence).size(); i++) { int support_size = (this->sequence(i)).support_size; double lambda = (this->sequence(i)).lambda; @@ -139,7 +139,7 @@ class Parameters { Rcout << " support_size = " << support_size << ", lambda = " << lambda << endl; #else std::cout << " support_size = " << support_size << ", lambda = " << lambda << endl; -#endif +#endif } } }; @@ -508,6 +508,13 @@ void trunc(Eigen::VectorXd &vec, double *trunc_range) { } } +void trunc(Eigen::MatrixXd &mat, double *trunc_range) { + for (int i = 0; i < mat.rows(); i++) + for (int j = 0; j < mat.cols(); j++) { + trunc(mat(i, j), trunc_range); + } +} + Eigen::MatrixXd rowwise_add(Eigen::MatrixXd &m, Eigen::VectorXd &v) { return m.rowwise() + v.transpose(); } Eigen::MatrixXd rowwise_add(Eigen::MatrixXd &m, double &v) { diff --git a/src/workflow.h b/src/workflow.h index 10a27c73..527d104b 100644 --- a/src/workflow.h +++ b/src/workflow.h @@ -76,13 +76,16 @@ template List abessWorkflow(T4 &x, T1 &y, int n, int p, int normalize_type, Eigen::VectorXd weight, int algorithm_type, int path_type, bool is_warm_start, int eval_type, double ic_coef, int Kfold, Parameters parameters, int screening_size, Eigen::VectorXi g_index, bool early_stop, int thread, bool sparse_matrix, - Eigen::VectorXi &cv_fold_id, Eigen::VectorXi &A_init, + Eigen::VectorXi &cv_fold_id, Eigen::VectorXi &A_init, double beta_low, double beta_high, vector *> algorithm_list) { #ifndef R_BUILD std::srand(123); #endif int algorithm_list_size = algorithm_list.size(); + for (int i = 0; i < algorithm_list_size; i++) { + algorithm_list[i]->update_beta_range(beta_low, beta_high); + } // Size of the candidate set: // usually it is equal to `p`, the number of variable, From 9b409369e299fb884c108d9ab0f46059b04bfbff Mon Sep 17 00:00:00 2001 From: oooo26 Date: Fri, 4 Aug 2023 15:57:57 +0800 Subject: [PATCH 2/2] Update more test for clipping --- python/abess/bess_base.py | 5 ++++- python/pytest/test_check.py | 9 +++++++++ python/pytest/test_flow.py | 16 +++++++++++++--- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/python/abess/bess_base.py b/python/abess/bess_base.py index ee702f9b..cad68a4d 100644 --- a/python/abess/bess_base.py +++ b/python/abess/bess_base.py @@ -586,12 +586,15 @@ def fit(self, always_select_list = np.zeros(0, dtype="int32") else: always_select_list = np.array(self.always_select, dtype="int32") - + # beta range if beta_low is None: beta_low = -sys.float_info.max if beta_high is None: beta_high = sys.float_info.max + if beta_low > beta_high: + raise ValueError( + "Please make sure beta_low <= beta_high.") # unused n_lambda = 100 diff --git a/python/pytest/test_check.py b/python/pytest/test_check.py index 746e817b..48bcb6e3 100644 --- a/python/pytest/test_check.py +++ b/python/pytest/test_check.py @@ -209,6 +209,15 @@ def test_base(): else: assert False + # clipping + try: + model = abess.LinearRegression() + model.fit([[1]], [1], beta_low=1, beta_high=0) + except ValueError as e: + print(e) + else: + assert False + # incompatible shape try: model.fit([1, 1, 1], [1]) diff --git a/python/pytest/test_flow.py b/python/pytest/test_flow.py index 312cf483..6e06bd9f 100644 --- a/python/pytest/test_flow.py +++ b/python/pytest/test_flow.py @@ -171,16 +171,26 @@ def test_clipping(): model2.fit(data2.x, data2.y, beta_low=-100, beta_high=-90) assert_value(model2.coef_, data2.coef_, 0.1, 0.1) + # one-side + model1.fit(data1.x, data1.y, beta_low=0) + assert_value(model1.coef_, data1.coef_, 0.1, 0.1) + model1.fit(data1.x, data1.y, beta_high=10) + assert_value(model1.coef_, data1.coef_, 0.1, 0.1) + # force range coef3 = np.zeros(p) coef3[np.random.choice(p, k, replace=False) - ] = np.random.uniform(-10, 0, size=k) + ] = np.random.choice([-11, 11], size=k) data3 = abess.make_glm_data( n=n, p=p, k=k, family='gaussian', coef_=coef3) model3 = abess.LinearRegression() - model3.fit(data3.x, data3.y, beta_low=0, beta_high=10) - assert (model3.coef_ >= 0).all() + model3.fit(data3.x, data3.y, beta_low=-10, beta_high=10, is_normal=False) + assert_fit(model3.coef_, data3.coef_) + assert (model3.coef_ >= -10).all() + assert (model3.coef_ <= 10).all() + assert (model3.coef_[data3.coef_ == -11] == -10).all() + assert (model3.coef_[data3.coef_ == 11] == 10).all() @staticmethod def test_possible_input():