From 2ac5786431276588de7ef2f6182d25a47e535434 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 5 Nov 2021 08:34:55 +0000 Subject: [PATCH 01/84] add parameter data_sample_strategy --- include/LightGBM/config.h | 2 ++ src/io/config.cpp | 15 +++++++++++++++ src/io/config_auto.cpp | 1 + 3 files changed, 18 insertions(+) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 45fffa432819..7ba9b47f7298 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -149,6 +149,8 @@ struct Config { // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; + std::string data_sample_strategy = "bagging"; + // alias = train, train_data, train_data_file, data_filename // desc = path of training data, LightGBM will train from this data // desc = **Note**: can be used only in CLI version diff --git a/src/io/config.cpp b/src/io/config.cpp index a42b392dac3e..d5dab13f8413 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -64,6 +64,20 @@ void GetBoostingType(const std::unordered_map& params, } } +void GetDataSampleStrategy(const std::unordered_map& params, std::string* strategy) { + std::string value; + if (Config::GetString(params, "data_sample_strategy", &value)) { + std::transform(value.begin(), value.end(), value.begin(), Common::tolower); + if (value == std::string("goss")) { + *strategy = "goss"; + } else if (value == std::string("bagging")) { + *strategy = "bagging"; + } else { + Log::Fatal("Unknown sample strategy %s", value.c_str()); + } + } +} + void ParseMetrics(const std::string& value, std::vector* out_metric) { std::unordered_set metric_sets; out_metric->clear(); @@ -205,6 +219,7 @@ void Config::Set(const std::unordered_map& params) { GetTaskType(params, &task); GetBoostingType(params, &boosting); + GetDataSampleStrategy(params, &data_sample_strategy); GetObjectiveType(params, &objective); GetMetricType(params, objective, &metric); GetDeviceType(params, &device_type); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 4e3f000a88f5..18225c55a2fc 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -312,6 +312,7 @@ const std::unordered_set& Config::parameter_set() { "gpu_device_id", "gpu_use_dp", "num_gpu", + "data_sample_strategy" }); return params; } From 590aec61b31c2bfd97d705fae20c0f7b8a3be068 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Tue, 9 Nov 2021 14:46:14 +0000 Subject: [PATCH 02/84] abstract GOSS as a sample strategy(GOSS1), togetherwith origial GOSS (Normal Bagging has not been abstracted, so do NOT use it now) --- include/LightGBM/sample_strategy.h | 42 +++++++++ src/boosting/gbdt.cpp | 29 ++++++- src/boosting/gbdt.h | 2 + src/boosting/goss1.hpp | 131 +++++++++++++++++++++++++++++ src/boosting/sample_strategy.cpp | 16 ++++ 5 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 include/LightGBM/sample_strategy.h create mode 100644 src/boosting/goss1.hpp create mode 100644 src/boosting/sample_strategy.cpp diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h new file mode 100644 index 000000000000..cb181fc4892d --- /dev/null +++ b/include/LightGBM/sample_strategy.h @@ -0,0 +1,42 @@ +#ifndef LIGHTGBM_SAMPLE_STRATEGY_H_ +#define LIGHTGBM_SAMPLE_STRATEGY_H_ + +#include +#include +#include +#include +#include +#include + +namespace LightGBM { + +class SampleStrategy { + public: + SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_) {}; + virtual ~SampleStrategy() {}; + static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, int num_tree_per_iteration); + virtual void Bagging(int iter, score_t* gradients, score_t* hessians, TreeLearner* tree_learner) = 0; + virtual void Reset() = 0; + bool is_use_subset() {return is_use_subset_;} + data_size_t bag_data_cnt() {return bag_data_cnt_;} + std::vector> bag_data_indices() {return bag_data_indices_;} + + protected: + virtual data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) = 0; + + const Config* config_; + const Dataset* train_data_; + std::vector> bag_data_indices_; + data_size_t bag_data_cnt_; + data_size_t num_data_; + int num_tree_per_iteration_; + std::unique_ptr tmp_subset_; + bool is_use_subset_; + bool balanced_bagging_; + const int bagging_rand_block_ = 1024; + std::vector bagging_rands_; + ParallelPartitionRunner bagging_runner_; +}; + +} // namespace LightGBM +#endif // LIGHTGBM_SAMPLE_STRATEGY_H_ \ No newline at end of file diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index d393d46d5133..aca734e28eec 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,10 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } } + CHECK(!(config_->bagging_freq > 0)); // can not use normal bagging in this version + data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, num_tree_per_iteration_)); + data_sample_strategy_->Reset(); + is_constant_hessian_ = GetIsConstHessian(objective_function); tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, @@ -107,10 +112,14 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective num_data_ = train_data_->num_data(); // create buffer for gradients and Hessians + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; if (objective_function_ != nullptr) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; gradients_.resize(total_size); hessians_.resize(total_size); + } else { + // use customized objective function, only for GOSS + gradients_.resize(total_size, 0.0f); + hessians_.resize(total_size, 0.0f); } // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; @@ -377,9 +386,23 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { Boosting(); gradients = gradients_.data(); hessians = hessians_.data(); + } else if (gradients != nullptr) { + // use customized objective function + CHECK(hessians != nullptr && objective_function_ == nullptr); + // and will be only used for GOSS + CHECK(config_->boosting==std::string("goss") || config_->data_sample_strategy==std::string("goss")); + int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + #pragma omp parallel for schedule(static) + for (int64_t i = 0; i < total_size; ++i) { + gradients_[i] = gradients[i]; + hessians_[i] = hessians[i]; + } } // bagging logic - Bagging(iter_); + data_sample_strategy_->Bagging(iter_, gradients_.data(), hessians_.data(), tree_learner_.get()); + bag_data_indices_ = data_sample_strategy_->bag_data_indices(); + bag_data_cnt_ = data_sample_strategy_->bag_data_cnt(); + is_use_subset_ = data_sample_strategy_->is_use_subset(); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -733,6 +756,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); ResetBaggingConfig(config_.get(), true); + data_sample_strategy_->Reset(); } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } @@ -757,6 +781,7 @@ void GBDT::ResetConfig(const Config* config) { if (train_data_ != nullptr) { ResetBaggingConfig(new_config.get(), false); } + data_sample_strategy_->Reset(); if (config_.get() != nullptr && config_->forcedsplits_filename != new_config->forcedsplits_filename) { // load forced_splits file if (!new_config->forcedsplits_filename.empty()) { diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 472ea1707104..f41e14582f4d 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -534,6 +535,7 @@ class GBDT : public GBDTBase { ParallelPartitionRunner bagging_runner_; Json forced_splits_json_; bool linear_tree_; + std::unique_ptr data_sample_strategy_; }; } // namespace LightGBM diff --git a/src/boosting/goss1.hpp b/src/boosting/goss1.hpp new file mode 100644 index 000000000000..b0e49231a5a4 --- /dev/null +++ b/src/boosting/goss1.hpp @@ -0,0 +1,131 @@ +#ifndef LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ +#define LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ + +#include +#include + + +namespace LightGBM { + +class GOSS1 : public SampleStrategy { + public: + GOSS1(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { + config_ = config; + train_data_ = train_data; + num_tree_per_iteration_ = num_tree_per_iteration; + num_data_ = train_data->num_data(); + } + + ~GOSS1() { + } + + void Bagging(int iter, score_t* gradients, score_t* hessians, TreeLearner* tree_learner) override { + bag_data_cnt_ = num_data_; + // not subsample for first iterations + if (iter < static_cast(1.0f / config_->learning_rate)) { return; } + auto left_cnt = bagging_runner_.Run( + num_data_, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, + data_size_t*) { + data_size_t cur_left_count = 0; + cur_left_count = Helper(cur_start, cur_cnt, left, gradients, hessians); + return cur_left_count; + }, + bag_data_indices_.data()); + bag_data_cnt_ = left_cnt; + // set bagging data to tree learner + if (!is_use_subset_) { + tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); + } else { + // get subset + tmp_subset_->ReSize(bag_data_cnt_); + tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), + bag_data_cnt_, false); + tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), + bag_data_cnt_); + } + } + + void Reset() override { + CHECK_LE(config_->top_rate + config_->other_rate, 1.0f); + CHECK(config_->top_rate > 0.0f && config_->other_rate > 0.0f); + if (config_->bagging_freq > 0 && config_->bagging_fraction != 1.0f) { + Log::Fatal("Cannot use bagging in GOSS"); + } + Log::Info("Using GOSS"); + balanced_bagging_ = false; + bag_data_indices_.resize(num_data_); + bagging_runner_.ReSize(num_data_); + bagging_rands_.clear(); + for (int i = 0; + i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) { + bagging_rands_.emplace_back(config_->bagging_seed + i); + } + is_use_subset_ = false; + if (config_->top_rate + config_->other_rate <= 0.5) { + auto bag_data_cnt = static_cast((config_->top_rate + config_->other_rate) * num_data_); + bag_data_cnt = std::max(1, bag_data_cnt); + tmp_subset_.reset(new Dataset(bag_data_cnt)); + tmp_subset_->CopyFeatureMapperFrom(train_data_); + is_use_subset_ = true; + } + // flag to not bagging first + bag_data_cnt_ = num_data_; + } + + protected: + data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) override { + if (cnt <= 0) { + return 0; + } + std::vector tmp_gradients(cnt, 0.0f); + for (data_size_t i = 0; i < cnt; ++i) { + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + size_t idx = static_cast(cur_tree_id) * num_data_ + start + i; + tmp_gradients[i] += std::fabs(gradients[idx] * hessians[idx]); + } + } + data_size_t top_k = static_cast(cnt * config_->top_rate); + data_size_t other_k = static_cast(cnt * config_->other_rate); + top_k = std::max(1, top_k); + ArrayArgs::ArgMaxAtK(&tmp_gradients, 0, static_cast(tmp_gradients.size()), top_k - 1); + score_t threshold = tmp_gradients[top_k - 1]; + + score_t multiply = static_cast(cnt - top_k) / other_k; + data_size_t cur_left_cnt = 0; + data_size_t cur_right_pos = cnt; + data_size_t big_weight_cnt = 0; + for (data_size_t i = 0; i < cnt; ++i) { + auto cur_idx = start + i; + score_t grad = 0.0f; + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + size_t idx = static_cast(cur_tree_id) * num_data_ + cur_idx; + grad += std::fabs(gradients[idx] * hessians[idx]); + } + if (grad >= threshold) { + buffer[cur_left_cnt++] = cur_idx; + ++big_weight_cnt; + } else { + data_size_t sampled = cur_left_cnt - big_weight_cnt; + data_size_t rest_need = other_k - sampled; + data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt); + double prob = (rest_need) / static_cast(rest_all); + if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < prob) { + buffer[cur_left_cnt++] = cur_idx; + for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { + size_t idx = static_cast(cur_tree_id) * num_data_ + cur_idx; + gradients[idx] *= multiply; + hessians[idx] *= multiply; + } + } else { + buffer[--cur_right_pos] = cur_idx; + } + } + } + return cur_left_cnt; + } + +}; + +} // namespace LightGBM +#endif // LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ \ No newline at end of file diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp new file mode 100644 index 000000000000..cac1badb6c13 --- /dev/null +++ b/src/boosting/sample_strategy.cpp @@ -0,0 +1,16 @@ +#include +#include "goss1.hpp" + +namespace LightGBM { + +SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { + bool use_goss_as_boosting = config->boosting == std::string("goss"); + bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); + if (use_goss_as_boosting || use_goss_as_strategy) { + return new GOSS1(config, train_data, num_tree_per_iteration); + } else if (config->data_sample_strategy == std::string("bagging")) { + return nullptr; + } +} + +} // namespace LightGBM \ No newline at end of file From c8dce4d83eb0c03b5e6a7e16913f7fcb97618da2 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 12 Nov 2021 05:41:23 +0000 Subject: [PATCH 03/84] abstract Bagging as a subclass (BAGGING), but original Bagging members in GBDT are still kept --- include/LightGBM/sample_strategy.h | 11 +- src/boosting/bagging.hpp | 169 +++++++++++++++++++++++++++++ src/boosting/gbdt.cpp | 21 ++-- src/boosting/goss1.hpp | 8 +- src/boosting/sample_strategy.cpp | 9 +- 5 files changed, 198 insertions(+), 20 deletions(-) create mode 100644 src/boosting/bagging.hpp diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index cb181fc4892d..b1770d28ccb4 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace LightGBM { @@ -14,18 +15,20 @@ class SampleStrategy { public: SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_) {}; virtual ~SampleStrategy() {}; - static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, int num_tree_per_iteration); - virtual void Bagging(int iter, score_t* gradients, score_t* hessians, TreeLearner* tree_learner) = 0; + static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration); + virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0; virtual void Reset() = 0; + virtual void ResetConfig(const Config* config, bool is_change_dataset, + std::vector>& gradients, + std::vector>& hessians) = 0; bool is_use_subset() {return is_use_subset_;} data_size_t bag_data_cnt() {return bag_data_cnt_;} std::vector> bag_data_indices() {return bag_data_indices_;} protected: - virtual data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) = 0; - const Config* config_; const Dataset* train_data_; + const ObjectiveFunction* objective_function_; std::vector> bag_data_indices_; data_size_t bag_data_cnt_; data_size_t num_data_; diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp new file mode 100644 index 000000000000..fb7e230cae16 --- /dev/null +++ b/src/boosting/bagging.hpp @@ -0,0 +1,169 @@ +#ifndef LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ +#define LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ + +namespace LightGBM { + +class BAGGING : public SampleStrategy { + public: + BAGGING(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) + : need_re_bagging_(false) { + config_ = config; + train_data_ = train_data; + num_data_ = train_data->num_data(); + objective_function_ = objective_function; + num_tree_per_iteration_ = num_tree_per_iteration; + } + ~BAGGING() {} + + void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { + Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); + // if need bagging + if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || + need_re_bagging_) { + need_re_bagging_ = false; + auto left_cnt = bagging_runner_.Run( + num_data_, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, + data_size_t*) { + data_size_t cur_left_count = 0; + if (balanced_bagging_) { + cur_left_count = + BalancedBaggingHelper(cur_start, cur_cnt, left); + } else { + cur_left_count = BaggingHelper(cur_start, cur_cnt, left); + } + return cur_left_count; + }, + bag_data_indices_.data()); + bag_data_cnt_ = left_cnt; + Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); + // set bagging data to tree learner + if (!is_use_subset_) { + tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); + } else { + // get subset + tmp_subset_->ReSize(bag_data_cnt_); + tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), + bag_data_cnt_, false); + tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), + bag_data_cnt_); + } + } + } + + void Reset() override {} + + void ResetConfig(const Config* config, bool is_change_dataset, + std::vector>& gradients, + std::vector>& hessians) override { + // if need bagging, create buffer + data_size_t num_pos_data = 0; + if (objective_function_ != nullptr) { + num_pos_data = objective_function_->NumPositiveData(); + } + bool balance_bagging_cond = (config->pos_bagging_fraction < 1.0 || config->neg_bagging_fraction < 1.0) && (num_pos_data > 0); + if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) { + need_re_bagging_ = false; + if (!is_change_dataset && + config_ != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq + && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) { + return; + } + if (balance_bagging_cond) { + balanced_bagging_ = true; + bag_data_cnt_ = static_cast(num_pos_data * config->pos_bagging_fraction) + + static_cast((num_data_ - num_pos_data) * config->neg_bagging_fraction); + } else { + bag_data_cnt_ = static_cast(config->bagging_fraction * num_data_); + } + bag_data_indices_.resize(num_data_); + bagging_runner_.ReSize(num_data_); + bagging_rands_.clear(); + for (int i = 0; + i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) { + bagging_rands_.emplace_back(config_->bagging_seed + i); + } + + double average_bag_rate = + (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; + is_use_subset_ = false; + const int group_threshold_usesubset = 100; + if (average_bag_rate <= 0.5 + && (train_data_->num_feature_groups() < group_threshold_usesubset)) { + if (tmp_subset_ == nullptr || is_change_dataset) { + tmp_subset_.reset(new Dataset(bag_data_cnt_)); + tmp_subset_->CopyFeatureMapperFrom(train_data_); + } + is_use_subset_ = true; + Log::Debug("Use subset for bagging"); + } + + need_re_bagging_ = true; + + if (is_use_subset_ && bag_data_cnt_ < num_data_) { + if (objective_function_ == nullptr) { + size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients.resize(total_size); + hessians.resize(total_size); + } + } + } else { + bag_data_cnt_ = num_data_; + bag_data_indices_.clear(); + bagging_runner_.ReSize(0); + is_use_subset_ = false; + } + } + + data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { + if (cnt <= 0) { + return 0; + } + data_size_t cur_left_cnt = 0; + data_size_t cur_right_pos = cnt; + // random bagging, minimal unit is one record + for (data_size_t i = 0; i < cnt; ++i) { + auto cur_idx = start + i; + if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < config_->bagging_fraction) { + buffer[cur_left_cnt++] = cur_idx; + } else { + buffer[--cur_right_pos] = cur_idx; + } + } + return cur_left_cnt; + } + + data_size_t BalancedBaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { + if (cnt <= 0) { + return 0; + } + auto label_ptr = train_data_->metadata().label(); + data_size_t cur_left_cnt = 0; + data_size_t cur_right_pos = cnt; + // random bagging, minimal unit is one record + for (data_size_t i = 0; i < cnt; ++i) { + auto cur_idx = start + i; + bool is_pos = label_ptr[start + i] > 0; + bool is_in_bag = false; + if (is_pos) { + is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < + config_->pos_bagging_fraction; + } else { + is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < + config_->neg_bagging_fraction; + } + if (is_in_bag) { + buffer[cur_left_cnt++] = cur_idx; + } else { + buffer[--cur_right_pos] = cur_idx; + } + } + return cur_left_cnt; + } + + bool need_re_bagging_; +}; + +} // namespace LightGBM + +#endif // LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ \ No newline at end of file diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index aca734e28eec..84065f87e572 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -88,10 +88,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } } - CHECK(!(config_->bagging_freq > 0)); // can not use normal bagging in this version - data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, num_tree_per_iteration_)); - data_sample_strategy_->Reset(); - is_constant_hessian_ = GetIsConstHessian(objective_function); tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, @@ -131,7 +127,10 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective monotone_constraints_ = config->monotone_constraints; // if need bagging, create buffer - ResetBaggingConfig(config_.get(), true); + // ResetBaggingConfig(config_.get(), true); + data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_)); + data_sample_strategy_->ResetConfig(config_.get(), true, gradients_, hessians_); + data_sample_strategy_->Reset(); class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { @@ -399,7 +398,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } } // bagging logic - data_sample_strategy_->Bagging(iter_, gradients_.data(), hessians_.data(), tree_learner_.get()); + data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); bag_data_indices_ = data_sample_strategy_->bag_data_indices(); bag_data_cnt_ = data_sample_strategy_->bag_data_cnt(); is_use_subset_ = data_sample_strategy_->is_use_subset(); @@ -755,11 +754,12 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* feature_infos_ = train_data_->feature_infos(); tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); - ResetBaggingConfig(config_.get(), true); - data_sample_strategy_->Reset(); + // ResetBaggingConfig(config_.get(), true); + data_sample_strategy_->ResetConfig(config_.get(), true, gradients_, hessians_); } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } + data_sample_strategy_->Reset(); } void GBDT::ResetConfig(const Config* config) { @@ -779,9 +779,9 @@ void GBDT::ResetConfig(const Config* config) { tree_learner_->ResetConfig(new_config.get()); } if (train_data_ != nullptr) { - ResetBaggingConfig(new_config.get(), false); + // ResetBaggingConfig(new_config.get(), false); + data_sample_strategy_->ResetConfig(new_config.get(), false, gradients_, hessians_); } - data_sample_strategy_->Reset(); if (config_.get() != nullptr && config_->forcedsplits_filename != new_config->forcedsplits_filename) { // load forced_splits file if (!new_config->forcedsplits_filename.empty()) { @@ -798,6 +798,7 @@ void GBDT::ResetConfig(const Config* config) { } } config_.reset(new_config.release()); + data_sample_strategy_->Reset(); } void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { diff --git a/src/boosting/goss1.hpp b/src/boosting/goss1.hpp index b0e49231a5a4..a51e4a1dc79c 100644 --- a/src/boosting/goss1.hpp +++ b/src/boosting/goss1.hpp @@ -19,7 +19,7 @@ class GOSS1 : public SampleStrategy { ~GOSS1() { } - void Bagging(int iter, score_t* gradients, score_t* hessians, TreeLearner* tree_learner) override { + void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { bag_data_cnt_ = num_data_; // not subsample for first iterations if (iter < static_cast(1.0f / config_->learning_rate)) { return; } @@ -73,8 +73,12 @@ class GOSS1 : public SampleStrategy { bag_data_cnt_ = num_data_; } + void ResetConfig(const Config* config, bool is_change_dataset, + std::vector>& gradients, + std::vector>& hessians) override {} + protected: - data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) override { + data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) { if (cnt <= 0) { return 0; } diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index cac1badb6c13..35b56ee24020 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -1,15 +1,16 @@ #include #include "goss1.hpp" +#include "bagging.hpp" namespace LightGBM { -SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { +SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) { bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); if (use_goss_as_boosting || use_goss_as_strategy) { - return new GOSS1(config, train_data, num_tree_per_iteration); - } else if (config->data_sample_strategy == std::string("bagging")) { - return nullptr; + return new GOSS1(config, train_data, num_tree_per_iteration); + } else { + return new BAGGING(config, train_data, objective_function, num_tree_per_iteration); } } From dd40531b395dcac22243ae1c729510cb3a00b13d Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 12 Nov 2021 14:16:53 +0000 Subject: [PATCH 04/84] fix some variables --- include/LightGBM/sample_strategy.h | 2 +- src/boosting/gbdt.cpp | 21 +++++++++++---------- src/boosting/rf.hpp | 20 ++++++++++++-------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index b1770d28ccb4..e86f5e6aa6f0 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -23,7 +23,7 @@ class SampleStrategy { std::vector>& hessians) = 0; bool is_use_subset() {return is_use_subset_;} data_size_t bag_data_cnt() {return bag_data_cnt_;} - std::vector> bag_data_indices() {return bag_data_indices_;} + std::vector>& bag_data_indices() {return bag_data_indices_;} protected: const Config* config_; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 84065f87e572..fd68263bbbe3 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -399,9 +399,9 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } // bagging logic data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); - bag_data_indices_ = data_sample_strategy_->bag_data_indices(); - bag_data_cnt_ = data_sample_strategy_->bag_data_cnt(); - is_use_subset_ = data_sample_strategy_->is_use_subset(); + const bool is_use_subset = data_sample_strategy_->is_use_subset(); + const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); + const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { @@ -411,10 +411,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto grad = gradients + offset; auto hess = hessians + offset; // need to copy gradients for bagging subset. - if (is_use_subset_ && bag_data_cnt_ < num_data_) { - for (int i = 0; i < bag_data_cnt_; ++i) { - gradients_[offset + i] = grad[bag_data_indices_[i]]; - hessians_[offset + i] = hess[bag_data_indices_[i]]; + if (is_use_subset && bag_data_cnt < num_data_) { + for (int i = 0; i < bag_data_cnt; ++i) { + gradients_[offset + i] = grad[bag_data_indices[i]]; + hessians_[offset + i] = hess[bag_data_indices[i]]; } grad = gradients_.data() + offset; hess = hessians_.data() + offset; @@ -513,12 +513,13 @@ bool GBDT::EvalAndCheckEarlyStopping() { void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { Common::FunctionTimer fun_timer("GBDT::UpdateScore", global_timer); // update training score - if (!is_use_subset_) { + if (!data_sample_strategy_->is_use_subset()) { train_score_updater_->AddScore(tree_learner_.get(), tree, cur_tree_id); + const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); // we need to predict out-of-bag scores of data for boosting - if (num_data_ - bag_data_cnt_ > 0) { - train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id); + if (num_data_ - bag_data_cnt > 0) { + train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id); } } else { diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 5a9eb226fef5..20b80c025c18 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -48,7 +48,7 @@ class RF : public GBDT { shrinkage_rate_ = 1.0f; // only boosting one time Boosting(); - if (is_use_subset_ && bag_data_cnt_ < num_data_) { + if (data_sample_strategy_->is_use_subset() && data_sample_strategy_->bag_data_cnt() < num_data_) { tmp_grad_.resize(num_data_); tmp_hess_.resize(num_data_); } @@ -73,7 +73,7 @@ class RF : public GBDT { CHECK_EQ(num_tree_per_iteration_, num_class_); // only boosting one time Boosting(); - if (is_use_subset_ && bag_data_cnt_ < num_data_) { + if (data_sample_strategy_->is_use_subset() && data_sample_strategy_->bag_data_cnt() < num_data_) { tmp_grad_.resize(num_data_); tmp_hess_.resize(num_data_); } @@ -102,7 +102,11 @@ class RF : public GBDT { bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { // bagging logic - Bagging(iter_); + data_sample_strategy_ ->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); + const bool is_use_subset = data_sample_strategy_->is_use_subset(); + const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); + const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); + CHECK_EQ(gradients, nullptr); CHECK_EQ(hessians, nullptr); @@ -116,10 +120,10 @@ class RF : public GBDT { auto hess = hessians + offset; // need to copy gradients for bagging subset. - if (is_use_subset_ && bag_data_cnt_ < num_data_) { - for (int i = 0; i < bag_data_cnt_; ++i) { - tmp_grad_[i] = grad[bag_data_indices_[i]]; - tmp_hess_[i] = hess[bag_data_indices_[i]]; + if (is_use_subset && bag_data_cnt < num_data_) { + for (int i = 0; i < bag_data_cnt; ++i) { + tmp_grad_[i] = grad[bag_data_indices[i]]; + tmp_hess_[i] = hess[bag_data_indices[i]]; } grad = tmp_grad_.data(); hess = tmp_hess_.data(); @@ -132,7 +136,7 @@ class RF : public GBDT { double pred = init_scores_[cur_tree_id]; auto residual_getter = [pred](const label_t* label, int i) {return static_cast(label[i]) - pred; }; tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - num_data_, bag_data_indices_.data(), bag_data_cnt_); + num_data_, bag_data_indices.data(), bag_data_cnt); if (std::fabs(init_scores_[cur_tree_id]) > kEpsilon) { new_tree->AddBias(init_scores_[cur_tree_id]); } From 4b6095db63acffd70e9ad57bac2f33177abe76ae Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 12 Nov 2021 14:35:23 +0000 Subject: [PATCH 05/84] remove GOSS(as boost) and Bagging logic in GBDT --- src/boosting/boosting.cpp | 5 +- src/boosting/gbdt.cpp | 153 +------------------------------ src/boosting/gbdt.h | 21 ----- src/boosting/goss.hpp | 188 -------------------------------------- 4 files changed, 4 insertions(+), 363 deletions(-) delete mode 100644 src/boosting/goss.hpp diff --git a/src/boosting/boosting.cpp b/src/boosting/boosting.cpp index 91fa318a0f18..98f2554b1388 100644 --- a/src/boosting/boosting.cpp +++ b/src/boosting/boosting.cpp @@ -6,7 +6,6 @@ #include "dart.hpp" #include "gbdt.h" -#include "goss.hpp" #include "rf.hpp" namespace LightGBM { @@ -39,7 +38,7 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename } else if (type == std::string("dart")) { return new DART(); } else if (type == std::string("goss")) { - return new GOSS(); + return new GBDT(); } else if (type == std::string("rf")) { return new RF(); } else { @@ -53,7 +52,7 @@ Boosting* Boosting::CreateBoosting(const std::string& type, const char* filename } else if (type == std::string("dart")) { ret.reset(new DART()); } else if (type == std::string("goss")) { - ret.reset(new GOSS()); + ret.reset(new GBDT()); } else if (type == std::string("rf")) { return new RF(); } else { diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index fd68263bbbe3..1f9f7d834a47 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -35,10 +35,7 @@ GBDT::GBDT() num_class_(1), num_iteration_for_pred_(0), shrinkage_rate_(0.1f), - num_init_iteration_(0), - need_re_bagging_(false), - balanced_bagging_(false), - bagging_runner_(0, bagging_rand_block_) { + num_init_iteration_(0) { average_output_ = false; tree_learner_ = nullptr; linear_tree_ = false; @@ -127,7 +124,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective monotone_constraints_ = config->monotone_constraints; // if need bagging, create buffer - // ResetBaggingConfig(config_.get(), true); data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_)); data_sample_strategy_->ResetConfig(config_.get(), true, gradients_, hessians_); data_sample_strategy_->Reset(); @@ -186,89 +182,6 @@ void GBDT::Boosting() { GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data()); } -data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { - if (cnt <= 0) { - return 0; - } - data_size_t cur_left_cnt = 0; - data_size_t cur_right_pos = cnt; - // random bagging, minimal unit is one record - for (data_size_t i = 0; i < cnt; ++i) { - auto cur_idx = start + i; - if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < config_->bagging_fraction) { - buffer[cur_left_cnt++] = cur_idx; - } else { - buffer[--cur_right_pos] = cur_idx; - } - } - return cur_left_cnt; -} - -data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt, - data_size_t* buffer) { - if (cnt <= 0) { - return 0; - } - auto label_ptr = train_data_->metadata().label(); - data_size_t cur_left_cnt = 0; - data_size_t cur_right_pos = cnt; - // random bagging, minimal unit is one record - for (data_size_t i = 0; i < cnt; ++i) { - auto cur_idx = start + i; - bool is_pos = label_ptr[start + i] > 0; - bool is_in_bag = false; - if (is_pos) { - is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < - config_->pos_bagging_fraction; - } else { - is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < - config_->neg_bagging_fraction; - } - if (is_in_bag) { - buffer[cur_left_cnt++] = cur_idx; - } else { - buffer[--cur_right_pos] = cur_idx; - } - } - return cur_left_cnt; -} - -void GBDT::Bagging(int iter) { - Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); - // if need bagging - if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || - need_re_bagging_) { - need_re_bagging_ = false; - auto left_cnt = bagging_runner_.Run( - num_data_, - [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, - data_size_t*) { - data_size_t cur_left_count = 0; - if (balanced_bagging_) { - cur_left_count = - BalancedBaggingHelper(cur_start, cur_cnt, left); - } else { - cur_left_count = BaggingHelper(cur_start, cur_cnt, left); - } - return cur_left_count; - }, - bag_data_indices_.data()); - bag_data_cnt_ = left_cnt; - Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); - // set bagging data to tree learner - if (!is_use_subset_) { - tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - } else { - // get subset - tmp_subset_->ReSize(bag_data_cnt_); - tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), - bag_data_cnt_, false); - tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), - bag_data_cnt_); - } - } -} - void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; @@ -428,7 +341,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto score_ptr = train_score_updater_->score() + offset; auto residual_getter = [score_ptr](const label_t* label, int i) {return static_cast(label[i]) - score_ptr[i]; }; tree_learner_->RenewTreeOutput(new_tree.get(), objective_function_, residual_getter, - num_data_, bag_data_indices_.data(), bag_data_cnt_); + num_data_, bag_data_indices.data(), bag_data_cnt); // shrinkage by learning rate new_tree->Shrinkage(shrinkage_rate_); // update score @@ -755,7 +668,6 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* feature_infos_ = train_data_->feature_infos(); tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); - // ResetBaggingConfig(config_.get(), true); data_sample_strategy_->ResetConfig(config_.get(), true, gradients_, hessians_); } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); @@ -780,7 +692,6 @@ void GBDT::ResetConfig(const Config* config) { tree_learner_->ResetConfig(new_config.get()); } if (train_data_ != nullptr) { - // ResetBaggingConfig(new_config.get(), false); data_sample_strategy_->ResetConfig(new_config.get(), false, gradients_, hessians_); } if (config_.get() != nullptr && config_->forcedsplits_filename != new_config->forcedsplits_filename) { @@ -802,64 +713,4 @@ void GBDT::ResetConfig(const Config* config) { data_sample_strategy_->Reset(); } -void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) { - // if need bagging, create buffer - data_size_t num_pos_data = 0; - if (objective_function_ != nullptr) { - num_pos_data = objective_function_->NumPositiveData(); - } - bool balance_bagging_cond = (config->pos_bagging_fraction < 1.0 || config->neg_bagging_fraction < 1.0) && (num_pos_data > 0); - if ((config->bagging_fraction < 1.0 || balance_bagging_cond) && config->bagging_freq > 0) { - need_re_bagging_ = false; - if (!is_change_dataset && - config_.get() != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq - && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) { - return; - } - if (balance_bagging_cond) { - balanced_bagging_ = true; - bag_data_cnt_ = static_cast(num_pos_data * config->pos_bagging_fraction) - + static_cast((num_data_ - num_pos_data) * config->neg_bagging_fraction); - } else { - bag_data_cnt_ = static_cast(config->bagging_fraction * num_data_); - } - bag_data_indices_.resize(num_data_); - bagging_runner_.ReSize(num_data_); - bagging_rands_.clear(); - for (int i = 0; - i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) { - bagging_rands_.emplace_back(config_->bagging_seed + i); - } - - double average_bag_rate = - (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; - is_use_subset_ = false; - const int group_threshold_usesubset = 100; - if (average_bag_rate <= 0.5 - && (train_data_->num_feature_groups() < group_threshold_usesubset)) { - if (tmp_subset_ == nullptr || is_change_dataset) { - tmp_subset_.reset(new Dataset(bag_data_cnt_)); - tmp_subset_->CopyFeatureMapperFrom(train_data_); - } - is_use_subset_ = true; - Log::Debug("Use subset for bagging"); - } - - need_re_bagging_ = true; - - if (is_use_subset_ && bag_data_cnt_ < num_data_) { - if (objective_function_ == nullptr) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients_.resize(total_size); - hessians_.resize(total_size); - } - } - } else { - bag_data_cnt_ = num_data_; - bag_data_indices_.clear(); - bagging_runner_.ReSize(0); - is_use_subset_ = false; - } -} - } // namespace LightGBM diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index f41e14582f4d..13c427bd266c 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -413,18 +413,6 @@ class GBDT : public GBDTBase { */ void ResetBaggingConfig(const Config* config, bool is_change_dataset); - /*! - * \brief Implement bagging logic - * \param iter Current interation - */ - virtual void Bagging(int iter); - - virtual data_size_t BaggingHelper(data_size_t start, data_size_t cnt, - data_size_t* buffer); - - data_size_t BalancedBaggingHelper(data_size_t start, data_size_t cnt, - data_size_t* buffer); - /*! * \brief calculate the object function */ @@ -497,10 +485,6 @@ class GBDT : public GBDTBase { std::vector> hessians_; #endif - /*! \brief Store the indices of in-bag data */ - std::vector> bag_data_indices_; - /*! \brief Number of in-bag data */ - data_size_t bag_data_cnt_; /*! \brief Number of training data */ data_size_t num_data_; /*! \brief Number of trees per iterations */ @@ -520,8 +504,6 @@ class GBDT : public GBDTBase { /*! \brief Feature names */ std::vector feature_names_; std::vector feature_infos_; - std::unique_ptr tmp_subset_; - bool is_use_subset_; std::vector class_need_train_; bool is_constant_hessian_; std::unique_ptr loaded_objective_; @@ -530,9 +512,6 @@ class GBDT : public GBDTBase { bool balanced_bagging_; std::string loaded_parameter_; std::vector monotone_constraints_; - const int bagging_rand_block_ = 1024; - std::vector bagging_rands_; - ParallelPartitionRunner bagging_runner_; Json forced_splits_json_; bool linear_tree_; std::unique_ptr data_sample_strategy_; diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp deleted file mode 100644 index 09b5b90763ce..000000000000 --- a/src/boosting/goss.hpp +++ /dev/null @@ -1,188 +0,0 @@ -/*! - * Copyright (c) 2017 Microsoft Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_BOOSTING_GOSS_H_ -#define LIGHTGBM_BOOSTING_GOSS_H_ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "gbdt.h" -#include "score_updater.hpp" - -namespace LightGBM { - -class GOSS: public GBDT { - public: - /*! - * \brief Constructor - */ - GOSS() : GBDT() { - } - - ~GOSS() { - } - - void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, - const std::vector& training_metrics) override { - GBDT::Init(config, train_data, objective_function, training_metrics); - ResetGoss(); - if (objective_function_ == nullptr) { - // use customized objective function - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients_.resize(total_size, 0.0f); - hessians_.resize(total_size, 0.0f); - } - } - - void ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* objective_function, - const std::vector& training_metrics) override { - GBDT::ResetTrainingData(train_data, objective_function, training_metrics); - ResetGoss(); - } - - void ResetConfig(const Config* config) override { - GBDT::ResetConfig(config); - ResetGoss(); - } - - bool TrainOneIter(const score_t* gradients, const score_t* hessians) override { - if (gradients != nullptr) { - // use customized objective function - CHECK(hessians != nullptr && objective_function_ == nullptr); - int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - #pragma omp parallel for schedule(static) - for (int64_t i = 0; i < total_size; ++i) { - gradients_[i] = gradients[i]; - hessians_[i] = hessians[i]; - } - return GBDT::TrainOneIter(gradients_.data(), hessians_.data()); - } else { - CHECK(hessians == nullptr); - return GBDT::TrainOneIter(nullptr, nullptr); - } - } - - void ResetGoss() { - CHECK_LE(config_->top_rate + config_->other_rate, 1.0f); - CHECK(config_->top_rate > 0.0f && config_->other_rate > 0.0f); - if (config_->bagging_freq > 0 && config_->bagging_fraction != 1.0f) { - Log::Fatal("Cannot use bagging in GOSS"); - } - Log::Info("Using GOSS"); - balanced_bagging_ = false; - bag_data_indices_.resize(num_data_); - bagging_runner_.ReSize(num_data_); - bagging_rands_.clear(); - for (int i = 0; - i < (num_data_ + bagging_rand_block_ - 1) / bagging_rand_block_; ++i) { - bagging_rands_.emplace_back(config_->bagging_seed + i); - } - is_use_subset_ = false; - if (config_->top_rate + config_->other_rate <= 0.5) { - auto bag_data_cnt = static_cast((config_->top_rate + config_->other_rate) * num_data_); - bag_data_cnt = std::max(1, bag_data_cnt); - tmp_subset_.reset(new Dataset(bag_data_cnt)); - tmp_subset_->CopyFeatureMapperFrom(train_data_); - is_use_subset_ = true; - } - // flag to not bagging first - bag_data_cnt_ = num_data_; - } - - data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) override { - if (cnt <= 0) { - return 0; - } - std::vector tmp_gradients(cnt, 0.0f); - for (data_size_t i = 0; i < cnt; ++i) { - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - size_t idx = static_cast(cur_tree_id) * num_data_ + start + i; - tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]); - } - } - data_size_t top_k = static_cast(cnt * config_->top_rate); - data_size_t other_k = static_cast(cnt * config_->other_rate); - top_k = std::max(1, top_k); - ArrayArgs::ArgMaxAtK(&tmp_gradients, 0, static_cast(tmp_gradients.size()), top_k - 1); - score_t threshold = tmp_gradients[top_k - 1]; - - score_t multiply = static_cast(cnt - top_k) / other_k; - data_size_t cur_left_cnt = 0; - data_size_t cur_right_pos = cnt; - data_size_t big_weight_cnt = 0; - for (data_size_t i = 0; i < cnt; ++i) { - auto cur_idx = start + i; - score_t grad = 0.0f; - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - size_t idx = static_cast(cur_tree_id) * num_data_ + cur_idx; - grad += std::fabs(gradients_[idx] * hessians_[idx]); - } - if (grad >= threshold) { - buffer[cur_left_cnt++] = cur_idx; - ++big_weight_cnt; - } else { - data_size_t sampled = cur_left_cnt - big_weight_cnt; - data_size_t rest_need = other_k - sampled; - data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt); - double prob = (rest_need) / static_cast(rest_all); - if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < prob) { - buffer[cur_left_cnt++] = cur_idx; - for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { - size_t idx = static_cast(cur_tree_id) * num_data_ + cur_idx; - gradients_[idx] *= multiply; - hessians_[idx] *= multiply; - } - } else { - buffer[--cur_right_pos] = cur_idx; - } - } - } - return cur_left_cnt; - } - - void Bagging(int iter) override { - bag_data_cnt_ = num_data_; - // not subsample for first iterations - if (iter < static_cast(1.0f / config_->learning_rate)) { return; } - auto left_cnt = bagging_runner_.Run( - num_data_, - [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, - data_size_t*) { - data_size_t cur_left_count = 0; - cur_left_count = BaggingHelper(cur_start, cur_cnt, left); - return cur_left_count; - }, - bag_data_indices_.data()); - bag_data_cnt_ = left_cnt; - // set bagging data to tree learner - if (!is_use_subset_) { - tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - } else { - // get subset - tmp_subset_->ReSize(bag_data_cnt_); - tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), - bag_data_cnt_, false); - tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), - bag_data_cnt_); - } - } - - protected: - bool GetIsConstHessian(const ObjectiveFunction*) override { - return false; - } -}; - -} // namespace LightGBM -#endif // LIGHTGBM_BOOSTING_GOSS_H_ From 2acb230593c83b78ec652805ea3704b33d689def Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 12 Nov 2021 14:37:28 +0000 Subject: [PATCH 06/84] rename GOSS1 to GOSS(as sample strategy) --- src/boosting/{goss1.hpp => goss.hpp} | 6 +++--- src/boosting/sample_strategy.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) rename src/boosting/{goss1.hpp => goss.hpp} (97%) diff --git a/src/boosting/goss1.hpp b/src/boosting/goss.hpp similarity index 97% rename from src/boosting/goss1.hpp rename to src/boosting/goss.hpp index a51e4a1dc79c..605b660bd8c9 100644 --- a/src/boosting/goss1.hpp +++ b/src/boosting/goss.hpp @@ -7,16 +7,16 @@ namespace LightGBM { -class GOSS1 : public SampleStrategy { +class GOSS : public SampleStrategy { public: - GOSS1(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { + GOSS(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { config_ = config; train_data_ = train_data; num_tree_per_iteration_ = num_tree_per_iteration; num_data_ = train_data->num_data(); } - ~GOSS1() { + ~GOSS() { } void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 35b56ee24020..53e00fcee14f 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -1,5 +1,5 @@ #include -#include "goss1.hpp" +#include "goss.hpp" #include "bagging.hpp" namespace LightGBM { @@ -8,7 +8,7 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); if (use_goss_as_boosting || use_goss_as_strategy) { - return new GOSS1(config, train_data, num_tree_per_iteration); + return new GOSS(config, train_data, num_tree_per_iteration); } else { return new BAGGING(config, train_data, objective_function, num_tree_per_iteration); } From 8b25d657519d4562f89fcaf78b794b5102900854 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 12 Nov 2021 14:47:15 +0000 Subject: [PATCH 07/84] add warning about use GOSS as boosting_type --- src/boosting/sample_strategy.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 53e00fcee14f..656504eef7c7 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -7,6 +7,12 @@ namespace LightGBM { SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) { bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); + if (use_goss_as_boosting) { + Log::Warning("Setting goss as `boosting_type` is NOT suggested. Please set `data_sample_strategy = goss` in your config file."); + if (use_goss_as_strategy) { + Log::Warning("Both `boosting_type` and `data_sample_strategy` are set as GOSS. Only one time of sampling will be conducted. Please check and modify your config file.") + } + } if (use_goss_as_boosting || use_goss_as_strategy) { return new GOSS(config, train_data, num_tree_per_iteration); } else { From 05a8d15af99a078cd5c36e0d16454eeca95ec937 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 12 Nov 2021 14:52:04 +0000 Subject: [PATCH 08/84] a little ; bug --- src/boosting/sample_strategy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 656504eef7c7..c3d6ea8378c9 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -10,7 +10,7 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const if (use_goss_as_boosting) { Log::Warning("Setting goss as `boosting_type` is NOT suggested. Please set `data_sample_strategy = goss` in your config file."); if (use_goss_as_strategy) { - Log::Warning("Both `boosting_type` and `data_sample_strategy` are set as GOSS. Only one time of sampling will be conducted. Please check and modify your config file.") + Log::Warning("Both `boosting_type` and `data_sample_strategy` are set as GOSS. Only one time of sampling will be conducted. Please check and modify your config file."); } } if (use_goss_as_boosting || use_goss_as_strategy) { From 6f9c8cceb26f283a127ed50815e7f08407824bfa Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Mon, 15 Nov 2021 08:42:08 +0000 Subject: [PATCH 09/84] remove CHECK when "gradients != nullptr" --- src/boosting/gbdt.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 1f9f7d834a47..f7392dd29dc2 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -301,8 +301,8 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } else if (gradients != nullptr) { // use customized objective function CHECK(hessians != nullptr && objective_function_ == nullptr); - // and will be only used for GOSS - CHECK(config_->boosting==std::string("goss") || config_->data_sample_strategy==std::string("goss")); + // and will be only used for GOSS (No? But copying gradients will not have effect when bagging) + // CHECK(config_->boosting==std::string("goss") || config_->data_sample_strategy==std::string("goss")); int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; #pragma omp parallel for schedule(static) for (int64_t i = 0; i < total_size; ++i) { From 80c4f70b4b7855e8dec91b8a30ef1a573c775d06 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Sun, 5 Dec 2021 09:13:45 +0000 Subject: [PATCH 10/84] rename DataSampleStrategy to avoid confusion --- include/LightGBM/sample_strategy.h | 4 ++-- src/boosting/bagging.hpp | 4 ++-- src/boosting/gbdt.cpp | 12 ++++++------ src/boosting/goss.hpp | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index e86f5e6aa6f0..96768bbae99f 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -17,8 +17,8 @@ class SampleStrategy { virtual ~SampleStrategy() {}; static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration); virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0; - virtual void Reset() = 0; - virtual void ResetConfig(const Config* config, bool is_change_dataset, + virtual void ResetGOSS() = 0; + virtual void ResetBaggingConfig(const Config* config, bool is_change_dataset, std::vector>& gradients, std::vector>& hessians) = 0; bool is_use_subset() {return is_use_subset_;} diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index fb7e230cae16..577ffa7e14a1 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -51,9 +51,9 @@ class BAGGING : public SampleStrategy { } } - void Reset() override {} + void ResetGOSS() override {} - void ResetConfig(const Config* config, bool is_change_dataset, + void ResetBaggingConfig(const Config* config, bool is_change_dataset, std::vector>& gradients, std::vector>& hessians) override { // if need bagging, create buffer diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index f7392dd29dc2..559a2d7b1002 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -125,8 +125,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective // if need bagging, create buffer data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_)); - data_sample_strategy_->ResetConfig(config_.get(), true, gradients_, hessians_); - data_sample_strategy_->Reset(); + data_sample_strategy_->ResetBaggingConfig(config_.get(), true, gradients_, hessians_); + data_sample_strategy_->ResetGOSS(); class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { @@ -668,11 +668,11 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* feature_infos_ = train_data_->feature_infos(); tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); - data_sample_strategy_->ResetConfig(config_.get(), true, gradients_, hessians_); + data_sample_strategy_->ResetBaggingConfig(config_.get(), true, gradients_, hessians_); } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } - data_sample_strategy_->Reset(); + data_sample_strategy_->ResetGOSS(); } void GBDT::ResetConfig(const Config* config) { @@ -692,7 +692,7 @@ void GBDT::ResetConfig(const Config* config) { tree_learner_->ResetConfig(new_config.get()); } if (train_data_ != nullptr) { - data_sample_strategy_->ResetConfig(new_config.get(), false, gradients_, hessians_); + data_sample_strategy_->ResetBaggingConfig(new_config.get(), false, gradients_, hessians_); } if (config_.get() != nullptr && config_->forcedsplits_filename != new_config->forcedsplits_filename) { // load forced_splits file @@ -710,7 +710,7 @@ void GBDT::ResetConfig(const Config* config) { } } config_.reset(new_config.release()); - data_sample_strategy_->Reset(); + data_sample_strategy_->ResetGOSS(); } } // namespace LightGBM diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 605b660bd8c9..79c15fd278c1 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -46,7 +46,7 @@ class GOSS : public SampleStrategy { } } - void Reset() override { + void ResetGOSS() override { CHECK_LE(config_->top_rate + config_->other_rate, 1.0f); CHECK(config_->top_rate > 0.0f && config_->other_rate > 0.0f); if (config_->bagging_freq > 0 && config_->bagging_fraction != 1.0f) { @@ -73,7 +73,7 @@ class GOSS : public SampleStrategy { bag_data_cnt_ = num_data_; } - void ResetConfig(const Config* config, bool is_change_dataset, + void ResetBaggingConfig(const Config* config, bool is_change_dataset, std::vector>& gradients, std::vector>& hessians) override {} From 8103d81574b6c6ca905c925d8abfd41b0770028b Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Sun, 5 Dec 2021 09:33:04 +0000 Subject: [PATCH 11/84] remove and add some ccomments, followingconvention --- include/LightGBM/config.h | 5 +++++ include/LightGBM/sample_strategy.h | 15 ++++++++++++--- src/boosting/gbdt.cpp | 2 -- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 7ba9b47f7298..238f14126dca 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -149,6 +149,11 @@ struct Config { // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; + // [doc-only] + // type = enum + // options = bagging, goss + // desc = ``bagging``, Randomly Bagging Sampling + // desc = ``goss``, Gradient-based One-Side Sampling std::string data_sample_strategy = "bagging"; // alias = train, train_data, train_data_file, data_filename diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 96768bbae99f..bcb499e42cf2 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -13,16 +13,25 @@ namespace LightGBM { class SampleStrategy { public: + SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_) {}; + virtual ~SampleStrategy() {}; + static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration); + virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0; + virtual void ResetGOSS() = 0; + virtual void ResetBaggingConfig(const Config* config, bool is_change_dataset, std::vector>& gradients, std::vector>& hessians) = 0; - bool is_use_subset() {return is_use_subset_;} - data_size_t bag_data_cnt() {return bag_data_cnt_;} + + bool is_use_subset() const { return is_use_subset_; } + + data_size_t bag_data_cnt() const { return bag_data_cnt_; } + std::vector>& bag_data_indices() {return bag_data_indices_;} protected: @@ -42,4 +51,4 @@ class SampleStrategy { }; } // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_H_ \ No newline at end of file +#endif // LIGHTGBM_SAMPLE_STRATEGY_H_ diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 559a2d7b1002..3aa141c0b320 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -301,8 +301,6 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } else if (gradients != nullptr) { // use customized objective function CHECK(hessians != nullptr && objective_function_ == nullptr); - // and will be only used for GOSS (No? But copying gradients will not have effect when bagging) - // CHECK(config_->boosting==std::string("goss") || config_->data_sample_strategy==std::string("goss")); int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; #pragma omp parallel for schedule(static) for (int64_t i = 0; i < total_size; ++i) { From 94a17eec9ac7b847afe6b972ffa93fa2786da28a Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Sun, 5 Dec 2021 09:48:51 +0000 Subject: [PATCH 12/84] =?UTF-8?q?fix=20bug=20about=20GBDT::ResetConfig=20(?= =?UTF-8?q?ObjectiveFunction=20inconsistencty=20bet=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/LightGBM/sample_strategy.h | 8 ++++++++ src/boosting/gbdt.cpp | 2 ++ src/boosting/goss.hpp | 8 +++++++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index bcb499e42cf2..1ea52b999206 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -34,6 +34,14 @@ class SampleStrategy { std::vector>& bag_data_indices() {return bag_data_indices_;} + void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) { + objective_function_ = objective_function; + } + + void UpdateTrainingData(const Dataset* train_data) { + train_data_ = train_data; + } + protected: const Config* config_; const Dataset* train_data_; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 3aa141c0b320..7e3f509754ab 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -622,6 +622,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* } objective_function_ = objective_function; + data_sample_strategy_->UpdateObjectiveFunction(objective_function); if (objective_function_ != nullptr) { CHECK_EQ(num_tree_per_iteration_, objective_function_->NumModelPerIteration()); if (objective_function_->IsRenewTreeOutput() && !config_->monotone_constraints.empty()) { @@ -639,6 +640,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* if (train_data != train_data_) { train_data_ = train_data; + data_sample_strategy_->UpdateTrainingData(train_data); // not same training data, need reset score and others // create score tracker train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 79c15fd278c1..28bc06267aa0 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -75,7 +75,13 @@ class GOSS : public SampleStrategy { void ResetBaggingConfig(const Config* config, bool is_change_dataset, std::vector>& gradients, - std::vector>& hessians) override {} + std::vector>& hessians) override { + // Cannot use bagging in GOSS + bag_data_cnt_ = num_data_; + bag_data_indices_.clear(); + bagging_runner_.ReSize(0); + is_use_subset_ = false; + } protected: data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) { From f000f0a500e84e7d493952360858192ffcdfa840 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Tue, 7 Dec 2021 07:04:39 +0000 Subject: [PATCH 13/84] add std::ignore to avoid compiler warnings (anpotential fails) --- src/boosting/bagging.hpp | 3 +++ src/boosting/goss.hpp | 7 ++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 577ffa7e14a1..ef32486ab4c6 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -49,6 +49,9 @@ class BAGGING : public SampleStrategy { bag_data_cnt_); } } + // avoid warnings + std::ignore = gradients; + std::ignore = hessians; } void ResetGOSS() override {} diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 28bc06267aa0..990b4fec8fc6 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -80,7 +80,12 @@ class GOSS : public SampleStrategy { bag_data_cnt_ = num_data_; bag_data_indices_.clear(); bagging_runner_.ReSize(0); - is_use_subset_ = false; + is_use_subset_ = false; + // avoid warnings + std::ignore = config; + std::ignore = is_change_dataset; + std::ignore = gradients; + std::ignore = hessians; } protected: From 0ca5cb1400112eece6d8f99983415f99491637b9 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 04:14:07 +0000 Subject: [PATCH 14/84] update Makevars and vcxproj --- R-package/src/Makevars.in | 1 + R-package/src/Makevars.win.in | 1 + windows/LightGBM.vcxproj | 2 ++ windows/LightGBM.vcxproj.filters | 6 ++++++ 4 files changed, 10 insertions(+) diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in index 2490ba0757df..eca4ccc73e0a 100644 --- a/R-package/src/Makevars.in +++ b/R-package/src/Makevars.in @@ -26,6 +26,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ + boosting/sample_strategy.o \ io/bin.o \ io/config.o \ io/config_auto.o \ diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in index 0fb2de926905..bbefe3c4fe15 100644 --- a/R-package/src/Makevars.win.in +++ b/R-package/src/Makevars.win.in @@ -27,6 +27,7 @@ OBJECTS = \ boosting/gbdt_model_text.o \ boosting/gbdt_prediction.o \ boosting/prediction_early_stop.o \ + boosting/sample_strategy.o \ io/bin.o \ io/config.o \ io/config_auto.o \ diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj index 59b589a40d51..f309d4fab820 100644 --- a/windows/LightGBM.vcxproj +++ b/windows/LightGBM.vcxproj @@ -253,6 +253,7 @@ + @@ -311,6 +312,7 @@ + diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters index 0f48c7564580..56b4e29287d5 100644 --- a/windows/LightGBM.vcxproj.filters +++ b/windows/LightGBM.vcxproj.filters @@ -129,6 +129,9 @@ include\LightGBM + + include\LightGBM + include\LightGBM @@ -311,6 +314,9 @@ src\boosting + + src\boosting + src\io From 2a58353354a06c80c14a3522bfdba8fcda810db8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 06:39:41 +0000 Subject: [PATCH 15/84] handle constant hessian move resize of gradient vectors out of sample strategy --- include/LightGBM/sample_strategy.h | 15 +++++---- src/boosting/bagging.hpp | 19 +++++++----- src/boosting/gbdt.cpp | 49 ++++++++++++++++++++---------- src/boosting/goss.hpp | 24 +++++++++------ src/boosting/sample_strategy.cpp | 2 +- 5 files changed, 70 insertions(+), 39 deletions(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 1ea52b999206..7dba6c7b69e9 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -13,8 +13,7 @@ namespace LightGBM { class SampleStrategy { public: - - SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_) {}; + SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_), need_resize_gradients_(false) {}; virtual ~SampleStrategy() {}; @@ -23,10 +22,8 @@ class SampleStrategy { virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0; virtual void ResetGOSS() = 0; - - virtual void ResetBaggingConfig(const Config* config, bool is_change_dataset, - std::vector>& gradients, - std::vector>& hessians) = 0; + + virtual void ResetBaggingConfig(const Config* config, bool is_change_dataset) = 0; bool is_use_subset() const { return is_use_subset_; } @@ -42,6 +39,10 @@ class SampleStrategy { train_data_ = train_data; } + virtual bool IsHessianChange() const = 0; + + bool NeedResizeGradients() const { return need_resize_gradients_; } + protected: const Config* config_; const Dataset* train_data_; @@ -56,6 +57,8 @@ class SampleStrategy { const int bagging_rand_block_ = 1024; std::vector bagging_rands_; ParallelPartitionRunner bagging_runner_; + /*! \brief whether need to resize the gradient vectors */ + bool need_resize_gradients_; }; } // namespace LightGBM diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index ef32486ab4c6..a9ce2f19d11b 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -13,6 +13,7 @@ class BAGGING : public SampleStrategy { objective_function_ = objective_function; num_tree_per_iteration_ = num_tree_per_iteration; } + ~BAGGING() {} void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { @@ -54,11 +55,10 @@ class BAGGING : public SampleStrategy { std::ignore = hessians; } - void ResetGOSS() override {} + void ResetGOSS() override {} - void ResetBaggingConfig(const Config* config, bool is_change_dataset, - std::vector>& gradients, - std::vector>& hessians) override { + void ResetBaggingConfig(const Config* config, bool is_change_dataset) override { + need_resize_gradients_ = false; // if need bagging, create buffer data_size_t num_pos_data = 0; if (objective_function_ != nullptr) { @@ -105,9 +105,8 @@ class BAGGING : public SampleStrategy { if (is_use_subset_ && bag_data_cnt_ < num_data_) { if (objective_function_ == nullptr) { - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients.resize(total_size); - hessians.resize(total_size); + // resize gradient vectors to copy the customized gradients for using subset data + need_resize_gradients_ = true; } } } else { @@ -118,6 +117,11 @@ class BAGGING : public SampleStrategy { } } + bool IsHessianChange() const { + return false; + } + + private: data_size_t BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { if (cnt <= 0) { return 0; @@ -164,6 +168,7 @@ class BAGGING : public SampleStrategy { return cur_left_cnt; } + /*! \brief whether need restart bagging in continued training */ bool need_re_bagging_; }; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 7e3f509754ab..8a0bca95a8dd 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -39,6 +39,7 @@ GBDT::GBDT() average_output_ = false; tree_learner_ = nullptr; linear_tree_ = false; + data_sample_strategy_.reset(nullptr); } GBDT::~GBDT() { @@ -85,7 +86,8 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } } - is_constant_hessian_ = GetIsConstHessian(objective_function); + data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_)); + is_constant_hessian_ = GetIsConstHessian(objective_function) && !data_sample_strategy_->IsHessianChange(); tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get())); @@ -105,14 +107,10 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective num_data_ = train_data_->num_data(); // create buffer for gradients and Hessians - size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; if (objective_function_ != nullptr) { + const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; gradients_.resize(total_size); hessians_.resize(total_size); - } else { - // use customized objective function, only for GOSS - gradients_.resize(total_size, 0.0f); - hessians_.resize(total_size, 0.0f); } // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; @@ -124,9 +122,14 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective monotone_constraints_ = config->monotone_constraints; // if need bagging, create buffer - data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_)); - data_sample_strategy_->ResetBaggingConfig(config_.get(), true, gradients_, hessians_); + data_sample_strategy_->ResetBaggingConfig(config_.get(), true); data_sample_strategy_->ResetGOSS(); + if (data_sample_strategy_->NeedResizeGradients()) { + // resize gradient vectors to copy the customized gradients for goss or bagging with subset + const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients_.resize(total_size, 0.0f); + hessians_.resize(total_size, 0.0f); + } class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { @@ -301,11 +304,13 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } else if (gradients != nullptr) { // use customized objective function CHECK(hessians != nullptr && objective_function_ == nullptr); - int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - #pragma omp parallel for schedule(static) - for (int64_t i = 0; i < total_size; ++i) { - gradients_[i] = gradients[i]; - hessians_[i] = hessians[i]; + if (config_->boosting == std::string("goss") || config_->data_sample_strategy == std::string("goss")) { + int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + #pragma omp parallel for schedule(static) + for (int64_t i = 0; i < total_size; ++i) { + gradients_[i] = gradients[i]; + hessians_[i] = hessians[i]; + } } } // bagging logic @@ -629,7 +634,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* Log::Fatal("Cannot use ``monotone_constraints`` in %s objective, please disable it.", objective_function_->GetName()); } } - is_constant_hessian_ = GetIsConstHessian(objective_function); + is_constant_hessian_ = GetIsConstHessian(objective_function) && !data_sample_strategy_->IsHessianChange(); // push training metrics training_metrics_.clear(); @@ -668,7 +673,13 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* feature_infos_ = train_data_->feature_infos(); tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); - data_sample_strategy_->ResetBaggingConfig(config_.get(), true, gradients_, hessians_); + data_sample_strategy_->ResetBaggingConfig(config_.get(), true); + if (data_sample_strategy_->NeedResizeGradients()) { + // resize gradient vectors to copy the customized gradients for goss or bagging with subset + const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients_.resize(total_size, 0.0f); + hessians_.resize(total_size, 0.0f); + } } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } @@ -692,7 +703,13 @@ void GBDT::ResetConfig(const Config* config) { tree_learner_->ResetConfig(new_config.get()); } if (train_data_ != nullptr) { - data_sample_strategy_->ResetBaggingConfig(new_config.get(), false, gradients_, hessians_); + data_sample_strategy_->ResetBaggingConfig(new_config.get(), false); + if (data_sample_strategy_->NeedResizeGradients()) { + // resize gradient vectors to copy the customized gradients for goss or bagging with subset + const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients_.resize(total_size, 0.0f); + hessians_.resize(total_size, 0.0f); + } } if (config_.get() != nullptr && config_->forcedsplits_filename != new_config->forcedsplits_filename) { // load forced_splits file diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 990b4fec8fc6..d75acbb45c7e 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -15,7 +15,7 @@ class GOSS : public SampleStrategy { num_tree_per_iteration_ = num_tree_per_iteration; num_data_ = train_data->num_data(); } - + ~GOSS() { } @@ -73,22 +73,27 @@ class GOSS : public SampleStrategy { bag_data_cnt_ = num_data_; } - void ResetBaggingConfig(const Config* config, bool is_change_dataset, - std::vector>& gradients, - std::vector>& hessians) override { + void ResetBaggingConfig(const Config* config, bool is_change_dataset) override { // Cannot use bagging in GOSS bag_data_cnt_ = num_data_; bag_data_indices_.clear(); bagging_runner_.ReSize(0); is_use_subset_ = false; + need_resize_gradients_ = false; + if (objective_function_ == nullptr) { + // resize gradient vectors to copy the customized gradients for goss + need_resize_gradients_ = true; + } // avoid warnings std::ignore = config; std::ignore = is_change_dataset; - std::ignore = gradients; - std::ignore = hessians; - } + } - protected: + bool IsHessianChange() const { + return true; + } + + private: data_size_t Helper(data_size_t start, data_size_t cnt, data_size_t* buffer, score_t* gradients, score_t* hessians) { if (cnt <= 0) { return 0; @@ -143,4 +148,5 @@ class GOSS : public SampleStrategy { }; } // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ \ No newline at end of file + +#endif // LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index c3d6ea8378c9..64aa50540ce1 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -20,4 +20,4 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const } } -} // namespace LightGBM \ No newline at end of file +} // namespace LightGBM From 8775c055e5ed31d56d1f43f27e9d182f65b8a4c7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 06:57:31 +0000 Subject: [PATCH 16/84] mark override for IsHessianChange --- src/boosting/bagging.hpp | 2 +- src/boosting/goss.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index a9ce2f19d11b..1cdeb31c55db 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -117,7 +117,7 @@ class BAGGING : public SampleStrategy { } } - bool IsHessianChange() const { + bool IsHessianChange() const override { return false; } diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index d75acbb45c7e..77bcbfb2591d 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -89,7 +89,7 @@ class GOSS : public SampleStrategy { std::ignore = is_change_dataset; } - bool IsHessianChange() const { + bool IsHessianChange() const override { return true; } From 1e888efac5e5f88c49ccec8edc3dbfddda4fb8bc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 07:10:58 +0000 Subject: [PATCH 17/84] fix lint errors --- include/LightGBM/sample_strategy.h | 31 +++++++++++++++++++----------- src/boosting/bagging.hpp | 11 ++++++++--- src/boosting/goss.hpp | 17 +++++++++++----- src/boosting/rf.hpp | 2 +- src/boosting/sample_strategy.cpp | 7 ++++++- 5 files changed, 47 insertions(+), 21 deletions(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 7dba6c7b69e9..3bfb37b2c78f 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -1,3 +1,8 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + #ifndef LIGHTGBM_SAMPLE_STRATEGY_H_ #define LIGHTGBM_SAMPLE_STRATEGY_H_ @@ -9,26 +14,29 @@ #include #include +#include +#include + namespace LightGBM { class SampleStrategy { public: - SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_), need_resize_gradients_(false) {}; - - virtual ~SampleStrategy() {}; - + SampleStrategy() : balanced_bagging_(false), bagging_runner_(0, bagging_rand_block_), need_resize_gradients_(false) {} + + virtual ~SampleStrategy() {} + static SampleStrategy* CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration); - + virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0; - + virtual void ResetGOSS() = 0; virtual void ResetBaggingConfig(const Config* config, bool is_change_dataset) = 0; - + bool is_use_subset() const { return is_use_subset_; } - + data_size_t bag_data_cnt() const { return bag_data_cnt_; } - + std::vector>& bag_data_indices() {return bag_data_indices_;} void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) { @@ -61,5 +69,6 @@ class SampleStrategy { bool need_resize_gradients_; }; -} // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_H_ +} // namespace LightGBM + +#endif // LIGHTGBM_SAMPLE_STRATEGY_H_ diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 1cdeb31c55db..f897c1af3e3a 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -1,3 +1,8 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + #ifndef LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ #define LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ @@ -5,7 +10,7 @@ namespace LightGBM { class BAGGING : public SampleStrategy { public: - BAGGING(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) + BAGGING(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) : need_re_bagging_(false) { config_ = config; train_data_ = train_data; @@ -172,6 +177,6 @@ class BAGGING : public SampleStrategy { bool need_re_bagging_; }; -} // namespace LightGBM +} // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ \ No newline at end of file +#endif // LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 77bcbfb2591d..53decd7c84a5 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -1,9 +1,16 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + #ifndef LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ #define LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ #include #include +#include +#include namespace LightGBM { @@ -25,7 +32,7 @@ class GOSS : public SampleStrategy { if (iter < static_cast(1.0f / config_->learning_rate)) { return; } auto left_cnt = bagging_runner_.Run( num_data_, - [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, + [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, data_size_t*) { data_size_t cur_left_count = 0; cur_left_count = Helper(cur_start, cur_cnt, left, gradients, hessians); @@ -70,7 +77,7 @@ class GOSS : public SampleStrategy { is_use_subset_ = true; } // flag to not bagging first - bag_data_cnt_ = num_data_; + bag_data_cnt_ = num_data_; } void ResetBaggingConfig(const Config* config, bool is_change_dataset) override { @@ -144,9 +151,9 @@ class GOSS : public SampleStrategy { } return cur_left_cnt; } - + }; -} // namespace LightGBM +} // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ +#endif // LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 20b80c025c18..96638697f1ca 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -106,7 +106,7 @@ class RF : public GBDT { const bool is_use_subset = data_sample_strategy_->is_use_subset(); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); - + CHECK_EQ(gradients, nullptr); CHECK_EQ(hessians, nullptr); diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 64aa50540ce1..8e005dbc9c22 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -1,3 +1,8 @@ +/*! + * Copyright (c) 2021 Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See LICENSE file in the project root for license information. + */ + #include #include "goss.hpp" #include "bagging.hpp" @@ -20,4 +25,4 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const } } -} // namespace LightGBM +} // namespace LightGBM From 22ad1c873b165584b1717319928677a7c4352305 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 07:15:38 +0000 Subject: [PATCH 18/84] rerun parameter_generator.py --- docs/Parameters.rst | 6 ++++++ src/io/config_auto.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 5faa9af9fd31..da796bb0c0a0 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -139,6 +139,12 @@ Core Parameters - **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations +- ``data_sample_strategy`` :raw-html:`🔗︎`, default = ``bagging``, type = enum, options: ``bagging``, ``goss`` + + - ``bagging``, Randomly Bagging Sampling + + - ``goss``, Gradient-based One-Side Sampling + - ``data`` :raw-html:`🔗︎`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename`` - path of training data, LightGBM will train from this data diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 18225c55a2fc..ca8a00f2249a 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -185,6 +185,7 @@ const std::unordered_set& Config::parameter_set() { "task", "objective", "boosting", + "data_sample_strategy", "data", "valid", "num_iterations", @@ -312,7 +313,6 @@ const std::unordered_set& Config::parameter_set() { "gpu_device_id", "gpu_use_dp", "num_gpu", - "data_sample_strategy" }); return params; } From e64ad6fe2b8ac8347fc18306ce0bdc873aed0b6f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 07:21:44 +0000 Subject: [PATCH 19/84] update config_auto.cpp --- src/io/config_auto.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 4c772d3a10e6..46097a4741e6 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -764,6 +764,7 @@ const std::string Config::DumpAliases() { str_buf << "\"task\": [\"task_type\"], "; str_buf << "\"objective\": [\"objective_type\", \"app\", \"application\", \"loss\"], "; str_buf << "\"boosting\": [\"boosting_type\", \"boost\"], "; + str_buf << "\"data_sample_strategy\": [], "; str_buf << "\"data\": [\"train\", \"train_data\", \"train_data_file\", \"data_filename\"], "; str_buf << "\"valid\": [\"test\", \"valid_data\", \"valid_data_file\", \"test_data\", \"test_data_file\", \"valid_filenames\"], "; str_buf << "\"num_iterations\": [\"num_iteration\", \"n_iter\", \"num_tree\", \"num_trees\", \"num_round\", \"num_rounds\", \"nrounds\", \"num_boost_round\", \"n_estimators\", \"max_iter\"], "; From 8dec6306ab688aab04741b0f8b77ab3ddf102910 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 07:22:45 +0000 Subject: [PATCH 20/84] delete redundant blank line --- src/boosting/goss.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 53decd7c84a5..401a25ec2c6d 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -151,7 +151,6 @@ class GOSS : public SampleStrategy { } return cur_left_cnt; } - }; } // namespace LightGBM From aa63de8df8c22dbb608a7555c07bbbf48e8a01aa Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 09:33:03 +0000 Subject: [PATCH 21/84] update num_data_ when train_data_ is updated set gradients and hessians when GOSS --- include/LightGBM/sample_strategy.h | 1 + src/boosting/gbdt.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 3bfb37b2c78f..816826464b00 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -45,6 +45,7 @@ class SampleStrategy { void UpdateTrainingData(const Dataset* train_data) { train_data_ = train_data; + num_data_ = train_data->num_data(); } virtual bool IsHessianChange() const = 0; diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 7f428720e734..4b2bb5b35679 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -307,12 +307,15 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { // use customized objective function CHECK(hessians != nullptr && objective_function_ == nullptr); if (config_->boosting == std::string("goss") || config_->data_sample_strategy == std::string("goss")) { + // need to copy customized gradients when using GOSS int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; #pragma omp parallel for schedule(static) for (int64_t i = 0; i < total_size; ++i) { gradients_[i] = gradients[i]; hessians_[i] = hessians[i]; } + gradients = gradients_.data(); + hessians = hessians_.data(); } } // bagging logic From 6405361e6b5631c794ee00203a4b068f899c71f1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 10:44:17 +0000 Subject: [PATCH 22/84] check bagging_freq is not zero --- include/LightGBM/sample_strategy.h | 2 +- src/boosting/bagging.hpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 816826464b00..e64c2e193725 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -37,7 +37,7 @@ class SampleStrategy { data_size_t bag_data_cnt() const { return bag_data_cnt_; } - std::vector>& bag_data_indices() {return bag_data_indices_;} + std::vector>& bag_data_indices() { return bag_data_indices_; } void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) { objective_function_ = objective_function; diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index f897c1af3e3a..04661349f49d 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -24,6 +24,9 @@ class BAGGING : public SampleStrategy { void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging + if (bag_data_cnt_ < num_data_ && config_->bagging_freq == 0) { + Log::Fatal("error !!! bag_data_cnt_ = %d, num_data_ = %d, config_->bagging_freq = %d", bag_data_cnt_, num_data_, config_->bagging_freq); + } if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || need_re_bagging_) { need_re_bagging_ = false; From 4d6362a0b6fe3720a0ecfb5a7b69ab2e794f3f6e Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 12:34:24 +0000 Subject: [PATCH 23/84] reset config_ value merge ResetBaggingConfig and ResetGOSS --- include/LightGBM/sample_strategy.h | 4 +--- src/boosting/bagging.hpp | 5 ++--- src/boosting/gbdt.cpp | 9 +++------ src/boosting/goss.hpp | 26 +++++++++----------------- 4 files changed, 15 insertions(+), 29 deletions(-) diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index e64c2e193725..d3dbff9be3f7 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -29,9 +29,7 @@ class SampleStrategy { virtual void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) = 0; - virtual void ResetGOSS() = 0; - - virtual void ResetBaggingConfig(const Config* config, bool is_change_dataset) = 0; + virtual void ResetSampleConfig(const Config* config, bool is_change_dataset) = 0; bool is_use_subset() const { return is_use_subset_; } diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 04661349f49d..bb13442661e5 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -63,9 +63,7 @@ class BAGGING : public SampleStrategy { std::ignore = hessians; } - void ResetGOSS() override {} - - void ResetBaggingConfig(const Config* config, bool is_change_dataset) override { + void ResetSampleConfig(const Config* config, bool is_change_dataset) override { need_resize_gradients_ = false; // if need bagging, create buffer data_size_t num_pos_data = 0; @@ -80,6 +78,7 @@ class BAGGING : public SampleStrategy { && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) { return; } + config_ = config; if (balance_bagging_cond) { balanced_bagging_ = true; bag_data_cnt_ = static_cast(num_pos_data * config->pos_bagging_fraction) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 4b2bb5b35679..44fcbd338106 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -124,8 +124,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective parser_config_str_ = train_data_->parser_config_str(); // if need bagging, create buffer - data_sample_strategy_->ResetBaggingConfig(config_.get(), true); - data_sample_strategy_->ResetGOSS(); + data_sample_strategy_->ResetSampleConfig(config_.get(), true); if (data_sample_strategy_->NeedResizeGradients()) { // resize gradient vectors to copy the customized gradients for goss or bagging with subset const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; @@ -679,7 +678,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* parser_config_str_ = train_data_->parser_config_str(); tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); - data_sample_strategy_->ResetBaggingConfig(config_.get(), true); + data_sample_strategy_->ResetSampleConfig(config_.get(), true); if (data_sample_strategy_->NeedResizeGradients()) { // resize gradient vectors to copy the customized gradients for goss or bagging with subset const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; @@ -689,7 +688,6 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } - data_sample_strategy_->ResetGOSS(); } void GBDT::ResetConfig(const Config* config) { @@ -709,7 +707,7 @@ void GBDT::ResetConfig(const Config* config) { tree_learner_->ResetConfig(new_config.get()); } if (train_data_ != nullptr) { - data_sample_strategy_->ResetBaggingConfig(new_config.get(), false); + data_sample_strategy_->ResetSampleConfig(new_config.get(), false); if (data_sample_strategy_->NeedResizeGradients()) { // resize gradient vectors to copy the customized gradients for goss or bagging with subset const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; @@ -733,7 +731,6 @@ void GBDT::ResetConfig(const Config* config) { } } config_.reset(new_config.release()); - data_sample_strategy_->ResetGOSS(); } } // namespace LightGBM diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 401a25ec2c6d..e048226ffb0e 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -53,7 +53,15 @@ class GOSS : public SampleStrategy { } } - void ResetGOSS() override { + void ResetSampleConfig(const Config* config, bool /*is_change_dataset*/) override { + // Cannot use bagging in GOSS + config_ = config; + need_resize_gradients_ = false; + if (objective_function_ == nullptr) { + // resize gradient vectors to copy the customized gradients for goss + need_resize_gradients_ = true; + } + CHECK_LE(config_->top_rate + config_->other_rate, 1.0f); CHECK(config_->top_rate > 0.0f && config_->other_rate > 0.0f); if (config_->bagging_freq > 0 && config_->bagging_fraction != 1.0f) { @@ -80,22 +88,6 @@ class GOSS : public SampleStrategy { bag_data_cnt_ = num_data_; } - void ResetBaggingConfig(const Config* config, bool is_change_dataset) override { - // Cannot use bagging in GOSS - bag_data_cnt_ = num_data_; - bag_data_indices_.clear(); - bagging_runner_.ReSize(0); - is_use_subset_ = false; - need_resize_gradients_ = false; - if (objective_function_ == nullptr) { - // resize gradient vectors to copy the customized gradients for goss - need_resize_gradients_ = true; - } - // avoid warnings - std::ignore = config; - std::ignore = is_change_dataset; - } - bool IsHessianChange() const override { return true; } From 21ee487a64b83a67a71a543bdf44e4db88fd0074 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 8 Dec 2021 12:50:53 +0000 Subject: [PATCH 24/84] remove useless check --- src/boosting/bagging.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index bb13442661e5..7eb2ab881864 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -24,9 +24,6 @@ class BAGGING : public SampleStrategy { void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging - if (bag_data_cnt_ < num_data_ && config_->bagging_freq == 0) { - Log::Fatal("error !!! bag_data_cnt_ = %d, num_data_ = %d, config_->bagging_freq = %d", bag_data_cnt_, num_data_, config_->bagging_freq); - } if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || need_re_bagging_) { need_re_bagging_ = false; From 634fab48109b570cb40c08a515f04691ea889255 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Fri, 10 Dec 2021 15:14:55 +0000 Subject: [PATCH 25/84] add ttests in test_engine.py --- tests/python_package_test/test_engine.py | 133 +++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index b75c8945669c..8b849568fc22 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3000,3 +3000,136 @@ def test_force_split_with_feature_fraction(tmp_path): for tree in tree_info: tree_structure = tree["tree_structure"] assert tree_structure['split_feature'] == 0 + + +def test_goss_boosting_and_strategy_equivalent(): + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + lgb_train = lgb.Dataset(X_train, y_train) + lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) + params1 = { + 'boosting': 'goss', + 'metric': 'l2', + 'verbose': -1 + } + evals_result1 = {} + gbm = lgb.train(params1, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result1) + params2 = { + 'data_sample_strategy': 'goss', + 'metric': 'l2', + 'verbose': -1 + } + evals_result2 = {} + gbm = lgb.train(params2, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result2) + np.testing.assert_allclose(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) + + +def test_sample_strategy_with_boosting(): + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) + lgb_train = lgb.Dataset(X_train, y_train) + lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) + + params = { + 'boosting': 'dart', + 'data_sample_strategy': 'goss', + 'metric': 'l2', + 'verbose': -1 + } + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 14 + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + + params = { + 'boosting': 'gbdt', + 'data_sample_strategy': 'goss', + 'metric': 'l2', + 'verbose': -1 + } + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 12 + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + + params = { + 'boosting': 'goss', + 'data_sample_strategy': 'goss', + 'metric': 'l2', + 'verbose': -1 + } + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 12 + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + + params = { + 'boosting': 'dart', + 'data_sample_strategy': 'bagging', + 'metric': 'l2', + 'verbose': -1 + } + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 12 + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + + params = { + 'boosting': 'gbdt', + 'data_sample_strategy': 'bagging', + 'metric': 'l2', + 'verbose': -1 + } + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 7 + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + + params = { + 'boosting': 'goss', + 'data_sample_strategy': 'bagging', + 'metric': 'l2', + 'verbose': -1 + } + evals_result = {} + gbm = lgb.train(params, lgb_train, + num_boost_round=50, + valid_sets=lgb_eval, + verbose_eval=False, + evals_result=evals_result) + ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 12 + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) From a68fc252c35eccf7d5bd0924ff16cb7631c6caac Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Sat, 11 Dec 2021 09:44:22 +0000 Subject: [PATCH 26/84] remove whitespace in blank line --- tests/python_package_test/test_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 8b849568fc22..22aab0507aa5 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3037,7 +3037,7 @@ def test_sample_strategy_with_boosting(): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) - + params = { 'boosting': 'dart', 'data_sample_strategy': 'goss', @@ -3053,7 +3053,7 @@ def test_sample_strategy_with_boosting(): ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 14 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) - + params = { 'boosting': 'gbdt', 'data_sample_strategy': 'goss', @@ -3101,7 +3101,7 @@ def test_sample_strategy_with_boosting(): ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) - + params = { 'boosting': 'gbdt', 'data_sample_strategy': 'bagging', From ac387b3a66a0b5c2cfced620cd39f8e738e211b3 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 7 Jan 2022 07:50:58 +0000 Subject: [PATCH 27/84] remove arguments verbose_eval and evals_result --- tests/python_package_test/test_engine.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index e1777ad1d2bd..08c8cb985995 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3240,8 +3240,7 @@ def test_goss_boosting_and_strategy_equivalent(): gbm = lgb.train(params1, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result1) + callbacks=[lgb.record_evaluation(evals_result1)]) params2 = { 'data_sample_strategy': 'goss', 'metric': 'l2', @@ -3251,8 +3250,7 @@ def test_goss_boosting_and_strategy_equivalent(): gbm = lgb.train(params2, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result2) + callbacks=[lgb.record_evaluation(evals_result2)]) np.testing.assert_allclose(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) @@ -3272,8 +3270,7 @@ def test_sample_strategy_with_boosting(): gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result) + callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 14 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) @@ -3288,8 +3285,7 @@ def test_sample_strategy_with_boosting(): gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result) + callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) @@ -3304,8 +3300,7 @@ def test_sample_strategy_with_boosting(): gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result) + callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) @@ -3320,8 +3315,7 @@ def test_sample_strategy_with_boosting(): gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result) + callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) @@ -3336,8 +3330,7 @@ def test_sample_strategy_with_boosting(): gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result) + callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 7 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) @@ -3352,8 +3345,7 @@ def test_sample_strategy_with_boosting(): gbm = lgb.train(params, lgb_train, num_boost_round=50, valid_sets=lgb_eval, - verbose_eval=False, - evals_result=evals_result) + callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) From 6e94059690804502f6b4c6320c8da0e9f8698730 Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:21:50 +0800 Subject: [PATCH 28/84] Update tests/python_package_test/test_engine.py reduce num_boost_round Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 08c8cb985995..240ce5433928 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3248,7 +3248,7 @@ def test_goss_boosting_and_strategy_equivalent(): } evals_result2 = {} gbm = lgb.train(params2, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result2)]) np.testing.assert_allclose(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) From 0fe6dc8dd0d80df0701c8def2c497e89438b8f76 Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:22:26 +0800 Subject: [PATCH 29/84] Update tests/python_package_test/test_engine.py reduce num_boost_round Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 240ce5433928..0916135d9eb8 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3268,7 +3268,7 @@ def test_sample_strategy_with_boosting(): } evals_result = {} gbm = lgb.train(params, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) From ab39d21aac6252d0534ce643dce062203c9c469d Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:22:38 +0800 Subject: [PATCH 30/84] Update tests/python_package_test/test_engine.py reduce num_boost_round Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 0916135d9eb8..c55b94f79fbb 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3328,7 +3328,7 @@ def test_sample_strategy_with_boosting(): } evals_result = {} gbm = lgb.train(params, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) From 9978c3c3feae653831b951a1fb5c4f2c73c5ce32 Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:22:46 +0800 Subject: [PATCH 31/84] Update tests/python_package_test/test_engine.py reduce num_boost_round Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c55b94f79fbb..468b149eeeb0 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3343,7 +3343,7 @@ def test_sample_strategy_with_boosting(): } evals_result = {} gbm = lgb.train(params, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) From 7ba17504edceb9f52c4d4d7dfa3175bf7d8e4439 Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:22:54 +0800 Subject: [PATCH 32/84] Update tests/python_package_test/test_engine.py reduce num_boost_round Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 468b149eeeb0..00a0a29e65e2 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3313,7 +3313,7 @@ def test_sample_strategy_with_boosting(): } evals_result = {} gbm = lgb.train(params, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) From ecaaabe8fd9d6650e02f53746a609f02b2f014a8 Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:23:03 +0800 Subject: [PATCH 33/84] Update tests/python_package_test/test_engine.py reduce num_boost_round Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 00a0a29e65e2..1f9d031e7559 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3298,7 +3298,7 @@ def test_sample_strategy_with_boosting(): } evals_result = {} gbm = lgb.train(params, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) From c1f1b91a713cfc7ba736e5228af50b4a6839914b Mon Sep 17 00:00:00 2001 From: GuangdaLiu <90019144+GuangdaLiu@users.noreply.github.com> Date: Tue, 11 Jan 2022 19:24:39 +0800 Subject: [PATCH 34/84] Update src/boosting/sample_strategy.cpp modify warning about setting goss as `boosting_type` Co-authored-by: James Lamb --- src/boosting/sample_strategy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 8e005dbc9c22..0d8d581efb01 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -13,7 +13,7 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); if (use_goss_as_boosting) { - Log::Warning("Setting goss as `boosting_type` is NOT suggested. Please set `data_sample_strategy = goss` in your config file."); + Log::Warning("Setting goss as `boosting_type` is not recommended. Please set `data_sample_strategy = goss` instead."); if (use_goss_as_strategy) { Log::Warning("Both `boosting_type` and `data_sample_strategy` are set as GOSS. Only one time of sampling will be conducted. Please check and modify your config file."); } From 006de872210d046f4097b365e5507c365c9eb24b Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Tue, 11 Jan 2022 12:15:39 +0000 Subject: [PATCH 35/84] Update tests/python_package_test/test_engine.py replace load_boston() with make_regression() remove value checks of mean_squared_error in test_sample_strategy_with_boosting() --- tests/python_package_test/test_engine.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 1f9d031e7559..7a5356747194 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -12,7 +12,7 @@ import psutil import pytest from scipy.sparse import csr_matrix, isspmatrix_csc, isspmatrix_csr -from sklearn.datasets import load_svmlight_file, make_multilabel_classification +from sklearn.datasets import load_svmlight_file, make_multilabel_classification, make_regression from sklearn.metrics import average_precision_score, log_loss, mean_absolute_error, mean_squared_error, roc_auc_score from sklearn.model_selection import GroupKFold, TimeSeriesSplit, train_test_split @@ -3227,7 +3227,7 @@ def test_force_split_with_feature_fraction(tmp_path): def test_goss_boosting_and_strategy_equivalent(): - X, y = load_boston(return_X_y=True) + X, y = make_regression(n_samples=10_000, n_features=10, n_informative=5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) @@ -3238,7 +3238,7 @@ def test_goss_boosting_and_strategy_equivalent(): } evals_result1 = {} gbm = lgb.train(params1, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result1)]) params2 = { @@ -3255,7 +3255,7 @@ def test_goss_boosting_and_strategy_equivalent(): def test_sample_strategy_with_boosting(): - X, y = load_boston(return_X_y=True) + X, y = make_regression(n_samples=10_000, n_features=10, n_informative=5) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) @@ -3272,7 +3272,6 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 14 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3283,11 +3282,10 @@ def test_sample_strategy_with_boosting(): } evals_result = {} gbm = lgb.train(params, lgb_train, - num_boost_round=50, + num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3302,7 +3300,6 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3317,7 +3314,6 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3332,7 +3328,6 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 7 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3347,5 +3342,4 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 12 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) From 20ddcb4c84a2129d7551739f090b4b59cc567693 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Sat, 15 Jan 2022 08:44:18 +0000 Subject: [PATCH 36/84] Update tests/python_package_test/test_engine.py add value checks of mean_squared_error in test_sample_strategy_with_boosting() --- tests/python_package_test/test_engine.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 7a5356747194..030b0a51c809 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3272,6 +3272,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 10000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3286,6 +3287,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 10000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3300,6 +3302,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 10000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3314,6 +3317,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 10000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3328,6 +3332,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 10000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3342,4 +3347,5 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret < 10000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) From 73d7db76785a5a4b13bacf4c1026dbeb6ba62b64 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Sat, 15 Jan 2022 08:49:48 +0000 Subject: [PATCH 37/84] Modify warnning about using goss as boosting type --- src/boosting/sample_strategy.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 0d8d581efb01..059eba9ecc42 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -13,10 +13,7 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); if (use_goss_as_boosting) { - Log::Warning("Setting goss as `boosting_type` is not recommended. Please set `data_sample_strategy = goss` instead."); - if (use_goss_as_strategy) { - Log::Warning("Both `boosting_type` and `data_sample_strategy` are set as GOSS. Only one time of sampling will be conducted. Please check and modify your config file."); - } + Log::Warning("Found boosting_type=goss. For backwards compatibility reasons, LightGBM interprets this as boosting_type=gbdt, data_sample_strategy=goss. To suppress this warning, set data_sample_strategy=goss instead."); } if (use_goss_as_boosting || use_goss_as_strategy) { return new GOSS(config, train_data, num_tree_per_iteration); From beaaf19f711d88e9d31ac1ada5dc63d062638926 Mon Sep 17 00:00:00 2001 From: Guangda Liu Date: Tue, 18 Jan 2022 06:27:42 +0000 Subject: [PATCH 38/84] Update tests/python_package_test/test_engine.py add random_state=42 for make_regression() reduce the threshold of mean_square_error --- tests/python_package_test/test_engine.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 030b0a51c809..0b654be4ce08 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3227,7 +3227,7 @@ def test_force_split_with_feature_fraction(tmp_path): def test_goss_boosting_and_strategy_equivalent(): - X, y = make_regression(n_samples=10_000, n_features=10, n_informative=5) + X, y = make_regression(n_samples=10_000, n_features=10, n_informative=5, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) @@ -3255,7 +3255,7 @@ def test_goss_boosting_and_strategy_equivalent(): def test_sample_strategy_with_boosting(): - X, y = make_regression(n_samples=10_000, n_features=10, n_informative=5) + X, y = make_regression(n_samples=10_000, n_features=10, n_informative=5, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) @@ -3272,7 +3272,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 10000 + assert ret < 4000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3287,7 +3287,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 10000 + assert ret < 4000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3302,7 +3302,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 10000 + assert ret < 4000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3317,7 +3317,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 10000 + assert ret < 4000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3332,7 +3332,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 10000 + assert ret < 4000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) params = { @@ -3347,5 +3347,5 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 10000 + assert ret < 4000 assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) From 1dbbee4cbd3ff78750ca7bd5f66fcddf8fbf0502 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Tue, 15 Mar 2022 16:32:00 +0800 Subject: [PATCH 39/84] Update src/boosting/sample_strategy.cpp Co-authored-by: James Lamb --- src/boosting/sample_strategy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 059eba9ecc42..a49a08a0a36f 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -13,7 +13,7 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); if (use_goss_as_boosting) { - Log::Warning("Found boosting_type=goss. For backwards compatibility reasons, LightGBM interprets this as boosting_type=gbdt, data_sample_strategy=goss. To suppress this warning, set data_sample_strategy=goss instead."); + Log::Warning("Found boosting=goss. For backwards compatibility reasons, LightGBM interprets this as boosting=gbdt, data_sample_strategy=goss. To suppress this warning, set data_sample_strategy=goss instead."); } if (use_goss_as_boosting || use_goss_as_strategy) { return new GOSS(config, train_data, num_tree_per_iteration); From cddfcd69674bf19d6b897852cdf3e6eb9d3d8ce6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 15 Mar 2022 09:08:08 +0000 Subject: [PATCH 40/84] remove goss from boosting types in documentation --- docs/Parameters.rst | 4 +--- include/LightGBM/config.h | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index d51c0e6001ee..61355c08b880 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -127,7 +127,7 @@ Core Parameters - label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect) -- ``boosting`` :raw-html:`🔗︎`, default = ``gbdt``, type = enum, options: ``gbdt``, ``rf``, ``dart``, ``goss``, aliases: ``boosting_type``, ``boost`` +- ``boosting`` :raw-html:`🔗︎`, default = ``gbdt``, type = enum, options: ``gbdt``, ``rf``, ``dart``, aliases: ``boosting_type``, ``boost`` - ``gbdt``, traditional Gradient Boosting Decision Tree, aliases: ``gbrt`` @@ -135,8 +135,6 @@ Core Parameters - ``dart``, `Dropouts meet Multiple Additive Regression Trees `__ - - ``goss``, Gradient-based One-Side Sampling - - **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations - ``data_sample_strategy`` :raw-html:`🔗︎`, default = ``bagging``, type = enum, options: ``bagging``, ``goss`` diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 086c0f453966..8ebcc62b51bc 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -141,11 +141,10 @@ struct Config { // [doc-only] // type = enum // alias = boosting_type, boost - // options = gbdt, rf, dart, goss + // options = gbdt, rf, dart // desc = ``gbdt``, traditional Gradient Boosting Decision Tree, aliases: ``gbrt`` // desc = ``rf``, Random Forest, aliases: ``random_forest`` // desc = ``dart``, `Dropouts meet Multiple Additive Regression Trees `__ - // desc = ``goss``, Gradient-based One-Side Sampling // descl2 = **Note**: internally, LightGBM uses ``gbdt`` mode for the first ``1 / learning_rate`` iterations std::string boosting = "gbdt"; From df523f31b237845bfec2fb85be7dc93155d40cec Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Tue, 15 Mar 2022 18:38:08 +0800 Subject: [PATCH 41/84] Update src/boosting/bagging.hpp Co-authored-by: Nikita Titov --- src/boosting/bagging.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 7eb2ab881864..de4728e1e6de 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ -#define LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ +#ifndef LIGHTGBM_BOOSTING_BAGGING_HPP_ +#define LIGHTGBM_BOOSTING_BAGGING_HPP_ namespace LightGBM { From 85e7fd161de495100d734accde6d9590f5249a39 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Tue, 15 Mar 2022 18:38:42 +0800 Subject: [PATCH 42/84] Update src/boosting/bagging.hpp Co-authored-by: Nikita Titov --- src/boosting/bagging.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index de4728e1e6de..2c02c6285372 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -178,4 +178,4 @@ class BAGGING : public SampleStrategy { } // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_BAGGING_HPP_ +#endif // LIGHTGBM_BOOSTING_BAGGING_HPP_ From efb5e286861063bb78f992b768a93e38811ee9f9 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Tue, 15 Mar 2022 18:38:56 +0800 Subject: [PATCH 43/84] Update src/boosting/goss.hpp Co-authored-by: Nikita Titov --- src/boosting/goss.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index e048226ffb0e..30d463564e45 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -3,8 +3,8 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifndef LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ -#define LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ +#ifndef LIGHTGBM_BOOSTING_GOSS_HPP_ +#define LIGHTGBM_BOOSTING_GOSS_HPP_ #include #include From beb9f8c71d6b38a9a70741a603f1b3cda61d27af Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Tue, 15 Mar 2022 18:39:06 +0800 Subject: [PATCH 44/84] Update src/boosting/goss.hpp Co-authored-by: Nikita Titov --- src/boosting/goss.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 30d463564e45..0265a1b3656d 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -147,4 +147,4 @@ class GOSS : public SampleStrategy { } // namespace LightGBM -#endif // LIGHTGBM_SAMPLE_STRATEGY_GOSS_HPP_ +#endif // LIGHTGBM_BOOSTING_GOSS_HPP_ From 4bdcdd5ba53b2b8599858dd25f7880b3dbdb0aa1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 19 Mar 2022 03:05:19 +0000 Subject: [PATCH 45/84] rename GOSS with GOSSStrategy --- include/LightGBM/config.h | 3 ++- python-package/lightgbm/sklearn.py | 1 - src/boosting/bagging.hpp | 11 ++++------- src/boosting/goss.hpp | 6 +++--- src/boosting/sample_strategy.cpp | 4 ++-- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 8ebcc62b51bc..d7277ea25a6f 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -153,6 +153,7 @@ struct Config { // options = bagging, goss // desc = ``bagging``, Randomly Bagging Sampling // desc = ``goss``, Gradient-based One-Side Sampling + // desc = ``bagging`` is only effective when bagging_freq > 0 and bagging_fraction < 1.0 std::string data_sample_strategy = "bagging"; // alias = train, train_data, train_data_file, data_filename @@ -253,7 +254,7 @@ struct Config { // desc = enabling this is recommended when: // descl2 = the number of data points is large, and the total number of bins is relatively small // descl2 = ``num_threads`` is relatively small, e.g. ``<= 16`` - // descl2 = you want to use small ``bagging_fraction`` or ``goss`` boosting to speed up + // descl2 = you want to use small ``bagging_fraction`` or ``goss`` sample strategy to speed up // desc = **Note**: setting this to ``true`` will double the memory cost for Dataset object. If you have not enough memory, you can try setting ``force_col_wise=true`` // desc = **Note**: when both ``force_col_wise`` and ``force_row_wise`` are ``false``, LightGBM will firstly try them both, and then use the faster one. To remove the overhead of testing set the faster one to ``true`` manually // desc = **Note**: this parameter cannot be used at the same time with ``force_col_wise``, choose only one of them diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 7ebba0bc962c..f0f141fc33d6 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -373,7 +373,6 @@ def __init__( boosting_type : str, optional (default='gbdt') 'gbdt', traditional Gradient Boosting Decision Tree. 'dart', Dropouts meet Multiple Additive Regression Trees. - 'goss', Gradient-based One-Side Sampling. 'rf', Random Forest. num_leaves : int, optional (default=31) Maximum tree leaves for base learners. diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 7eb2ab881864..e0f5c5806f4a 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -8,9 +8,9 @@ namespace LightGBM { -class BAGGING : public SampleStrategy { +class BaggingSampleStrategy : public SampleStrategy { public: - BAGGING(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) + BaggingSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) : need_re_bagging_(false) { config_ = config; train_data_ = train_data; @@ -19,9 +19,9 @@ class BAGGING : public SampleStrategy { num_tree_per_iteration_ = num_tree_per_iteration; } - ~BAGGING() {} + ~BaggingSampleStrategy() {} - void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { + void Bagging(int iter, TreeLearner* tree_learner, score_t* /*gradients*/, score_t* /*hessians*/) override { Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); // if need bagging if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || @@ -55,9 +55,6 @@ class BAGGING : public SampleStrategy { bag_data_cnt_); } } - // avoid warnings - std::ignore = gradients; - std::ignore = hessians; } void ResetSampleConfig(const Config* config, bool is_change_dataset) override { diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index e048226ffb0e..a0766cc16673 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -14,16 +14,16 @@ namespace LightGBM { -class GOSS : public SampleStrategy { +class GOSSStrategy : public SampleStrategy { public: - GOSS(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { + GOSSStrategy(const Config* config, const Dataset* train_data, int num_tree_per_iteration) { config_ = config; train_data_ = train_data; num_tree_per_iteration_ = num_tree_per_iteration; num_data_ = train_data->num_data(); } - ~GOSS() { + ~GOSSStrategy() { } void Bagging(int iter, TreeLearner* tree_learner, score_t* gradients, score_t* hessians) override { diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index a49a08a0a36f..77c992e166d3 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -16,9 +16,9 @@ SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const Log::Warning("Found boosting=goss. For backwards compatibility reasons, LightGBM interprets this as boosting=gbdt, data_sample_strategy=goss. To suppress this warning, set data_sample_strategy=goss instead."); } if (use_goss_as_boosting || use_goss_as_strategy) { - return new GOSS(config, train_data, num_tree_per_iteration); + return new GOSSStrategy(config, train_data, num_tree_per_iteration); } else { - return new BAGGING(config, train_data, objective_function, num_tree_per_iteration); + return new BaggingSampleStrategy(config, train_data, objective_function, num_tree_per_iteration); } } From 3291d7e8da1e29f4be5f8a77549b4d11e34916de Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 19 Mar 2022 03:05:52 +0000 Subject: [PATCH 46/84] update doc --- docs/Parameters.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 61355c08b880..52c84634c0a2 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -143,6 +143,8 @@ Core Parameters - ``goss``, Gradient-based One-Side Sampling + - ``bagging`` is only effective when bagging_freq > 0 and bagging_fraction < 1.0 + - ``data`` :raw-html:`🔗︎`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename`` - path of training data, LightGBM will train from this data @@ -268,7 +270,7 @@ Learning Control Parameters - ``num_threads`` is relatively small, e.g. ``<= 16`` - - you want to use small ``bagging_fraction`` or ``goss`` boosting to speed up + - you want to use small ``bagging_fraction`` or ``goss`` sample strategy to speed up - **Note**: setting this to ``true`` will double the memory cost for Dataset object. If you have not enough memory, you can try setting ``force_col_wise=true`` From 93a87629c65677bee68f71b0db259897e1611ae5 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 19 Mar 2022 03:43:54 +0000 Subject: [PATCH 47/84] address comments --- docs/Development-Guide.rst | 2 +- src/boosting/bagging.hpp | 3 ++- src/boosting/sample_strategy.cpp | 6 +++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/Development-Guide.rst b/docs/Development-Guide.rst index c8b30173da79..cea967dd8dde 100644 --- a/docs/Development-Guide.rst +++ b/docs/Development-Guide.rst @@ -19,7 +19,7 @@ Important Classes +-------------------------+----------------------------------------------------------------------------------------+ | ``Bin`` | Data structure used for storing feature discrete values (converted from float values) | +-------------------------+----------------------------------------------------------------------------------------+ -| ``Boosting`` | Boosting interface (GBDT, DART, GOSS, etc.) | +| ``Boosting`` | Boosting interface (GBDT, DART, etc.) | +-------------------------+----------------------------------------------------------------------------------------+ | ``Config`` | Stores parameters and configurations | +-------------------------+----------------------------------------------------------------------------------------+ diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index e617121d5cd1..aec37bb8d5ba 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -92,7 +92,8 @@ class BaggingSampleStrategy : public SampleStrategy { (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; is_use_subset_ = false; const int group_threshold_usesubset = 100; - if (average_bag_rate <= 0.5 + const double average_bag_rate_threshold = 0.5; + if (average_bag_rate <= average_bag_rate_threshold && (train_data_->num_feature_groups() < group_threshold_usesubset)) { if (tmp_subset_ == nullptr || is_change_dataset) { tmp_subset_.reset(new Dataset(bag_data_cnt_)); diff --git a/src/boosting/sample_strategy.cpp b/src/boosting/sample_strategy.cpp index 77c992e166d3..e991efd01045 100644 --- a/src/boosting/sample_strategy.cpp +++ b/src/boosting/sample_strategy.cpp @@ -9,7 +9,11 @@ namespace LightGBM { -SampleStrategy* SampleStrategy::CreateSampleStrategy(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, int num_tree_per_iteration) { +SampleStrategy* SampleStrategy::CreateSampleStrategy( + const Config* config, + const Dataset* train_data, + const ObjectiveFunction* objective_function, + int num_tree_per_iteration) { bool use_goss_as_boosting = config->boosting == std::string("goss"); bool use_goss_as_strategy = config->data_sample_strategy == std::string("goss"); if (use_goss_as_boosting) { From 7e1167aca598a69838f9e17be31b4107968c00d1 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Sat, 19 Mar 2022 03:48:22 +0000 Subject: [PATCH 48/84] fix table in doc --- docs/Development-Guide.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Development-Guide.rst b/docs/Development-Guide.rst index cea967dd8dde..6c4819e45209 100644 --- a/docs/Development-Guide.rst +++ b/docs/Development-Guide.rst @@ -19,7 +19,7 @@ Important Classes +-------------------------+----------------------------------------------------------------------------------------+ | ``Bin`` | Data structure used for storing feature discrete values (converted from float values) | +-------------------------+----------------------------------------------------------------------------------------+ -| ``Boosting`` | Boosting interface (GBDT, DART, etc.) | +| ``Boosting`` | Boosting interface (GBDT, DART, etc.) | +-------------------------+----------------------------------------------------------------------------------------+ | ``Config`` | Stores parameters and configurations | +-------------------------+----------------------------------------------------------------------------------------+ From a1b6bd18b6cc27458fa93afdadcaf691781a44d6 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Mon, 21 Mar 2022 11:29:05 +0800 Subject: [PATCH 49/84] Update include/LightGBM/config.h Co-authored-by: Nikita Titov --- include/LightGBM/config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index d7277ea25a6f..ec6e04cbf81b 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -152,8 +152,8 @@ struct Config { // type = enum // options = bagging, goss // desc = ``bagging``, Randomly Bagging Sampling + // descl2 = **Note**: ``bagging`` is only effective when ``bagging_freq > 0`` and ``bagging_fraction < 1.0`` // desc = ``goss``, Gradient-based One-Side Sampling - // desc = ``bagging`` is only effective when bagging_freq > 0 and bagging_fraction < 1.0 std::string data_sample_strategy = "bagging"; // alias = train, train_data, train_data_file, data_filename From 4499113b2d9e4b5b5b403728c28adfbd3437ed6b Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 21 Mar 2022 03:38:26 +0000 Subject: [PATCH 50/84] update documentation --- docs/Parameters.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 52c84634c0a2..3c9284e92737 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -141,9 +141,9 @@ Core Parameters - ``bagging``, Randomly Bagging Sampling - - ``goss``, Gradient-based One-Side Sampling + - **Note**: ``bagging`` is only effective when ``bagging_freq > 0`` and ``bagging_fraction < 1.0`` - - ``bagging`` is only effective when bagging_freq > 0 and bagging_fraction < 1.0 + - ``goss``, Gradient-based One-Side Sampling - ``data`` :raw-html:`🔗︎`, default = ``""``, type = string, aliases: ``train``, ``train_data``, ``train_data_file``, ``data_filename`` From 3a2235e5a3f75a5ed3da467f6d76ab046feec4a7 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 21 Mar 2022 06:22:18 +0000 Subject: [PATCH 51/84] update test case --- tests/python_package_test/test_engine.py | 34 +++++++++++++++--------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c865060ef507..1728db1fe751 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -18,7 +18,7 @@ import lightgbm as lgb -from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression, +from utils import (load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression, sklearn_multiclass_custom_objective, softmax) decreasing_generator = itertools.count(0, -1) @@ -3311,24 +3311,32 @@ def test_goss_boosting_and_strategy_equivalent(): params1 = { 'boosting': 'goss', 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'bagging_seed': 0, + 'learning_rate': 0.05, + 'num_threads': 1, + 'force_row_wise': True, } evals_result1 = {} - gbm = lgb.train(params1, lgb_train, - num_boost_round=10, - valid_sets=lgb_eval, - callbacks=[lgb.record_evaluation(evals_result1)]) + lgb.train(params1, lgb_train, + num_boost_round=10, + valid_sets=lgb_eval, + callbacks=[lgb.record_evaluation(evals_result1)]) params2 = { 'data_sample_strategy': 'goss', 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'bagging_seed': 0, + 'learning_rate': 0.05, + 'num_threads': 1, + 'force_row_wise': True, } evals_result2 = {} - gbm = lgb.train(params2, lgb_train, - num_boost_round=10, - valid_sets=lgb_eval, - callbacks=[lgb.record_evaluation(evals_result2)]) - np.testing.assert_allclose(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) + lgb.train(params2, lgb_train, + num_boost_round=10, + valid_sets=lgb_eval, + callbacks=[lgb.record_evaluation(evals_result2)]) + np.testing.assert_equal(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) def test_sample_strategy_with_boosting(): @@ -3577,3 +3585,5 @@ def test_boost_from_average_with_single_leaf_trees(): preds = model.predict(X) mean_preds = np.mean(preds) assert y.min() <= mean_preds <= y.max() + +test_goss_boosting_and_strategy_equivalent() From 1e4c11a8f3f5d5f969627387a76c44d924f28464 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 21 Mar 2022 06:33:58 +0000 Subject: [PATCH 52/84] revert useless change in test_engine.py --- tests/python_package_test/test_engine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 1728db1fe751..89395cca221f 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -18,7 +18,7 @@ import lightgbm as lgb -from utils import (load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression, +from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression, sklearn_multiclass_custom_objective, softmax) decreasing_generator = itertools.count(0, -1) @@ -3585,5 +3585,3 @@ def test_boost_from_average_with_single_leaf_trees(): preds = model.predict(X) mean_preds = np.mean(preds) assert y.min() <= mean_preds <= y.max() - -test_goss_boosting_and_strategy_equivalent() From e72fb01f1de343b22579288d1291f0467356a30f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 7 Jun 2022 09:02:41 +0000 Subject: [PATCH 53/84] add tests for evaluation results in test_sample_strategy_with_boosting --- tests/python_package_test/test_engine.py | 53 ++++++++++-------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 871d6a976267..19c8e7289ae8 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3482,9 +3482,9 @@ def test_sample_strategy_with_boosting(): num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 4000 - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + ret1 = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret1 == pytest.approx(3149.393862) + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret1) params = { 'boosting': 'gbdt', @@ -3497,9 +3497,9 @@ def test_sample_strategy_with_boosting(): num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 4000 - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + ret2 = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret2 == pytest.approx(2547.715968) + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret2) params = { 'boosting': 'goss', @@ -3512,13 +3512,18 @@ def test_sample_strategy_with_boosting(): num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 4000 - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + ret3 = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret3 == pytest.approx(2547.715968) + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret3) + + assert ret1 != ret2 + assert ret2 == ret3 params = { 'boosting': 'dart', 'data_sample_strategy': 'bagging', + 'bagging_freq': 1, + 'bagging_fraction': 0.5, 'metric': 'l2', 'verbose': -1 } @@ -3527,13 +3532,15 @@ def test_sample_strategy_with_boosting(): num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 4000 - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + ret4 = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret4 == pytest.approx(3134.866931) + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret4) params = { 'boosting': 'gbdt', 'data_sample_strategy': 'bagging', + 'bagging_freq': 1, + 'bagging_fraction': 0.5, 'metric': 'l2', 'verbose': -1 } @@ -3542,24 +3549,10 @@ def test_sample_strategy_with_boosting(): num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 4000 - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) - - params = { - 'boosting': 'goss', - 'data_sample_strategy': 'bagging', - 'metric': 'l2', - 'verbose': -1 - } - evals_result = {} - gbm = lgb.train(params, lgb_train, - num_boost_round=10, - valid_sets=lgb_eval, - callbacks=[lgb.record_evaluation(evals_result)]) - ret = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret < 4000 - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret) + ret5 = mean_squared_error(y_test, gbm.predict(X_test)) + assert ret5 == pytest.approx(2539.792378) + assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret5) + assert ret4 != ret5 def test_record_evaluation_with_train(): From 05292ffbc45b722630eb0afc9757be9e6f6db345 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 9 Jun 2022 06:58:18 +0000 Subject: [PATCH 54/84] include --- src/boosting/bagging.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index a87b7a9c1814..bf26381dd1b7 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -6,6 +6,8 @@ #ifndef LIGHTGBM_BOOSTING_BAGGING_HPP_ #define LIGHTGBM_BOOSTING_BAGGING_HPP_ +#include + namespace LightGBM { class BaggingSampleStrategy : public SampleStrategy { From 6ec78125b6cf122d1bdbf6f3aa7b8c7ad9d34e30 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 9 Jun 2022 06:59:31 +0000 Subject: [PATCH 55/84] change to assert_allclose in test_goss_boosting_and_strategy_equivalent --- tests/python_package_test/test_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 19c8e7289ae8..849b407afcbd 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3463,6 +3463,7 @@ def test_goss_boosting_and_strategy_equivalent(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result2)]) np.testing.assert_equal(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) + np.testing.assert_allclose(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2'], atol=1e-5) def test_sample_strategy_with_boosting(): From 9f749fa10a14b15665e4ecad60c6d8b14ddb2301 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 9 Jun 2022 08:09:37 +0000 Subject: [PATCH 56/84] more tolerance in result checking, due to minor difference in results of gpu versions --- tests/python_package_test/test_engine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 849b407afcbd..3becadf6eb90 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3484,7 +3484,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret1 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret1 == pytest.approx(3149.393862) + assert ret1 == pytest.approx(3149.393862, abs=1.0) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret1) params = { @@ -3499,7 +3499,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret2 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret2 == pytest.approx(2547.715968) + assert ret2 == pytest.approx(2547.715968, abs=1.0) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret2) params = { @@ -3514,7 +3514,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret3 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret3 == pytest.approx(2547.715968) + assert ret3 == pytest.approx(2547.715968, abs=1.0) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret3) assert ret1 != ret2 @@ -3534,7 +3534,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret4 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret4 == pytest.approx(3134.866931) + assert ret4 == pytest.approx(3134.866931, abs=1.0) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret4) params = { @@ -3551,7 +3551,7 @@ def test_sample_strategy_with_boosting(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) ret5 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret5 == pytest.approx(2539.792378) + assert ret5 == pytest.approx(2539.792378, abs=1.0) assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret5) assert ret4 != ret5 From 808ccc609929a4d4427e5e5b0f5c8bb1d1da7b56 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 9 Jun 2022 12:16:00 +0000 Subject: [PATCH 57/84] change == to np.testing.assert_allclose --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 3becadf6eb90..4b3650115875 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3518,7 +3518,7 @@ def test_sample_strategy_with_boosting(): assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret3) assert ret1 != ret2 - assert ret2 == ret3 + assert np.testing.assert_allclose(ret1, ret2) params = { 'boosting': 'dart', From 35f4eb50bfff279db557c7e410df6911c20db739 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 13 Jun 2022 10:41:39 +0000 Subject: [PATCH 58/84] fix test case --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 4b3650115875..0a42de8f9f7f 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3518,7 +3518,7 @@ def test_sample_strategy_with_boosting(): assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret3) assert ret1 != ret2 - assert np.testing.assert_allclose(ret1, ret2) + np.testing.assert_allclose(ret2, ret3) params = { 'boosting': 'dart', From 7fe6a944fdc04f1d857f743adaf3b4a87e2804a6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 27 Jul 2022 15:06:41 +0000 Subject: [PATCH 59/84] set gpu_use_dp to true --- tests/python_package_test/test_engine.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 0a42de8f9f7f..92f9faa9b304 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3476,7 +3476,8 @@ def test_sample_strategy_with_boosting(): 'boosting': 'dart', 'data_sample_strategy': 'goss', 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'gpu_use_dp': True } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3491,7 +3492,8 @@ def test_sample_strategy_with_boosting(): 'boosting': 'gbdt', 'data_sample_strategy': 'goss', 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'gpu_use_dp': True } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3506,7 +3508,8 @@ def test_sample_strategy_with_boosting(): 'boosting': 'goss', 'data_sample_strategy': 'goss', 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'gpu_use_dp': True } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3518,7 +3521,7 @@ def test_sample_strategy_with_boosting(): assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret3) assert ret1 != ret2 - np.testing.assert_allclose(ret2, ret3) + assert ret2 == ret3 params = { 'boosting': 'dart', @@ -3526,7 +3529,8 @@ def test_sample_strategy_with_boosting(): 'bagging_freq': 1, 'bagging_fraction': 0.5, 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'gpu_use_dp': True } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3543,7 +3547,8 @@ def test_sample_strategy_with_boosting(): 'bagging_freq': 1, 'bagging_fraction': 0.5, 'metric': 'l2', - 'verbose': -1 + 'verbose': -1, + 'gpu_use_dp': True } evals_result = {} gbm = lgb.train(params, lgb_train, From 7f108181577cb86b036c3c924352feaa631f8e38 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 27 Jul 2022 15:26:13 +0000 Subject: [PATCH 60/84] change --report to --report-level for rstcheck --- .ci/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 0ba52ecbb998..d7e83cb48925 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -47,9 +47,9 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then rstcheck || exit -1 # check reStructuredText formatting cd $BUILD_DIRECTORY/python-package - rstcheck --report warning $(find . -type f -name "*.rst") || exit -1 + rstcheck --report-level warning $(find . -type f -name "*.rst") || exit -1 cd $BUILD_DIRECTORY/docs - rstcheck --report warning --ignore-directives=autoclass,autofunction,doxygenfile $(find . -type f -name "*.rst") || exit -1 + rstcheck --report-level warning --ignore-directives=autoclass,autofunction,doxygenfile $(find . -type f -name "*.rst") || exit -1 # build docs make html || exit -1 if [[ $TASK == "check-links" ]]; then From 755cb3a38c51cc5361e5ac7966d4af53e5fced3f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 29 Jul 2022 03:47:46 +0000 Subject: [PATCH 61/84] use gpu_use_dp=true in test_goss_boosting_and_strategy_equivalent --- tests/python_package_test/test_engine.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 92f9faa9b304..99b646ae0919 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1089,7 +1089,7 @@ def test_feature_name_with_non_ascii(): X_train = np.random.normal(size=(100, 4)) y_train = np.random.random(100) # This has non-ascii strings. - feature_names = [u'F_零', u'F_一', u'F_二', u'F_三'] + feature_names = [u'F_é›?', u'F_一', u'F_äº?', u'F_ä¸?'] params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) @@ -3442,6 +3442,7 @@ def test_goss_boosting_and_strategy_equivalent(): 'learning_rate': 0.05, 'num_threads': 1, 'force_row_wise': True, + 'gpu_use_dp': True, } evals_result1 = {} lgb.train(params1, lgb_train, @@ -3456,6 +3457,7 @@ def test_goss_boosting_and_strategy_equivalent(): 'learning_rate': 0.05, 'num_threads': 1, 'force_row_wise': True, + 'gpu_use_dp': True, } evals_result2 = {} lgb.train(params2, lgb_train, @@ -3463,7 +3465,6 @@ def test_goss_boosting_and_strategy_equivalent(): valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result2)]) np.testing.assert_equal(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) - np.testing.assert_allclose(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2'], atol=1e-5) def test_sample_strategy_with_boosting(): @@ -3477,7 +3478,7 @@ def test_sample_strategy_with_boosting(): 'data_sample_strategy': 'goss', 'metric': 'l2', 'verbose': -1, - 'gpu_use_dp': True + 'gpu_use_dp': True, } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3493,7 +3494,7 @@ def test_sample_strategy_with_boosting(): 'data_sample_strategy': 'goss', 'metric': 'l2', 'verbose': -1, - 'gpu_use_dp': True + 'gpu_use_dp': True, } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3509,7 +3510,7 @@ def test_sample_strategy_with_boosting(): 'data_sample_strategy': 'goss', 'metric': 'l2', 'verbose': -1, - 'gpu_use_dp': True + 'gpu_use_dp': True, } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3530,7 +3531,7 @@ def test_sample_strategy_with_boosting(): 'bagging_fraction': 0.5, 'metric': 'l2', 'verbose': -1, - 'gpu_use_dp': True + 'gpu_use_dp': True, } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3548,7 +3549,7 @@ def test_sample_strategy_with_boosting(): 'bagging_fraction': 0.5, 'metric': 'l2', 'verbose': -1, - 'gpu_use_dp': True + 'gpu_use_dp': True, } evals_result = {} gbm = lgb.train(params, lgb_train, From b431c2c9e751228d9293edea4b6134894ba41bc8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 29 Jul 2022 04:11:25 +0000 Subject: [PATCH 62/84] revert unexpected changes of non-ascii characters --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 05e2d1ad0e3e..57c83ee3f93c 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1090,7 +1090,7 @@ def test_feature_name_with_non_ascii(): X_train = np.random.normal(size=(100, 4)) y_train = np.random.random(100) # This has non-ascii strings. - feature_names = [u'F_é›?', u'F_一', u'F_äº?', u'F_ä¸?'] + feature_names = [u'F_Áã', u'F_Ò»', u'F_¶þ', u'F_Èý'] params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) From 43480d14fca741b2650339e01f0023cb2d2fdfc3 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Fri, 29 Jul 2022 12:13:21 +0800 Subject: [PATCH 63/84] revert unexpected changes of non-ascii characters --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 57c83ee3f93c..cd1c776645b4 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1090,7 +1090,7 @@ def test_feature_name_with_non_ascii(): X_train = np.random.normal(size=(100, 4)) y_train = np.random.random(100) # This has non-ascii strings. - feature_names = [u'F_Áã', u'F_Ò»', u'F_¶þ', u'F_Èý'] + feature_names = [u'F_零', u'F_一', u'F_二', u'F_三'] params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) From 92971572b0db43d8c619a08925c82ca95b04c7d8 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Tue, 16 Aug 2022 08:03:48 +0000 Subject: [PATCH 64/84] remove useless changes --- src/boosting/gbdt.cpp | 105 ------------------------------------------ src/boosting/goss.hpp | 1 + src/main.cpp | 10 +--- 3 files changed, 3 insertions(+), 113 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 89ed39271fc9..686ffca215d4 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -233,111 +233,6 @@ void GBDT::Boosting() { GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_); } -// <<<<<<< HEAD -// ======= -// data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) { -// if (cnt <= 0) { -// return 0; -// } -// data_size_t cur_left_cnt = 0; -// data_size_t cur_right_pos = cnt; -// // random bagging, minimal unit is one record -// for (data_size_t i = 0; i < cnt; ++i) { -// auto cur_idx = start + i; -// if (bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < config_->bagging_fraction) { -// buffer[cur_left_cnt++] = cur_idx; -// } else { -// buffer[--cur_right_pos] = cur_idx; -// } -// } -// return cur_left_cnt; -// } - -// data_size_t GBDT::BalancedBaggingHelper(data_size_t start, data_size_t cnt, -// data_size_t* buffer) { -// if (cnt <= 0) { -// return 0; -// } -// auto label_ptr = train_data_->metadata().label(); -// data_size_t cur_left_cnt = 0; -// data_size_t cur_right_pos = cnt; -// // random bagging, minimal unit is one record -// for (data_size_t i = 0; i < cnt; ++i) { -// auto cur_idx = start + i; -// bool is_pos = label_ptr[start + i] > 0; -// bool is_in_bag = false; -// if (is_pos) { -// is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < -// config_->pos_bagging_fraction; -// } else { -// is_in_bag = bagging_rands_[cur_idx / bagging_rand_block_].NextFloat() < -// config_->neg_bagging_fraction; -// } -// if (is_in_bag) { -// buffer[cur_left_cnt++] = cur_idx; -// } else { -// buffer[--cur_right_pos] = cur_idx; -// } -// } -// return cur_left_cnt; -// } - -// void GBDT::Bagging(int iter) { -// Common::FunctionTimer fun_timer("GBDT::Bagging", global_timer); -// // if need bagging -// if ((bag_data_cnt_ < num_data_ && iter % config_->bagging_freq == 0) || -// need_re_bagging_) { -// need_re_bagging_ = false; -// auto left_cnt = bagging_runner_.Run( -// num_data_, -// [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left, -// data_size_t*) { -// data_size_t cur_left_count = 0; -// if (balanced_bagging_) { -// cur_left_count = -// BalancedBaggingHelper(cur_start, cur_cnt, left); -// } else { -// cur_left_count = BaggingHelper(cur_start, cur_cnt, left); -// } -// return cur_left_count; -// }, -// bag_data_indices_.data()); -// bag_data_cnt_ = left_cnt; -// Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); -// // set bagging data to tree learner -// if (!is_use_subset_) { -// #ifdef USE_CUDA_EXP -// if (config_->device_type == std::string("cuda_exp")) { -// CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); -// tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_); -// } else { -// #endif // USE_CUDA_EXP -// tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); -// #ifdef USE_CUDA_EXP -// } -// #endif // USE_CUDA_EXP -// } else { -// // get subset -// tmp_subset_->ReSize(bag_data_cnt_); -// tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), -// bag_data_cnt_, false); -// #ifdef USE_CUDA_EXP -// if (config_->device_type == std::string("cuda_exp")) { -// CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); -// tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(), -// bag_data_cnt_); -// } else { -// #endif // USE_CUDA_EXP -// tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), -// bag_data_cnt_); -// #ifdef USE_CUDA_EXP -// } -// #endif // USE_CUDA_EXP -// } -// } -// } - -// >>>>>>> LightGBM/master void GBDT::Train(int snapshot_freq, const std::string& model_output_path) { Common::FunctionTimer fun_timer("GBDT::Train", global_timer); bool is_finished = false; diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index e0ad4697c35d..34b099e051bb 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -10,6 +10,7 @@ #include #include +#include #include namespace LightGBM { diff --git a/src/main.cpp b/src/main.cpp index 4d69c53a1aec..8034da826811 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -8,16 +8,10 @@ #include "network/linkers.h" -int main(int /*argc*/, char** /*argv*/) { +int main(int argc, char** argv) { bool success = false; try { - const std::string config_str = std::string("config=train.conf"); - char* argv = new char[config_str.size() + 1]; - for (size_t i = 0; i < config_str.size(); ++i) { - argv[i] = config_str[i]; - } - argv[config_str.size()] = '\0'; - LightGBM::Application app(2, &argv - 1); + LightGBM::Application app(argc, argv); app.Run(); #ifdef USE_MPI From 7a5fede86372ec0abee7ade24f71a405e2f15d51 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 24 Aug 2022 06:37:00 +0000 Subject: [PATCH 65/84] allocate gradients_pointer_ and hessians_pointer when necessary --- include/LightGBM/tree_learner.h | 6 ++ src/boosting/bagging.hpp | 6 +- src/boosting/gbdt.cpp | 77 ++++++++++--------- src/boosting/gbdt.h | 4 +- .../cuda/cuda_single_gpu_tree_learner.cpp | 10 +++ .../cuda/cuda_single_gpu_tree_learner.hpp | 4 +- 6 files changed, 66 insertions(+), 41 deletions(-) diff --git a/include/LightGBM/tree_learner.h b/include/LightGBM/tree_learner.h index 197a80f18cd7..772e1422ff69 100644 --- a/include/LightGBM/tree_learner.h +++ b/include/LightGBM/tree_learner.h @@ -50,6 +50,12 @@ class TreeLearner { */ virtual void ResetConfig(const Config* config) = 0; + /*! + * \brief Reset boosting_on_gpu_ + * \param boosting_on_gpu flag for boosting on GPU + */ + virtual void ResetBoostingOnGPU(const bool /*boosting_on_gpu*/) {} + virtual void SetForcedSplit(const Json* forced_split_json) = 0; /*! diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 27b1589f81c8..03b62f6c65e9 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -50,7 +50,7 @@ class BaggingSampleStrategy : public SampleStrategy { #ifdef USE_CUDA_EXP if (config_->device_type == std::string("cuda_exp")) { CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); - tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_); + tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_); } else { #endif // USE_CUDA_EXP tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); @@ -65,8 +65,8 @@ class BaggingSampleStrategy : public SampleStrategy { #ifdef USE_CUDA_EXP if (config_->device_type == std::string("cuda_exp")) { CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); - tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(), - bag_data_cnt_); + tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(), + bag_data_cnt_); } else { #endif // USE_CUDA_EXP tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 686ffca215d4..d2c61b85aafb 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -42,6 +42,7 @@ GBDT::GBDT() data_sample_strategy_.reset(nullptr); gradients_pointer_ = nullptr; hessians_pointer_ = nullptr; + boosting_on_gpu_ = false; } GBDT::~GBDT() { @@ -95,11 +96,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } data_sample_strategy_.reset(SampleStrategy::CreateSampleStrategy(config_.get(), train_data_, objective_function_, num_tree_per_iteration_)); - is_constant_hessian_ = GetIsConstHessian(objective_function) && !data_sample_strategy_->IsHessianChange(); + is_constant_hessian_ = GetIsConstHessian(objective_function); - const bool boosting_on_gpu = objective_function_ != nullptr && objective_function_->IsCUDAObjective(); + boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && + !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, - config_.get(), boosting_on_gpu)); + config_.get(), boosting_on_gpu_)); // init tree learner tree_learner_->Init(train_data_, is_constant_hessian_); @@ -114,7 +116,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective #ifdef USE_CUDA_EXP if (config_->device_type == std::string("cuda_exp")) { - train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu)); + train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_)); } else { #endif // USE_CUDA_EXP train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); @@ -127,7 +129,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (objective_function_ != nullptr) { const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) { + if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) { if (gradients_pointer_ != nullptr) { CHECK_NOTNULL(hessians_pointer_); DeallocateCUDAMemory(&gradients_pointer_, __FILE__, __LINE__); @@ -144,21 +146,14 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective #ifdef USE_CUDA_EXP } #endif // USE_CUDA_EXP - #ifndef USE_CUDA_EXP - } - #else // USE_CUDA_EXP - } else { - if (config_->device_type == std::string("cuda_exp")) { - if (gradients_pointer_ != nullptr) { - CHECK_NOTNULL(hessians_pointer_); - DeallocateCUDAMemory(&gradients_pointer_, __FILE__, __LINE__); - DeallocateCUDAMemory(&hessians_pointer_, __FILE__, __LINE__); - } - AllocateCUDAMemory(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemory(&hessians_pointer_, total_size, __FILE__, __LINE__); - } + } else if (data_sample_strategy_->IsHessianChange()) { + const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients_.resize(total_size); + hessians_.resize(total_size); + gradients_pointer_ = gradients_.data(); + hessians_pointer_ = hessians_.data(); } - #endif // USE_CUDA_EXP + // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; // get label index @@ -352,7 +347,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { } else { // use customized objective function CHECK(objective_function_ == nullptr); - if (config_->boosting == std::string("goss") || config_->data_sample_strategy == std::string("goss")) { + if (data_sample_strategy_->IsHessianChange()) { // need to copy customized gradients when using GOSS int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; #pragma omp parallel for schedule(static) @@ -360,24 +355,25 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { gradients_[i] = gradients[i]; hessians_[i] = hessians[i]; } - gradients = gradients_.data(); - hessians = hessians_.data(); - } - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { - const size_t total_size = static_cast(num_data_ * num_class_); - CopyFromHostToCUDADevice(gradients_pointer_, gradients, total_size, __FILE__, __LINE__); - CopyFromHostToCUDADevice(hessians_pointer_, hessians, total_size, __FILE__, __LINE__); + CHECK_EQ(gradients_pointer_, gradients_.data()); + CHECK_EQ(hessians_pointer_, hessians_.data()); gradients = gradients_pointer_; hessians = hessians_pointer_; } - #endif // USE_CUDA_EXP } // bagging logic data_sample_strategy_->Bagging(iter_, tree_learner_.get(), gradients_.data(), hessians_.data()); const bool is_use_subset = data_sample_strategy_->is_use_subset(); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); + if (gradients != nullptr && is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_ && !data_sample_strategy_->IsHessianChange()) { + // allocate gradients_ and hessians_ for copy gradients for using data subset + int64_t total_size = static_cast(num_data_) * num_tree_per_iteration_; + gradients_.resize(total_size); + hessians_.resize(total_size); + gradients_pointer_ = gradients_.data(); + hessians_pointer_ = hessians_.data(); + } const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); bool should_continue = false; @@ -388,7 +384,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto grad = gradients + offset; auto hess = hessians + offset; // need to copy gradients for bagging subset. - if (is_use_subset && bag_data_cnt < num_data_ && config_->device_type != std::string("cuda_exp")) { + if (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_) { for (int i = 0; i < bag_data_cnt; ++i) { gradients_pointer_[offset + i] = grad[bag_data_indices[i]]; hessians_pointer_[offset + i] = hess[bag_data_indices[i]]; @@ -493,7 +489,7 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { if (num_data_ - bag_data_cnt > 0) { #ifdef USE_CUDA_EXP if (config_->device_type == std::string("cuda_exp")) { - train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().data().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id); + train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id); } else { #endif // USE_CUDA_EXP train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id); @@ -720,7 +716,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* Log::Fatal("Cannot use ``monotone_constraints`` in %s objective, please disable it.", objective_function_->GetName()); } } - is_constant_hessian_ = GetIsConstHessian(objective_function) && !data_sample_strategy_->IsHessianChange(); + is_constant_hessian_ = GetIsConstHessian(objective_function); // push training metrics training_metrics_.clear(); @@ -730,7 +726,9 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* training_metrics_.shrink_to_fit(); #ifdef USE_CUDA_EXP - const bool boosting_on_gpu = objective_function_ != nullptr && objective_function_->IsCUDAObjective(); + boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && + !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU + tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_); #endif // USE_CUDA_EXP if (train_data != train_data_) { @@ -740,7 +738,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* // create score tracker #ifdef USE_CUDA_EXP if (config_->device_type == std::string("cuda_exp")) { - train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu)); + train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_)); } else { #endif // USE_CUDA_EXP train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); @@ -762,7 +760,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; if (objective_function_ != nullptr) { #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) { + if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) { if (gradients_pointer_ != nullptr) { CHECK_NOTNULL(hessians_pointer_); DeallocateCUDAMemory(&gradients_pointer_, __FILE__, __LINE__); @@ -825,6 +823,13 @@ void GBDT::ResetConfig(const Config* config) { if (tree_learner_ != nullptr) { tree_learner_->ResetConfig(new_config.get()); } + + #ifdef USE_CUDA_EXP + boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && + !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU + tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_); + #endif // USE_CUDA_EXP + if (train_data_ != nullptr) { data_sample_strategy_->ResetSampleConfig(new_config.get(), false); if (data_sample_strategy_->NeedResizeGradients()) { @@ -832,7 +837,7 @@ void GBDT::ResetConfig(const Config* config) { const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; if (objective_function_ != nullptr) { #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) { + if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) { if (gradients_pointer_ != nullptr) { CHECK_NOTNULL(hessians_pointer_); DeallocateCUDAMemory(&gradients_pointer_, __FILE__, __LINE__); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 5625ee552495..15fb14ae6a70 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -400,7 +400,7 @@ class GBDT : public GBDTBase { protected: virtual bool GetIsConstHessian(const ObjectiveFunction* objective_function) { - if (objective_function != nullptr && config_->boosting != std::string("goss") && config_->data_sample_strategy != std::string("goss")) { + if (objective_function != nullptr && !data_sample_strategy_->IsHessianChange()) { return objective_function->IsConstantHessian(); } else { return false; @@ -493,6 +493,8 @@ class GBDT : public GBDTBase { score_t* gradients_pointer_; /*! \brief Pointer to hessian vector, can be on CPU or GPU */ score_t* hessians_pointer_; + /*! \brief Whether boosting is done on GPU, used for cuda_exp */ + bool boosting_on_gpu_; #ifdef USE_CUDA_EXP /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */ mutable std::vector host_score_; diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index 55595765fcc6..fd48201e2c3a 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -447,6 +447,16 @@ void CUDASingleGPUTreeLearner::AllocateBitset() { cuda_bitset_inner_len_ = 0; } +void CUDASingleGPUTreeLearner::ResetBoostingOnGPU(const bool boosting_on_cuda) { + boosting_on_cuda_ = boosting_on_cuda; + DeallocateCUDAMemory(&cuda_gradients_, __FILE__, __LINE__); + DeallocateCUDAMemory(&cuda_hessians_, __FILE__, __LINE__); + if (!boosting_on_cuda_) { + AllocateCUDAMemory(&cuda_gradients_, static_cast(num_data_), __FILE__, __LINE__); + AllocateCUDAMemory(&cuda_hessians_, static_cast(num_data_), __FILE__, __LINE__); + } +} + #ifdef DEBUG void CUDASingleGPUTreeLearner::CheckSplitValid( const int left_leaf, diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 942a6d1cb17d..1c17aa009c84 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -49,6 +49,8 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { Tree* FitByExistingTree(const Tree* old_tree, const std::vector& leaf_pred, const score_t* gradients, const score_t* hessians) const override; + virtual void ResetBoostingOnGPU(const bool boosting_on_gpu) override; + protected: void BeforeTrain() override; @@ -119,7 +121,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { /*! \brief hessians on CUDA */ score_t* cuda_hessians_; /*! \brief whether boosting is done on CUDA */ - const bool boosting_on_cuda_; + bool boosting_on_cuda_; #ifdef DEBUG /*! \brief gradients on CPU */ From b4a014f2874decf3649a0985eaecc9dbcbb17cfc Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 24 Aug 2022 06:41:54 +0000 Subject: [PATCH 66/84] add spaces --- src/boosting/gbdt.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index d2c61b85aafb..029f39b2fb80 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -99,7 +99,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective is_constant_hessian_ = GetIsConstHessian(objective_function); boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && - !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU + !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU tree_learner_ = std::unique_ptr(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type, config_.get(), boosting_on_gpu_)); @@ -727,7 +727,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* #ifdef USE_CUDA_EXP boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && - !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU + !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_); #endif // USE_CUDA_EXP @@ -826,7 +826,7 @@ void GBDT::ResetConfig(const Config* config) { #ifdef USE_CUDA_EXP boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && - !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU + !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_); #endif // USE_CUDA_EXP From f783a611b24f8401f91b0e047bc04a4e9c935ced Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 24 Aug 2022 06:43:08 +0000 Subject: [PATCH 67/84] remove redundant virtual --- src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index 1c17aa009c84..a55f9df8fc15 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -49,7 +49,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { Tree* FitByExistingTree(const Tree* old_tree, const std::vector& leaf_pred, const score_t* gradients, const score_t* hessians) const override; - virtual void ResetBoostingOnGPU(const bool boosting_on_gpu) override; + void ResetBoostingOnGPU(const bool boosting_on_gpu) override; protected: void BeforeTrain() override; From 204517b00cb46bd1efcaaca1499dc9737bb56864 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 24 Aug 2022 08:27:36 +0000 Subject: [PATCH 68/84] include for USE_CUDA --- include/LightGBM/cuda/cuda_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index ee88c52a0404..f1c28213d9f3 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -10,10 +10,10 @@ #include #include #include +#include #endif // USE_CUDA || USE_CUDA_EXP #ifdef USE_CUDA_EXP -#include #include #endif // USE_CUDA_EXP From e5d4605f8222cee61dce9f44d3ce5bb95deaa658 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 29 Aug 2022 12:36:35 +0000 Subject: [PATCH 69/84] check for in test_goss_boosting_and_strategy_equivalent --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 0fd135f5f30e..23e7a970d823 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3601,7 +3601,7 @@ def test_goss_boosting_and_strategy_equivalent(): num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result2)]) - np.testing.assert_equal(evals_result1['valid_0']['l2'], evals_result2['valid_0']['l2']) + assert evals_result1['valid_0']['l2'] == evals_result2['valid_0']['l2'] def test_sample_strategy_with_boosting(): From 469f6bbe6c0d6a425f637cd11f52acae73d180e6 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 29 Aug 2022 12:55:46 +0000 Subject: [PATCH 70/84] check for identity in test_sample_strategy_with_boosting --- tests/python_package_test/test_engine.py | 58 ++++++++++++++++-------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 23e7a970d823..c79e87a8c528 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3615,51 +3615,65 @@ def test_sample_strategy_with_boosting(): 'data_sample_strategy': 'goss', 'metric': 'l2', 'verbose': -1, + 'num_threads': 1, + 'force_row_wise': True, 'gpu_use_dp': True, + 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret1 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret1 == pytest.approx(3149.393862, abs=1.0) - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret1) + eval_res1 = evals_result['valid_0']['l2'][-1] + test_res1 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res1 == pytest.approx(3149.393862, abs=1.0) + assert eval_res1 == pytest.approx(test_res1) params = { 'boosting': 'gbdt', 'data_sample_strategy': 'goss', 'metric': 'l2', 'verbose': -1, + 'num_threads': 1, + 'force_row_wise': True, 'gpu_use_dp': True, + 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret2 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret2 == pytest.approx(2547.715968, abs=1.0) - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret2) + eval_res2 = evals_result['valid_0']['l2'][-1] + test_res2 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res2 == pytest.approx(2547.715968, abs=1.0) + assert eval_res2 == pytest.approx(test_res2) params = { 'boosting': 'goss', 'data_sample_strategy': 'goss', 'metric': 'l2', 'verbose': -1, + 'num_threads': 1, + 'force_row_wise': True, 'gpu_use_dp': True, + 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret3 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret3 == pytest.approx(2547.715968, abs=1.0) - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret3) + eval_res3 = evals_result['valid_0']['l2'][-1] + test_res3 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res3 == pytest.approx(2547.715968, abs=1.0) + assert eval_res3 == pytest.approx(test_res3) - assert ret1 != ret2 - assert ret2 == ret3 + assert test_res1 != test_res2 + assert eval_res1 != eval_res2 + assert test_res2 == test_res3 + assert eval_res2 == eval_res3 params = { 'boosting': 'dart', @@ -3668,16 +3682,20 @@ def test_sample_strategy_with_boosting(): 'bagging_fraction': 0.5, 'metric': 'l2', 'verbose': -1, + 'num_threads': 1, + 'force_row_wise': True, 'gpu_use_dp': True, + 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret4 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret4 == pytest.approx(3134.866931, abs=1.0) - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret4) + eval_res4 = evals_result['valid_0']['l2'][-1] + test_res4 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res4 == pytest.approx(3134.866931, abs=1.0) + assert eval_res4 == pytest.approx(test_res4) params = { 'boosting': 'gbdt', @@ -3686,17 +3704,21 @@ def test_sample_strategy_with_boosting(): 'bagging_fraction': 0.5, 'metric': 'l2', 'verbose': -1, + 'num_threads': 1, + 'force_row_wise': True, 'gpu_use_dp': True, + 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) - ret5 = mean_squared_error(y_test, gbm.predict(X_test)) - assert ret5 == pytest.approx(2539.792378, abs=1.0) - assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret5) - assert ret4 != ret5 + eval_res5 = evals_result['valid_0']['l2'][-1] + test_res5 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res5 == pytest.approx(2539.792378, abs=1.0) + assert eval_res5 == pytest.approx(test_res5) + assert eval_res5 != test_res5 def test_record_evaluation_with_train(): From 512718889183ce2dba8e506bd96828421c3cb8b5 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Mon, 29 Aug 2022 14:32:46 +0000 Subject: [PATCH 71/84] remove cuda option in test_sample_strategy_with_boosting --- tests/python_package_test/test_engine.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index c79e87a8c528..66cfe34748df 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3618,7 +3618,6 @@ def test_sample_strategy_with_boosting(): 'num_threads': 1, 'force_row_wise': True, 'gpu_use_dp': True, - 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3638,7 +3637,6 @@ def test_sample_strategy_with_boosting(): 'num_threads': 1, 'force_row_wise': True, 'gpu_use_dp': True, - 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3658,7 +3656,6 @@ def test_sample_strategy_with_boosting(): 'num_threads': 1, 'force_row_wise': True, 'gpu_use_dp': True, - 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3685,7 +3682,6 @@ def test_sample_strategy_with_boosting(): 'num_threads': 1, 'force_row_wise': True, 'gpu_use_dp': True, - 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, @@ -3707,7 +3703,6 @@ def test_sample_strategy_with_boosting(): 'num_threads': 1, 'force_row_wise': True, 'gpu_use_dp': True, - 'device': 'cuda' } evals_result = {} gbm = lgb.train(params, lgb_train, From cc28c8a8a73486ed1280f01e5eb15fa996c67deb Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Mon, 5 Sep 2022 10:14:47 +0800 Subject: [PATCH 72/84] Update tests/python_package_test/test_engine.py Co-authored-by: Nikita Titov --- tests/python_package_test/test_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 66cfe34748df..f554d43296ce 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3713,7 +3713,8 @@ def test_sample_strategy_with_boosting(): test_res5 = mean_squared_error(y_test, gbm.predict(X_test)) assert test_res5 == pytest.approx(2539.792378, abs=1.0) assert eval_res5 == pytest.approx(test_res5) - assert eval_res5 != test_res5 + assert test_res4 != test_res5 + assert eval_res4 != eval_res5 def test_record_evaluation_with_train(): From 42f3de9db9bb8a6a7af22072e24af2cdfded08a1 Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Mon, 5 Sep 2022 10:14:58 +0800 Subject: [PATCH 73/84] Update tests/python_package_test/test_engine.py Co-authored-by: James Lamb --- tests/python_package_test/test_engine.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index f554d43296ce..ab63d1f9f0ab 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -3571,8 +3571,7 @@ def test_goss_boosting_and_strategy_equivalent(): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) - params1 = { - 'boosting': 'goss', + base_params = { 'metric': 'l2', 'verbose': -1, 'bagging_seed': 0, @@ -3581,21 +3580,13 @@ def test_goss_boosting_and_strategy_equivalent(): 'force_row_wise': True, 'gpu_use_dp': True, } + params1 = {**base_params, 'boosting': 'goss'} evals_result1 = {} lgb.train(params1, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result1)]) - params2 = { - 'data_sample_strategy': 'goss', - 'metric': 'l2', - 'verbose': -1, - 'bagging_seed': 0, - 'learning_rate': 0.05, - 'num_threads': 1, - 'force_row_wise': True, - 'gpu_use_dp': True, - } + params2 = {**base_params, 'data_sample_strategy': 'goss'} evals_result2 = {} lgb.train(params2, lgb_train, num_boost_round=10, From ea95e86482eacda91b6b739299407792ca7d5d93 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 7 Sep 2022 07:21:11 +0000 Subject: [PATCH 74/84] ResetGradientBuffers after ResetSampleConfig --- src/boosting/gbdt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index bc86b5ae1465..fe2759681aba 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -126,7 +126,6 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective #endif // USE_CUDA_EXP num_data_ = train_data_->num_data(); - ResetGradientBuffers(); // get max feature index max_feature_idx_ = train_data_->num_total_features() - 1; @@ -141,6 +140,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective // if need bagging, create buffer data_sample_strategy_->ResetSampleConfig(config_.get(), true); + ResetGradientBuffers(); class_need_train_ = std::vector(num_tree_per_iteration_, true); if (objective_function_ != nullptr && objective_function_->SkipEmptyClass()) { From 18a54ef40bcd1ed096d8b2283c81b8e642d35e25 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 9 Sep 2022 05:06:12 +0000 Subject: [PATCH 75/84] ResetGradientBuffers after ResetSampleConfig --- src/boosting/gbdt.cpp | 3 +-- src/boosting/gbdt.h | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index fe2759681aba..be4fabe9e078 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -733,8 +733,6 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* num_data_ = train_data_->num_data(); - ResetGradientBuffers(); - max_feature_idx_ = train_data_->num_total_features() - 1; label_idx_ = train_data_->label_idx(); feature_names_ = train_data_->feature_names(); @@ -743,6 +741,7 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); data_sample_strategy_->ResetSampleConfig(config_.get(), true); + ResetGradientBuffers(); } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 2f18a575a1d6..1f8a778d619b 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -443,6 +443,9 @@ class GBDT : public GBDTBase { double BoostFromAverage(int class_id, bool update_scorer); + /*! + * \brief Reset gradient buffers, must be called after sample strategy is reset + */ void ResetGradientBuffers(); /*! \brief current iteration */ From beb12b99c76b124de74213efe5c271f599d71784 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 9 Sep 2022 06:52:30 +0000 Subject: [PATCH 76/84] ResetGradientBuffers after bagging --- include/LightGBM/cuda/cuda_utils.h | 1 + src/boosting/gbdt.cpp | 27 +++++++++++++++++---------- src/boosting/gbdt.h | 4 ++++ 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index 2d419c260207..da73e8bc7b73 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -121,6 +121,7 @@ class CUDAVector { void Resize(size_t size) { if (size == 0) { Clear(); + return; } T* new_data = nullptr; AllocateCUDAMemory(&new_data, size, __FILE__, __LINE__); diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index be4fabe9e078..32d680f39608 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -342,6 +342,10 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); + if (gradients != nullptr && is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_ && !data_sample_strategy_->IsHessianChange()) { + ResetGradientBuffers(); + } + bool should_continue = false; for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) { const size_t offset = static_cast(cur_tree_id) * num_data_; @@ -800,17 +804,18 @@ void GBDT::ResetGradientBuffers() { if (objective_function_ != nullptr) { #ifdef USE_CUDA_EXP if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) { - if (gradients_pointer_ != nullptr) { - CHECK_NOTNULL(hessians_pointer_); - DeallocateCUDAMemory(&gradients_pointer_, __FILE__, __LINE__); - DeallocateCUDAMemory(&hessians_pointer_, __FILE__, __LINE__); + if (cuda_gradients_.Size() < total_size) { + cuda_gradients_.Resize(total_size); + cuda_hessians_.Resize(total_size); } - AllocateCUDAMemory(&gradients_pointer_, total_size, __FILE__, __LINE__); - AllocateCUDAMemory(&hessians_pointer_, total_size, __FILE__, __LINE__); + gradients_pointer_ = cuda_gradients_.RawData(); + hessians_pointer_ = cuda_hessians_.RawData(); } else { #endif // USE_CUDA_EXP - gradients_.resize(total_size); - hessians_.resize(total_size); + if (gradients_.size() < total_size) { + gradients_.resize(total_size); + hessians_.resize(total_size); + } gradients_pointer_ = gradients_.data(); hessians_pointer_ = hessians_.data(); #ifdef USE_CUDA_EXP @@ -818,8 +823,10 @@ void GBDT::ResetGradientBuffers() { #endif // USE_CUDA_EXP } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) { const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; - gradients_.resize(total_size); - hessians_.resize(total_size); + if (gradients_.size() < total_size) { + gradients_.resize(total_size); + hessians_.resize(total_size); + } gradients_pointer_ = gradients_.data(); hessians_pointer_ = hessians_.data(); } diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 1f8a778d619b..c3934f5a0e73 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -501,6 +501,10 @@ class GBDT : public GBDTBase { /*! \brief Whether boosting is done on GPU, used for cuda_exp */ bool boosting_on_gpu_; #ifdef USE_CUDA_EXP + /*! \brief Gradient vector on GPU */ + CUDAVector cuda_gradients_; + /*! \brief Hessian vector on GPU */ + CUDAVector cuda_hessians_; /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */ mutable std::vector host_score_; /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */ From c6175188d6e78d0de7317c2c3e7f19a15a7e1d38 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 9 Sep 2022 06:53:46 +0000 Subject: [PATCH 77/84] remove useless code --- src/boosting/gbdt.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 32d680f39608..8681740bb298 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -822,7 +822,6 @@ void GBDT::ResetGradientBuffers() { } #endif // USE_CUDA_EXP } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) { - const size_t total_size = static_cast(num_data_) * num_tree_per_iteration_; if (gradients_.size() < total_size) { gradients_.resize(total_size); hessians_.resize(total_size); From 87b3e0e6e522997bd8fe008f0e146d245628ca2a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Fri, 9 Sep 2022 09:24:18 +0000 Subject: [PATCH 78/84] check objective_function_ instead of gradients --- src/boosting/gbdt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 8681740bb298..65d730bc4740 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -342,7 +342,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); const std::vector>& bag_data_indices = data_sample_strategy_->bag_data_indices(); - if (gradients != nullptr && is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_ && !data_sample_strategy_->IsHessianChange()) { + if (objective_function_ == nullptr && is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_ && !data_sample_strategy_->IsHessianChange()) { ResetGradientBuffers(); } From 58356e4a066e0b1f7195f9a7cf0bcaf2b1ea53c5 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 14 Sep 2022 03:46:32 +0000 Subject: [PATCH 79/84] enable rf with goss simplify params in test cases --- src/boosting/rf.hpp | 8 +- tests/python_package_test/test_engine.py | 110 +++++++++++------------ 2 files changed, 58 insertions(+), 60 deletions(-) diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 40a54ad8626d..30efb8fdd5c1 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -32,8 +32,12 @@ class RF : public GBDT { void Init(const Config* config, const Dataset* train_data, const ObjectiveFunction* objective_function, const std::vector& training_metrics) override { - CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f); - CHECK(config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f); + if (config->data_sample_strategy == std::string("bagging")) { + CHECK((config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || + (config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f)); + } else { + CHECK_EQ(config->data_sample_strategy, std::string("goss")); + } GBDT::Init(config, train_data, objective_function, training_metrics); if (num_init_iteration_ > 0) { diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 0b51a108dbb1..f9e37634ccde 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1198,7 +1198,7 @@ def test_feature_name_with_non_ascii(): X_train = np.random.normal(size=(100, 4)) y_train = np.random.random(100) # This has non-ascii strings. - feature_names = [u'F_零', u'F_一', u'F_二', u'F_三'] + feature_names = [u'F1', u'F2', u'F3', u'F4'] params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) @@ -3607,17 +3607,17 @@ def test_sample_strategy_with_boosting(): lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) - params = { - 'boosting': 'dart', - 'data_sample_strategy': 'goss', + base_params = { 'metric': 'l2', 'verbose': -1, 'num_threads': 1, 'force_row_wise': True, 'gpu_use_dp': True, } + + params1 = {**base_params, 'boosting': 'dart', 'data_sample_strategy': 'goss'} evals_result = {} - gbm = lgb.train(params, lgb_train, + gbm = lgb.train(params1, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) @@ -3626,17 +3626,9 @@ def test_sample_strategy_with_boosting(): assert test_res1 == pytest.approx(3149.393862, abs=1.0) assert eval_res1 == pytest.approx(test_res1) - params = { - 'boosting': 'gbdt', - 'data_sample_strategy': 'goss', - 'metric': 'l2', - 'verbose': -1, - 'num_threads': 1, - 'force_row_wise': True, - 'gpu_use_dp': True, - } + params2 = {**base_params, 'boosting': 'gbdt', 'data_sample_strategy': 'goss'} evals_result = {} - gbm = lgb.train(params, lgb_train, + gbm = lgb.train(params2, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) @@ -3645,17 +3637,9 @@ def test_sample_strategy_with_boosting(): assert test_res2 == pytest.approx(2547.715968, abs=1.0) assert eval_res2 == pytest.approx(test_res2) - params = { - 'boosting': 'goss', - 'data_sample_strategy': 'goss', - 'metric': 'l2', - 'verbose': -1, - 'num_threads': 1, - 'force_row_wise': True, - 'gpu_use_dp': True, - } + params3 = {**base_params, 'boosting': 'goss', 'data_sample_strategy': 'goss'} evals_result = {} - gbm = lgb.train(params, lgb_train, + gbm = lgb.train(params3, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) @@ -3664,54 +3648,64 @@ def test_sample_strategy_with_boosting(): assert test_res3 == pytest.approx(2547.715968, abs=1.0) assert eval_res3 == pytest.approx(test_res3) - assert test_res1 != test_res2 - assert eval_res1 != eval_res2 - assert test_res2 == test_res3 - assert eval_res2 == eval_res3 - - params = { - 'boosting': 'dart', - 'data_sample_strategy': 'bagging', - 'bagging_freq': 1, - 'bagging_fraction': 0.5, - 'metric': 'l2', - 'verbose': -1, - 'num_threads': 1, - 'force_row_wise': True, - 'gpu_use_dp': True, - } + params4 = {**base_params, 'boosting': 'rf', 'data_sample_strategy': 'goss'} evals_result = {} - gbm = lgb.train(params, lgb_train, + gbm = lgb.train(params4, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) eval_res4 = evals_result['valid_0']['l2'][-1] test_res4 = mean_squared_error(y_test, gbm.predict(X_test)) - assert test_res4 == pytest.approx(3134.866931, abs=1.0) + assert test_res4 == pytest.approx(2095.538735, abs=1.0) assert eval_res4 == pytest.approx(test_res4) - params = { - 'boosting': 'gbdt', - 'data_sample_strategy': 'bagging', - 'bagging_freq': 1, - 'bagging_fraction': 0.5, - 'metric': 'l2', - 'verbose': -1, - 'num_threads': 1, - 'force_row_wise': True, - 'gpu_use_dp': True, - } + assert test_res1 != test_res2 + assert eval_res1 != eval_res2 + assert test_res2 == test_res3 + assert eval_res2 == eval_res3 + assert eval_res1 != eval_res4 + assert test_res1 != test_res4 + assert eval_res2 != eval_res4 + assert test_res2 != test_res4 + + params5 = {**base_params, 'boosting': 'dart', 'data_sample_strategy': 'bagging', 'bagging_freq': 1, 'bagging_fraction': 0.5} evals_result = {} - gbm = lgb.train(params, lgb_train, + gbm = lgb.train(params5, lgb_train, num_boost_round=10, valid_sets=lgb_eval, callbacks=[lgb.record_evaluation(evals_result)]) eval_res5 = evals_result['valid_0']['l2'][-1] test_res5 = mean_squared_error(y_test, gbm.predict(X_test)) - assert test_res5 == pytest.approx(2539.792378, abs=1.0) + assert test_res5 == pytest.approx(3134.866931, abs=1.0) assert eval_res5 == pytest.approx(test_res5) - assert test_res4 != test_res5 - assert eval_res4 != eval_res5 + + params6 = {**base_params, 'boosting': 'gbdt', 'data_sample_strategy': 'bagging', 'bagging_freq': 1, 'bagging_fraction': 0.5} + evals_result = {} + gbm = lgb.train(params6, lgb_train, + num_boost_round=10, + valid_sets=lgb_eval, + callbacks=[lgb.record_evaluation(evals_result)]) + eval_res6 = evals_result['valid_0']['l2'][-1] + test_res6 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res6 == pytest.approx(2539.792378, abs=1.0) + assert eval_res6 == pytest.approx(test_res6) + assert test_res5 != test_res6 + assert eval_res5 != eval_res6 + + params7 = {**base_params, 'boosting': 'rf', 'data_sample_strategy': 'bagging', 'bagging_freq': 1, 'bagging_fraction': 0.5} + evals_result = {} + gbm = lgb.train(params7, lgb_train, + num_boost_round=10, + valid_sets=lgb_eval, + callbacks=[lgb.record_evaluation(evals_result)]) + eval_res7 = evals_result['valid_0']['l2'][-1] + test_res7 = mean_squared_error(y_test, gbm.predict(X_test)) + assert test_res7 == pytest.approx(1518.704481, abs=1.0) + assert eval_res7 == pytest.approx(test_res7) + assert test_res5 != test_res7 + assert eval_res5 != eval_res7 + assert test_res6 != test_res7 + assert eval_res6 != eval_res7 def test_record_evaluation_with_train(): From 47957b1bc2f0507e8e6862d1220c0bb5993fd8ca Mon Sep 17 00:00:00 2001 From: shiyu1994 Date: Wed, 14 Sep 2022 20:16:56 +0800 Subject: [PATCH 80/84] remove useless changes --- tests/python_package_test/test_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index f9e37634ccde..e6e1faf78d97 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1198,7 +1198,7 @@ def test_feature_name_with_non_ascii(): X_train = np.random.normal(size=(100, 4)) y_train = np.random.random(100) # This has non-ascii strings. - feature_names = [u'F1', u'F2', u'F3', u'F4'] + feature_names = [u'F_零', u'F_一', u'F_二', u'F_三'] params = {'verbose': -1} lgb_train = lgb.Dataset(X_train, y_train) From 1eb96d63cd10583cc322a2f11d49f3e998117650 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Wed, 14 Sep 2022 14:21:32 +0000 Subject: [PATCH 81/84] allow rf with feature subsampling alone --- src/boosting/bagging.hpp | 11 ++++++----- src/boosting/rf.hpp | 10 +++++++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 98570252af18..65a937435105 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -91,19 +91,20 @@ class BaggingSampleStrategy : public SampleStrategy { if (!is_change_dataset && config_ != nullptr && config_->bagging_fraction == config->bagging_fraction && config_->bagging_freq == config->bagging_freq && config_->pos_bagging_fraction == config->pos_bagging_fraction && config_->neg_bagging_fraction == config->neg_bagging_fraction) { + config_ = config; return; } config_ = config; if (balance_bagging_cond) { balanced_bagging_ = true; - bag_data_cnt_ = static_cast(num_pos_data * config->pos_bagging_fraction) - + static_cast((num_data_ - num_pos_data) * config->neg_bagging_fraction); + bag_data_cnt_ = static_cast(num_pos_data * config_->pos_bagging_fraction) + + static_cast((num_data_ - num_pos_data) * config_->neg_bagging_fraction); } else { - bag_data_cnt_ = static_cast(config->bagging_fraction * num_data_); + bag_data_cnt_ = static_cast(config_->bagging_fraction * num_data_); } bag_data_indices_.resize(num_data_); #ifdef USE_CUDA_EXP - if (config->device_type == std::string("cuda_exp")) { + if (config_->device_type == std::string("cuda_exp")) { cuda_bag_data_indices_.Resize(num_data_); } #endif // USE_CUDA_EXP @@ -115,7 +116,7 @@ class BaggingSampleStrategy : public SampleStrategy { } double average_bag_rate = - (static_cast(bag_data_cnt_) / num_data_) / config->bagging_freq; + (static_cast(bag_data_cnt_) / num_data_) / config_->bagging_freq; is_use_subset_ = false; if (config_->device_type != std::string("cuda_exp")) { const int group_threshold_usesubset = 100; diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp index 30efb8fdd5c1..9a87e982483e 100644 --- a/src/boosting/rf.hpp +++ b/src/boosting/rf.hpp @@ -34,7 +34,7 @@ class RF : public GBDT { const std::vector& training_metrics) override { if (config->data_sample_strategy == std::string("bagging")) { CHECK((config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || - (config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f)); + (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f)); } else { CHECK_EQ(config->data_sample_strategy, std::string("goss")); } @@ -59,8 +59,12 @@ class RF : public GBDT { } void ResetConfig(const Config* config) override { - CHECK(config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f); - CHECK(config->feature_fraction <= 1.0f && config->feature_fraction > 0.0f); + if (config->data_sample_strategy == std::string("bagging")) { + CHECK((config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || + (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f)); + } else { + CHECK_EQ(config->data_sample_strategy, std::string("goss")); + } GBDT::ResetConfig(config); // not shrinkage rate for the RF shrinkage_rate_ = 1.0f; From 90a2b8fabba289cc8472bbf2b568cd3ff1b1b2ed Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 15 Sep 2022 10:53:57 +0000 Subject: [PATCH 82/84] change position of ResetGradientBuffers --- src/boosting/gbdt.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 65d730bc4740..f7ac445131dd 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -737,6 +737,8 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* num_data_ = train_data_->num_data(); + ResetGradientBuffers(); + max_feature_idx_ = train_data_->num_total_features() - 1; label_idx_ = train_data_->label_idx(); feature_names_ = train_data_->feature_names(); @@ -745,7 +747,6 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* tree_learner_->ResetTrainingData(train_data, is_constant_hessian_); data_sample_strategy_->ResetSampleConfig(config_.get(), true); - ResetGradientBuffers(); } else { tree_learner_->ResetIsConstantHessian(is_constant_hessian_); } From c3d49338fcec764d2b2c35f70be578ea3576c85a Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 1 Dec 2022 02:54:04 +0000 Subject: [PATCH 83/84] check for dask --- python-package/lightgbm/dask.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py index 2152ac8e35cb..85021e65d7c0 100644 --- a/python-package/lightgbm/dask.py +++ b/python-package/lightgbm/dask.py @@ -1041,6 +1041,8 @@ def _lgb_dask_fit( eval_at: Optional[Iterable[int]] = None, **kwargs: Any ) -> "_DaskLGBMModel": + if not DASK_INSTALLED: + raise LightGBMError('dask is required for lightgbm.dask') if not all((DASK_INSTALLED, PANDAS_INSTALLED, SKLEARN_INSTALLED)): raise LightGBMError('dask, pandas and scikit-learn are required for lightgbm.dask') From ced7b06cb87c9e940e31d1baff62c9604a23dc7f Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 22 Dec 2022 02:10:05 +0000 Subject: [PATCH 84/84] add parameter types for data_sample_strategy --- src/io/config_auto.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 5220b394edfe..b1dbcc378a27 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -901,6 +901,7 @@ const std::unordered_map& Config::ParameterTypes() { {"config", "string"}, {"objective", "string"}, {"boosting", "string"}, + {"data_sample_strategy", "string"}, {"data", "string"}, {"valid", "vector"}, {"num_iterations", "int"},