diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index ec330bd94f8c..a1f1ce8127a1 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -174,7 +174,7 @@ class BinMapper { * \param zero_as_missing True to use zero as missing value * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ - void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type, + void FindBin(double* values, int64_t num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type, bool use_missing, bool zero_as_missing, const std::vector& forced_upper_bounds); /*! diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h index 4f89cc784da1..adc7b170deb5 100644 --- a/include/LightGBM/c_api.h +++ b/include/LightGBM/c_api.h @@ -77,7 +77,7 @@ LIGHTGBM_C_EXPORT int LGBM_RegisterLogCallback(void (*callback)(const char*)); * \param[out] out Number of samples. This value is used to pre-allocate memory to hold sample indices when calling ``LGBM_SampleIndices`` * \return 0 when succeed, -1 when failure happens */ -LIGHTGBM_C_EXPORT int LGBM_GetSampleCount(int32_t num_total_row, +LIGHTGBM_C_EXPORT int LGBM_GetSampleCount(int64_t num_total_row, const char* parameters, int* out); @@ -91,10 +91,10 @@ LIGHTGBM_C_EXPORT int LGBM_GetSampleCount(int32_t num_total_row, * \param[out] out_len Number of indices * \return 0 when succeed, -1 when failure happens */ -LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int32_t num_total_row, +LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int64_t num_total_row, const char* parameters, void* out, - int32_t* out_len); + int64_t* out_len); /* --- start Dataset interface */ @@ -127,9 +127,9 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename, LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledColumn(double** sample_data, int** sample_indices, int32_t ncol, - const int* num_per_col, - int32_t num_sample_row, - int32_t num_local_row, + const int64_t* num_per_col, + int64_t num_sample_row, + int64_t num_local_row, int64_t num_dist_row, const char* parameters, DatasetHandle* out); @@ -175,9 +175,9 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetInitStreaming(DatasetHandle dataset, LIGHTGBM_C_EXPORT int LGBM_DatasetPushRows(DatasetHandle dataset, const void* data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, - int32_t start_row); + int64_t start_row); /*! * \brief Push data to existing dataset. @@ -208,7 +208,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, const float* label, const float* weight, const double* init_score, - const int32_t* query, + const int64_t* query, int32_t tid); /*! @@ -228,7 +228,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -266,7 +266,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle datase const float* label, const float* weight, const double* init_score, - const int32_t* query, + const int64_t* query, int32_t tid); /*! @@ -302,7 +302,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetMarkFinished(DatasetHandle dataset); */ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -347,7 +347,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, */ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr, int col_ptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t ncol_ptr, @@ -371,7 +371,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr, */ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, int is_row_major, const char* parameters, @@ -394,7 +394,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data, LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat, const void** data, int data_type, - int32_t* nrow, + int64_t* nrow, int32_t ncol, int is_row_major, const char* parameters, @@ -411,8 +411,8 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat, * \return 0 when succeed, -1 when failure happens */ LIGHTGBM_C_EXPORT int LGBM_DatasetGetSubset(const DatasetHandle handle, - const int32_t* used_row_indices, - int32_t num_used_row_indices, + const int64_t* used_row_indices, + int64_t num_used_row_indices, const char* parameters, DatasetHandle* out); @@ -487,7 +487,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetDumpText(DatasetHandle handle, LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, const char* field_name, const void* field_data, - int num_element, + int64_t num_element, int type); /*! @@ -501,7 +501,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle, */ LIGHTGBM_C_EXPORT int LGBM_DatasetGetField(DatasetHandle handle, const char* field_name, - int* out_len, + int64_t* out_len, const void** out_ptr, int* out_type); @@ -521,7 +521,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetUpdateParamChecking(const char* old_parameters * \return 0 when succeed, -1 when failure happens */ LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle, - int* out); + int64_t* out); /*! * \brief Get number of features. @@ -677,7 +677,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterUpdateOneIter(BoosterHandle handle, */ LIGHTGBM_C_EXPORT int LGBM_BoosterRefit(BoosterHandle handle, const int32_t* leaf_preds, - int32_t nrow, + int64_t nrow, int32_t ncol); /*! @@ -926,7 +926,7 @@ LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig); LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -970,7 +970,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle, LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -995,7 +995,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, * \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64`` * \return 0 when succeed, -1 when failure happens */ -LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, int indptr_type, int data_type); +LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, void* indices, void* data, int indptr_type, int data_type); /*! * \brief Make prediction for a new dataset in CSR format. This method re-uses the internal predictor structure @@ -1029,7 +1029,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indic LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -1104,7 +1104,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle h LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle, const void* indptr, const int indptr_type, - const int32_t* indices, + const void* indices, const void* data, const int64_t nindptr, const int64_t nelem, @@ -1142,7 +1142,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fa LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle, const void* col_ptr, int col_ptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t ncol_ptr, @@ -1183,7 +1183,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle, LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle, const void* data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, int is_row_major, int predict_type, @@ -1221,7 +1221,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle, LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle, const void* data, int data_type, - int ncol, + int32_t ncol, int is_row_major, int predict_type, int start_iteration, @@ -1310,7 +1310,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fa LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle, const void** data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, int predict_type, int start_iteration, diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index f4791394c43c..b1b0c7a2671c 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -984,7 +984,7 @@ struct Config { // alias = ndcg_eval_at, ndcg_at, map_eval_at, map_at // desc = used only with ``ndcg`` and ``map`` metrics // desc = `NDCG `__ and `MAP `__ evaluation positions, separated by ``,`` - std::vector eval_at; + std::vector eval_at; // check = >0 // desc = used only with ``multi_error`` metric diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index c60aaf037c71..0e7690b47649 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -194,7 +194,7 @@ class Metadata { const float* labels, const float* weights, const double* init_scores, - const int32_t* queries); + const int64_t* queries); /*! * \brief Perform any extra operations after all data has been loaded @@ -436,7 +436,7 @@ class Dataset { const std::vector>& forced_bins, int** sample_non_zero_indices, double** sample_values, - const int* num_per_col, + const int64_t* num_per_col, int num_sample_col, size_t total_sample_cnt, const Config& io_config); @@ -600,13 +600,13 @@ class Dataset { LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element); - LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element); + LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const data_size_t* field_data, data_size_t num_element); LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr); LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr); - LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr); + LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const data_size_t** out_ptr); /*! * \brief Save current dataset into binary file, will save to "filename.bin" diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index 8b04e8327ff0..f9d7149270de 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -31,7 +31,7 @@ class DatasetLoader { LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values, int** sample_indices, int num_col, - const int* num_per_col, + const int64_t* num_per_col, size_t total_sample_size, data_size_t num_local_data, int64_t num_dist_data); @@ -45,17 +45,17 @@ class DatasetLoader { const std::unordered_set& categorical_features); private: - Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); + Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, data_size_t* num_global_data, std::vector* used_data_indices); void SetHeader(const char* filename); void CheckDataset(const Dataset* dataset, bool is_load_from_binary); - std::vector LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); + std::vector LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, data_size_t* num_global_data, std::vector* used_data_indices); std::vector SampleTextDataFromMemory(const std::vector& data); - std::vector SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); + std::vector SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, data_size_t* num_global_data, std::vector* used_data_indices); void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector& sample_data, const Parser* parser, Dataset* dataset); diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h index 72d9fcac08dc..d31363ff947e 100644 --- a/include/LightGBM/feature_group.h +++ b/include/LightGBM/feature_group.h @@ -226,7 +226,7 @@ class FeatureGroup { } } - void ReSize(int num_data) { + void ReSize(data_size_t num_data) { if (!is_multi_val_) { bin_data_->ReSize(num_data); } else { @@ -537,7 +537,7 @@ class FeatureGroup { } private: - void CreateBinData(int num_data, bool is_multi_val, bool force_dense, bool force_sparse) { + void CreateBinData(data_size_t num_data, bool is_multi_val, bool force_dense, bool force_sparse) { if (is_multi_val) { multi_bin_data_.clear(); for (int i = 0; i < num_feature_; ++i) { diff --git a/include/LightGBM/meta.h b/include/LightGBM/meta.h index ee97090cbe0a..fb758f86581a 100644 --- a/include/LightGBM/meta.h +++ b/include/LightGBM/meta.h @@ -25,7 +25,7 @@ namespace LightGBM { /*! \brief Type of data size, it is better to use signed type*/ -typedef int32_t data_size_t; +typedef int64_t data_size_t; // Enable following macro to use double for score_t // #define SCORE_T_USE_DOUBLE diff --git a/include/LightGBM/metric.h b/include/LightGBM/metric.h index d92e5702bb18..a88e3223236f 100644 --- a/include/LightGBM/metric.h +++ b/include/LightGBM/metric.h @@ -67,7 +67,7 @@ class Metric { */ class DCGCalculator { public: - static void DefaultEvalAt(std::vector* eval_at); + static void DefaultEvalAt(std::vector* eval_at); static void DefaultLabelGain(std::vector* label_gain); /*! * \brief Initial logic diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 5c14c7d51a47..01159d6f01f5 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -159,9 +159,9 @@ class MultiValBinWrapper { int num_bin_; int num_bin_aligned_; int n_data_block_; - int data_block_size_; - int min_block_size_; - int num_data_; + data_size_t data_block_size_; + data_size_t min_block_size_; + data_size_t num_data_; hist_t* origin_hist_data_; diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 6ff0370e2ea6..e5523ea8b17e 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -61,7 +61,7 @@ class Tree { */ int Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, + data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool default_left); /*! @@ -84,7 +84,7 @@ class Tree { */ int SplitCategorical(int leaf, int feature, int real_feature, const uint32_t* threshold_bin, int num_threshold_bin, const uint32_t* threshold, int num_threshold, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type); + data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type); /*! \brief Get the output of one leaf */ inline double LeafOutput(int leaf) const { return leaf_value_[leaf]; } @@ -412,7 +412,7 @@ class Tree { } } - inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, int left_cnt, int right_cnt, + inline void Split(int leaf, int feature, int real_feature, double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain); /*! * \brief Find leaf index of which record belongs by features @@ -507,13 +507,13 @@ class Tree { /*! \brief weight of leaves */ std::vector leaf_weight_; /*! \brief DataCount of leaves */ - std::vector leaf_count_; + std::vector leaf_count_; /*! \brief Output of non-leaf nodes */ std::vector internal_value_; /*! \brief weight of non-leaf nodes */ std::vector internal_weight_; /*! \brief DataCount of non-leaf nodes */ - std::vector internal_count_; + std::vector internal_count_; /*! \brief Depth for leaves */ std::vector leaf_depth_; /*! \brief whether to keep track of ancestor nodes for each leaf (only needed when feature interactions are restricted) */ @@ -539,7 +539,7 @@ class Tree { }; inline void Tree::Split(int leaf, int feature, int real_feature, - double left_value, double right_value, int left_cnt, int right_cnt, + double left_value, double right_value, data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain) { int new_node_idx = num_leaves_ - 1; // update parent info diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h index 3125f6b9e9ca..85e3220af6dc 100644 --- a/include/LightGBM/utils/common.h +++ b/include/LightGBM/utils/common.h @@ -642,10 +642,10 @@ inline static std::vector Vector2Ptr(std::vector>* data) { } template -inline static std::vector VectorSize(const std::vector>& data) { - std::vector ret(data.size()); +inline static std::vector VectorSize(const std::vector>& data) { + std::vector ret(data.size()); for (size_t i = 0; i < data.size(); ++i) { - ret[i] = static_cast(data[i].size()); + ret[i] = static_cast(data[i].size()); } return ret; } @@ -728,13 +728,13 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred) { // Check that all y[] are in interval [ymin, ymax] (end points included); throws error if not template -inline static void CheckElementsIntervalClosed(const T *y, T ymin, T ymax, int ny, const char *callername) { +inline static void CheckElementsIntervalClosed(const T *y, T ymin, T ymax, int64_t ny, const char *callername) { auto fatal_msg = [&y, &ymin, &ymax, &callername](int i) { std::ostringstream os; os << "[%s]: does not tolerate element [#%i = " << y[i] << "] outside [" << ymin << ", " << ymax << "]"; Log::Fatal(os.str().c_str(), callername, i); }; - for (int i = 1; i < ny; i += 2) { + for (int64_t i = 1; i < ny; i += 2) { if (y[i - 1] < y[i]) { if (y[i - 1] < ymin) { fatal_msg(i - 1); diff --git a/include/LightGBM/utils/random.h b/include/LightGBM/utils/random.h index 6f89f935b310..cbbace859066 100644 --- a/include/LightGBM/utils/random.h +++ b/include/LightGBM/utils/random.h @@ -66,26 +66,27 @@ class Random { * \param K * \return K Ordered sampled data from {0,1,...,N-1} */ - inline std::vector Sample(int N, int K) { - std::vector ret; + template + inline std::vector Sample(T N, int K) { + std::vector ret; ret.reserve(K); if (K > N || K <= 0) { return ret; } else if (K == N) { - for (int i = 0; i < N; ++i) { + for (T i = 0; i < N; ++i) { ret.push_back(i); } } else if (K > 1 && K > (N / std::log2(K))) { - for (int i = 0; i < N; ++i) { + for (T i = 0; i < N; ++i) { double prob = (K - ret.size()) / static_cast(N - i); if (NextFloat() < prob) { ret.push_back(i); } } } else { - std::set sample_set; - for (int r = N - K; r < N; ++r) { - int v = NextInt(0, r + 1); + std::set sample_set; + for (T r = N - K; r < N; ++r) { + T v = NextInt(0, r + 1); if (!sample_set.insert(v).second) { sample_set.insert(r); } diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h index a093f87c1c8b..6b302c11d361 100644 --- a/include/LightGBM/utils/threading.h +++ b/include/LightGBM/utils/threading.h @@ -68,7 +68,7 @@ class Threading { template static inline int For( INDEX_T start, INDEX_T end, INDEX_T min_block_size, - const std::function& inner_fun) { + const std::function& inner_fun) { int n_block = 1; INDEX_T num_inner = end - start; BlockInfo(num_inner, min_block_size, &n_block, &num_inner); @@ -161,11 +161,14 @@ class ParallelPartitionRunner { left_write_pos_[0] = 0; right_write_pos_[0] = 0; - for (int i = 1; i < nblock; ++i) { + for (INDEX_T i = 1; i < nblock; ++i) { left_write_pos_[i] = left_write_pos_[i - 1] + left_cnts_[i - 1]; right_write_pos_[i] = right_write_pos_[i - 1] + right_cnts_[i - 1]; } - data_size_t left_cnt = left_write_pos_[nblock - 1] + left_cnts_[nblock - 1]; + data_size_t left_cnt = 0; + if(nblock > 0){ + left_cnt = left_write_pos_[nblock - 1] + left_cnts_[nblock - 1]; + } auto right_start = out + left_cnt; #pragma omp parallel for schedule(static, 1) num_threads(num_threads_) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 40f3850978a6..031047cd4c49 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -40,7 +40,7 @@ def _is_zero(x: float) -> bool: def _get_sample_count(total_nrow: int, params: str) -> int: sample_cnt = ctypes.c_int(0) _safe_call(_LIB.LGBM_GetSampleCount( - ctypes.c_int32(total_nrow), + ctypes.c_int64(total_nrow), c_str(params), ctypes.byref(sample_cnt), )) @@ -466,6 +466,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va MAX_INT32 = (1 << 31) - 1 +MAX_INT64 = (1 << 63) - 1 """Macro definition of data type in C API of LightGBM""" C_API_DTYPE_FLOAT32 = 0 @@ -494,7 +495,7 @@ def _choose_param_value(main_param_name: str, params: Dict[str, Any], default_va FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32, "weight": C_API_DTYPE_FLOAT32, "init_score": C_API_DTYPE_FLOAT64, - "group": C_API_DTYPE_INT32} + "group": C_API_DTYPE_INT64} """String name to int feature importance type mapper""" FEATURE_IMPORTANCE_TYPE_MAPPER = {"split": C_API_FEATURE_IMPORTANCE_SPLIT, @@ -897,9 +898,9 @@ def predict( def __get_num_preds(self, start_iteration, num_iteration, nrow, predict_type): """Get size of prediction result.""" - if nrow > MAX_INT32: + if nrow > MAX_INT64: raise LightGBMError('LightGBM cannot perform prediction for data ' - f'with number of rows greater than MAX_INT32 ({MAX_INT32}).\n' + f'with number of rows greater than MAX_INT64 ({MAX_INT64}).\n' 'You can split your data into chunks ' 'and then concatenate predictions for them') n_preds = ctypes.c_int64(0) @@ -1021,14 +1022,19 @@ def inner_predict(csr, start_iteration, num_iteration, predict_type, preds=None) ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr) ptr_data, type_ptr_data, _ = c_float_array(csr.data) - assert csr.shape[1] <= MAX_INT32 - csr_indices = csr.indices.astype(np.int32, copy=False) + # assert csr.shape[1] <= MAX_INT32 + if csr.indices.dtype == np.int32: + ptr_csr_indices = csr.indices.astype(np.int32, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + elif csr.indices.dtype == np.int64: + ptr_csr_indices = csr.indices.astype(np.int64, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + else: + raise TypeError(f"Expected np.int32 or np.int64, met type({csr.indices.dtype})") _safe_call(_LIB.LGBM_BoosterPredictForCSR( self.handle, ptr_indptr, ctypes.c_int(type_ptr_indptr), - csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_csr_indices, ptr_data, ctypes.c_int(type_ptr_data), ctypes.c_int64(len(csr.indptr)), @@ -1063,7 +1069,7 @@ def inner_predict_sparse(csr, start_iteration, num_iteration, predict_type): self.handle, ptr_indptr, ctypes.c_int(type_ptr_indptr), - csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), ptr_data, ctypes.c_int(type_ptr_data), ctypes.c_int64(len(csr.indptr)), @@ -1121,7 +1127,7 @@ def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type): self.handle, ptr_indptr, ctypes.c_int(type_ptr_indptr), - csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), ptr_data, ctypes.c_int(type_ptr_data), ctypes.c_int64(len(csc.indptr)), @@ -1153,14 +1159,19 @@ def inner_predict_sparse(csc, start_iteration, num_iteration, predict_type): ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr) ptr_data, type_ptr_data, _ = c_float_array(csc.data) - assert csc.shape[0] <= MAX_INT32 - csc_indices = csc.indices.astype(np.int32, copy=False) + # assert csc.shape[0] <= MAX_INT32 + if csc.indices.dtype == np.int32: + ptr_csc_indices = csc.indices.astype(np.int32, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + elif csc.indices.dtype == np.int64: + ptr_csc_indices = csc.indices.astype(np.int64, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + else: + raise TypeError(f"Expected np.int32 or np.int64, met type({csc.indices.dtype})") _safe_call(_LIB.LGBM_BoosterPredictForCSC( self.handle, ptr_indptr, ctypes.c_int(type_ptr_indptr), - csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_csc_indices, ptr_data, ctypes.c_int(type_ptr_data), ctypes.c_int64(len(csc.indptr)), @@ -1292,12 +1303,12 @@ def _create_sample_indices(self, total_nrow: int) -> np.ndarray: """ param_str = param_dict_to_str(self.get_params()) sample_cnt = _get_sample_count(total_nrow, param_str) - indices = np.empty(sample_cnt, dtype=np.int32) + indices = np.empty(sample_cnt, dtype=np.int64) ptr_data, _, _ = c_int_array(indices) - actual_sample_cnt = ctypes.c_int32(0) + actual_sample_cnt = ctypes.c_int64(0) _safe_call(_LIB.LGBM_SampleIndices( - ctypes.c_int32(total_nrow), + ctypes.c_int64(total_nrow), c_str(param_str), ptr_data, ctypes.byref(actual_sample_cnt), @@ -1376,7 +1387,7 @@ def _init_from_sample( sample_col_ptr[i] = c_float_array(sample_data[i])[0] indices_col_ptr[i] = c_int_array(sample_indices[i])[0] - num_per_col = np.array([len(d) for d in sample_indices], dtype=np.int32) + num_per_col = np.array([len(d) for d in sample_indices], dtype=np.int64) num_per_col_ptr, _, _ = c_int_array(num_per_col) self.handle = ctypes.c_void_p() @@ -1386,8 +1397,8 @@ def _init_from_sample( ctypes.cast(indices_col_ptr, ctypes.POINTER(ctypes.POINTER(ctypes.c_int32))), ctypes.c_int32(ncol), num_per_col_ptr, - ctypes.c_int32(sample_cnt), - ctypes.c_int32(total_nrow), + ctypes.c_int64(sample_cnt), + ctypes.c_int64(total_nrow), ctypes.c_int64(total_nrow), c_str(params_str), ctypes.byref(self.handle), @@ -1415,9 +1426,9 @@ def _push_rows(self, data: np.ndarray) -> 'Dataset': self.handle, data_ptr, data_type, - ctypes.c_int32(nrow), + ctypes.c_int64(nrow), ctypes.c_int32(ncol), - ctypes.c_int32(self._start_row), + ctypes.c_int64(self._start_row), )) self._start_row += nrow return self @@ -1708,7 +1719,7 @@ def __init_from_np2d( _safe_call(_LIB.LGBM_DatasetCreateFromMat( ptr_data, ctypes.c_int(type_ptr_data), - ctypes.c_int32(mat.shape[0]), + ctypes.c_int64(mat.shape[0]), ctypes.c_int32(mat.shape[1]), ctypes.c_int(C_API_IS_ROW_MAJOR), c_str(params_str), @@ -1724,7 +1735,7 @@ def __init_from_list_np2d( ) -> "Dataset": """Initialize data from a list of 2-D numpy matrices.""" ncol = mats[0].shape[1] - nrow = np.empty((len(mats),), np.int32) + nrow = np.empty((len(mats),), np.int64) if mats[0].dtype == np.float64: ptr_data = (ctypes.POINTER(ctypes.c_double) * len(mats))() else: @@ -1753,13 +1764,16 @@ def __init_from_list_np2d( ptr_data[i] = chunk_ptr_data type_ptr_data = chunk_type_ptr_data holders.append(holder) - + if mat.dtype == np.float32 or mat.dtype == np.float64: + ptr_ptr_data = ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), + else: + ptr_ptr_data = ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_float32))), self.handle = ctypes.c_void_p() _safe_call(_LIB.LGBM_DatasetCreateFromMats( ctypes.c_int32(len(mats)), ctypes.cast(ptr_data, ctypes.POINTER(ctypes.POINTER(ctypes.c_double))), ctypes.c_int(type_ptr_data), - nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + nrow.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), ctypes.c_int32(ncol), ctypes.c_int(C_API_IS_ROW_MAJOR), c_str(params_str), @@ -1781,13 +1795,18 @@ def __init_from_csr( ptr_indptr, type_ptr_indptr, __ = c_int_array(csr.indptr) ptr_data, type_ptr_data, _ = c_float_array(csr.data) - assert csr.shape[1] <= MAX_INT32 - csr_indices = csr.indices.astype(np.int32, copy=False) + # assert csr.shape[1] <= MAX_INT32 + if csr.indices.dtype == np.int32: + ptr_csr_indices = csr.indices.astype(np.int32, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + elif csr.indices.dtype == np.int64: + ptr_csr_indices = csr.indices.astype(np.int64, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + else: + raise TypeError(f"Expected np.int32 or np.int64, met type({csr.indices.dtype})") _safe_call(_LIB.LGBM_DatasetCreateFromCSR( ptr_indptr, ctypes.c_int(type_ptr_indptr), - csr_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_csr_indices, ptr_data, ctypes.c_int(type_ptr_data), ctypes.c_int64(len(csr.indptr)), @@ -1812,13 +1831,18 @@ def __init_from_csc( ptr_indptr, type_ptr_indptr, __ = c_int_array(csc.indptr) ptr_data, type_ptr_data, _ = c_float_array(csc.data) - assert csc.shape[0] <= MAX_INT32 - csc_indices = csc.indices.astype(np.int32, copy=False) + # assert csc.shape[0] <= MAX_INT32 + if csc.indices.dtype == np.int32: + ptr_csc_indices = csc.indices.astype(np.int32, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + elif csc.indices.dtype == np.int64: + ptr_csc_indices = csc.indices.astype(np.int64, copy=False).ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + else: + raise TypeError(f"Expected np.int32 or np.int64, met type({csc.indices.dtype})") _safe_call(_LIB.LGBM_DatasetCreateFromCSC( ptr_indptr, ctypes.c_int(type_ptr_indptr), - csc_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + ptr_csc_indices, ptr_data, ctypes.c_int(type_ptr_data), ctypes.c_int64(len(csc.indptr)), @@ -1895,7 +1919,7 @@ def construct(self) -> "Dataset": feature_name=self.feature_name, params=self.params) else: # construct subset - used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices') + used_indices = list_to_1d_numpy(self.used_indices, np.int64, name='used_indices') assert used_indices.flags.c_contiguous if self.reference.group is not None: group_info = np.array(self.reference.group).astype(np.int32, copy=False) @@ -1905,8 +1929,8 @@ def construct(self) -> "Dataset": params_str = param_dict_to_str(self.params) _safe_call(_LIB.LGBM_DatasetGetSubset( self.reference.construct().handle, - used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ctypes.c_int32(used_indices.shape[0]), + used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)), + ctypes.c_int64(used_indices.shape[0]), c_str(params_str), ctypes.byref(self.handle))) if not self.free_raw_data: @@ -2084,7 +2108,7 @@ def set_field( self.handle, c_str(field_name), None, - ctypes.c_int(0), + ctypes.c_int64(0), ctypes.c_int(FIELD_TYPE_MAPPER[field_name]))) return self if field_name == 'init_score': @@ -2100,13 +2124,16 @@ def set_field( 'In multiclass classification init_score can also be a list of lists, numpy 2-D array or pandas DataFrame.' ) else: - dtype = np.int32 if field_name == 'group' else np.float32 + dtype = np.int64 if field_name == 'group' else np.float32 data = list_to_1d_numpy(data, dtype, name=field_name) if data.dtype == np.float32 or data.dtype == np.float64: ptr_data, type_data, _ = c_float_array(data) elif data.dtype == np.int32: ptr_data, type_data, _ = c_int_array(data) + elif data.dtype == np.int64 and field_name == 'group': + ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + type_data= C_API_DTYPE_INT64 else: raise TypeError(f"Expected np.float32/64 or np.int32, met type({data.dtype})") if type_data != FIELD_TYPE_MAPPER[field_name]: @@ -2115,7 +2142,7 @@ def set_field( self.handle, c_str(field_name), ptr_data, - ctypes.c_int(len(data)), + ctypes.c_int64(len(data)), ctypes.c_int(type_data))) self.version += 1 return self @@ -2135,7 +2162,7 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]: """ if self.handle is None: raise Exception(f"Cannot get {field_name} before construct Dataset") - tmp_out_len = ctypes.c_int(0) + tmp_out_len = ctypes.c_int64(0) out_type = ctypes.c_int(0) ret = ctypes.POINTER(ctypes.c_void_p)() _safe_call(_LIB.LGBM_DatasetGetField( @@ -2148,8 +2175,8 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]: raise TypeError("Return type error for get_field") if tmp_out_len.value == 0: return None - if out_type.value == C_API_DTYPE_INT32: - arr = cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value) + if out_type.value == C_API_DTYPE_INT32 or (out_type.value == C_API_DTYPE_INT64 and field_name == 'group'): + arr = cint64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int64)), tmp_out_len.value) elif out_type.value == C_API_DTYPE_FLOAT32: arr = cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) elif out_type.value == C_API_DTYPE_FLOAT64: @@ -2497,7 +2524,7 @@ def num_data(self) -> int: The number of rows in the Dataset. """ if self.handle is not None: - ret = ctypes.c_int(0) + ret = ctypes.c_int64(0) _safe_call(_LIB.LGBM_DatasetGetNumData(self.handle, ctypes.byref(ret))) return ret.value @@ -3863,7 +3890,7 @@ def refit( _safe_call(_LIB.LGBM_BoosterRefit( new_booster.handle, ptr_data, - ctypes.c_int32(nrow), + ctypes.c_int64(nrow), ctypes.c_int32(ncol))) new_booster.network = self.network return new_booster diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index c2a38417091a..500adc91bed4 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -374,7 +374,7 @@ void GBDT::RefitTree(const std::vector>& tree_leaf_prediction) for (int tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) { int model_index = iter * num_tree_per_iteration_ + tree_id; #pragma omp parallel for schedule(static) - for (int i = 0; i < num_data_; ++i) { + for (int64_t i = 0; i < num_data_; ++i) { leaf_pred[i] = tree_leaf_prediction[i][model_index]; CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves()); } @@ -484,7 +484,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) { auto hess = hessians + offset; // need to copy gradients for bagging subset. if (is_use_subset_ && bag_data_cnt_ < num_data_ && !boosting_on_gpu_) { - for (int i = 0; i < bag_data_cnt_; ++i) { + for (int64_t i = 0; i < bag_data_cnt_; ++i) { gradients_pointer_[offset + i] = grad[bag_data_indices_[i]]; hessians_pointer_[offset + i] = hess[bag_data_indices_[i]]; } diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 09c63d9728f3..1a4fff636eab 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -91,7 +91,7 @@ class GOSS: public GBDT { is_use_subset_ = false; if (config_->top_rate + config_->other_rate <= 0.5) { auto bag_data_cnt = static_cast((config_->top_rate + config_->other_rate) * num_data_); - bag_data_cnt = std::max(1, bag_data_cnt); + bag_data_cnt = std::max(1, bag_data_cnt); tmp_subset_.reset(new Dataset(bag_data_cnt)); tmp_subset_->CopyFeatureMapperFrom(train_data_); is_use_subset_ = true; @@ -113,7 +113,7 @@ class GOSS: public GBDT { } data_size_t top_k = static_cast(cnt * config_->top_rate); data_size_t other_k = static_cast(cnt * config_->other_rate); - top_k = std::max(1, top_k); + top_k = std::max(1, top_k); ArrayArgs::ArgMaxAtK(&tmp_gradients, 0, static_cast(tmp_gradients.size()), top_k - 1); score_t threshold = tmp_gradients[top_k - 1]; diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp index 6e475455df7a..bd3daa90b73d 100644 --- a/src/boosting/score_updater.hpp +++ b/src/boosting/score_updater.hpp @@ -55,7 +55,7 @@ class ScoreUpdater { Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer); const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) - for (int i = 0; i < num_data_; ++i) { + for (int64_t i = 0; i < num_data_; ++i) { score_[offset + i] += val; } } @@ -63,7 +63,7 @@ class ScoreUpdater { virtual inline void MultiplyScore(double val, int cur_tree_id) { const size_t offset = static_cast(num_data_) * cur_tree_id; #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024) - for (int i = 0; i < num_data_; ++i) { + for (int64_t i = 0; i < num_data_; ++i) { score_[offset + i] *= val; } } diff --git a/src/c_api.cpp b/src/c_api.cpp index 5a5dc81fd5e9..24cb06150e16 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -351,10 +351,10 @@ class Booster { return boosting_->TrainOneIter(nullptr, nullptr); } - void Refit(const int32_t* leaf_preds, int32_t nrow, int32_t ncol) { + void Refit(const int32_t* leaf_preds, int64_t nrow, int32_t ncol) { UNIQUE_LOCK(mutex_) std::vector> v_leaf_preds(nrow, std::vector(ncol, 0)); - for (int i = 0; i < nrow; ++i) { + for (int64_t i = 0; i < nrow; ++i) { for (int j = 0; j < ncol; ++j) { v_leaf_preds[i][j] = leaf_preds[static_cast(i) * static_cast(ncol) + static_cast(j)]; } @@ -382,7 +382,7 @@ class Booster { } void PredictSingleRow(int predict_type, int ncol, - std::function>(int row_idx)> get_row_fun, + std::function>(int64_t row_idx)> get_row_fun, const Config& config, double* out_result, int64_t* out_len) const { if (!config.predict_disable_shape_check && ncol != boosting_->MaxFeatureIdx() + 1) { @@ -420,8 +420,8 @@ class Booster { config.pred_early_stop, config.pred_early_stop_freq, config.pred_early_stop_margin); } - void Predict(int start_iteration, int num_iteration, int predict_type, int nrow, int ncol, - std::function>(int row_idx)> get_row_fun, + void Predict(int start_iteration, int num_iteration, int predict_type, int64_t nrow, int ncol, + std::function>(int64_t row_idx)> get_row_fun, const Config& config, double* out_result, int64_t* out_len) const { SHARED_LOCK(mutex_); @@ -437,7 +437,7 @@ class Booster { auto pred_fun = predictor.GetPredictFunction(); OMP_INIT_EX(); #pragma omp parallel for schedule(static) - for (int i = 0; i < nrow; ++i) { + for (int64_t i = 0; i < nrow; ++i) { OMP_LOOP_EX_BEGIN(); auto one_row = get_row_fun(i); auto pred_wrt_ptr = out_result + static_cast(num_pred_in_one_row) * i; @@ -849,37 +849,37 @@ using LightGBM::ReduceScatterFunction; // some help functions used to convert data -std::function(int row_idx)> -RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major); +std::function(int64_t row_idx)> +RowFunctionFromDenseMatric(const void* data, int64_t num_row, int num_col, int data_type, int is_row_major); -std::function>(int row_idx)> -RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major); +std::function>(int64_t row_idx)> +RowPairFunctionFromDenseMatric(const void* data, int64_t num_row, int num_col, int data_type, int is_row_major); -std::function>(int row_idx)> +std::function>(int64_t row_idx)> RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type); template std::function>(T idx)> -RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, +RowFunctionFromCSR(const void* indptr, int indptr_type, const void* indices, const void* data, int data_type, int64_t nindptr, int64_t nelem); // Row iterator of on column for CSC matrix class CSC_RowIterator { public: - CSC_RowIterator(const void* col_ptr, int col_ptr_type, const int32_t* indices, + CSC_RowIterator(const void* col_ptr, int col_ptr_type, const void* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx); ~CSC_RowIterator() {} // return value at idx, only can access by ascent order - double Get(int idx); + double Get(int64_t idx); // return next non-zero pair, if index < 0, means no more data - std::pair NextNonZero(); + std::pair NextNonZero(); private: - int nonzero_idx_ = 0; - int cur_idx_ = -1; + int64_t nonzero_idx_ = 0; + int64_t cur_idx_ = -1; double cur_val_ = 0.0f; bool is_end_ = false; - std::function(int idx)> iter_fun_; + std::function(int64_t idx)> iter_fun_; }; // start of c_api functions @@ -906,17 +906,17 @@ int LGBM_RegisterLogCallback(void (*callback)(const char*)) { API_END(); } -static inline int SampleCount(int32_t total_nrow, const Config& config) { +static inline int SampleCount(int64_t total_nrow, const Config& config) { return static_cast(total_nrow < config.bin_construct_sample_cnt ? total_nrow : config.bin_construct_sample_cnt); } -static inline std::vector CreateSampleIndices(int32_t total_nrow, const Config& config) { +static inline std::vector CreateSampleIndices(int64_t total_nrow, const Config& config) { Random rand(config.data_random_seed); int sample_cnt = SampleCount(total_nrow, config); - return rand.Sample(total_nrow, sample_cnt); + return rand.Sample(total_nrow, sample_cnt); } -int LGBM_GetSampleCount(int32_t num_total_row, +int LGBM_GetSampleCount(int64_t num_total_row, const char* parameters, int* out) { API_BEGIN(); @@ -931,10 +931,10 @@ int LGBM_GetSampleCount(int32_t num_total_row, API_END(); } -int LGBM_SampleIndices(int32_t num_total_row, +int LGBM_SampleIndices(int64_t num_total_row, const char* parameters, void* out, - int32_t* out_len) { + int64_t* out_len) { // This API is to keep python binding's behavior the same with C++ implementation. // Sample count, random seed etc. should be provided in parameters. API_BEGIN(); @@ -946,7 +946,7 @@ int LGBM_SampleIndices(int32_t num_total_row, config.Set(param); auto sample_indices = CreateSampleIndices(num_total_row, config); - memcpy(out, sample_indices.data(), sizeof(int32_t) * sample_indices.size()); + memcpy(out, sample_indices.data(), sizeof(int64_t) * sample_indices.size()); *out_len = static_cast(sample_indices.size()); API_END(); } @@ -977,9 +977,9 @@ int LGBM_DatasetCreateFromFile(const char* filename, int LGBM_DatasetCreateFromSampledColumn(double** sample_data, int** sample_indices, int32_t ncol, - const int* num_per_col, - int32_t num_sample_row, - int32_t num_local_row, + const int64_t* num_per_col, + int64_t num_sample_row, + int64_t num_local_row, int64_t num_dist_row, const char* parameters, DatasetHandle* out) { @@ -1030,9 +1030,9 @@ int LGBM_DatasetInitStreaming(DatasetHandle dataset, int LGBM_DatasetPushRows(DatasetHandle dataset, const void* data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, - int32_t start_row) { + int64_t start_row) { API_BEGIN(); auto p_dataset = reinterpret_cast(dataset); auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, 1); @@ -1041,7 +1041,7 @@ int LGBM_DatasetPushRows(DatasetHandle dataset, } OMP_INIT_EX(); #pragma omp parallel for schedule(static) - for (int i = 0; i < nrow; ++i) { + for (int64_t i = 0; i < nrow; ++i) { OMP_LOOP_EX_BEGIN(); const int tid = omp_get_thread_num(); auto one_row = get_row_fun(i); @@ -1064,7 +1064,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, const float* labels, const float* weights, const double* init_scores, - const int32_t* queries, + const int64_t* queries, int32_t tid) { API_BEGIN(); #ifdef LABEL_T_USE_DOUBLE @@ -1103,7 +1103,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset, int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -1113,13 +1113,13 @@ int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset, API_BEGIN(); auto p_dataset = reinterpret_cast(dataset); auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); - int32_t nrow = static_cast(nindptr - 1); + int64_t nrow = static_cast(nindptr - 1); if (p_dataset->has_raw()) { p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow); } OMP_INIT_EX(); #pragma omp parallel for schedule(static) - for (int i = 0; i < nrow; ++i) { + for (int64_t i = 0; i < nrow; ++i) { OMP_LOOP_EX_BEGIN(); const int tid = omp_get_thread_num(); auto one_row = get_row_fun(i); @@ -1145,7 +1145,7 @@ int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle dataset, const float* labels, const float* weights, const double* init_scores, - const int32_t* queries, + const int64_t* queries, int32_t tid) { API_BEGIN(); #ifdef LABEL_T_USE_DOUBLE @@ -1197,7 +1197,7 @@ int LGBM_DatasetMarkFinished(DatasetHandle dataset) { int LGBM_DatasetCreateFromMat(const void* data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, int is_row_major, const char* parameters, @@ -1217,7 +1217,7 @@ int LGBM_DatasetCreateFromMat(const void* data, int LGBM_DatasetCreateFromMats(int32_t nmat, const void** data, int data_type, - int32_t* nrow, + int64_t* nrow, int32_t ncol, int is_row_major, const char* parameters, @@ -1229,12 +1229,12 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, config.Set(param); OMP_SET_NUM_THREADS(config.num_threads); std::unique_ptr ret; - int32_t total_nrow = 0; + int64_t total_nrow = 0; for (int j = 0; j < nmat; ++j) { total_nrow += nrow[j]; } - std::vector(int row_idx)>> get_row_fun; + std::vector(int64_t row_idx)>> get_row_fun; for (int j = 0; j < nmat; ++j) { get_row_fun.push_back(RowFunctionFromDenseMatric(data[j], nrow[j], ncol, data_type, is_row_major)); } @@ -1279,11 +1279,11 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, ret->ResizeRaw(total_nrow); } } - int32_t start_row = 0; + int64_t start_row = 0; for (int j = 0; j < nmat; ++j) { OMP_INIT_EX(); #pragma omp parallel for schedule(static) - for (int i = 0; i < nrow[j]; ++i) { + for (int64_t i = 0; i < nrow[j]; ++i) { OMP_LOOP_EX_BEGIN(); const int tid = omp_get_thread_num(); auto one_row = get_row_fun[j](i); @@ -1301,7 +1301,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat, int LGBM_DatasetCreateFromCSR(const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -1322,7 +1322,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr, OMP_SET_NUM_THREADS(config.num_threads); std::unique_ptr ret; auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem); - int32_t nrow = static_cast(nindptr - 1); + int64_t nrow = static_cast(nindptr - 1); if (reference == nullptr) { // sample data first auto sample_indices = CreateSampleIndices(nrow, config); @@ -1389,7 +1389,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, config.Set(param); OMP_SET_NUM_THREADS(config.num_threads); std::unique_ptr ret; - int32_t nrow = num_rows; + int64_t nrow = num_rows; if (reference == nullptr) { // sample data first auto sample_indices = CreateSampleIndices(nrow, config); @@ -1446,7 +1446,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr, int LGBM_DatasetCreateFromCSC(const void* col_ptr, int col_ptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t ncol_ptr, @@ -1461,7 +1461,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, config.Set(param); OMP_SET_NUM_THREADS(config.num_threads); std::unique_ptr ret; - int32_t nrow = static_cast(num_row); + int64_t nrow = static_cast(num_row); if (reference == nullptr) { // sample data first auto sample_indices = CreateSampleIndices(nrow, config); @@ -1517,7 +1517,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, ret->PushOneData(tid, row_idx, group, feature_idx, sub_feature, pair.second); } } else { - for (int row_idx = 0; row_idx < nrow; ++row_idx) { + for (int64_t row_idx = 0; row_idx < nrow; ++row_idx) { auto val = col_it.Get(row_idx); ret->PushOneData(tid, row_idx, group, feature_idx, sub_feature, val); } @@ -1532,8 +1532,8 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr, int LGBM_DatasetGetSubset( const DatasetHandle handle, - const int32_t* used_row_indices, - int32_t num_used_row_indices, + const int64_t* used_row_indices, + int64_t num_used_row_indices, const char* parameters, DatasetHandle* out) { API_BEGIN(); @@ -1543,9 +1543,9 @@ int LGBM_DatasetGetSubset( OMP_SET_NUM_THREADS(config.num_threads); auto full_dataset = reinterpret_cast(handle); CHECK_GT(num_used_row_indices, 0); - const int32_t lower = 0; - const int32_t upper = full_dataset->num_data() - 1; - CheckElementsIntervalClosed(used_row_indices, lower, upper, num_used_row_indices, "Used indices of subset"); + const int64_t lower = 0; + const int64_t upper = full_dataset->num_data() - 1; + CheckElementsIntervalClosed(used_row_indices, lower, upper, num_used_row_indices, "Used indices of subset"); if (!std::is_sorted(used_row_indices, used_row_indices + num_used_row_indices)) { Log::Fatal("used_row_indices should be sorted in Subset"); } @@ -1620,17 +1620,17 @@ int LGBM_DatasetDumpText(DatasetHandle handle, int LGBM_DatasetSetField(DatasetHandle handle, const char* field_name, const void* field_data, - int num_element, + data_size_t num_element, int type) { API_BEGIN(); auto dataset = reinterpret_cast(handle); bool is_success = false; if (type == C_API_DTYPE_FLOAT32) { - is_success = dataset->SetFloatField(field_name, reinterpret_cast(field_data), static_cast(num_element)); - } else if (type == C_API_DTYPE_INT32) { - is_success = dataset->SetIntField(field_name, reinterpret_cast(field_data), static_cast(num_element)); + is_success = dataset->SetFloatField(field_name, reinterpret_cast(field_data), static_cast(num_element)); + } else if (type == C_API_DTYPE_INT32 || type == C_API_DTYPE_INT64) { + is_success = dataset->SetIntField(field_name, reinterpret_cast(field_data), static_cast(num_element)); } else if (type == C_API_DTYPE_FLOAT64) { - is_success = dataset->SetDoubleField(field_name, reinterpret_cast(field_data), static_cast(num_element)); + is_success = dataset->SetDoubleField(field_name, reinterpret_cast(field_data), static_cast(num_element)); } if (!is_success) { Log::Fatal("Input data type error or field not found"); } API_END(); @@ -1638,7 +1638,7 @@ int LGBM_DatasetSetField(DatasetHandle handle, int LGBM_DatasetGetField(DatasetHandle handle, const char* field_name, - int* out_len, + int64_t* out_len, const void** out_ptr, int* out_type) { API_BEGIN(); @@ -1647,8 +1647,8 @@ int LGBM_DatasetGetField(DatasetHandle handle, if (dataset->GetFloatField(field_name, out_len, reinterpret_cast(out_ptr))) { *out_type = C_API_DTYPE_FLOAT32; is_success = true; - } else if (dataset->GetIntField(field_name, out_len, reinterpret_cast(out_ptr))) { - *out_type = C_API_DTYPE_INT32; + } else if (dataset->GetIntField(field_name, out_len, reinterpret_cast(out_ptr))) { + *out_type = C_API_DTYPE_INT64; is_success = true; } else if (dataset->GetDoubleField(field_name, out_len, reinterpret_cast(out_ptr))) { *out_type = C_API_DTYPE_FLOAT64; @@ -1670,7 +1670,7 @@ int LGBM_DatasetUpdateParamChecking(const char* old_parameters, const char* new_ } int LGBM_DatasetGetNumData(DatasetHandle handle, - int* out) { + int64_t* out) { API_BEGIN(); auto dataset = reinterpret_cast(handle); *out = dataset->num_data(); @@ -1816,7 +1816,7 @@ int LGBM_BoosterGetLinear(BoosterHandle handle, int* out) { API_END(); } -int LGBM_BoosterRefit(BoosterHandle handle, const int32_t* leaf_preds, int32_t nrow, int32_t ncol) { +int LGBM_BoosterRefit(BoosterHandle handle, const int32_t* leaf_preds, int64_t nrow, int32_t ncol) { API_BEGIN(); Booster* ref_booster = reinterpret_cast(handle); ref_booster->Refit(leaf_preds, nrow, ncol); @@ -2023,7 +2023,7 @@ int LGBM_FastConfigFree(FastConfigHandle fastConfig) { int LGBM_BoosterPredictForCSR(BoosterHandle handle, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -2056,7 +2056,7 @@ int LGBM_BoosterPredictForCSR(BoosterHandle handle, int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -2102,7 +2102,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, one_row.reserve(ncol); const int tid = omp_get_thread_num(); for (int j = 0; j < ncol; ++j) { - auto val = iterators[tid][j].Get(static_cast(i)); + auto val = iterators[tid][j].Get(static_cast(i)); if (std::fabs(val) > kZeroThreshold || std::isnan(val)) { one_row.emplace_back(j, val); } @@ -2117,7 +2117,7 @@ int LGBM_BoosterPredictSparseOutput(BoosterHandle handle, API_END(); } -int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, int indptr_type, int data_type) { +int LGBM_BoosterFreePredictSparse(void* indptr, void* indices, void* data, int indptr_type, int data_type) { API_BEGIN(); if (indptr_type == C_API_DTYPE_INT32) { delete[] reinterpret_cast(indptr); @@ -2126,7 +2126,15 @@ int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, in } else { Log::Fatal("Unknown indptr type in LGBM_BoosterFreePredictSparse"); } - delete[] indices; + + // indices type is the same as indptr type + if (indptr_type == C_API_DTYPE_INT32) { + delete[] reinterpret_cast(indices); + } else if (indptr_type == C_API_DTYPE_INT64) { + delete[] reinterpret_cast(indices); + } else { + Log::Fatal("Unknown indices type in LGBM_BoosterFreePredictSparse"); + } if (data_type == C_API_DTYPE_FLOAT32) { delete[] reinterpret_cast(data); } else if (data_type == C_API_DTYPE_FLOAT64) { @@ -2140,7 +2148,7 @@ int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, in int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle, const void* indptr, int indptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t nindptr, @@ -2202,7 +2210,7 @@ int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle, int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle, const void* indptr, const int indptr_type, - const int32_t* indices, + const void* indices, const void* data, const int64_t nindptr, const int64_t nelem, @@ -2220,7 +2228,7 @@ int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle, int LGBM_BoosterPredictForCSC(BoosterHandle handle, const void* col_ptr, int col_ptr_type, - const int32_t* indices, + const void* indices, const void* data, int data_type, int64_t ncol_ptr, @@ -2246,8 +2254,9 @@ int LGBM_BoosterPredictForCSC(BoosterHandle handle, iterators[i].emplace_back(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, j); } } - std::function>(int row_idx)> get_row_fun = - [&iterators, ncol](int i) { + std::function>(int64_t row_idx)> get_row_fun = + [&iterators, ncol](int64_t i) { + std::vector> one_row; one_row.reserve(ncol); const int tid = omp_get_thread_num(); @@ -2288,7 +2297,7 @@ int LGBM_BoosterValidateFeatureNames(BoosterHandle handle, int LGBM_BoosterPredictForMat(BoosterHandle handle, const void* data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, int is_row_major, int predict_type, @@ -2374,7 +2383,7 @@ int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fastConfig_handle, int LGBM_BoosterPredictForMats(BoosterHandle handle, const void** data, int data_type, - int32_t nrow, + int64_t nrow, int32_t ncol, int predict_type, int start_iteration, @@ -2528,11 +2537,11 @@ int LGBM_NetworkInitWithFunctions(int num_machines, int rank, template -std::function(int row_idx)> -RowFunctionFromDenseMatric_helper(const void* data, int num_row, int num_col, int is_row_major) { +std::function(int64_t row_idx)> +RowFunctionFromDenseMatric_helper(const void* data, int64_t num_row, int num_col, int is_row_major) { const T* data_ptr = reinterpret_cast(data); if (is_row_major) { - return [=] (int row_idx) { + return [=] (int64_t row_idx) { std::vector ret(num_col); auto tmp_ptr = data_ptr + static_cast(num_col) * row_idx; for (int i = 0; i < num_col; ++i) { @@ -2541,7 +2550,7 @@ RowFunctionFromDenseMatric_helper(const void* data, int num_row, int num_col, in return ret; }; } else { - return [=] (int row_idx) { + return [=] (int64_t row_idx) { std::vector ret(num_col); for (int i = 0; i < num_col; ++i) { ret[i] = static_cast(*(data_ptr + static_cast(num_row) * i + row_idx)); @@ -2550,9 +2559,8 @@ RowFunctionFromDenseMatric_helper(const void* data, int num_row, int num_col, in }; } } - -std::function(int row_idx)> -RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) { +std::function(int64_t row_idx)> +RowFunctionFromDenseMatric(const void* data, int64_t num_row, int num_col, int data_type, int is_row_major) { if (data_type == C_API_DTYPE_FLOAT32) { return RowFunctionFromDenseMatric_helper(data, num_row, num_col, is_row_major); } else if (data_type == C_API_DTYPE_FLOAT64) { @@ -2562,11 +2570,11 @@ RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_ return nullptr; } -std::function>(int row_idx)> -RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) { +std::function>(int64_t row_idx)> +RowPairFunctionFromDenseMatric(const void* data, int64_t num_row, int num_col, int data_type, int is_row_major) { auto inner_function = RowFunctionFromDenseMatric(data, num_row, num_col, data_type, is_row_major); if (inner_function != nullptr) { - return [inner_function] (int row_idx) { + return [inner_function] (int64_t row_idx) { auto raw_values = inner_function(row_idx); std::vector> ret; ret.reserve(raw_values.size()); @@ -2582,9 +2590,9 @@ RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int d } // data is array of pointers to individual rows -std::function>(int row_idx)> +std::function>(int64_t row_idx)> RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type) { - return [=](int row_idx) { + return [=](int64_t row_idx) { auto inner_function = RowFunctionFromDenseMatric(data[row_idx], 1, num_col, data_type, /* is_row_major */ true); auto raw_values = inner_function(0); std::vector> ret; @@ -2600,9 +2608,11 @@ RowPairFunctionFromDenseRows(const void** data, int num_col, int data_type) { template std::function>(T idx)> -RowFunctionFromCSR_helper(const void* indptr, const int32_t* indices, const void* data) { +RowFunctionFromCSR_helper(const void* indptr, const void* tmp_indices, const void* data) { + const T1* data_ptr = reinterpret_cast(data); const T2* ptr_indptr = reinterpret_cast(indptr); + const T2* indices = reinterpret_cast(tmp_indices); return [=] (T idx) { std::vector> ret; int64_t start = ptr_indptr[idx]; @@ -2619,7 +2629,8 @@ RowFunctionFromCSR_helper(const void* indptr, const int32_t* indices, const void template std::function>(T idx)> -RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, const void* data, int data_type, int64_t , int64_t ) { +RowFunctionFromCSR(const void* indptr, int indptr_type, const void* indices, const void* data, int data_type, int64_t , int64_t ) { + if (data_type == C_API_DTYPE_FLOAT32) { if (indptr_type == C_API_DTYPE_INT32) { return RowFunctionFromCSR_helper(indptr, indices, data); @@ -2640,48 +2651,50 @@ RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, template -std::function(int idx)> IterateFunctionFromCSC_helper(const void* col_ptr, const int32_t* indices, const void* data, int col_idx) { +std::function(T2 idx)> IterateFunctionFromCSC_helper(const void* col_ptr, const void* tmp_indices, const void* data, int col_idx) { const T1* data_ptr = reinterpret_cast(data); const T2* ptr_col_ptr = reinterpret_cast(col_ptr); + const T2* indices = reinterpret_cast(tmp_indices); int64_t start = ptr_col_ptr[col_idx]; int64_t end = ptr_col_ptr[col_idx + 1]; return [=] (int offset) { int64_t i = static_cast(start + offset); if (i >= end) { - return std::make_pair(-1, 0.0); + return std::make_pair(static_cast(-1), 0.0); } - int idx = static_cast(indices[i]); + T2 idx = static_cast(indices[i]); + double val = static_cast(data_ptr[i]); return std::make_pair(idx, val); }; } -std::function(int idx)> -IterateFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t , int col_idx) { +template +std::function(int64_t idx)> +IterateFunctionFromCSC(const void* col_ptr, int, const void* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t , int col_idx) { + CHECK(col_idx < ncol_ptr && col_idx >= 0); if (data_type == C_API_DTYPE_FLOAT32) { - if (col_ptr_type == C_API_DTYPE_INT32) { - return IterateFunctionFromCSC_helper(col_ptr, indices, data, col_idx); - } else if (col_ptr_type == C_API_DTYPE_INT64) { - return IterateFunctionFromCSC_helper(col_ptr, indices, data, col_idx); - } + return IterateFunctionFromCSC_helper(col_ptr, indices, data, col_idx); } else if (data_type == C_API_DTYPE_FLOAT64) { - if (col_ptr_type == C_API_DTYPE_INT32) { - return IterateFunctionFromCSC_helper(col_ptr, indices, data, col_idx); - } else if (col_ptr_type == C_API_DTYPE_INT64) { - return IterateFunctionFromCSC_helper(col_ptr, indices, data, col_idx); - } + return IterateFunctionFromCSC_helper(col_ptr, indices, data, col_idx); } Log::Fatal("Unknown data type in CSC matrix"); return nullptr; } -CSC_RowIterator::CSC_RowIterator(const void* col_ptr, int col_ptr_type, const int32_t* indices, + +CSC_RowIterator::CSC_RowIterator(const void* col_ptr, int col_ptr_type, const void* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem, int col_idx) { - iter_fun_ = IterateFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, col_idx); + if (col_ptr_type == C_API_DTYPE_INT32) { + iter_fun_ = IterateFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, col_idx); + } else if (col_ptr_type == C_API_DTYPE_INT64) { + iter_fun_ = IterateFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, col_idx); + } + } -double CSC_RowIterator::Get(int idx) { +double CSC_RowIterator::Get(int64_t idx) { while (idx > cur_idx_ && !is_end_) { auto ret = iter_fun_(nonzero_idx_); if (ret.first < 0) { @@ -2699,7 +2712,7 @@ double CSC_RowIterator::Get(int idx) { } } -std::pair CSC_RowIterator::NextNonZero() { +std::pair CSC_RowIterator::NextNonZero() { if (!is_end_) { auto ret = iter_fun_(nonzero_idx_); ++nonzero_idx_; @@ -2708,6 +2721,6 @@ std::pair CSC_RowIterator::NextNonZero() { } return ret; } else { - return std::make_pair(-1, 0.0); + return std::make_pair(static_cast(-1), 0.0); } } diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 2acdd083b9df..ca4d6ef4f6f6 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -51,7 +51,7 @@ namespace LightGBM { BinMapper::~BinMapper() { } - bool NeedFilter(const std::vector& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) { + bool NeedFilter(const std::vector& cnt_in_bin, data_size_t total_cnt, int filter_cnt, BinType bin_type) { if (bin_type == BinType::NumericalBin) { int sum_left = 0; for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) { @@ -322,7 +322,7 @@ namespace LightGBM { } } - void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, + void BinMapper::FindBin(double* values, int64_t num_sample_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type, bool use_missing, bool zero_as_missing, const std::vector& forced_upper_bounds) { @@ -367,7 +367,7 @@ namespace LightGBM { counts.push_back(1); } - for (int i = 1; i < num_sample_values; ++i) { + for (int64_t i = 1; i < num_sample_values; ++i) { if (!Common::CheckDoubleEqualOrdered(values[i - 1], values[i])) { if (values[i - 1] < 0.0f && values[i] > 0.0f) { distinct_values.push_back(0.0f); @@ -497,7 +497,7 @@ namespace LightGBM { is_trivial_ = false; } // check useless bin - if (!is_trivial_ && pre_filter && NeedFilter(cnt_in_bin, static_cast(total_sample_cnt), min_split_data, bin_type_)) { + if (!is_trivial_ && pre_filter && NeedFilter(cnt_in_bin, static_cast(total_sample_cnt), min_split_data, bin_type_)) { is_trivial_ = true; } diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 6c2e3cabad00..abe521d14a15 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -612,7 +612,7 @@ void Config::GetMembersFromString(const std::unordered_map(tmp_str, ','); + eval_at = Common::StringToArray(tmp_str, ','); } GetInt(params, "multi_error_top_k", &multi_error_top_k); diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 2842551cf2ee..f652f9efc49a 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -52,10 +52,10 @@ std::vector> OneFeaturePerGroup(const std::vector& used_fe return features_in_group; } -int GetConflictCount(const std::vector& mark, const int* indices, - int num_indices, data_size_t max_cnt) { - int ret = 0; - for (int i = 0; i < num_indices; ++i) { +int64_t GetConflictCount(const std::vector& mark, const int* indices, + int64_t num_indices, data_size_t max_cnt) { + int64_t ret = 0; + for (int64_t i = 0; i < num_indices; ++i) { if (mark[indices[i]]) { ++ret; } @@ -69,13 +69,13 @@ int GetConflictCount(const std::vector& mark, const int* indices, void MarkUsed(std::vector* mark, const int* indices, data_size_t num_indices) { auto& ref_mark = *mark; - for (int i = 0; i < num_indices; ++i) { + for (int64_t i = 0; i < num_indices; ++i) { ref_mark[indices[i]] = true; } } std::vector FixSampleIndices(const BinMapper* bin_mapper, - int num_total_samples, int num_indices, + int num_total_samples, int64_t num_indices, const int* sample_indices, const double* sample_values) { std::vector ret; @@ -102,7 +102,7 @@ std::vector FixSampleIndices(const BinMapper* bin_mapper, std::vector> FindGroups( const std::vector>& bin_mappers, const std::vector& find_order, int** sample_indices, - const int* num_per_col, int num_sample_col, data_size_t total_sample_cnt, + const int64_t* num_per_col, int num_sample_col, data_size_t total_sample_cnt, data_size_t num_data, bool is_use_gpu, bool is_sparse, std::vector* multi_val_group) { const int max_search_group = 100; @@ -137,7 +137,7 @@ std::vector> FindGroups( std::vector search_groups; if (!available_groups.empty()) { int last = static_cast(available_groups.size()) - 1; - auto indices = rand.Sample(last, std::min(last, max_search_group - 1)); + auto indices = rand.Sample(last, std::min(last, max_search_group - 1)); // always push the last group search_groups.push_back(available_groups.back()); for (auto idx : indices) { @@ -240,7 +240,7 @@ std::vector> FindGroups( std::vector> FastFeatureBundling( const std::vector>& bin_mappers, - int** sample_indices, double** sample_values, const int* num_per_col, + int** sample_indices, double** sample_values, const int64_t* num_per_col, int num_sample_col, data_size_t total_sample_cnt, const std::vector& used_features, data_size_t num_data, bool is_use_gpu, bool is_sparse, std::vector* multi_val_group) { @@ -274,7 +274,7 @@ std::vector> FastFeatureBundling( } std::vector> tmp_indices; - std::vector tmp_num_per_col(num_sample_col, 0); + std::vector tmp_num_per_col(num_sample_col, 0); for (auto fidx : used_features) { if (fidx >= num_sample_col) { continue; @@ -322,7 +322,7 @@ void Dataset::Construct(std::vector>* bin_mappers, const std::vector>& forced_bins, int** sample_non_zero_indices, double** sample_values, - const int* num_per_col, + const int64_t* num_per_col, int num_sample_col, size_t total_sample_cnt, const Config& io_config) { @@ -907,7 +907,7 @@ bool Dataset::SetDoubleField(const char* field_name, const double* field_data, return true; } -bool Dataset::SetIntField(const char* field_name, const int* field_data, +bool Dataset::SetIntField(const char* field_name, const data_size_t* field_data, data_size_t num_element) { std::string name(field_name); name = Common::Trim(name); @@ -957,7 +957,7 @@ bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, } bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, - const int** out_ptr) { + const data_size_t** out_ptr) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("query") || name == std::string("group")) { @@ -1109,7 +1109,7 @@ void Dataset::DumpTextFile(const char* text_filename) { fprintf(file, "num_features: %d\n", num_features_); fprintf(file, "num_total_features: %d\n", num_total_features_); fprintf(file, "num_groups: %d\n", num_groups_); - fprintf(file, "num_data: %d\n", num_data_); + fprintf(file, "num_data: %ld\n", num_data_); fprintf(file, "feature_names: "); for (auto n : feature_names_) { fprintf(file, "%s, ", n.c_str()); diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 246424600b03..461d2f6b4d7c 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -354,7 +354,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, } Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* bin_filename, - int rank, int num_machines, int* num_global_data, + int rank, int num_machines, data_size_t* num_global_data, std::vector* used_data_indices) { auto dataset = std::unique_ptr(new Dataset()); auto reader = VirtualFileReader::Make(bin_filename); @@ -660,7 +660,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, int** sample_indices, int num_col, - const int* num_per_col, + const int64_t* num_per_col, size_t total_sample_size, data_size_t num_local_data, int64_t num_dist_data) { @@ -900,7 +900,7 @@ void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binar } std::vector DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata, - int rank, int num_machines, int* num_global_data, + int rank, int num_machines, data_size_t* num_global_data, std::vector* used_data_indices) { TextReader text_reader(filename, config_.header, config_.file_load_progress_interval_bytes); used_data_indices->clear(); @@ -952,7 +952,7 @@ std::vector DatasetLoader::SampleTextDataFromMemory(const std::vect if (static_cast(sample_cnt) > data.size()) { sample_cnt = static_cast(data.size()); } - auto sample_indices = random_.Sample(static_cast(data.size()), sample_cnt); + auto sample_indices = random_.Sample(static_cast(data.size()), sample_cnt); std::vector out(sample_indices.size()); for (size_t i = 0; i < sample_indices.size(); ++i) { const size_t idx = sample_indices[i]; @@ -962,7 +962,7 @@ std::vector DatasetLoader::SampleTextDataFromMemory(const std::vect } std::vector DatasetLoader::SampleTextDataFromFile(const char* filename, const Metadata& metadata, - int rank, int num_machines, int* num_global_data, + int rank, int num_machines, data_size_t* num_global_data, std::vector* used_data_indices) { const data_size_t sample_cnt = static_cast(config_.bin_construct_sample_cnt); TextReader text_reader(filename, config_.header, config_.file_load_progress_interval_bytes); diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 053d1b43c104..6db27ee6d68a 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -615,7 +615,7 @@ void Metadata::InsertAt(data_size_t start_index, const float* labels, const float* weights, const double* init_scores, - const int32_t* queries) { + const int64_t* queries) { if (num_data_ < count + start_index) { Log::Fatal("Length of metadata is too long to append #data"); } diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 8de9cf305952..625b524447bc 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -155,10 +155,10 @@ class MultiValDenseBin : public MultiValBin { Threading::BlockInfo(num_data_, 1024, &n_block, &block_size); #pragma omp parallel for schedule(static, 1) - for (int tid = 0; tid < n_block; ++tid) { + for (data_size_t tid = 0; tid < n_block; ++tid) { data_size_t start = tid * block_size; data_size_t end = std::min(num_data_, start + block_size); - for (data_size_t i = start; i < end; ++i) { + for (int i = start; i < end; ++i) { const auto j_start = RowPtr(i); const auto other_j_start = SUBROW ? other_bin->RowPtr(used_indices[i]) : other_bin->RowPtr(i); diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index 80acbb681ab6..ab665a2cdee7 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -222,7 +222,7 @@ class MultiValSparseBin : public MultiValBin { } int n_block = 1; data_size_t block_size = num_data_; - Threading::BlockInfo(static_cast(t_data_.size() + 1), + Threading::BlockInfo(static_cast(t_data_.size() + 1), num_data_, 1024, &n_block, &block_size); std::vector sizes(t_data_.size() + 1, 0); const int pre_alloc_size = 50; diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 199424733f80..cd481e2449ec 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -62,7 +62,7 @@ void MultiValBinWrapper::HistMove(const std::vector>* hist_buf) { int n_bin_block = 1; - int bin_block_size = num_bin_; + data_size_t bin_block_size = num_bin_; Threading::BlockInfo(num_threads_, num_bin_, 512, &n_bin_block, &bin_block_size); hist_t* dst = origin_hist_data_; @@ -71,9 +71,9 @@ void MultiValBinWrapper::HistMerge(std::vector(start + bin_block_size, num_bin_); + for (data_size_t tid = 1; tid < n_data_block_; ++tid) { auto src_ptr = hist_buf->data() + static_cast(num_bin_aligned_) * 2 * (tid - 1); for (int i = start * 2; i < end * 2; ++i) { dst[i] += src_ptr[i]; diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 39b5c23d4d1c..a75568d4283a 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -60,7 +60,7 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear) int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, double threshold_double, double left_value, double right_value, - int left_cnt, int right_cnt, double left_weight, double right_weight, float gain, + data_size_t left_cnt, data_size_t right_cnt, double left_weight, double right_weight, float gain, MissingType missing_type, bool default_left) { Split(leaf, feature, real_feature, left_value, right_value, left_cnt, right_cnt, left_weight, right_weight, gain); int new_node_idx = num_leaves_ - 1; @@ -770,7 +770,7 @@ Tree::Tree(const char* str, size_t* used_len) { } if (key_vals.count("internal_count")) { - internal_count_ = CommonC::StringToArrayFast(key_vals["internal_count"], num_leaves_ - 1); + internal_count_ = CommonC::StringToArrayFast(key_vals["internal_count"], num_leaves_ - 1); } else { internal_count_.resize(num_leaves_ - 1); } @@ -794,7 +794,7 @@ Tree::Tree(const char* str, size_t* used_len) { } if (key_vals.count("leaf_count")) { - leaf_count_ = CommonC::StringToArrayFast(key_vals["leaf_count"], num_leaves_); + leaf_count_ = CommonC::StringToArrayFast(key_vals["leaf_count"], num_leaves_); } else { leaf_count_.resize(num_leaves_); } diff --git a/src/metric/dcg_calculator.cpp b/src/metric/dcg_calculator.cpp index 316fdf0a6ddf..101e482567af 100644 --- a/src/metric/dcg_calculator.cpp +++ b/src/metric/dcg_calculator.cpp @@ -17,7 +17,7 @@ std::vector DCGCalculator::discount_; const data_size_t DCGCalculator::kMaxPosition = 10000; -void DCGCalculator::DefaultEvalAt(std::vector* eval_at) { +void DCGCalculator::DefaultEvalAt(std::vector* eval_at) { auto& ref_eval_at = *eval_at; if (ref_eval_at.empty()) { for (int i = 1; i <= 5; ++i) { diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp index 18539ee44ee0..ce0f11441219 100644 --- a/src/metric/map_metric.hpp +++ b/src/metric/map_metric.hpp @@ -71,7 +71,7 @@ class MapMetric:public Metric { return 1.0f; } - void CalMapAtK(std::vector ks, data_size_t npos, const label_t* label, + void CalMapAtK(std::vector ks, data_size_t npos, const label_t* label, const double* score, data_size_t num_data, std::vector* out) const { // get sorted indices by score std::vector sorted_idx; diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp index 6debe9db60ca..6e09d062bfc3 100644 --- a/src/treelearner/col_sampler.hpp +++ b/src/treelearner/col_sampler.hpp @@ -75,7 +75,7 @@ class ColSampler { if (need_reset_bytree_) { std::memset(is_feature_used_.data(), 0, sizeof(int8_t) * is_feature_used_.size()); - used_feature_indices_ = random_.Sample( + used_feature_indices_ = random_.Sample( static_cast(valid_feature_indices_.size()), used_cnt_bytree_); int omp_loop_size = static_cast(used_feature_indices_.size()); @@ -139,7 +139,7 @@ class ColSampler { used_feature_cnt = std::min(used_feature_cnt, static_cast(filtered_feature_indices.size())); allowed_used_feature_indices = &filtered_feature_indices; } - auto sampled_indices = random_.Sample( + auto sampled_indices = random_.Sample( static_cast((*allowed_used_feature_indices).size()), used_feature_cnt); int omp_loop_size = static_cast(sampled_indices.size()); #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) @@ -165,7 +165,7 @@ class ColSampler { allowed_valid_feature_indices = &filtered_feature_indices; used_feature_cnt = std::min(used_feature_cnt, static_cast(filtered_feature_indices.size())); } - auto sampled_indices = random_.Sample( + auto sampled_indices = random_.Sample( static_cast((*allowed_valid_feature_indices).size()), used_feature_cnt); int omp_loop_size = static_cast(sampled_indices.size()); #pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024) diff --git a/src/treelearner/cost_effective_gradient_boosting.hpp b/src/treelearner/cost_effective_gradient_boosting.hpp index 4c29deb82de4..e8f8722860f4 100644 --- a/src/treelearner/cost_effective_gradient_boosting.hpp +++ b/src/treelearner/cost_effective_gradient_boosting.hpp @@ -77,7 +77,7 @@ class CostEfficientGradientBoosting { } double DeltaGain(int feature_index, int real_fidx, int leaf_index, - int num_data_in_leaf, SplitInfo split_info) { + data_size_t num_data_in_leaf, SplitInfo split_info) { auto config = tree_learner_->config_; double delta = config->cegb_tradeoff * config->cegb_penalty_split * num_data_in_leaf; diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp index 7a6ac031e62d..16918f7c7fd8 100644 --- a/src/treelearner/data_partition.hpp +++ b/src/treelearner/data_partition.hpp @@ -34,7 +34,7 @@ class DataPartition { leaf_count_.resize(num_leaves_); } - void ResetNumData(int num_data) { + void ResetNumData(data_size_t num_data) { num_data_ = num_data; indices_.resize(num_data_); runner_.ReSize(num_data_); @@ -100,7 +100,7 @@ class DataPartition { */ void Split(int leaf, const Dataset* dataset, int feature, const uint32_t* threshold, int num_threshold, bool default_left, - int right_leaf) { + data_size_t right_leaf) { Common::FunctionTimer fun_timer("DataPartition::Split", global_timer); // get leaf boundary const data_size_t begin = leaf_begin_[leaf]; diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 7a21ed0691c3..b8c3906ab10f 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -65,7 +65,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian share_state_->num_hist_total_bin(), share_state_->feature_hist_offsets(), config_, max_cache_size, config_->num_leaves); - Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_); + Log::Info("Number of data points in the train set: %lld, number of used features: %d", num_data_, num_features_); if (CostEfficientGradientBoosting::IsEnable(config_)) { cegb_.reset(new CostEfficientGradientBoosting(this)); cegb_->Init(); @@ -760,7 +760,7 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj void SerialTreeLearner::ComputeBestSplitForFeature( FeatureHistogram* histogram_array_, int feature_index, int real_fidx, - int8_t is_feature_used, int num_data, const LeafSplits* leaf_splits, + int8_t is_feature_used, data_size_t num_data, const LeafSplits* leaf_splits, SplitInfo* best_split, double parent_output) { bool is_feature_numerical = train_data_->FeatureBinMapper(feature_index) ->bin_type() == BinType::NumericalBin; @@ -816,7 +816,7 @@ void SerialTreeLearner::RecomputeBestSplitForLeaf(Tree* tree, int leaf, SplitInf } double sum_gradients = split->left_sum_gradient + split->right_sum_gradient; double sum_hessians = split->left_sum_hessian + split->right_sum_hessian; - int num_data = split->left_count + split->right_count; + data_size_t num_data = split->left_count + split->right_count; std::vector bests(share_state_->num_threads); LeafSplits leaf_splits(num_data, config_); diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 0409821850b1..22abb0dcf9fc 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -122,7 +122,7 @@ class SerialTreeLearner: public TreeLearner { protected: void ComputeBestSplitForFeature(FeatureHistogram* histogram_array_, int feature_index, int real_fidx, - int8_t is_feature_used, int num_data, + int8_t is_feature_used, data_size_t num_data, const LeafSplits* leaf_splits, SplitInfo* best_split, double parent_output); diff --git a/swig/lightgbmlib.i b/swig/lightgbmlib.i index 67937c43ba69..31c5cd9ea3d5 100644 --- a/swig/lightgbmlib.i +++ b/swig/lightgbmlib.i @@ -80,7 +80,7 @@ jdoubleArray data, BoosterHandle handle, int data_type, - int ncol, + int32_t ncol, int is_row_major, int predict_type, int start_iteration, @@ -143,7 +143,7 @@ // https://stackoverflow.com/questions/23258357/whats-the-trade-off-between-using-getprimitivearraycritical-and-getprimitivety jboolean isCopy; - int* indices0 = (int*)jenv->GetPrimitiveArrayCritical(indices, &isCopy); + int64_t* indices0 = (int64_t*)jenv->GetPrimitiveArrayCritical(indices, &isCopy); double* values0 = (double*)jenv->GetPrimitiveArrayCritical(values, &isCopy); int32_t ind[2] = { 0, numNonZeros }; @@ -182,7 +182,7 @@ // https://stackoverflow.com/questions/23258357/whats-the-trade-off-between-using-getprimitivearraycritical-and-getprimitivety jboolean isCopy; - int* indices0 = (int*)jenv->GetPrimitiveArrayCritical(indices, &isCopy); + int64_t* indices0 = (int64_t*)jenv->GetPrimitiveArrayCritical(indices, &isCopy); double* values0 = (double*)jenv->GetPrimitiveArrayCritical(values, &isCopy); int32_t ind[2] = { 0, numNonZeros }; diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py index 55e4af906dd9..3d33e75b6ca4 100644 --- a/tests/c_api_test/test_.py +++ b/tests/c_api_test/test_.py @@ -67,7 +67,7 @@ def load_from_file(filename, reference): ref, ctypes.byref(handle)) print(LIB.LGBM_GetLastError()) - num_data = ctypes.c_int(0) + num_data = ctypes.c_int64(0) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) num_feature = ctypes.c_int(0) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) @@ -88,10 +88,19 @@ def load_from_csr(filename, reference): if reference is not None: ref = reference + if csr.indptr.dtype==np.int32: + indices_t = csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + indptr_t = csr.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + indptr_type=ctypes.c_int(dtype_int32) + elif csr.indptr.dtype==np.int64: + indices_t = csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + indptr_t = csr.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + indptr_type=ctypes.c_int(dtype_int64) + LIB.LGBM_DatasetCreateFromCSR( - csr.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ctypes.c_int(dtype_int32), - csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + indptr_t, + indptr_type, + indices_t, csr.data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), ctypes.c_int(dtype_float64), ctypes.c_int64(len(csr.indptr)), @@ -100,7 +109,7 @@ def load_from_csr(filename, reference): c_str('max_bin=15'), ref, ctypes.byref(handle)) - num_data = ctypes.c_int(0) + num_data = ctypes.c_int64(0) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) num_feature = ctypes.c_int(0) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) @@ -108,7 +117,7 @@ def load_from_csr(filename, reference): handle, c_str('label'), label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - ctypes.c_int(len(label)), + ctypes.c_int64(len(label)), ctypes.c_int(dtype_float32)) print(f'#data: {num_data.value} #feature: {num_feature.value}') return handle @@ -123,10 +132,19 @@ def load_from_csc(filename, reference): if reference is not None: ref = reference + if csc.indptr.dtype==np.int32: + indices_t = csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + indptr_t = csc.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)) + indptr_type=ctypes.c_int(dtype_int32) + elif csc.indptr.dtype==np.int64: + indices_t = csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + indptr_t = csc.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + indptr_type=ctypes.c_int(dtype_int64) + LIB.LGBM_DatasetCreateFromCSC( - csc.indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), - ctypes.c_int(dtype_int32), - csc.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)), + indptr_t, + indptr_type, + indices_t, csc.data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), ctypes.c_int(dtype_float64), ctypes.c_int64(len(csc.indptr)), @@ -135,7 +153,7 @@ def load_from_csc(filename, reference): c_str('max_bin=15'), ref, ctypes.byref(handle)) - num_data = ctypes.c_int(0) + num_data = ctypes.c_int64(0) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) num_feature = ctypes.c_int(0) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) @@ -143,7 +161,7 @@ def load_from_csc(filename, reference): handle, c_str('label'), label.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), - ctypes.c_int(len(label)), + ctypes.c_int64(len(label)), ctypes.c_int(dtype_float32)) print(f'#data: {num_data.value} #feature: {num_feature.value}') return handle @@ -162,13 +180,13 @@ def load_from_mat(filename, reference): LIB.LGBM_DatasetCreateFromMat( data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), ctypes.c_int(dtype_float64), - ctypes.c_int32(mat.shape[0]), + ctypes.c_int64(mat.shape[0]), ctypes.c_int32(mat.shape[1]), ctypes.c_int(1), c_str('max_bin=15'), ref, ctypes.byref(handle)) - num_data = ctypes.c_int(0) + num_data = ctypes.c_int64(0) LIB.LGBM_DatasetGetNumData(handle, ctypes.byref(num_data)) num_feature = ctypes.c_int(0) LIB.LGBM_DatasetGetNumFeature(handle, ctypes.byref(num_feature)) @@ -247,7 +265,7 @@ def test_booster(): booster2, data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)), ctypes.c_int(dtype_float64), - ctypes.c_int32(mat.shape[0]), + ctypes.c_int64(mat.shape[0]), ctypes.c_int32(mat.shape[1]), ctypes.c_int(1), ctypes.c_int(1), diff --git a/tests/cpp_tests/test_array_args.cpp b/tests/cpp_tests/test_array_args.cpp index 41af9755e001..60fd0dd0434d 100644 --- a/tests/cpp_tests/test_array_args.cpp +++ b/tests/cpp_tests/test_array_args.cpp @@ -16,7 +16,7 @@ using LightGBM::ArrayArgs; TEST(Partition, JustWorks) { std::vector gradients({0.5f, 5.0f, 1.0f, 2.0f, 2.0f}); - data_size_t middle_begin, middle_end; + int32_t middle_begin, middle_end; ArrayArgs::Partition(&gradients, 0, static_cast(gradients.size()), &middle_begin, &middle_end); @@ -27,14 +27,14 @@ TEST(Partition, JustWorks) { TEST(Partition, PartitionOneElement) { std::vector gradients({0.5f}); - data_size_t middle_begin, middle_end; + int32_t middle_begin, middle_end; ArrayArgs::Partition(&gradients, 0, static_cast(gradients.size()), &middle_begin, &middle_end); EXPECT_EQ(gradients[middle_begin + 1], gradients[middle_end - 1]); } TEST(Partition, Empty) { std::vector gradients; - data_size_t middle_begin, middle_end; + int32_t middle_begin, middle_end; ArrayArgs::Partition(&gradients, 0, static_cast(gradients.size()), &middle_begin, &middle_end); EXPECT_EQ(middle_begin, -1); @@ -43,7 +43,7 @@ TEST(Partition, Empty) { TEST(Partition, AllEqual) { std::vector gradients({0.5f, 0.5f, 0.5f}); - data_size_t middle_begin, middle_end; + int32_t middle_begin, middle_end; ArrayArgs::Partition(&gradients, 0, static_cast(gradients.size()), &middle_begin, &middle_end); EXPECT_EQ(gradients[middle_begin + 1], gradients[middle_end - 1]); diff --git a/tests/cpp_tests/test_stream.cpp b/tests/cpp_tests/test_stream.cpp index 75feb2e69b54..4ec7c6df9650 100644 --- a/tests/cpp_tests/test_stream.cpp +++ b/tests/cpp_tests/test_stream.cpp @@ -26,7 +26,7 @@ void test_stream_dense( const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups) { Log::Info("Streaming %d rows dense data with a batch size of %d", nrows, batch_count); DatasetHandle dataset_handle = nullptr; Dataset* dataset = nullptr; @@ -55,7 +55,7 @@ void test_stream_dense( } } - std::vector sample_sizes; + std::vector sample_sizes; std::vector sample_values_ptrs; std::vector sample_idx_ptrs; for (int32_t i = 0; i < ncols; ++i) { @@ -133,7 +133,7 @@ void test_stream_sparse( const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups) { Log::Info("Streaming %d rows sparse data with a batch size of %d", nrows, batch_count); DatasetHandle dataset_handle = nullptr; Dataset* dataset = nullptr; @@ -163,7 +163,7 @@ void test_stream_sparse( } } - std::vector sample_sizes; + std::vector sample_sizes; std::vector sample_values_ptrs; std::vector sample_idx_ptrs; for (int32_t i = 0; i < ncols; ++i) { @@ -246,7 +246,7 @@ TEST(Stream, PushDenseRowsWithMetadata) { int nclasses = 2; // choose > 1 just to test multi-class handling std::vector unused_init_scores; unused_init_scores.resize(noriginalrows * nclasses); - std::vector unused_groups; + std::vector unused_groups; unused_groups.assign(noriginalrows, 1); result = LGBM_DatasetSetField(ref_datset_handle, "init_score", unused_init_scores.data(), noriginalrows * nclasses, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetField init_score result code: " << result; @@ -260,7 +260,7 @@ TEST(Stream, PushDenseRowsWithMetadata) { std::vector labels; std::vector weights; std::vector init_scores; - std::vector groups; + std::vector groups; Log::Info("Creating random data"); TestUtils::CreateRandomDenseData(nrows, ncols, nclasses, &features, &labels, &weights, &init_scores, &groups); @@ -297,7 +297,7 @@ TEST(Stream, PushSparseRowsWithMetadata) { int32_t nclasses = 2; std::vector unused_init_scores; unused_init_scores.resize(noriginalrows * nclasses); - std::vector unused_groups; + std::vector unused_groups; unused_groups.assign(noriginalrows, 1); result = LGBM_DatasetSetField(ref_datset_handle, "init_score", unused_init_scores.data(), noriginalrows * nclasses, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetField init_score result code: " << result; @@ -313,7 +313,7 @@ TEST(Stream, PushSparseRowsWithMetadata) { std::vector labels; std::vector weights; std::vector init_scores; - std::vector groups; + std::vector groups; Log::Info("Creating random data"); float sparse_percent = .1f; diff --git a/tests/cpp_tests/testutils.cpp b/tests/cpp_tests/testutils.cpp index f0b3e1c1f206..283ff56f6b63 100644 --- a/tests/cpp_tests/testutils.cpp +++ b/tests/cpp_tests/testutils.cpp @@ -42,7 +42,7 @@ namespace LightGBM { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups) { + std::vector* groups) { Random rand(42); features->reserve(nrows * ncols); @@ -69,7 +69,7 @@ namespace LightGBM { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups) { + std::vector* groups) { Random rand(42); indptr->reserve(static_cast(nrows + 1)); indices->reserve(static_cast(sparse_percent * nrows * ncols)); @@ -98,7 +98,7 @@ namespace LightGBM { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups) { + std::vector* groups) { Random rand(42); labels->reserve(nrows); if (weights) { @@ -111,7 +111,7 @@ namespace LightGBM { groups->reserve(nrows); } - int32_t group = 0; + int64_t group = 0; for (int32_t row = 0; row < nrows; row++) { labels->push_back(rand.NextFloat()); @@ -141,7 +141,7 @@ namespace LightGBM { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups) { int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result; @@ -165,7 +165,7 @@ namespace LightGBM { init_scores_ptr = init_score_batch.data(); } - const int32_t* groups_ptr = nullptr; + const int64_t* groups_ptr = nullptr; if (groups) { groups_ptr = groups->data(); } @@ -217,7 +217,7 @@ namespace LightGBM { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups) { + const std::vector* groups) { int result = LGBM_DatasetSetWaitForManualFinish(dataset_handle, 1); EXPECT_EQ(0, result) << "LGBM_DatasetSetWaitForManualFinish result code: " << result; @@ -235,7 +235,7 @@ namespace LightGBM { weights_ptr = weights->data(); } - const int32_t* groups_ptr = nullptr; + const int64_t* groups_ptr = nullptr; if (groups) { groups_ptr = groups->data(); } @@ -352,7 +352,7 @@ namespace LightGBM { const std::vector* ref_labels, const std::vector* ref_weights, const std::vector* ref_init_scores, - const std::vector* ref_groups) { + const std::vector* ref_groups) { const float* labels = metadata->label(); auto nTotal = static_cast(ref_labels->size()); for (auto i = 0; i < nTotal; i++) { @@ -392,7 +392,7 @@ namespace LightGBM { FAIL() << "Expected non-null init_scores"; } - const int32_t* query_boundaries = metadata->query_boundaries(); + const int64_t* query_boundaries = metadata->query_boundaries(); if (query_boundaries) { if (!ref_groups) { FAIL() << "Expected null query_boundaries"; diff --git a/tests/cpp_tests/testutils.h b/tests/cpp_tests/testutils.h index 158b38e0e24b..7dbd32d59001 100644 --- a/tests/cpp_tests/testutils.h +++ b/tests/cpp_tests/testutils.h @@ -32,7 +32,7 @@ class TestUtils { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups); + std::vector* groups); /*! * Creates a CSR sparse Dataset of random values. @@ -47,7 +47,7 @@ class TestUtils { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups); + std::vector* groups); /*! * Creates a batch of Metadata of random values. @@ -57,7 +57,7 @@ class TestUtils { std::vector* labels, std::vector* weights, std::vector* init_scores, - std::vector* groups); + std::vector* groups); /*! * Pushes nrows of data to a Dataset in batches of batch_count. @@ -71,7 +71,7 @@ class TestUtils { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups); + const std::vector* groups); /*! * Pushes nrows of data to a Dataset in batches of batch_count. @@ -86,7 +86,7 @@ class TestUtils { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups); + const std::vector* groups); /*! * Validates metadata against reference vectors. @@ -95,7 +95,7 @@ class TestUtils { const std::vector* labels, const std::vector* weights, const std::vector* init_scores, - const std::vector* groups); + const std::vector* groups); static const double* CreateInitScoreBatch(std::vector* init_score_batch, int32_t index, @@ -116,7 +116,7 @@ class TestUtils { const float* labels_ptr, const float* weights_ptr, const std::vector* init_scores, - const int32_t* groups_ptr, + const int64_t* groups_ptr, int32_t thread_count, int32_t thread_id); };