Skip to content

Commit

Permalink
[microsoft#5454]support dataset rows more then max(int32_t)
Browse files Browse the repository at this point in the history
  • Loading branch information
junpeng.li authored and junpeng0715 committed Oct 11, 2022
1 parent 4642712 commit 087793d
Show file tree
Hide file tree
Showing 40 changed files with 379 additions and 317 deletions.
2 changes: 1 addition & 1 deletion include/LightGBM/bin.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ class BinMapper {
* \param zero_as_missing True to use zero as missing value
* \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
*/
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
void FindBin(double* values, int64_t num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, bool pre_filter, BinType bin_type,
bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);

/*!
Expand Down
60 changes: 30 additions & 30 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ LIGHTGBM_C_EXPORT int LGBM_RegisterLogCallback(void (*callback)(const char*));
* \param[out] out Number of samples. This value is used to pre-allocate memory to hold sample indices when calling ``LGBM_SampleIndices``
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_GetSampleCount(int32_t num_total_row,
LIGHTGBM_C_EXPORT int LGBM_GetSampleCount(int64_t num_total_row,
const char* parameters,
int* out);

Expand All @@ -91,10 +91,10 @@ LIGHTGBM_C_EXPORT int LGBM_GetSampleCount(int32_t num_total_row,
* \param[out] out_len Number of indices
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int32_t num_total_row,
LIGHTGBM_C_EXPORT int LGBM_SampleIndices(int64_t num_total_row,
const char* parameters,
void* out,
int32_t* out_len);
int64_t* out_len);

/* --- start Dataset interface */

Expand Down Expand Up @@ -127,9 +127,9 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromFile(const char* filename,
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromSampledColumn(double** sample_data,
int** sample_indices,
int32_t ncol,
const int* num_per_col,
int32_t num_sample_row,
int32_t num_local_row,
const int64_t* num_per_col,
int64_t num_sample_row,
int64_t num_local_row,
int64_t num_dist_row,
const char* parameters,
DatasetHandle* out);
Expand Down Expand Up @@ -175,9 +175,9 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetInitStreaming(DatasetHandle dataset,
LIGHTGBM_C_EXPORT int LGBM_DatasetPushRows(DatasetHandle dataset,
const void* data,
int data_type,
int32_t nrow,
int64_t nrow,
int32_t ncol,
int32_t start_row);
int64_t start_row);

/*!
* \brief Push data to existing dataset.
Expand Down Expand Up @@ -208,7 +208,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset,
const float* label,
const float* weight,
const double* init_score,
const int32_t* query,
const int64_t* query,
int32_t tid);

/*!
Expand All @@ -228,7 +228,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset,
LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t nindptr,
Expand Down Expand Up @@ -266,7 +266,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle datase
const float* label,
const float* weight,
const double* init_score,
const int32_t* query,
const int64_t* query,
int32_t tid);

/*!
Expand Down Expand Up @@ -302,7 +302,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetMarkFinished(DatasetHandle dataset);
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t nindptr,
Expand Down Expand Up @@ -347,7 +347,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
Expand All @@ -371,7 +371,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
int data_type,
int32_t nrow,
int64_t nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
Expand All @@ -394,7 +394,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
const void** data,
int data_type,
int32_t* nrow,
int64_t* nrow,
int32_t ncol,
int is_row_major,
const char* parameters,
Expand All @@ -411,8 +411,8 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetGetSubset(const DatasetHandle handle,
const int32_t* used_row_indices,
int32_t num_used_row_indices,
const int64_t* used_row_indices,
int64_t num_used_row_indices,
const char* parameters,
DatasetHandle* out);

Expand Down Expand Up @@ -487,7 +487,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetDumpText(DatasetHandle handle,
LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
const char* field_name,
const void* field_data,
int num_element,
int64_t num_element,
int type);

/*!
Expand All @@ -501,7 +501,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetGetField(DatasetHandle handle,
const char* field_name,
int* out_len,
int64_t* out_len,
const void** out_ptr,
int* out_type);

Expand All @@ -521,7 +521,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetUpdateParamChecking(const char* old_parameters
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_DatasetGetNumData(DatasetHandle handle,
int* out);
int64_t* out);

/*!
* \brief Get number of features.
Expand Down Expand Up @@ -677,7 +677,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterUpdateOneIter(BoosterHandle handle,
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterRefit(BoosterHandle handle,
const int32_t* leaf_preds,
int32_t nrow,
int64_t nrow,
int32_t ncol);

/*!
Expand Down Expand Up @@ -926,7 +926,7 @@ LIGHTGBM_C_EXPORT int LGBM_FastConfigFree(FastConfigHandle fastConfig);
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t nindptr,
Expand Down Expand Up @@ -970,7 +970,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t nindptr,
Expand All @@ -995,7 +995,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, int indptr_type, int data_type);
LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, void* indices, void* data, int indptr_type, int data_type);

/*!
* \brief Make prediction for a new dataset in CSR format. This method re-uses the internal predictor structure
Expand Down Expand Up @@ -1029,7 +1029,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indic
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t nindptr,
Expand Down Expand Up @@ -1104,7 +1104,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle h
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle,
const void* indptr,
const int indptr_type,
const int32_t* indices,
const void* indices,
const void* data,
const int64_t nindptr,
const int64_t nelem,
Expand Down Expand Up @@ -1142,7 +1142,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fa
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle,
const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* indices,
const void* data,
int data_type,
int64_t ncol_ptr,
Expand Down Expand Up @@ -1183,7 +1183,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSC(BoosterHandle handle,
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle,
const void* data,
int data_type,
int32_t nrow,
int64_t nrow,
int32_t ncol,
int is_row_major,
int predict_type,
Expand Down Expand Up @@ -1221,7 +1221,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMat(BoosterHandle handle,
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
const void* data,
int data_type,
int ncol,
int32_t ncol,
int is_row_major,
int predict_type,
int start_iteration,
Expand Down Expand Up @@ -1310,7 +1310,7 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fa
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle,
const void** data,
int data_type,
int32_t nrow,
int64_t nrow,
int32_t ncol,
int predict_type,
int start_iteration,
Expand Down
2 changes: 1 addition & 1 deletion include/LightGBM/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ struct Config {
// alias = ndcg_eval_at, ndcg_at, map_eval_at, map_at
// desc = used only with ``ndcg`` and ``map`` metrics
// desc = `NDCG <https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG>`__ and `MAP <https://makarandtapaswi.wordpress.com/2012/07/02/intuition-behind-average-precision-and-map/>`__ evaluation positions, separated by ``,``
std::vector<int> eval_at;
std::vector<data_size_t> eval_at;

// check = >0
// desc = used only with ``multi_error`` metric
Expand Down
8 changes: 4 additions & 4 deletions include/LightGBM/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class Metadata {
const float* labels,
const float* weights,
const double* init_scores,
const int32_t* queries);
const int64_t* queries);

/*!
* \brief Perform any extra operations after all data has been loaded
Expand Down Expand Up @@ -436,7 +436,7 @@ class Dataset {
const std::vector<std::vector<double>>& forced_bins,
int** sample_non_zero_indices,
double** sample_values,
const int* num_per_col,
const int64_t* num_per_col,
int num_sample_col,
size_t total_sample_cnt,
const Config& io_config);
Expand Down Expand Up @@ -600,13 +600,13 @@ class Dataset {

LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);

LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const int* field_data, data_size_t num_element);
LIGHTGBM_EXPORT bool SetIntField(const char* field_name, const data_size_t* field_data, data_size_t num_element);

LIGHTGBM_EXPORT bool GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr);

LIGHTGBM_EXPORT bool GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr);

LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr);
LIGHTGBM_EXPORT bool GetIntField(const char* field_name, data_size_t* out_len, const data_size_t** out_ptr);

/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
Expand Down
8 changes: 4 additions & 4 deletions include/LightGBM/dataset_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class DatasetLoader {
LIGHTGBM_EXPORT Dataset* ConstructFromSampleData(double** sample_values,
int** sample_indices,
int num_col,
const int* num_per_col,
const int64_t* num_per_col,
size_t total_sample_size,
data_size_t num_local_data,
int64_t num_dist_data);
Expand All @@ -45,17 +45,17 @@ class DatasetLoader {
const std::unordered_set<int>& categorical_features);

private:
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, data_size_t* num_global_data, std::vector<data_size_t>* used_data_indices);

void SetHeader(const char* filename);

void CheckDataset(const Dataset* dataset, bool is_load_from_binary);

std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, data_size_t* num_global_data, std::vector<data_size_t>* used_data_indices);

std::vector<std::string> SampleTextDataFromMemory(const std::vector<std::string>& data);

std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
std::vector<std::string> SampleTextDataFromFile(const char* filename, const Metadata& metadata, int rank, int num_machines, data_size_t* num_global_data, std::vector<data_size_t>* used_data_indices);

void ConstructBinMappersFromTextData(int rank, int num_machines, const std::vector<std::string>& sample_data, const Parser* parser, Dataset* dataset);

Expand Down
Loading

0 comments on commit 087793d

Please sign in to comment.