Skip to content

Commit

Permalink
modify GPU data type to fit int64
Browse files Browse the repository at this point in the history
  • Loading branch information
junpeng.li authored and junpeng0715 committed Oct 11, 2022
1 parent b32427a commit 6eac042
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 13 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,11 @@ if(USE_SWIG)
COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com
)
else()
add_custom_command(
TARGET _lightgbm_swig_swig_compilation
POST_BUILD
COMMAND sed -i 's/long long const/int64_t const/g' java/lightgbmlibJAVA_wrap.cxx
)
add_custom_command(
TARGET _lightgbm_swig
POST_BUILD
Expand Down
2 changes: 1 addition & 1 deletion src/io/train_share_states.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,
const data_size_t end = std::min<data_size_t>(start + bin_block_size, num_bin_);
for (data_size_t tid = 1; tid < n_data_block_; ++tid) {
auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
for (data_size_t i = start * 2; i < end * 2; ++i) {
dst[i] += src_ptr[i];
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/treelearner/cuda_tree_learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ class CUDATreeLearner: public SerialTreeLearner {
std::vector<float> kernel_time_; // measure histogram kernel time
std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
int num_gpu_;
int allocated_num_data_; // allocated data instances
data_size_t allocated_num_data_; // allocated data instances
pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu
};

Expand Down
18 changes: 9 additions & 9 deletions src/treelearner/gpu_tree_learner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
num_dense_feature4_ = (num_dense_feature_groups_ + (dword_features_ - 1)) / dword_features_;
// leave some safe margin for prefetching
// 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
int allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
data_size_t allocated_num_data_ = num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature);
// clear sparse/dense maps
dense_feature_group_map_.clear();
device_bin_mults_.clear();
Expand Down Expand Up @@ -391,7 +391,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[5]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[6]),
*static_cast<DenseBinIterator<uint8_t, true>*>(bin_iters[7])};
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
host4[j].s[0] = (uint8_t)((iters[0].RawGet(j) * dev_bin_mult[0] + ((j+0) & (dev_bin_mult[0] - 1)))
|((iters[1].RawGet(j) * dev_bin_mult[1] + ((j+1) & (dev_bin_mult[1] - 1))) << 4));
host4[j].s[1] = (uint8_t)((iters[2].RawGet(j) * dev_bin_mult[2] + ((j+2) & (dev_bin_mult[2] - 1)))
Expand All @@ -409,13 +409,13 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
// Dense bin
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
}
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
// Dense 4-bit bin
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
host4[j].s[s_idx] = (uint8_t)(iter.RawGet(j) * dev_bin_mult[s_idx] + ((j+s_idx) & (dev_bin_mult[s_idx] - 1)));
}
} else {
Expand Down Expand Up @@ -452,7 +452,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
<< ((i & 1) << 2));
Expand All @@ -465,14 +465,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
} else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
+ ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
}
Expand All @@ -486,15 +486,15 @@ void GPUTreeLearner::AllocateGPUMemory() {
// fill the leftover features
if (dword_features_ == 8) {
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value
host4[j].s[i >> 1] |= (uint8_t)((j & 0xf) << ((i & 1) << 2));
}
}
} else if (dword_features_ == 4) {
#pragma omp parallel for schedule(static)
for (int j = 0; j < num_data_; ++j) {
for (data_size_t j = 0; j < num_data_; ++j) {
for (int i = k; i < dword_features_; ++i) {
// fill this empty feature with some "random" value
host4[j].s[i] = (uint8_t)j;
Expand Down
4 changes: 2 additions & 2 deletions src/treelearner/ocl/histogram256.cl
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ typedef uint acc_int_type;
#define AMD_USE_DS_ADD_F32 0
#endif

typedef uint data_size_t;
typedef signed long int data_size_t;
typedef float score_t;


Expand Down Expand Up @@ -439,7 +439,7 @@ __kernel void histogram256(__global const uchar4* feature_data_base,
R""()
*/
// there are 2^POWER_FEATURE_WORKGROUPS workgroups processing each feature4
for (uint i = subglobal_tid; i < num_data; i += subglobal_size) {
for (data_size_t i = subglobal_tid; i < num_data; i += subglobal_size) {
// prefetch the next iteration variables
// we don't need boundary check because we have made the buffer larger
stat1_next = ordered_gradients[i + subglobal_size];
Expand Down

0 comments on commit 6eac042

Please sign in to comment.