-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ranking algorithm: position unbiased option #4531
Changes from 34 commits
e5b6f31
ca85e05
7718343
7063000
4f4387a
482c143
4bae3ad
1449576
c49005e
3cbcb62
a0680e6
8827540
6798c2a
e1c8158
26b316b
4b8c2e9
c50e92c
592ade6
c29a6f8
9796a72
a00ac5c
672ec5b
d3347d2
88e3542
52f6243
99f4f04
87219c2
baa4a0d
20fe972
9ac06d5
da17901
a821813
32aa904
b450bfa
fc5b92d
ebb8f40
6ae3cfe
9572d7d
0cbe02d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -462,3 +462,6 @@ dask-worker-space/ | |
*.pub | ||
*.rdp | ||
*_rsa | ||
|
||
# swig jni | ||
*_swig.jnilib | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
|
||
#include <LightGBM/metric.h> | ||
#include <LightGBM/objective_function.h> | ||
#include <LightGBM/utils/log.h> | ||
|
||
#include <algorithm> | ||
#include <cmath> | ||
|
@@ -101,7 +102,9 @@ class LambdarankNDCG : public RankingObjective { | |
: RankingObjective(config), | ||
sigmoid_(config.sigmoid), | ||
norm_(config.lambdarank_norm), | ||
truncation_level_(config.lambdarank_truncation_level) { | ||
truncation_level_(config.lambdarank_truncation_level), | ||
unbiased_(config.lambdarank_unbiased), | ||
bias_p_norm_(config.lambdarank_bias_p_norm) { | ||
label_gain_ = config.label_gain; | ||
// initialize DCG calculator | ||
DCGCalculator::DefaultLabelGain(&label_gain_); | ||
|
@@ -111,6 +114,14 @@ class LambdarankNDCG : public RankingObjective { | |
if (sigmoid_ <= 0.0) { | ||
Log::Fatal("Sigmoid param %f should be greater than zero", sigmoid_); | ||
} | ||
|
||
#pragma omp parallel | ||
#pragma omp master | ||
{ | ||
num_threads_ = omp_get_num_threads(); | ||
} | ||
|
||
position_bias_regularizer = 1.0f / (1.0f + bias_p_norm_); | ||
} | ||
|
||
explicit LambdarankNDCG(const std::vector<std::string>& strs) | ||
|
@@ -135,19 +146,34 @@ class LambdarankNDCG : public RankingObjective { | |
} | ||
// construct Sigmoid table to speed up Sigmoid transform | ||
ConstructSigmoidTable(); | ||
|
||
// initialize position bias vectors | ||
InitPositionBiasesAndGradients(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this initialization be conditional on the value of (apologies if I've misunderstood how this works) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. they get initialized to 1 and only updated if |
||
} | ||
|
||
void GetGradients(const double* score, score_t* gradients, | ||
score_t* hessians) const override { | ||
RankingObjective::GetGradients(score, gradients, hessians); | ||
|
||
if (unbiased_) { UpdatePositionBiasesAndGradients(); } | ||
} | ||
|
||
inline void GetGradientsForOneQuery(data_size_t query_id, data_size_t cnt, | ||
const label_t* label, const double* score, | ||
score_t* lambdas, | ||
score_t* hessians) const override { | ||
|
||
const int tid = omp_get_thread_num(); // get thread id | ||
|
||
// get max DCG on current query | ||
const double inverse_max_dcg = inverse_max_dcgs_[query_id]; | ||
|
||
// initialize with zero | ||
for (data_size_t i = 0; i < cnt; ++i) { | ||
lambdas[i] = 0.0f; | ||
hessians[i] = 0.0f; | ||
} | ||
|
||
// get sorted indices for scores | ||
std::vector<data_size_t> sorted_idx(cnt); | ||
for (data_size_t i = 0; i < cnt; ++i) { | ||
|
@@ -156,6 +182,7 @@ class LambdarankNDCG : public RankingObjective { | |
std::stable_sort( | ||
sorted_idx.begin(), sorted_idx.end(), | ||
[score](data_size_t a, data_size_t b) { return score[a] > score[b]; }); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you please remove these and other whitespace-only changes? If you feel that they improve the readability of the code, we'd welcome a separate pull request proposing them. That separate pull request could also include some of the other changes you've made below where you added or updated comments on existing code (where those comment changes don't relate to the feature of "add position-unbiased option for lambdarank"). like these: Those types of PRs that only change comments and whitespace take minimal effort for reviewers to review, and shrink the size of changes like this one that are introducing more involved enhancements. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think i undid all the whitespace changes |
||
// get best and worst score | ||
const double best_score = score[sorted_idx[0]]; | ||
data_size_t worst_idx = cnt - 1; | ||
|
@@ -164,13 +191,16 @@ class LambdarankNDCG : public RankingObjective { | |
} | ||
const double worst_score = score[sorted_idx[worst_idx]]; | ||
double sum_lambdas = 0.0; | ||
// start accmulate lambdas by pairs that contain at least one document above truncation level | ||
|
||
// accumulate lambdas by pairs that contain at least one document above truncation level | ||
for (data_size_t i = 0; i < cnt - 1 && i < truncation_level_; ++i) { | ||
if (score[sorted_idx[i]] == kMinScore) { continue; } | ||
for (data_size_t j = i + 1; j < cnt; ++j) { | ||
if (score[sorted_idx[j]] == kMinScore) { continue; } | ||
|
||
// skip pairs with the same labels | ||
if (label[sorted_idx[i]] == label[sorted_idx[j]]) { continue; } | ||
|
||
data_size_t high_rank, low_rank; | ||
if (label[sorted_idx[i]] > label[sorted_idx[j]]) { | ||
high_rank = i; | ||
|
@@ -179,11 +209,15 @@ class LambdarankNDCG : public RankingObjective { | |
high_rank = j; | ||
low_rank = i; | ||
} | ||
|
||
// info of more relevant doc | ||
const data_size_t high = sorted_idx[high_rank]; | ||
const int high_label = static_cast<int>(label[high]); | ||
const double high_score = score[high]; | ||
const double high_label_gain = label_gain_[high_label]; | ||
const double high_discount = DCGCalculator::GetDiscount(high_rank); | ||
|
||
// info of less relevant doc | ||
const data_size_t low = sorted_idx[low_rank]; | ||
const int low_label = static_cast<int>(label[low]); | ||
const double low_score = score[low]; | ||
|
@@ -192,30 +226,49 @@ class LambdarankNDCG : public RankingObjective { | |
|
||
const double delta_score = high_score - low_score; | ||
|
||
// get dcg gap | ||
const double dcg_gap = high_label_gain - low_label_gain; | ||
|
||
// get discount of this pair | ||
const double paired_discount = fabs(high_discount - low_discount); | ||
|
||
// get delta NDCG | ||
double delta_pair_NDCG = dcg_gap * paired_discount * inverse_max_dcg; | ||
// regular the delta_pair_NDCG by score distance | ||
if (norm_ && best_score != worst_score) { | ||
|
||
// regularize the delta_pair_NDCG by score distance | ||
if ((norm_ || unbiased_) && best_score != worst_score) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is unclear to me why with unbiased Lambdarank we must normalize the delta score here. |
||
delta_pair_NDCG /= (0.01f + fabs(delta_score)); | ||
} | ||
|
||
// calculate lambda for this pair | ||
double p_lambda = GetSigmoid(delta_score); | ||
double p_hessian = p_lambda * (1.0f - p_lambda); | ||
// update | ||
p_lambda *= -sigmoid_ * delta_pair_NDCG; | ||
p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG; | ||
|
||
int debias_high_rank = static_cast<int>(std::min(high, truncation_level_ - 1)); | ||
int debias_low_rank = static_cast<int>(std::min(low, truncation_level_ - 1)); | ||
Comment on lines
+232
to
+233
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi I am trying to investigate what the issue is with the biases for the truncation cutoff spot that I'm observing and I'm wondering if the thresholding here is the culprit. Within the context of this implementation, the case where Correct me if I'm wrong but the only case the above happens is when I was wondering if it's better to check the truncation level for both high and low ranks and only update the position biases when both are lower than As I'm looking for the update to introduce the position column as a metadata value, what I'm thinking is along the lines of: const data_size_t high = sorted_idx[high_rank];
const data_size_t low = sorted_idx[low_rank];
data_size_t high_position, low_position;
if (unbiased_) {
// record_positions_ is a vector of length data_size, with the position of each
// record in the data. start is the offset for the current query
high_position = record_positions_[start + sorted_idx[high_rank]];
low_position = record_positions_[start + sorted_idx[low_rank]];
} else {
high_position = high;
low_position = low;
}
if (unbiased_) {
double p_cost = log(1.0f / (1.0f - p_lambda)) * delta_pair_NDCG;
// more relevant (clicked) gets debiased by less relevant (unclicked), only if within truncation levels
if (high_position <= truncation_level_ && low_position <= truncation_level_) {
i_costs_buffer_[tid][high_position] += p_cost / j_biases_pow_[low_position];
j_costs_buffer_[tid][low_position] += p_cost / i_biases_pow_[high_position]; // and vice versa
}
}
// By default we set values of 1.0 as no-ops
double i_bias_pow = 1.0;
double j_bias_pow = 1.0;
// We only use actual bias values if they are both within the truncation limits
if (unbiased_ && high_position <= truncation_level_ && low_position <= truncation_level_) {
i_bias_pow = i_biases_pow_[high_position];
j_bias_pow = j_biases_pow_[low_position];
}
// update, either with 1.0 values if at least one of data points ended up outside the truncation threshold or the actual biases
p_lambda *= -sigmoid_ * delta_pair_NDCG / i_bias_pow / j_bias_pow;
p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG / i_bias_pow / j_bias_pow; Does this make sense? The main suggestion is to not "clamp" the bias positions between It's a bit unclear to me still the difference between What I want to ensure is that: since my There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @thvasilo Thanks for reviewing this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe here |
||
|
||
if (unbiased_) { | ||
double p_cost = log(1.0f / (1.0f - p_lambda)) * delta_pair_NDCG; | ||
|
||
// more relevant (clicked) gets debiased by less relevant (unclicked) | ||
i_costs_buffer_[tid][debias_high_rank] += p_cost / j_biases_pow_[debias_low_rank]; | ||
j_costs_buffer_[tid][debias_low_rank] += p_cost / i_biases_pow_[debias_high_rank]; // and vice versa | ||
} | ||
|
||
p_lambda *= -sigmoid_ * delta_pair_NDCG / i_biases_pow_[debias_high_rank] / j_biases_pow_[debias_low_rank]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suppose we are now before starting iteration |
||
|
||
// remainder of d/dx {(34) and (36) for debiasing} | ||
p_hessian *= sigmoid_ * sigmoid_ * delta_pair_NDCG / i_biases_pow_[debias_high_rank] / j_biases_pow_[debias_low_rank]; | ||
|
||
lambdas[low] -= static_cast<score_t>(p_lambda); | ||
hessians[low] += static_cast<score_t>(p_hessian); | ||
lambdas[high] += static_cast<score_t>(p_lambda); | ||
hessians[high] += static_cast<score_t>(p_hessian); | ||
|
||
// lambda is negative, so use minus to accumulate | ||
sum_lambdas -= 2 * p_lambda; | ||
} | ||
} | ||
|
||
if (norm_ && sum_lambdas > 0) { | ||
double norm_factor = std::log2(1 + sum_lambdas) / sum_lambdas; | ||
for (data_size_t i = 0; i < cnt; ++i) { | ||
|
@@ -253,9 +306,86 @@ class LambdarankNDCG : public RankingObjective { | |
} | ||
} | ||
|
||
void InitPositionBiasesAndGradients() { | ||
i_biases_pow_.resize(truncation_level_); | ||
j_biases_pow_.resize(truncation_level_); | ||
i_costs_.resize(truncation_level_); | ||
j_costs_.resize(truncation_level_); | ||
|
||
for (int i = 0; i < truncation_level_; ++i) { | ||
// init position biases | ||
i_biases_pow_[i] = 1.0f; | ||
j_biases_pow_[i] = 1.0f; | ||
|
||
// init position gradients | ||
i_costs_[i] = 0.0f; | ||
j_costs_[i] = 0.0f; | ||
} | ||
|
||
// init gradient buffers for gathering results across threads | ||
for (int i = 0; i < num_threads_; i++) { | ||
i_costs_buffer_.emplace_back(truncation_level_, 0.0f); | ||
j_costs_buffer_.emplace_back(truncation_level_, 0.0f); | ||
} | ||
} | ||
|
||
void UpdatePositionBiasesAndGradients() const { | ||
// accumulate the parallel results | ||
for (int i = 0; i < num_threads_; i++) { | ||
for (int j = 0; j < truncation_level_; j++) { | ||
i_costs_[j] += i_costs_buffer_[i][j]; | ||
j_costs_[j] += j_costs_buffer_[i][j]; | ||
} | ||
} | ||
|
||
for (int i = 0; i < num_threads_; i++) { | ||
for (int j = 0; j < truncation_level_; j++) { | ||
// clear buffer for next run | ||
i_costs_buffer_[i][j] = 0.0f; | ||
j_costs_buffer_[i][j] = 0.0f; | ||
} | ||
} | ||
|
||
for (int i = 0; i < truncation_level_; i++) { | ||
// Update bias | ||
i_biases_pow_[i] = pow(i_costs_[i] / i_costs_[0], position_bias_regularizer); | ||
j_biases_pow_[i] = pow(j_costs_[i] / j_costs_[0], position_bias_regularizer); | ||
} | ||
|
||
LogDebugPositionBiases(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. while i found being able to track evolution and end point of position bias adjustment values useful in debug mode, happy to remove this since it doesn't have any new feature benefit during normal usage. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. Please remove this or wrap the debug code with |
||
|
||
for (int i = 0; i < truncation_level_; i++) { | ||
// Clear position info | ||
i_costs_[i] = 0.0f; | ||
j_costs_[i] = 0.0f; | ||
} | ||
} | ||
|
||
const char* GetName() const override { return "lambdarank"; } | ||
|
||
private: | ||
void LogDebugPositionBiases() const { | ||
std::stringstream message_stream; | ||
message_stream << std::setw(10) << "position" | ||
<< std::setw(15) << "bias_i" | ||
<< std::setw(15) << "bias_j" | ||
<< std::setw(15) << "i_cost" | ||
<< std::setw(15) << "j_cost" | ||
<< std::endl; | ||
Log::Debug(message_stream.str().c_str()); | ||
message_stream.str(""); | ||
|
||
for (int i = 0; i < truncation_level_; ++i) { | ||
message_stream << std::setw(10) << i | ||
<< std::setw(15) << i_biases_pow_[i] | ||
<< std::setw(15) << j_biases_pow_[i] | ||
<< std::setw(15) << i_costs_[i] | ||
<< std::setw(15) << j_costs_[i]; | ||
Log::Debug(message_stream.str().c_str()); | ||
message_stream.str(""); | ||
} | ||
} | ||
|
||
/*! \brief Sigmoid param */ | ||
double sigmoid_; | ||
/*! \brief Normalize the lambdas or not */ | ||
|
@@ -276,6 +406,35 @@ class LambdarankNDCG : public RankingObjective { | |
double max_sigmoid_input_ = 50; | ||
/*! \brief Factor that covert score to bin in Sigmoid table */ | ||
double sigmoid_table_idx_factor_; | ||
|
||
// bias correction variables | ||
/*! \brief power of (click) position biases */ | ||
mutable std::vector<label_t> i_biases_pow_; | ||
|
||
/*! \brief power of (unclick) position biases */ | ||
mutable std::vector<label_t> j_biases_pow_; | ||
|
||
// mutable double position cost; | ||
mutable std::vector<label_t> i_costs_; | ||
mutable std::vector<std::vector<label_t>> i_costs_buffer_; | ||
|
||
mutable std::vector<label_t> j_costs_; | ||
mutable std::vector<std::vector<label_t>> j_costs_buffer_; | ||
|
||
/*! | ||
* \brief Should use lambdarank with position bias correction | ||
* [arxiv.org/pdf/1809.05818.pdf] | ||
*/ | ||
bool unbiased_; | ||
|
||
/*! \brief Position bias regularizer norm */ | ||
double bias_p_norm_; | ||
|
||
/*! \brief Position bias regularizer exponent, 1 / (1 + bias_p_norm_) */ | ||
double position_bias_regularizer; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We'd better add an underscore after the member's name to be consistent with the whole code base, i.e. |
||
|
||
/*! \brief Number of threads */ | ||
int num_threads_; | ||
}; | ||
|
||
/*! | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,21 +112,33 @@ def test_multiclass(): | |
assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret) | ||
|
||
|
||
def test_lambdarank(): | ||
def lambdarank_test_runner(lambdarank_unbiased=False, **kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks very much for adding tests! I think we could have higher confidence that these changes are backwards-compatible if the existing tests were not touched at all. That sort of acts as a proxy for users' existing code. I appreciate the effort at reducing duplicated code here, but could you please instead just add new tests for this feature and leave the others untouched? It's ok if that means copying and pasting the contents of |
||
rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' | ||
X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) | ||
X_test, y_test = load_svmlight_file(str(rank_example_dir / 'rank.test')) | ||
q_train = np.loadtxt(str(rank_example_dir / 'rank.train.query')) | ||
q_test = np.loadtxt(str(rank_example_dir / 'rank.test.query')) | ||
gbm = lgb.LGBMRanker(n_estimators=50) | ||
gbm = lgb.LGBMRanker(n_estimators=50, lambdarank_unbiased=lambdarank_unbiased, **kwargs) | ||
gbm.fit(X_train, y_train, group=q_train, eval_set=[(X_test, y_test)], | ||
eval_group=[q_test], eval_at=[1, 3], early_stopping_rounds=10, verbose=False, | ||
callbacks=[lgb.reset_parameter(learning_rate=lambda x: max(0.01, 0.1 - 0.01 * x))]) | ||
return gbm | ||
|
||
|
||
def test_lambdarank(): | ||
gbm = lambdarank_test_runner() | ||
assert gbm.best_iteration_ <= 24 | ||
assert gbm.best_score_['valid_0']['ndcg@1'] > 0.5674 | ||
assert gbm.best_score_['valid_0']['ndcg@3'] > 0.578 | ||
|
||
|
||
def test_lambdarank_unbiased(): | ||
gbm = lambdarank_test_runner(lambdarank_unbiased=True, sigmoid=2) | ||
assert gbm.best_iteration_ <= 24 | ||
assert gbm.best_score_['valid_0']['ndcg@1'] > 0.569 | ||
assert gbm.best_score_['valid_0']['ndcg@3'] > 0.62 | ||
|
||
|
||
def test_xendcg(): | ||
xendcg_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'xendcg' | ||
X_train, y_train = load_svmlight_file(str(xendcg_example_dir / 'rank.train')) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Interesting, were you using SWIG builds to test your changes?
Anyway, can you please open a separate PR proposing this
.gitignore
change, and explaining how files like this get created and why they shouldn't be checked into the repo?We have a strong preference in this project for small, focused pull requests, and this change doesn't seem tightly related to the core purpose of this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removed it from the change set.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think this ended up here bc of the way i built it locally. 70/30 chance it was due to me not having looked/thought about c++ for 10 years. ignore the .gitignore