From 9d03b0665a0ba0e71505bb30c6c0fcaa38a6f352 Mon Sep 17 00:00:00 2001 From: Hussein Hazimeh Date: Mon, 1 Jun 2015 01:43:04 -0500 Subject: [PATCH] Renamed query_term_count to query_term_weight --- include/index/score_data.h | 4 +- include/index/score_data.h~ | 98 +++++++++++++++++++++++++++++ src/index/ranker/lm_ranker.cpp | 2 +- src/index/ranker/okapi_bm25.cpp | 4 +- src/index/ranker/pivoted_length.cpp | 2 +- src/index/ranker/ranker.cpp | 2 +- 6 files changed, 105 insertions(+), 7 deletions(-) create mode 100644 include/index/score_data.h~ diff --git a/include/index/score_data.h b/include/index/score_data.h index c071c6244..c7f9ed975 100644 --- a/include/index/score_data.h +++ b/include/index/score_data.h @@ -55,8 +55,8 @@ struct score_data /// doc term id term_id t_id; - /// query term count - uint64_t query_term_count; + /// query term count (or weight in case of feedback) + double query_term_weight; /// number of docs that t_id appears in uint64_t doc_count; /// number of times t_id appears in corpus diff --git a/include/index/score_data.h~ b/include/index/score_data.h~ new file mode 100644 index 000000000..c071c6244 --- /dev/null +++ b/include/index/score_data.h~ @@ -0,0 +1,98 @@ +/** + * @file score_data.h + * @author Sean Massung + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_SCORE_DATA_H_ +#define META_SCORE_DATA_H_ + +#include "meta.h" + +namespace meta +{ + +namespace corpus +{ +class document; +} + +namespace index +{ +class inverted_index; +} +} + +namespace meta +{ +namespace index +{ + +/** + * A score_data object contains information needed to evaluate a ranking + * function. Data is set by the base ranker class as needed, so the derived + * ranking classes don't make many unncessary calls to the inverted index. + */ +struct score_data +{ + // general info + + /// index queries are running on + inverted_index& idx; + /// average document length + double avg_dl; + /// total number of documents + uint64_t num_docs; + /// total number of terms in the index + uint64_t total_terms; + /// the current query + const corpus::document& query; + + // term-based info + + /// doc term id + term_id t_id; + /// query term count + uint64_t query_term_count; + /// number of docs that t_id appears in + uint64_t doc_count; + /// number of times t_id appears in corpus + uint64_t corpus_term_count; + + // document-based info + + /// document id + doc_id d_id; + /// number of times the term appears in the current doc + uint64_t doc_term_count; + /// total number of terms in the doc + uint64_t doc_size; + /// number of unique terms in the doc + uint64_t doc_unique_terms; + + /** + * Constructor to initialize most elements. + * @param p_idx The index that is being used + * @param p_avg_dl The average doc length in the index + * @param p_num_docs The number of docs in the index + * @param p_total_terms The total number of terms in the index + * @param p_query The current query + */ + score_data(inverted_index& p_idx, double p_avg_dl, uint64_t p_num_docs, + uint64_t p_total_terms, const corpus::document& p_query) + : idx(p_idx), // gcc no non-const ref init from brace init list + avg_dl{p_avg_dl}, + num_docs{p_num_docs}, + total_terms{p_total_terms}, + query(p_query) // gcc no non-const ref init from brace init list + { + /* nothing */ + } +}; +} +} + +#endif diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp index ce8c1cd8e..9451705fe 100644 --- a/src/index/ranker/lm_ranker.cpp +++ b/src/index/ranker/lm_ranker.cpp @@ -20,7 +20,7 @@ double language_model_ranker::score_one(const score_data& sd) double ps = smoothed_prob(sd); double pc = static_cast(sd.corpus_term_count) / sd.total_terms; - return sd.query_term_count * std::log(ps / (doc_constant(sd) * pc)); + return sd.query_term_weight * std::log(ps / (doc_constant(sd) * pc)); } double language_model_ranker::initial_score(const score_data& sd) const diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp index c31f731af..2cb8810cf 100644 --- a/src/index/ranker/okapi_bm25.cpp +++ b/src/index/ranker/okapi_bm25.cpp @@ -32,8 +32,8 @@ double okapi_bm25::score_one(const score_data& sd) / ((k1_ * ((1.0 - b_) + b_ * doc_len / sd.avg_dl)) + sd.doc_term_count); - double QTF = ((k3_ + 1.0) * sd.query_term_count) - / (k3_ + sd.query_term_count); + double QTF = ((k3_ + 1.0) * sd.query_term_weight) + / (k3_ + sd.query_term_weight); return TF * IDF * QTF; } diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp index 2e03e9930..255c452ac 100644 --- a/src/index/ranker/pivoted_length.cpp +++ b/src/index/ranker/pivoted_length.cpp @@ -26,7 +26,7 @@ double pivoted_length::score_one(const score_data& sd) double norm = (1 - s_) + s_ * (doc_len / sd.avg_dl); double IDF = log((sd.num_docs + 1) / (0.5 + sd.doc_count)); - return TF / norm * sd.query_term_count * IDF; + return TF / norm * sd.query_term_weight * IDF; } template <> diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp index 425117b4f..71aea6378 100644 --- a/src/index/ranker/ranker.cpp +++ b/src/index/ranker/ranker.cpp @@ -36,7 +36,7 @@ ranker::score(inverted_index& idx, corpus::document& query, auto pdata = idx.search_primary(t_id); sd.doc_count = pdata->counts().size(); sd.t_id = t_id; - sd.query_term_count = tpair.second; + sd.query_term_weight = tpair.second; sd.corpus_term_count = idx.total_num_occurences(sd.t_id); for (auto& dpair : pdata->counts()) {