Renamed query_term_count to query_term_weight

henfiber · Jun 1, 2015 · 9d03b06 · 9d03b06
1 parent bef7d46
commit 9d03b06
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 7 deletions.
diff --git a/include/index/score_data.h b/include/index/score_data.h
@@ -55,8 +55,8 @@ struct score_data
 
     /// doc term id
     term_id t_id;
-    /// query term count
-    uint64_t query_term_count;
+    /// query term count (or weight in case of feedback)
+    double query_term_weight;
     /// number of docs that t_id appears in
     uint64_t doc_count;
     /// number of times t_id appears in corpus

diff --git a/include/index/score_data.h~ b/include/index/score_data.h~
@@ -0,0 +1,98 @@
+/**
+ * @file score_data.h
+ * @author Sean Massung
+ *
+ * All files in META are dual-licensed under the MIT and NCSA licenses. For more
+ * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the
+ * project.
+ */
+
+#ifndef META_SCORE_DATA_H_
+#define META_SCORE_DATA_H_
+
+#include "meta.h"
+
+namespace meta
+{
+
+namespace corpus
+{
+class document;
+}
+
+namespace index
+{
+class inverted_index;
+}
+}
+
+namespace meta
+{
+namespace index
+{
+
+/**
+ * A score_data object contains information needed to evaluate a ranking
+ * function. Data is set by the base ranker class as needed, so the derived
+ * ranking classes don't make many unncessary calls to the inverted index.
+ */
+struct score_data
+{
+    // general info
+
+    /// index queries are running on
+    inverted_index& idx;
+    /// average document length
+    double avg_dl;
+    /// total number of documents
+    uint64_t num_docs;
+    /// total number of terms in the index
+    uint64_t total_terms;
+    /// the current query
+    const corpus::document& query;
+
+    // term-based info
+
+    /// doc term id
+    term_id t_id;
+    /// query term count
+    uint64_t query_term_count;
+    /// number of docs that t_id appears in
+    uint64_t doc_count;
+    /// number of times t_id appears in corpus
+    uint64_t corpus_term_count;
+
+    // document-based info
+
+    /// document id
+    doc_id d_id;
+    /// number of times the term appears in the current doc
+    uint64_t doc_term_count;
+    /// total number of terms in the doc
+    uint64_t doc_size;
+    /// number of unique terms in the doc
+    uint64_t doc_unique_terms;
+
+    /**
+     * Constructor to initialize most elements.
+     * @param p_idx The index that is being used
+     * @param p_avg_dl The average doc length in the index
+     * @param p_num_docs The number of docs in the index
+     * @param p_total_terms The total number of terms in the index
+     * @param p_query The current query
+     */
+    score_data(inverted_index& p_idx, double p_avg_dl, uint64_t p_num_docs,
+               uint64_t p_total_terms, const corpus::document& p_query)
+        : idx(p_idx), // gcc no non-const ref init from brace init list
+          avg_dl{p_avg_dl},
+          num_docs{p_num_docs},
+          total_terms{p_total_terms},
+          query(p_query) // gcc no non-const ref init from brace init list
+    {
+        /* nothing */
+    }
+};
+}
+}
+
+#endif
diff --git a/src/index/ranker/lm_ranker.cpp b/src/index/ranker/lm_ranker.cpp
@@ -20,7 +20,7 @@ double language_model_ranker::score_one(const score_data& sd)
     double ps = smoothed_prob(sd);
     double pc = static_cast<double>(sd.corpus_term_count) / sd.total_terms;
 
-    return sd.query_term_count * std::log(ps / (doc_constant(sd) * pc));
+    return sd.query_term_weight * std::log(ps / (doc_constant(sd) * pc));
 }
 
 double language_model_ranker::initial_score(const score_data& sd) const

diff --git a/src/index/ranker/okapi_bm25.cpp b/src/index/ranker/okapi_bm25.cpp
@@ -32,8 +32,8 @@ double okapi_bm25::score_one(const score_data& sd)
                 / ((k1_ * ((1.0 - b_) + b_ * doc_len / sd.avg_dl))
                    + sd.doc_term_count);
 
-    double QTF = ((k3_ + 1.0) * sd.query_term_count)
-                 / (k3_ + sd.query_term_count);
+    double QTF = ((k3_ + 1.0) * sd.query_term_weight)
+                 / (k3_ + sd.query_term_weight);
 
     return TF * IDF * QTF;
 }

diff --git a/src/index/ranker/pivoted_length.cpp b/src/index/ranker/pivoted_length.cpp
@@ -26,7 +26,7 @@ double pivoted_length::score_one(const score_data& sd)
     double norm = (1 - s_) + s_ * (doc_len / sd.avg_dl);
     double IDF = log((sd.num_docs + 1) / (0.5 + sd.doc_count));
 
-    return TF / norm * sd.query_term_count * IDF;
+    return TF / norm * sd.query_term_weight * IDF;
 }
 
 template <>

diff --git a/src/index/ranker/ranker.cpp b/src/index/ranker/ranker.cpp
@@ -36,7 +36,7 @@ ranker::score(inverted_index& idx, corpus::document& query,
         auto pdata = idx.search_primary(t_id);
         sd.doc_count = pdata->counts().size();
         sd.t_id = t_id;
-        sd.query_term_count = tpair.second;
+        sd.query_term_weight = tpair.second;
         sd.corpus_term_count = idx.total_num_occurences(sd.t_id);
         for (auto& dpair : pdata->counts())
         {