add __main__ checks to each file

normanrz · Aug 3, 2016 · 3dd5569 · 3dd5569
1 parent 1876fa6
commit 3dd5569
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 92 deletions.
diff --git a/build_index.py b/build_index.py
@@ -7,20 +7,21 @@
 filename = path.join("datasets", "pokemon_pages_current.json.gz")
 index = defaultdict(dict)
 
-doc_titles = [doc["title"] for doc in load_wikidata(filename)]
+if __name__ == "__main__":
+    doc_titles = [doc["title"] for doc in load_wikidata(filename)]
 
-for doc_no, doc in enumerate(load_wikidata(filename)):
-    tokens = tokenize_and_stem(clean_wikidata(doc["text"]))
-    if len(tokens) == 0:
-        continue
-    token_counts = count_tokens(tokens)
-    max_token_count = max_count_token(token_counts)
-    token_set = token_counts.keys()
-    for token in token_set:
-        tf = token_counts[token] / max_token_count
-        index[token][doc_no] = tf
+    for doc_no, doc in enumerate(load_wikidata(filename)):
+        tokens = tokenize_and_stem(clean_wikidata(doc["text"]))
+        if len(tokens) == 0:
+            continue
+        token_counts = count_tokens(tokens)
+        max_token_count = max_count_token(token_counts)
+        token_set = token_counts.keys()
+        for token in token_set:
+            tf = token_counts[token] / max_token_count
+            index[token][doc_no] = tf
 
-with open("index.pickle", "wb") as f:
-    pickle.dump(index, f)
-with open("doc_titles.pickle", "wb") as f:
-    pickle.dump(doc_titles, f)
+    with open("index.pickle", "wb") as f:
+        pickle.dump(index, f)
+    with open("doc_titles.pickle", "wb") as f:
+        pickle.dump(doc_titles, f)
diff --git a/load_dataset.py b/load_dataset.py
@@ -13,7 +13,6 @@
                              "Category:|Board:|Quiz:|QuizArticle:|Category talk:|Forum:|" \
                              "MediaWiki:|MediaWiki talk:|Kategorie:|Datei:|Wikipedia:)")
 
-
 def load_wikidata(filename):
     if filename.endswith(".gz"):
         return load_wikidata_gzip(filename)

diff --git a/pagerank.py b/pagerank.py
@@ -15,6 +15,7 @@
 num_links = []
 pagerank_values_old = []
 pagerank_values = []
+doc_titles = None
 
 def convert_title(link):
     # Some links have a section for the page category, which is cut
@@ -25,10 +26,6 @@ def convert_title(link):
     else:
         return link.lower()
 
-with open("doc_titles.pickle", "rb") as f:
-    doc_titles = pickle.load(f)
-doc_titles = set(map(convert_title, doc_titles))
-
 def gather_sites():
     # list all documents that are actually linked to by some other
     # document
@@ -99,17 +96,22 @@ def update_pagerank_value(index):
     pagerank_values[index] += sum_pr * d_global
 
 
-gather_sites()
-build_docid_mapping()
-build_adj_list()
-calc_pagerank_head()
-pagerank_final_values = [(i, pagerank_values[i]) for i in range(num_docs)]
+if __name__ == "__main__":
+    with open("doc_titles.pickle", "rb") as f:
+        doc_titles = pickle.load(f)
+    doc_titles = set(map(convert_title, doc_titles))
+
+    gather_sites()
+    build_docid_mapping()
+    build_adj_list()
+    calc_pagerank_head()
+    pagerank_final_values = [(i, pagerank_values[i]) for i in range(num_docs)]
 
-# map document titles to their respective pagerank values
-pagerank_mapping = {}
-for site in pagerank_final_values:
-    pagerank_mapping[doctitle_of_docid[site[0]]] = site[1]
+    # map document titles to their respective pagerank values
+    pagerank_mapping = {}
+    for site in pagerank_final_values:
+        pagerank_mapping[doctitle_of_docid[site[0]]] = site[1]
 
-with open("pagerank.pickle", "wb") as f:
-    pickle.dump(pagerank_mapping, f)
-print("done")
+    with open("pagerank.pickle", "wb") as f:
+        pickle.dump(pagerank_mapping, f)
+    print("done")
diff --git a/pagerank_query.py b/pagerank_query.py
@@ -14,39 +14,40 @@
 
 N = len(doc_titles)
 
-while True:
-    search_tokens = tokenize_and_stem(input("Query: "))
-
-    # Perform OR query
-    all_docs = []
-    for i, token in enumerate(search_tokens):
-        all_docs += index[token]
-    all_docs = set(all_docs)
-
-    # Calculate tfidf scores
-    doc_scores = {}
-    idf_values = {}
-    for token in search_tokens:
-        if len(index[token]) == 0: continue
-        idf_values[token] = math.log(N / len(index[token]))
-    for doc_no in all_docs:
-        score = 0
+if __name__ == "__main__":
+    while True:
+        search_tokens = tokenize_and_stem(input("Query: "))
+
+        # Perform OR query
+        all_docs = []
+        for i, token in enumerate(search_tokens):
+            all_docs += index[token]
+        all_docs = set(all_docs)
+
+        # Calculate tfidf scores
+        doc_scores = {}
+        idf_values = {}
         for token in search_tokens:
-            if doc_no in index[token]:
-                tf = index[token][doc_no]
-                idf = idf_values[token]
-                score += tf * idf
-        doc_scores[doc_no] = score
-
-    # apply pagerank values
-    ranked_docs = []
-    for item in doc_scores.items():
-        ranked_docs.append((doc_titles[item[0]],
-                            pagerank_mapping[convert_title(doc_titles[item[0]])] * item[1]))
-
-    # Sort by (tfidf * pagerank)
-    ranked_docs = sorted(ranked_docs, key=lambda x:x[1], reverse=True)
-
-    print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
-    for doc in ranked_docs[0:30]:
-        print(doc)
+            if len(index[token]) == 0: continue
+            idf_values[token] = math.log(N / len(index[token]))
+        for doc_no in all_docs:
+            score = 0
+            for token in search_tokens:
+                if doc_no in index[token]:
+                    tf = index[token][doc_no]
+                    idf = idf_values[token]
+                    score += tf * idf
+            doc_scores[doc_no] = score
+
+        # apply pagerank values
+        ranked_docs = []
+        for item in doc_scores.items():
+            ranked_docs.append((doc_titles[item[0]],
+                                pagerank_mapping[convert_title(doc_titles[item[0]])] * item[1]))
+
+        # Sort by (tfidf * pagerank)
+        ranked_docs = sorted(ranked_docs, key=lambda x:x[1], reverse=True)
+
+        print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
+        for doc in ranked_docs[0:30]:
+            print(doc[0])
diff --git a/query.py b/query.py
@@ -9,33 +9,34 @@
 
 N = len(doc_titles)
 
-while True:
-    search_tokens = tokenize_and_stem(input("Query: "))
+if __name__ == "__main__":
+    while True:
+        search_tokens = tokenize_and_stem(input("Query: "))
 
-    # Perform OR query
-    all_docs = []
-    for i, token in enumerate(search_tokens):
-        all_docs += index[token]
-    all_docs = set(all_docs)
+        # Perform OR query
+        all_docs = []
+        for i, token in enumerate(search_tokens):
+            all_docs += index[token]
+        all_docs = set(all_docs)
 
-    # Calculate tfidf scores
-    doc_scores = {}
-    idf_values = {}
-    for token in search_tokens:
-        if len(index[token]) == 0: continue
-        idf_values[token] = math.log(N / len(index[token]))
-    for doc_no in all_docs:
-        score = 0
+        # Calculate tfidf scores
+        doc_scores = {}
+        idf_values = {}
         for token in search_tokens:
-            if doc_no in index[token]:
-                tf = index[token][doc_no]
-                idf = idf_values[token]
-                score += tf * idf
-        doc_scores[doc_no] = score
+            if len(index[token]) == 0: continue
+            idf_values[token] = math.log(N / len(index[token]))
+        for doc_no in all_docs:
+            score = 0
+            for token in search_tokens:
+                if doc_no in index[token]:
+                    tf = index[token][doc_no]
+                    idf = idf_values[token]
+                    score += tf * idf
+            doc_scores[doc_no] = score
 
-    # Sort by tfidf scores
-    ranked_docs = sorted(doc_scores, key=doc_scores.get, reverse=True)
+        # Sort by tfidf scores
+        ranked_docs = sorted(doc_scores, key=doc_scores.get, reverse=True)
 
-    print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
-    for doc_no in ranked_docs[0:30]:
-        print(doc_titles[doc_no])
+        print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
+        for doc_no in ranked_docs[0:30]:
+            print(doc_titles[doc_no])