Skip to content

Commit

Permalink
add __main__ checks to each file
Browse files Browse the repository at this point in the history
  • Loading branch information
p-c-e-s committed Aug 3, 2016
1 parent 1876fa6 commit 3dd5569
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 92 deletions.
31 changes: 16 additions & 15 deletions build_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@
filename = path.join("datasets", "pokemon_pages_current.json.gz")
index = defaultdict(dict)

doc_titles = [doc["title"] for doc in load_wikidata(filename)]
if __name__ == "__main__":
doc_titles = [doc["title"] for doc in load_wikidata(filename)]

for doc_no, doc in enumerate(load_wikidata(filename)):
tokens = tokenize_and_stem(clean_wikidata(doc["text"]))
if len(tokens) == 0:
continue
token_counts = count_tokens(tokens)
max_token_count = max_count_token(token_counts)
token_set = token_counts.keys()
for token in token_set:
tf = token_counts[token] / max_token_count
index[token][doc_no] = tf
for doc_no, doc in enumerate(load_wikidata(filename)):
tokens = tokenize_and_stem(clean_wikidata(doc["text"]))
if len(tokens) == 0:
continue
token_counts = count_tokens(tokens)
max_token_count = max_count_token(token_counts)
token_set = token_counts.keys()
for token in token_set:
tf = token_counts[token] / max_token_count
index[token][doc_no] = tf

with open("index.pickle", "wb") as f:
pickle.dump(index, f)
with open("doc_titles.pickle", "wb") as f:
pickle.dump(doc_titles, f)
with open("index.pickle", "wb") as f:
pickle.dump(index, f)
with open("doc_titles.pickle", "wb") as f:
pickle.dump(doc_titles, f)
1 change: 0 additions & 1 deletion load_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"Category:|Board:|Quiz:|QuizArticle:|Category talk:|Forum:|" \
"MediaWiki:|MediaWiki talk:|Kategorie:|Datei:|Wikipedia:)")


def load_wikidata(filename):
if filename.endswith(".gz"):
return load_wikidata_gzip(filename)
Expand Down
34 changes: 18 additions & 16 deletions pagerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
num_links = []
pagerank_values_old = []
pagerank_values = []
doc_titles = None

def convert_title(link):
# Some links have a section for the page category, which is cut
Expand All @@ -25,10 +26,6 @@ def convert_title(link):
else:
return link.lower()

with open("doc_titles.pickle", "rb") as f:
doc_titles = pickle.load(f)
doc_titles = set(map(convert_title, doc_titles))

def gather_sites():
# list all documents that are actually linked to by some other
# document
Expand Down Expand Up @@ -99,17 +96,22 @@ def update_pagerank_value(index):
pagerank_values[index] += sum_pr * d_global


gather_sites()
build_docid_mapping()
build_adj_list()
calc_pagerank_head()
pagerank_final_values = [(i, pagerank_values[i]) for i in range(num_docs)]
if __name__ == "__main__":
with open("doc_titles.pickle", "rb") as f:
doc_titles = pickle.load(f)
doc_titles = set(map(convert_title, doc_titles))

gather_sites()
build_docid_mapping()
build_adj_list()
calc_pagerank_head()
pagerank_final_values = [(i, pagerank_values[i]) for i in range(num_docs)]

# map document titles to their respective pagerank values
pagerank_mapping = {}
for site in pagerank_final_values:
pagerank_mapping[doctitle_of_docid[site[0]]] = site[1]
# map document titles to their respective pagerank values
pagerank_mapping = {}
for site in pagerank_final_values:
pagerank_mapping[doctitle_of_docid[site[0]]] = site[1]

with open("pagerank.pickle", "wb") as f:
pickle.dump(pagerank_mapping, f)
print("done")
with open("pagerank.pickle", "wb") as f:
pickle.dump(pagerank_mapping, f)
print("done")
71 changes: 36 additions & 35 deletions pagerank_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,39 +14,40 @@

N = len(doc_titles)

while True:
search_tokens = tokenize_and_stem(input("Query: "))

# Perform OR query
all_docs = []
for i, token in enumerate(search_tokens):
all_docs += index[token]
all_docs = set(all_docs)

# Calculate tfidf scores
doc_scores = {}
idf_values = {}
for token in search_tokens:
if len(index[token]) == 0: continue
idf_values[token] = math.log(N / len(index[token]))
for doc_no in all_docs:
score = 0
if __name__ == "__main__":
while True:
search_tokens = tokenize_and_stem(input("Query: "))

# Perform OR query
all_docs = []
for i, token in enumerate(search_tokens):
all_docs += index[token]
all_docs = set(all_docs)

# Calculate tfidf scores
doc_scores = {}
idf_values = {}
for token in search_tokens:
if doc_no in index[token]:
tf = index[token][doc_no]
idf = idf_values[token]
score += tf * idf
doc_scores[doc_no] = score

# apply pagerank values
ranked_docs = []
for item in doc_scores.items():
ranked_docs.append((doc_titles[item[0]],
pagerank_mapping[convert_title(doc_titles[item[0]])] * item[1]))

# Sort by (tfidf * pagerank)
ranked_docs = sorted(ranked_docs, key=lambda x:x[1], reverse=True)

print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
for doc in ranked_docs[0:30]:
print(doc)
if len(index[token]) == 0: continue
idf_values[token] = math.log(N / len(index[token]))
for doc_no in all_docs:
score = 0
for token in search_tokens:
if doc_no in index[token]:
tf = index[token][doc_no]
idf = idf_values[token]
score += tf * idf
doc_scores[doc_no] = score

# apply pagerank values
ranked_docs = []
for item in doc_scores.items():
ranked_docs.append((doc_titles[item[0]],
pagerank_mapping[convert_title(doc_titles[item[0]])] * item[1]))

# Sort by (tfidf * pagerank)
ranked_docs = sorted(ranked_docs, key=lambda x:x[1], reverse=True)

print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
for doc in ranked_docs[0:30]:
print(doc[0])
51 changes: 26 additions & 25 deletions query.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,34 @@

N = len(doc_titles)

while True:
search_tokens = tokenize_and_stem(input("Query: "))
if __name__ == "__main__":
while True:
search_tokens = tokenize_and_stem(input("Query: "))

# Perform OR query
all_docs = []
for i, token in enumerate(search_tokens):
all_docs += index[token]
all_docs = set(all_docs)
# Perform OR query
all_docs = []
for i, token in enumerate(search_tokens):
all_docs += index[token]
all_docs = set(all_docs)

# Calculate tfidf scores
doc_scores = {}
idf_values = {}
for token in search_tokens:
if len(index[token]) == 0: continue
idf_values[token] = math.log(N / len(index[token]))
for doc_no in all_docs:
score = 0
# Calculate tfidf scores
doc_scores = {}
idf_values = {}
for token in search_tokens:
if doc_no in index[token]:
tf = index[token][doc_no]
idf = idf_values[token]
score += tf * idf
doc_scores[doc_no] = score
if len(index[token]) == 0: continue
idf_values[token] = math.log(N / len(index[token]))
for doc_no in all_docs:
score = 0
for token in search_tokens:
if doc_no in index[token]:
tf = index[token][doc_no]
idf = idf_values[token]
score += tf * idf
doc_scores[doc_no] = score

# Sort by tfidf scores
ranked_docs = sorted(doc_scores, key=doc_scores.get, reverse=True)
# Sort by tfidf scores
ranked_docs = sorted(doc_scores, key=doc_scores.get, reverse=True)

print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
for doc_no in ranked_docs[0:30]:
print(doc_titles[doc_no])
print("## Found {} documents. Most relevant titles: ##".format(len(ranked_docs)))
for doc_no in ranked_docs[0:30]:
print(doc_titles[doc_no])

0 comments on commit 3dd5569

Please sign in to comment.