Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kscore #2

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions data/WikiQA/find_bad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pickle
import gzip

wrong_indices = pickle.load(open("wrong_indices.p", "rb"))

with gzip.open('WikiQA-dev.tsv.gz', 'rb') as f:
next(f)
count = 1
test = 0
for line in f:
if count in wrong_indices:
line = line.decode("UTF-8")
line = line.split("\t")
print(line[1])
print(line[5])
print()
test += 1
count += 1
17 changes: 17 additions & 0 deletions data/WikiQA/find_bad.py~
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pickle
import gzip

wrong_indices = pickle.load(open("wrong_indices.p", "rb"))

with gzip.open('WikiQA-dev.tsv.gz', 'rb') as f:
next(f)
count = 1
test = 0
for line in f:
if count in wrong_indices:
line = line.decode("UTF-8")
line = line.split("\t")
print(line[1])
print()
test += 1
count += 1
18 changes: 18 additions & 0 deletions data/WikiQA/find_good.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pickle
import gzip

wrong_indices = pickle.load(open("wrong_indices.p", "rb"))

with gzip.open('WikiQA-dev.tsv.gz', 'rb') as f:
next(f)
count = 1
test = 0
for line in f:
if count not in wrong_indices:
line = line.decode("UTF-8")
line = line.split("\t")
print(line[1])
print(line[5])
print()
test += 1
count += 1
Empty file added data/WikiQA/find_good.py~
Empty file.
Binary file added data/WikiQA/wrong_indices.p
Binary file not shown.
Binary file added ranking/__pycache__/preprocess.cpython-35.pyc
Binary file not shown.
Binary file not shown.
Binary file added ranking/__pycache__/test_model.cpython-35.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
30 changes: 16 additions & 14 deletions ranking/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from tfidf_preprocess import TfidfPreprocess
from qword_preprocess import QwordPreprocess
from word_embeddings import WordEmbeddings
#from word_embeddings import WordEmbeddings
from union_intersect_preprocess import UnionIntersect
import numpy as np
import sklearn.metrics.pairwise
Expand All @@ -23,17 +23,18 @@ def process_dataset(self, dataset, pickle_name):
qwords = QwordPreprocess()
question_vector = qwords.get_question_word_data(dataset)
print(question_vector)
we = WordEmbeddings()
det_val_vector = we.get_det_val_dataset(dataset)
print(det_val_vector)
sum_val_vector = we.get_sum_vals_dataset(dataset)
print(sum_val_vector)
spacy_sim_vector = we.get_spacy_sim_dataset(dataset)
print(spacy_sim_vector)
#we = WordEmbeddings()
#det_val_vector = we.get_det_val_dataset(dataset)
#print(det_val_vector)
#sum_val_vector = we.get_sum_vals_dataset(dataset)
#print(sum_val_vector)
#spacy_sim_vector = we.get_spacy_sim_dataset(dataset)
#print(spacy_sim_vector)
ui = UnionIntersect()
ui_vector = ui.get_percentage_dataset(dataset)
print(ui_vector)
matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector))
matrix = np.vstack((sim_vector, question_vector, ui_vector))
#matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector))
matrix = matrix.transpose()
print(matrix)
processed_data = dict(x=matrix, y=labels)
Expand All @@ -47,12 +48,13 @@ def process_run(self, query_candidates):
sim_vector = sim_vector.flatten()
qwords = QwordPreprocess()
question_vector = qwords.get_question_word_run(query_candidates)
we = WordEmbeddings()
det_val_vector = we.get_det_vals_run(query_candidates)
sum_val_vector = we.get_sum_vals_run(query_candidates)
spacy_sim_vector = we.get_spacy_sim_run(query_candidates)
#we = WordEmbeddings()
#det_val_vector = we.get_det_vals_run(query_candidates)
#sum_val_vector = we.get_sum_vals_run(query_candidates)
#spacy_sim_vector = we.get_spacy_sim_run(query_candidates)
ui = UnionIntersect()
ui_vector = ui.get_percentage_run(query_candidates)
feature_matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector))
feature_matrix = np.vstack((sim_vector, question_vector, ui_vector))
#feature_matrix = np.vstack((sim_vector, question_vector, sum_val_vector, spacy_sim_vector, ui_vector))
feature_matrix = feature_matrix.transpose()
return(feature_matrix)
Binary file modified ranking/processed_dev.p
Binary file not shown.
Binary file modified ranking/processed_test.p
Binary file not shown.
Binary file modified ranking/processed_train.p
Binary file not shown.
3 changes: 3 additions & 0 deletions ranking/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@
import pickle
import numpy as np
from random import shuffle
import logging

def pre_ranking(x_features, model, query_cands, uuidDict):
retDict = {}
probs = model.predict_proba(x_features)
for i in range(len(probs)):
prob_yes = probs[i][1]
logging.info(prob_yes)
logging.info("\n\n\n\n\n\n\n")
sentence = query_cands[i][1]
uuid = uuidDict[sentence]
retDict[uuid] = prob_yes
Expand Down
56 changes: 30 additions & 26 deletions ranking/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,35 +106,35 @@ def train_MLP(train_data, dev_data, num_resamples=5):
#x_dev = x_dev.reshape(-1, 1)
y_dev = dev_data['y']

scaler = MinMaxScaler()
det = x_train[:, 2].reshape(-1, 1)
infinites = np.argwhere(np.isinf(det))
for n in infinites:
det[n] = 0
scaler.fit(det)
s = x_train[:, 3].reshape(-1, 1)
scaler.fit(s)
vec1 = scaler.transform(det).flatten()
vec2 = scaler.transform(s).flatten()
x_train[:, 2] = vec1
x_train[:, 3] = vec2
#scaler = MinMaxScaler()
#det = x_train[:, 2].reshape(-1, 1)
#infinites = np.argwhere(np.isinf(det))
#for n in infinites:
# det[n] = 0
#scaler.fit(det)
#s = x_train[:, 3].reshape(-1, 1)
#scaler.fit(s)
#vec1 = scaler.transform(det).flatten()
#vec2 = scaler.transform(s).flatten()
#x_train[:, 2] = vec1
#x_train[:, 3] = vec2
y_train = train_data['y']
(x_train, y_train) = subtractive_balance(x_train, y_train)
x_dev = dev_data['x']
#x_dev = x_dev.reshape(-1, 1)
y_dev = dev_data['y']

det = x_dev[:, 2].reshape(-1, 1)
infinites = np.argwhere(np.isinf(det))
for n in infinites:
det[n] = 0
scaler.fit(det)
s = x_dev[:, 3].reshape(-1, 1)
scaler.fit(s)
vec1 = scaler.transform(det).flatten()
vec2 = scaler.transform(s).flatten()
x_dev[:, 2] = vec1
x_dev[:, 3] = vec2
#det = x_dev[:, 2].reshape(-1, 1)
#infinites = np.argwhere(np.isinf(det))
#for n in infinites:
# det[n] = 0
#scaler.fit(det)
#s = x_dev[:, 3].reshape(-1, 1)
#scaler.fit(s)
#vec1 = scaler.transform(det).flatten()
#vec2 = scaler.transform(s).flatten()
#x_dev[:, 2] = vec1
#x_dev[:, 3] = vec2

C = 1e-7
curr_best_C = 0
Expand All @@ -145,14 +145,16 @@ def train_MLP(train_data, dev_data, num_resamples=5):
f1_scores = []
best_model = MLPClassifier()
while (C <= 1e7):
model = MLPClassifier(solver='lbfgs', alpha=C, random_state=1, activation='tanh')
# model = MLPClassifier(solver='lbfgs', alpha=C, random_state=1, activation='tanh')
model = LogisticRegression(C=C)
for sample_num in range(num_resamples):
(x_train_boot, y_train_boot) = resample(x_train, y_train)
model.fit(x_train_boot, y_train_boot)
probs = model.predict_proba(x_dev)
y_dev_pred = model.predict(x_dev)
y_dev_pred_prob = model.predict_proba(x_dev)
np.set_printoptions(threshold=np.nan)
# print(y_dev_pred_prob)
f1 = f1_score(y_dev, y_dev_pred)
if f1 > best_f1:
best_prob = y_dev_pred_prob
Expand All @@ -170,13 +172,15 @@ def train_MLP(train_data, dev_data, num_resamples=5):
return(best_model)




def main():
train_data = pickle.load(open("./processed_train.p", "rb"))
dev_data = pickle.load(open("./processed_dev.p", "rb"))
#train_model_SVM(train_data, dev_data)
model = train_MLP(train_data, dev_data)
with open("trained_model.p", "wb") as p:
pickle.dump(model, p)
# with open("trained_model.p", "wb") as p:
# pickle.dump(model, p)



Expand Down
Binary file modified ranking/trained_model.p
Binary file not shown.
Binary file added ranking/trained_model_lr.p
Binary file not shown.
Binary file added ranking/wrong_indices.p
Binary file not shown.