-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfuzz-couple.py
108 lines (88 loc) · 3.35 KB
/
fuzz-couple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import simplejson
from tqdm.auto import tqdm
import os
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct
import numpy as np
# Load dataset
newoag=pd.read_json('oag_qa_20230512.json',lines=True)
# newoag_title=newoag[['title','answers']]
print(f'newoag_title dataset size: {newoag.shape}')
folder_path = r".\oagqa-topic-v2"
files = glob.glob(os.path.join(folder_path, "*train*"))
merged_data=[]
for file in files:
with open(file, 'r') as file:
json_data = simplejson.load(file)
df=pd.DataFrame(json_data)
merged_data.append(df)
joined_data=pd.concat(merged_data)
tsvfiles = glob.glob(os.path.join(folder_path, "*questions.tsv*"))
merged_data=[]
for file in tsvfiles:
df=pd.read_csv(file,sep='\t',names=['question','answer'])
merged_data.append(df)
joined_data=pd.concat(merged_data)
print(f'joined_data dataset size: {joined_data.shape}')
old_question = joined_data
# Transform text to vectors with TF-IDF
newoag_title = newoag['title']
vectorizer = TfidfVectorizer(min_df=1)
tf_idf_matrix_newoag = vectorizer.fit_transform(newoag_title)
print(tf_idf_matrix_newoag.shape)
old_question_title = old_question['question']
tf_idf_matrix_old_question = vectorizer.fit_transform(old_question_title)
print(tf_idf_matrix_old_question.shape)
# Compute Cosine similarity
def cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M*ntop
indptr = np.zeros(M+1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data,indices,indptr),shape=(M,N))
matches = cossim_top(tf_idf_matrix_newoag, tf_idf_matrix_old_question.transpose(), 10, 0.8)
print(matches.shape)
# Create a match table
# def get_matches_df(sparse_matrix, name_vector, top=100):
# non_zeros = sparse_matrix.nonzero()
# sparserows = non_zeros[0]
# sparsecols = non_zeros[1]
# if top:
# nr_matches = top
# else:
# nr_matches = sparsecols.size
# left_side = np.empty([nr_matches], dtype=object)
# right_side = np.empty([nr_matches], dtype=object)
# similairity = np.zeros(nr_matches)
# for index in range(0, nr_matches):
# left_side[index] = name_vector[sparserows[index]]
# right_side[index] = name_vector[sparsecols[index]]
# similairity[index] = sparse_matrix.data[index]
# return pd.DataFrame({'title': left_side,
# 'similar_title': right_side,
# 'similairity_score': similairity})
# matches_df = pd.DataFrame()
# matches_df = get_matches_df(matches, newoag['title'], top=10000)
# # Remove all exact matches
# matches_df = matches_df[matches_df['similairity_score'] < 0.99999]
# print(matches_df.sample(10))