-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfast-fuzz-matching.py
123 lines (97 loc) · 3.96 KB
/
fast-fuzz-matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import simplejson
from tqdm.auto import tqdm
import os
import glob
import re
from ftfy import fix_text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import time
# Load dataset
t0 = time.time()
newoag=pd.read_json('oag_qa_20230512.json',lines=True)
# newoag_title=newoag[['title','answers']]
print(f'newoag dataset size: {newoag.shape}')
folder_path = r".\oagqa-topic-v2"
files = glob.glob(os.path.join(folder_path, "*train*"))
merged_data=[]
for file in files:
with open(file, 'r') as file:
json_data = simplejson.load(file)
df=pd.DataFrame(json_data)
merged_data.append(df)
joined_data=pd.concat(merged_data)
tsvfiles = glob.glob(os.path.join(folder_path, "*questions.tsv*"))
merged_data=[]
for file in tsvfiles:
df=pd.read_csv(file,sep='\t',names=['question','answer'])
merged_data.append(df)
joined_data=pd.concat(merged_data)
print(f'joined_data dataset size: {joined_data.shape}')
old_question = joined_data['question'].unique()
print(f'unique old_question dataset size: {old_question.shape}')
print('--------------------------------------------------')
print('Vecorizing the data - this could take a few minutes for large datasets...')
# clean strings
def ngrams(string, n=4):
string = str(string)
string = fix_text(string) # fix text
string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
string = string.lower()
chars_to_remove = [")","(",".","|","[","]","{","}","'"]
rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
string = re.sub(rx, '', string)
string = string.replace('&', 'and')
string = string.replace(',', ' ')
string = string.replace('-', ' ')
string = string.title() # normalise case - capital at start of each word
string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
string = ' '+ string +' ' # pad names for ngrams...
string = re.sub(r'[,-./]|\sBD',r'', string)
ngrams = zip(*[string[i:] for i in range(n)])
return [''.join(ngram) for ngram in ngrams]
# Bulid custom vectorizer with string clean analyzer
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(old_question)
print('Vecorizing completed!')
nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf)
title_column = 'title'
sub_newoag = newoag[:1000]
# sub_newoag = newoag
newoag_title = sub_newoag[title_column].values
print(f'newoag_title dataset size: {newoag_title.shape}')
# newoag_title = newoag['title'].sample(n=1000, random_state=1, replace=False)
# matching query
def getNerestN(query):
queryTFIDF_ = vectorizer.transform(query)
distance, indices = nbrs.kneighbors(queryTFIDF_)
return distance, indices
print('--------------------------------------------------')
print('geting nearest n...')
start_time = time.time()
distances, indices = getNerestN(newoag_title)
end_time = time.time()
print(f"finished in {end_time-start_time} seconds")
# find matches
print('--------------------------------------------------')
print('find matches...')
matches = []
for i, j in enumerate(indices):
temp = [round(distances[i][0], 2), old_question[j][0], newoag_title[i]]
matches.append(temp)
print('--------------------------------------------------')
print("Building datafram...")
# Match confidence : smaller is better
matches = pd.DataFrame(matches, columns=['Match confidence','old question','newoag_title'])
print("Done")
# Merge Dataframe
newoag_with_matches = pd.concat([sub_newoag, matches], axis=1)
print(newoag_with_matches[['Match confidence','old question', 'title']])
# filter by mathc confidence
threshold = 0.5
filtered_newoag = newoag_with_matches[newoag_with_matches['Match confidence'] <= threshold]
print(filtered_newoag[['Match confidence','old question', 'title']])
t1 = time.time()
print(f"finished fast fuzz matching in {t1-t0} seconds")