-
Notifications
You must be signed in to change notification settings - Fork 0
/
fuzzy_match_factors.py
98 lines (77 loc) · 3.39 KB
/
fuzzy_match_factors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import json
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from tqdm import tqdm
from collections import Counter
import re
ENG_DATASET = "data/dataset_with_articles.jsonl"
MATCHED_DATASET = "data/matched_articles.jsonl"
transl_table = dict( [ (ord(x), ord(y)) for x,y in zip( u"`‘’´“”–-", u"''''\"\"--\``") ] )
unstable_chars = r"[\u2018\u2019\u201C\u201D\u2013\u2014\u2026'`]"
# Helper functions for fuzzy matching algorithm ------------------
def string_overlap(s, t):
t_counts = Counter(t)
s_counts = Counter(s)
return sum([t_elem in s and s_counts[t_elem] >= t_counts[t_elem] for t_elem in t])/len(t)
def minimum_window_substring(s, t):
window_size = len(t)
l, r = 0, window_size
if r > len(s):
return ""
while window_size <= len(s):
matches = []
while r <= len(s):
if string_overlap(s[l:r], t) >= 2/3:
matches.append(s[l:r])
# return s[l:r]
l, r = l+1, r+1
if matches:
return max(matches, key=lambda match: string_overlap(match, t))
window_size += 1
l, r = 0, window_size
return ""
# ----------------------------------------------------------------
def fuzzy_match(factor, article):
paragraphs = article.split('[PARAGRAPH_SEP]')
# Special characters show up a lot in fact checks, but if there's a perfect match
# without them, it should be accepted
standardized_factor = re.sub(r'[^a-zA-Z0-9\s]', '', factor)
standardized_paragraphs = [re.sub(r'[^a-zA-Z0-9\s]', '', pg) for pg in paragraphs]
if standardized_factor in " ".join(standardized_paragraphs):
return factor
paragraphs = [word_tokenize(pg) for pg in paragraphs]
factor, article = word_tokenize(factor), word_tokenize(article)
overlap = [len(list(set(factor) & set(pg)))/len(set(factor)) for pg in paragraphs]
best_score = max(overlap)
best_match = paragraphs[overlap.index(best_score)]
if best_score < 2/3:
return None
return TreebankWordDetokenizer().detokenize(minimum_window_substring(best_match, factor))
if __name__ == "__main__":
if os.path.exists(MATCHED_DATASET):
os.remove(MATCHED_DATASET)
with open(ENG_DATASET, 'r') as f_in:
file_len = len(f_in.readlines())
with open(ENG_DATASET, 'r') as f_in:
for line in tqdm(f_in, total=file_len):
data = json.loads(line)
article = data['article']
# If the entry doesn't have all of these fields, it's not useful
try:
claim = data["claimReviewed"]
claimant = data["itemReviewed"]["author"]["name"]
verdict = data["reviewRating"]["alternateName"]
except KeyError:
continue
claim = fuzzy_match(claim, article)
claimant = fuzzy_match(claimant, article)
verdict = fuzzy_match(verdict, article)
if claim is not None and claim != "":
data["claimReviewed"] = claim
if claimant is not None and claimant != "":
data["itemReviewed"]["author"]["name"] = claimant
if verdict is not None and verdict != "":
data["reviewRating"]["alternateName"] = verdict
with open(MATCHED_DATASET, 'a') as f_out:
f_out.write(json.dumps(data) + '\n')