This repository has been archived by the owner on Oct 17, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_differences.py
121 lines (104 loc) · 4.4 KB
/
text_differences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import jiwer
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
sbEng = SnowballStemmer('english')
sbEsp = SnowballStemmer('spanish')
def get_stemmer(full_language):
language = full_language.split('-')[0]
if language=='es':
return sbEsp
elif language=='en':
return sbEng
else:
return None
def remove_stopwords(text, full_language):
language = full_language.split('-')[0]
if(language=='es'):
stopwords = nltk.corpus.stopwords.words('spanish')
elif(language=='en'):
stopwords = nltk.corpus.stopwords.words('english')
else:
return None
text_without_stopwords = [w for w in text if w not in stopwords]
return text_without_stopwords
def compute_perc_script_missing(original_script, transcript, language):
'''
Check how much of original_script is missing in transcript. Clean and remove stopwords
'''
# print(original_script)
# print(transcript)
cleaning = jiwer.Compose([
jiwer.SubstituteRegexes({"¡": "", "¿":"", "á": "a", "é": "e", "í": "i", "ó": "o","ú": "u"}),
jiwer.SubstituteWords({ "tardes": "dias",
"noches": "dias",
" uno ": " 1 ",
" dos ": " 2 ",
" tres ": " 3 ",
" cuatro ": " 4 ",
" cinco ": " 5 ",
" seis ": " 6 ",
" siete ": " 7 ",
" ocho ": " 8 ",
" nueve ": " 9 "}),
jiwer.RemovePunctuation(),
jiwer.ToLowerCase(),
jiwer.SentencesToListOfWords(word_delimiter=" "),
jiwer.RemoveEmptyStrings()
])
#Remove anything between ${variable} from original_script
original_script_transformed = re.sub(r'\${.*?\}','',original_script)
# print(original_script_transformed)
#Clean both
original_script_transformed = cleaning(original_script_transformed)
transcript_transformed = cleaning(transcript)
# print(original_script_transformed)
#Remove stopwords from original_script
original_script_transformed_no_stopwords = remove_stopwords(original_script_transformed, language)
if len(original_script_transformed_no_stopwords) != 0: #Sometimes removing stopwords removes all words from script
original_script_transformed = original_script_transformed_no_stopwords
#Lemmatize transcript
stemmer = get_stemmer(language)
transcript_transformed_stem = [stemmer.stem(word) for word in transcript_transformed]
#Get words form original_script_transformed whose stem is not in transcript_transformed_stem
words_missing = [word for word in original_script_transformed if stemmer.stem(word) not in transcript_transformed_stem]
return len(words_missing)/len(original_script_transformed), words_missing
# def compute_standard_difference_measures(ground_truth, transcript, preprocessing=True):
#
# #Define text preprocessing before comparison
# transformation = jiwer.Compose([
# jiwer.RemovePunctuation(),
# jiwer.ToLowerCase(),
# # jiwer.Strip(),
# # jiwer.RemoveMultipleSpaces(),
# # jiwer.RemoveWhiteSpace(replace_by_space=False),
# jiwer.SentencesToListOfWords(word_delimiter=" "),
# # jiwer.Strip(),
# jiwer.RemoveEmptyStrings(),
#
# # jiwer.SubstituteWords(dictionary: Mapping[str, str])
# ])
# # https://pypi.org/project/jiwer/
# # default_transformation = jiwer.Compose([
# # jiwer.RemoveMultipleSpaces(),
# # jiwer.Strip(),
# # jiwer.SentencesToListOfWords(),
# # jiwer.RemoveEmptyStrings()
#
# if(preprocessing):
# measures = jiwer.compute_measures(
# ground_truth,
# transcript,
# truth_transform=transformation,
# hypothesis_transform=transformation)
# else:
# measures = jiwer.compute_measures(
# ground_truth,
# transcript)
#
# return measures#['wer']#, measures['mer'], measures['wil']
if __name__ == '__main__':
ground_truth = 'mi nombre es felipe alamos illanes'
hypothesis = 'es felipe alamos illanes'
print(compute_difference_measures(ground_truth, hypothesis))
print(compute_difference_measures(ground_truth, hypothesis, preprocessing=True))