This repository has been archived by the owner on Oct 26, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_sample.py
143 lines (122 loc) · 4.7 KB
/
process_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#! /usr/bin/env python3
import sys
if len(sys.argv) < 3:
print("You must give at least 2 files to analyze their outputs with the switch.\n"
"Switches:\n"
"translation : reads input from true_gender file, and checks the gender against trailing CONLLU files for their gender\n"
"translation_check : compares two files, to give the output based on how correct are the values in generated tsv to truth values.\n"
"usage: {} switch input_files".format(sys.argv[0]))
exit(1)
switch = sys.argv[1]
# repetitively used in filter_data.py
# gets gender value of a CONLL-U format line
def get_gender(phrase):
feats = phrase.split("\t")[5].split("|")
for items in feats:
if "Gender" in items.split("=")[0]:
return items.split("=")[1]
if switch == "translation":
if len(sys.argv) < 4:
print("Need at least 3 arguments. Usage:\n"
"this_file switch true_gender_source conllu_files")
source = []
# store the gender true nouns in a list.
with open(sys.argv[2], "r") as sourcefile:
contents = sourcefile.readlines()
for lines in contents:
source.append(lines.strip("\n"))
# analyse conllu files to update the `scores` defaultdict of [M, F, Q] counts.
# M= Male, F= Female, Q= Questionable
from collections import defaultdict
scores = defaultdict(list)
for i in range(3, len(sys.argv)):
with open(sys.argv[i], "r") as conllu:
lines = conllu.readlines()
for line in lines:
if line != "\n":
if line[:2] != "ID":
token = ""
if line.split("\t")[1].lower() in source:
token = line.split("\t")[1].lower()
elif line.split("\t")[2].lower() in source:
token = line.split("\t")[2].lower()
if token not in scores:
scores[token] = [0, 0, 0]
if get_gender(line) == "Masc":
scores[token][0] += 1
elif get_gender(line) == "Fem":
scores[token][1] += 1
else:
scores[token][2] += 1
# with the values generated, write the decisions to stdout.
for i in scores:
if i in source:
male, female, quest = scores[i]
decision = ""
if male == 0 and female != 0 and quest == 0:
decision = "F"
elif male == 0 and female != 0 and quest != 0:
decision = "F~"
elif male != 0 and female == 0 and quest == 0:
decision = "M"
elif male != 0 and female == 0 and quest != 0:
decision = "M~"
elif male != 0 and female != 0 and quest == 0:
decision = "~"
elif male != 0 and female != 0 and quest != 0:
decision = "~~"
print(i, male, female, quest, decision, sep="\t")
if switch == "translation_check":
if len(sys.argv) != 4:
print("Usage:\n"
"this_file translation_check file_with_predictions file_with_truth values")
exit(0)
# this will contain the true_gender, as per the gold-standard
truth_val = dict()
# this will contain the english translation of the hindi token, used for final print to stdout.
trans = dict()
# output file
outfile = open("sample/truth_data_analysis", "w")
# process the reference value and store in dicts.
with open(sys.argv[3], "r") as truth:
contents = truth.readlines()
for lines in contents:
eng, hin, gender = lines.strip("\n").split("\t")
if hin != "-":
truth_val[hin] = gender
if hin not in trans:
trans[hin] = eng
std_decisions = ["M", "F"]
# open predictions file, and generate a TSV resembling Table 1 in the paper.
with open(sys.argv[2], "r") as prediction:
contents = prediction.readlines()
for lines in contents:
token, male, female, quest, decision = lines.strip("\n").split("\t")
symbol = ""
# if predicted tag == right tag
if decision == truth_val[token]:
symbol = "+"
else:
# wrong decision entirely
if decision in std_decisions:
symbol = "-"
# if there was no decision
elif decision == "~":
symbol = "?"
# if value is correctly tagged as [M, F], but ends with ~ sign
elif decision[0] == truth_val[token] and decision[1] == "~":
symbol = "+-"
# if value is incorrectly tagged, and ends with ~ sign
elif decision[0] != truth_val[token] and decision[1] == "~":
symbol = "-~"
total = int(male) + int(female) + int(quest)
# Key:
# + Multiple translations, with all gender correct
# - Multiple translations, with all gender incorrect
# ? Multiple translations, some correct, some incorrect
# +~ Multiple translations, most correct, some unresolved
# -~ Multiple translations, most incorrect, some unresolved
# Output format:
# Token_eng Token_hin Frequency Male Female Questionable Decision
outfile.write(trans[token] + "\t" + token + "\t" + str(total) + "\t" + str(male) + "\t" + str(female) + "\t" + str(quest) + "\t" + symbol + "\n")
outfile.close()