forked from ouyangsizhuo/2021Spring_CRF_AGACtask1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtac2brat.py
265 lines (220 loc) · 11.6 KB
/
tac2brat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/python
# Python 2-3 compatibility: see http://python-future.org/quickstart.html
#python tac2brat.py -d '/home/zhoukaiyin/Desktop/CRF_PRO/src/train_xml' -o '/home/zhoukaiyin/Desktop/CRF_PRO/work/corpus/test' -F TokenDict:diso:dis-DISO-2.dic -t conll -s OBBEI
from __future__ import (absolute_import, division, print_function, unicode_literals)
from builtins import *
import sys
import os
import argparse
import logging
import re
import pdb
from lxml import etree
# import nltk
# # choose nltk_data location for my laptop: update as needed for your machine
# nltk.data.path = ( ['nltk_data'] if os.path.exists('nltk_data') else ['/vol/datailes/tools/nlp/nltk_data/2016'] )
from Mention import Mention, Relation
from Text import Text
from NlpSupport import NlpSupportEnglish
import Features
# Program version
version = '0.3'
#================
class Section(object):
def __init__(self, id, name, string, mentions=[], nlp_processor=None, file=""):
self.id = id
self.name = name
self.string = string
self.nlp_processor = (nlp_processor if nlp_processor is not None else NlpSupportEnglish())
self.file = file
self.text = Text(self.string, nlp_processor=self.nlp_processor)
self.mentions = []
for m in mentions: # add mentions provided at init time
self.add_mention(m)
# self.text.add_mentions(self.mentions)
logging.info("Created Section '{}' with name '{}'".format(self.id, self.name))
def add_mention(self, m):
logging.debug("add_mention: {}: {}".format(self, m))
self.mentions.append(m)
self.text.register_mention_in_tokens(m)
return
def print_mentions(self, offset=0, output=sys.stdout):
for m in self.mentions:
print(m.sprint(offset=offset), file=output)
return
def print_tokens_and_mentions(self, offset=0, output=sys.stdout, scheme="OBBII", features=[]):
# for t in self.text.tokens:
# print(t, file=output)
self.text.print_conll(with_offset=True, output=output, filename=":".join((self.file, self.id)), with_labels=True, offset=offset, scheme=scheme, features=features)
return
def sprint(self, convert_eol=False):
return self.text.sprint(convert_eol=convert_eol)
def __len__(self):
return self.text.string_len()
def __repr__(self):
return "<Section '{}' of size {}>".format(self.id, len(self))
#================
class Converter(object):
def __init__(self):
pass
def output_file_names(self, file, format='brat'):
f = os.path.basename(file)
f = re.sub("\.xml$", "", f)
if format == 'brat':
return os.path.join(self.output_dir, f + ".txt"), os.path.join(self.output_dir, f + ".ann")
elif format == 'conll':
return os.path.join(self.output_dir, f + ".tab")
def convert_file_to_brat(self, file):
output_t_file, output_a_file = self.output_file_names(file)
logging.info("Reading from {} into {} + {}".format(file, output_t_file, output_a_file))
with open(output_t_file, "w", encoding="utf-8") as tf:
with open(output_a_file, "w", encoding="utf-8") as ta:
self.load_file(file)
self.print_brat_files(output_t=tf, output_a=ta)
return
def convert_file_to_conll(self, file, features=[]):
output_t_file = self.output_file_names(file, format='conll')
logging.info("Reading from {} into {}".format(file, output_t_file))
with open(output_t_file, "w", encoding="utf-8") as tf:
self.load_file(file)
self.print_conll_file(output_t=tf, features=features)
return
def load_file(self, file):
"""Parse contents of file and create list of sections, mentions, and relations.
Obtains and adds as slots of self:
sections: list of Section objects (each contains a Text object and a list of Mention objects)
mentions: list of Mention objects
relations: list of Relation objects
"""
tree = etree.parse(file)
# parse sections and their texts
self.sections = [ Section(s.get("id"), s.get("name"), s.text, file=os.path.basename(file)) for s in tree.xpath("/Label/Text/Section") ]
sec_dict = { s.id: s for s in self.sections }
self.mentions = []
# parse mentions and add to their sections
for x in tree.xpath("/Label/Mentions/Mention"):
sec_id = x.get("section")
if sec_id not in sec_dict:
raise ValueError ( "Unknown section id: '{}', skipping mention".format(sec_id) )
else:
section = sec_dict[sec_id]
m = Mention.from_lxml(x)
self.mentions.append(m)
section.add_mention(m)
logging.debug("% New mention {}:'{}' added to Section '{}'".format(sec_id, m.id, section.name))
# parse relations
self.relations = []
for x in tree.xpath("/Label/Relations/Relation"):
r = Relation.from_lxml(x)
self.relations.append(r)
logging.debug("% New relation {}".format(r.id))
return
def print_brat_files(self, output_t=sys.stdout, output_a=sys.stdout):
# print sections and their mentions
o = 0
for s in self.sections:
logging.info("================ Section {} ================".format(s.id))
# output_text.write(s.string)
print(s.sprint(convert_eol=False), file=output_t, flush=True, end="") # no newline to avoid shift in offsets
logging.info("==== Mentions for section {} ====".format(s.id))
s.print_mentions(offset=o, output=output_a)
o += len(s) # to obtain one consistent file, increment global offset
# print relations
for r in self.relations:
print(r.sprint(), file=output_a)
return
def print_conll_file(self, output_t=sys.stdout, features=[]):
# print sections (as tokens) and their mentions (as gold labels)
o = 0
for s in self.sections:
logging.info("================ Section {} and its mentions ================".format(s.id))
s.print_tokens_and_mentions(offset=o, output=output_t, scheme=self.scheme, features=features)
o += len(s) # to obtain one consistent file, increment global offset
return
#================================================================
def parse_execute_command_line(self):
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
groupI = parser.add_argument_group('Input')
groupIE = groupI.add_mutually_exclusive_group(required=True)
groupIE.add_argument('-f', '--file',
help="input file: an XML file")
groupIE.add_argument("-d", "--directory",
help="input directory: all *.xml files in this directory will be processed")
groupO = parser.add_argument_group('Output')
groupO.add_argument("-o", "--output_directory", default=".",
help="output directory (default = current directory)")
groupP = parser.add_argument_group('Processing')
groupP.add_argument("-t", "--target-format", choices=["brat", "conll"], default="brat",
help="specify target format: brat (default: for display, creates .txt and .ann) or conll (for CRF, creates .tab)")
groupP.add_argument("-s", "--scheme", default="OBBII", # = BIO
help="""specify scheme for labels: default = 'OBBII' which specifies the classical B-I-O scheme.
Use 'OBBEI' for B-I-E-O scheme.
Use 'OEIEI' for I-E-O scheme.
Use 'OIIII' for I-O scheme.
Must be a string of length 5, where each character position applies to the following case:
0. for outside any entity, else
1. for mono-token entity, else
2. for first token, else
3. for last token, else
4. for inside token""")
groupP.add_argument("-F", "--features", default="",
help="""for CONLL format only: specify features to add.
Must be a comma-separated list of feature specifications. Each specification must contain:
1. a feature name. 2. optionally, a colon-separated list of arguments.
Example: TokenDict:msh:msh-diso-1.dic
The feature name must be a class name (see Features.py), the arguments are passed to the class creator.""")
groupS = parser.add_argument_group('Special')
groupS.add_argument("-q", "--quiet", action="store_true",
help="suppress reporting progress info")
groupS.add_argument("--debug", action="store_true",
help="print debug info")
groupS.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
help="print program version")
args = parser.parse_args()
# process arguments and parameters
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger()
if not args.quiet:
logger.setLevel(logging.INFO)
if args.debug:
logger.setLevel(logging.DEBUG)
# actual processing
self.scheme = args.scheme
if len(self.scheme) != 5:
raise ValueError ("The length of the label scheme is {} instead of 5".format(len(self.scheme)))
f_objs = ( [] if args.features == "" else [ Features.FeatureParser(spec).f for spec in args.features.split(sep=",") ] )
logging.info("{} features: '{}'.".format(len(f_objs), f_objs))
self.output_dir = args.output_directory
if not os.path.exists(self.output_dir):
logging.info("Creating output directory '{}'.".format(self.output_dir))
os.mkdir(self.output_dir)
if not os.path.isdir(self.output_dir):
raise NameError ("Output directory '{}' is a file instead of a directory".format(self.output_dir))
if args.file:
if args.target_format == 'brat':
self.convert_file_to_brat(args.file)
elif args.target_format == 'conll':
self.convert_file_to_conll(args.file, features=f_objs)
logging.info("Processed {} XML file(s)".format(1))
else:
input_dir = args.directory
logging.info("Reading from {}/*.xml into {}".format(input_dir, self.output_dir))
n = 0
for file in sorted(os.listdir(input_dir)):
if file.endswith(".xml"):
file = os.path.join(input_dir, file) # do not forget to add the directory name to the file name!
if args.target_format == 'brat':
self.convert_file_to_brat(file)
elif args.target_format == 'conll':
self.convert_file_to_conll(file, features=f_objs)
n += 1
logging.info("Processed {} XML file(s)".format(n))
return
if __name__ == '__main__':
# to help test this program
c = Converter()
c.parse_execute_command_line()