forked from proycon/python-frog
-
Notifications
You must be signed in to change notification settings - Fork 0
/
frog_wrapper.pyx
265 lines (224 loc) · 12.2 KB
/
frog_wrapper.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#embedsignature=True
#*****************************
# Python-ucto
# by Maarten van Gompel
# Centre for Language Studies
# Radboud University Nijmegen
#
# Licensed under GPLv3
#****************************/
from libcpp.string cimport string
from libcpp cimport bool
from libcpp.vector cimport vector
from cython.operator cimport dereference as deref, preincrement as inc
from cython import address
from libc.stdint cimport *
from libcpp.utility cimport pair
import os
import sys
cimport libfolia_classes
cimport frog_classes
FROGDATAVERSION = "0.20"
UCTODATAVERSION = "0.8"
try:
from folia.main import Document as FoliaPyDocument
HASFOLIAPY = True
except:
try:
from pynlpl.formats.folia import Document as FoliaPyDocument
HASFOLIAPY = True
except ImportError:
HASFOLIAPY = False
class FoLiAPyDocument: #dummy
pass
cdef class Document:
cdef frog_classes.Document capi
cdef class FrogOptions:
"""Options for Frog, passed as keyword arguments to the constructor. Also accessible like dictionary keys.
You can pass as keyword arguments to the constructor any options similar to the command-line paramters accepted by Frog.
Please see frog --help for a complete list.
* ``skip`` ``[values]`` - Skip Tokenizer (t), Lemmatizer (l), Morphological Analyzer (a), Chunker (c), Multi-Word Units (m), Named Entity Recognition (n), or Parser (p)
* ``id`` ``[string]`` - Document ID for FoLiA output
Boolean parameters are passed like ``n=True``
Old style-options: These are deprecated but supported for backward compatibility reasons:
* ``tok`` - True/False - Do tokenisation? (default: True)
* ``lemma`` - True/False - Do lemmatisation? (default: True)
* ``morph`` - True/False - Do morpholigical analysis? (default: True)
* ``deepmorph`` - True/False - Do morphological analysis in new experimental style? (default: False)
* ``mwu`` - True/False - Do Multi Word Unit detection? (default: True)
* ``chunking`` - True/False - Do Chunking/Shallow parsing? (default: True)
* ``ner`` - True/False - Do Named Entity Recognition? (default: True)
* ``parser`` - True/False - Do Dependency Parsing? (default: False)
* ``xmlin`` - True/False - Input is FoLiA XML (default: False)
* ``xmlout`` - True/False - Output is FoLiA XML (default: False)
* ``docid`` - str - Document ID (for FoLiA)
* ``numThreads`` - int - Number of threads to use (default: unset, unlimited)
"""
cdef frog_classes.CL_Options capi
cdef dict shadow
cdef list skip
def __init__(self, **kwargs):
self.shadow = {} #shadow all settings in a dictionary
self.skip = ["p"] #skip parser by default
for key, value in kwargs.items():
self[key] = value
def __getitem__(self, key):
key = key.replace("_","-").lower()
if key in self.shadow:
return self.shadow[key]
else:
raise KeyError("No such key: " + str(key))
def get(self, key, default=False):
key = key.replace("_","-").lower()
if key in self.shadow:
return self.shadow[key]
else:
return default
def __setitem__(self, key, value):
key = key.replace("_","-")
self.shadow[key.lower()] = value
if key.lower() in ('dotok','tok'):
if not value: self.skip.append("t")
elif key.lower() in ('dolemma','lemma'):
if not value: self.skip.append("l")
elif key.lower() in ('domorph','morph'):
if not value: self.skip.append("a")
elif key.lower() in ('dodaringmorph','daringmorph','deepmorph','dodeepmorph'):
self.capi.insert(<string>b"deepmorph",<string>b"")
elif key.lower() in ('domwu','mwu'):
if not value: self.skip.append("m")
elif key.lower() in ('doiob','iob','dochunking','chunking','shallowparsing'):
if not value: self.skip.append("c")
elif key.lower() in ('doner','ner'):
if not value: self.skip.append("n")
elif key.lower() in ('doparse','doparser','parse','parser'):
if not value:
self.skip.append("p")
elif value:
self.skip.remove("p")
elif key.lower() in ('doxmlin','xmlin','foliain'):
self.capi.insert(<char>b"x", <string>b"", False)
elif key.lower() in ('doxmlout','xmlout','foliaout'):
self.capi.insert(<char>b"X", <string>b"", False)
elif key.lower() in ('debug','debugflag'):
if value: self.capi.insert(<char>b"d", <string>b"1", False)
elif key.lower() in ('docid','id'):
self.capi.insert(<string>b"id", <string>value)
elif key.lower() in ('numthreads','threads'):
self.capi.insert(<string>b"threads",<string>value)
else:
if key == 'x':
self.shadow['xmlin'] = True
elif key == 'X':
self.shadow['xmlout'] = True
self.capi.insert(<string>key, <string>value)
def finish(self):
v = "".join(self.skip)
self.capi.insert(<string>b"skip", <string>v.encode('utf-8'))
cdef class Frog:
cdef frog_classes.FrogAPI * capi
cdef FrogOptions options
cdef frog_classes.LogStream logstream
cdef frog_classes.LogStream debuglogstream
def __init__(self, FrogOptions options, configurationfile = "", overrides = None):
"""Initialises Frog, pass a FrogOptions instance and a configuration file, and optionally a dictionary containing overrides for the configuration."""
options.finish()
if overrides:
assert(overrides, dict)
for key, value in overrides.item():
v = key + "=" + value
options.capi.insert(<string>b"override", <string>v.encode('utf-8'))
if configurationfile:
if configurationfile.startswith("/") and not os.path.exists(configurationfile):
#better let Python handle this rather than the binding, so the interpreter can recover
#When the configurationfile is not absolute thouugh, we let frog handle it (will search in the default config dirs but may come back with a hard failure!)
raise FileNotFoundError(f"Specified configuration file {configurationfile} does not exist!")
options.capi.insert(<string>b"config", <string>configurationfile.encode("utf-8"))
else:
configurationfile = self.capi.defaultConfigFile(b"nld")
if not os.path.exists(configurationfile):
#better let Python handle this rather than the binding, so the interpreter can recover:
raise FileNotFoundError(f"Default configuration file {configurationfile} does not exist! Try running frog.installdata() first")
#(we don't pass this on because Frog will repeat the same logic by default)
self.options = options
self.capi = new frog_classes.FrogAPI(options.capi, &self.logstream, &self.debuglogstream)
def process_raw(self, text):
"""Invokes Frog on the specified text, the text is considered one document. The raw results from Frog are returned as a string"""
#cdef libfolia_classes.Document * doc = self.capi.tokenizer.tokenizehelper( text.encode('utf-8') )
cdef string result = self.capi.Frogtostring(self._encode_text(text))
r = result.decode('utf-8') #if (sys.version < '3' and type(text) == unicode) or (sys.version > '3' and type(text) == str) else result
return r
def parsecolumns(self, response):
"""Parse the raw Frog response"""
if self.options.get('doDeepMorph'):
columns = ('index','text','lemma','morph','compound','pos','posprob','ner','chunker','depindex','dep')
else:
columns = ('index','text','lemma','morph','pos','posprob','ner','chunker','depindex','dep')
data = []
for line in response.split('\n'):
if not line.strip():
if data:
data[-1]['eos'] = True
else:
item = {}
for i, field in enumerate(line.split('\t')):
if field:
if columns[i] == 'posprob':
item[columns[i]] = float(field)
else:
item[columns[i]] = field
data.append(item)
return data
def process(self, text):
"""Invokes Frog on the specified text. The text may be a string, or a folia.Document instance if Frog was instantiated with xmlin=True. If xmlout=False (default), the results from Frog are parsed into a list of dictionaries, one per token; if True, a FoLiA Document instance is returned"""
if self.options.get('xmlin') and HASFOLIAPY and isinstance(text, FoliaPyDocument):
text = str(text)
elif not isinstance(text,str) and not (sys.version < '3' and isinstance(text,unicode)):
raise ValueError("Text should be a string or FoLiA Document instance")
if self.options.get('xmlout') and HASFOLIAPY:
if HASFOLIAPY:
data = self.process_raw(text)
if not data:
raise ValueError("No data returned")
elif data[0] != '<':
raise ValueError("Returned data is not XML, got: " + str(data[:25]))
return FoliaPyDocument(string=data)
else:
raise Exception("Unable to return a FoLiA Document. FoLiAPy was not installed. Use process_raw() instead if you just want the XML output as string")
else:
return self.parsecolumns(self.process_raw(text))
def _encode_text(self, text):
if type(text) == str:
return text.encode('utf-8')
return text #already was bytes or python2 str
def localpath(name="frog"):
xdg_config_dir = os.environ.get("XDG_CONFIG_HOME", os.path.join(os.environ.get("HOME",""), ".config"))
return os.environ.get("FROGDATAPATH", os.path.join(xdg_config_dir,name) )
def installdata(targetdir=None, frogdataversion=FROGDATAVERSION, uctodataversion=UCTODATAVERSION):
if targetdir is None:
uctodir = localpath("ucto")
else:
uctodir = os.path.join(targetdir,"ucto")
if os.path.exists(uctodir):
print(f"Uctodata configuration directory already exists: {uctodir}, refusing to overwrite, please remove it first if you want to install the data anew.", file=sys.stderr)
else:
tmpdir=os.environ.get("TMPDIR","/tmp")
if os.system(f"cd {tmpdir} && mkdir -p {uctodir} && wget -O uctodata.tar.gz https://github.com/LanguageMachines/uctodata/releases/download/v{uctodataversion}/uctodata-{uctodataversion}.tar.gz && tar -xzf uctodata.tar.gz && cd uctodata-{uctodataversion} && mv config/* {uctodir}/ && cd .. && rm -Rf uctodata-{uctodataversion} && rm -Rf uctodata.tar.gz") != 0:
raise Exception("Installation of uctodata failed")
print(f"Installation of uctodata {uctodataversion} complete", file=sys.stderr)
if targetdir is None:
frogdir = localpath()
else:
frogdir = os.path.join(targetdir,"frog")
if os.path.exists(frogdir):
print(f"Frogdata configuration directory already exists: {frogdir}, refusing to overwrite, please remove it first if you want to install the data anew.", file=sys.stderr)
else:
tmpdir=os.environ.get("TMPDIR","/tmp")
if os.system(f"cd {tmpdir} && mkdir -p {frogdir} && wget -O frogdata.tar.gz https://github.com/LanguageMachines/frogdata/releases/download/v{frogdataversion}/frogdata-{frogdataversion}.tar.gz && tar -xzf frogdata.tar.gz && cd frogdata-{frogdataversion} && mv config/* {frogdir}/ && cd .. && rm -Rf frogdata-{frogdataversion} && rm -Rf frogdata.tar.gz") != 0:
raise Exception("Installation of frogdata failed")
print(f"Installation of frogdata {frogdataversion} complete", file=sys.stderr)
if os.path.isdir("/usr/share/libexttextcat"):
if os.system(f"cd {uctodir} && wget -O textcat.cfg https://raw.githubusercontent.com/LanguageMachines/ucto/master/config/textcat.cfg") != 0:
raise Exception("Installation of textcat.cfg failed.")
else:
print("Language detection will not be available unless you install libexttextcat and rerun installdata()", file=sys.stderr)