Skip to content

Commit

Permalink
Added option to split preprocessor into tokenizer, tagger and chunker.
Browse files Browse the repository at this point in the history
  • Loading branch information
marcverhagen committed Dec 16, 2016
1 parent ed4c4fb commit 937cba9
Show file tree
Hide file tree
Showing 7 changed files with 314 additions and 52 deletions.
9 changes: 8 additions & 1 deletion code/components/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@

from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET
from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER
from library.tarsqi_constants import GUTIME, EVITA, SLINKET
from library.tarsqi_constants import S2T, CLASSIFIER, BLINKER, LINK_MERGER

from preprocessing.wrapper import PreprocessorWrapper
from preprocessing.wrapper import TokenizerWrapper
from preprocessing.wrapper import TaggerWrapper
from preprocessing.wrapper import ChunkerWrapper
from gutime.wrapper import GUTimeWrapper
from evita.wrapper import EvitaWrapper
from slinket.wrapper import SlinketWrapper
Expand All @@ -13,6 +17,9 @@

COMPONENTS = {
PREPROCESSOR: PreprocessorWrapper,
TOKENIZER: TokenizerWrapper,
TAGGER: TaggerWrapper,
CHUNKER: ChunkerWrapper,
GUTIME: GUTimeWrapper,
EVITA: EvitaWrapper,
SLINKET: SlinketWrapper,
Expand Down
304 changes: 260 additions & 44 deletions code/components/preprocessing/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@

from utilities import logger
from docmodel.document import Tag
from library.tarsqi_constants import PREPROCESSOR
from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER

from components.preprocessing.tokenizer import Tokenizer
from components.preprocessing.tokenizer import Tokenizer, TokenizedLex
from components.preprocessing.chunker import chunk_sentences

# TreeTagger executables and parameter file
Expand Down Expand Up @@ -54,7 +54,7 @@ def initialize_treetagger(treetagger_dir):
return treetagger


def normalizePOS(pos):
def normalize_POS(pos):
"""Some simple modifications of the TreeTagger POS tags."""
if pos == 'SENT':
pos = '.'
Expand Down Expand Up @@ -109,18 +109,18 @@ def process(self):
TagId.reset()
for element in self.document.elements():
text = self.document.sourcedoc.text[element.begin:element.end]
tokens = self.tokenize_text(text)
tokens = self._tokenize_text(text)
adjust_lex_offsets(tokens, element.begin)
text = self.tag_text(tokens)
text = self._tag_text(tokens)
# TODO: add some code to get lemmas when the TreeTagger just gets
# <unknown>, see https://github.com/tarsqi/ttk/issues/5
text = self.chunk_text(text)
export(text, self.document)
text = self._chunk_text(text)
self._export(text)
logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)
logger.info("tagger processing time: %.3f seconds" % self.tag_time)
logger.info("chunker processing time: %.3f seconds" % self.chunk_time)

def tokenize_text(self, string):
def _tokenize_text(self, string):
"""Takes a unicode string and returns a list of objects, where each
object is either the pair ('<s>', None) or a pair of a tokenized string
and a TokenizedLex instance."""
Expand All @@ -131,7 +131,7 @@ def tokenize_text(self, string):
self.tokenize_time += time() - t1
return pairs

def tag_text(self, tokens):
def _tag_text(self, tokens):
"""Takes a string and returns a list of sentences. Each sentence is a
list of tuples of token, part-of-speech and lemma."""
t1 = time()
Expand All @@ -142,11 +142,11 @@ def tag_text(self, tokens):
# treetagger does not accept a unicode string, so encode in utf-8
# TODO: this may have changed with the latest version
taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8'))
text = self.create_text_from_tokens_and_tags(tokens, taggedItems)
text = self._create_text_from_tokens_and_tags(tokens, taggedItems)
self.tag_time += time() - t1
return text

def chunk_text(self, text):
def _chunk_text(self, text):
"""Takes a list of sentences and return the same sentences with chunk
tags inserted. May need to do something with things like &, <, >, and
others."""
Expand All @@ -155,7 +155,7 @@ def chunk_text(self, text):
self.chunk_time += time() - t1
return chunked_text

def create_text_from_tokens_and_tags(self, tokens, taggedItems):
def _create_text_from_tokens_and_tags(self, tokens, taggedItems):
text = []
current_sentence = []
for (token, item) in zip(tokens, taggedItems):
Expand All @@ -171,42 +171,41 @@ def create_text_from_tokens_and_tags(self, tokens, taggedItems):
current_sentence.append(token_tuple)
else:
(tok, pos, stem) = item.split("\t")
pos = normalizePOS(pos)
pos = normalize_POS(pos)
current_sentence.append((tok, pos, stem, lex.begin, lex.end))
return text


def export(text, tarsqidoc):
"""Export preprocessing information to the tag repository. Updates the
TagRepository using the preprocessing result."""
ctag = None
for sentence in text:
stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
for token in sentence:
if _is_tag(token):
if not token.startswith('</'):
ctag = Tag(TagId.next('c'), token[1:-1], None, None,
{'origin': PREPROCESSOR})
def _export(self, text):
"""Export preprocessing information to the tag repository. Updates the
TagRepository using the preprocessing result."""
ctag = None
for sentence in text:
stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
for token in sentence:
if _is_tag(token):
if not token.startswith('</'):
ctag = Tag(TagId.next('c'), token[1:-1], None, None,
{'origin': PREPROCESSOR})
else:
ctag.end = last_ltag.end
self.document.tags.append(ctag)
ctag = None
elif type(token) == TupleType:
ltag = _make_ltag(token)
self.document.tags.append(ltag)
if stag.begin is None:
stag.begin = token[3]
if ctag is not None and ctag.begin is None:
ctag.begin = ltag.begin
last_end_offset = token[4]
last_ltag = ltag
else:
ctag.end = last_ltag.end
tarsqidoc.tags.append(ctag)
ctag = None
elif type(token) == TupleType:
ltag = _make_ltag(token)
tarsqidoc.tags.append(ltag)
if stag.begin is None:
stag.begin = token[3]
if ctag is not None and ctag.begin is None:
ctag.begin = ltag.begin
last_end_offset = token[4]
last_ltag = ltag
else:
logger.warn('Unexpected token type')
stag.end = last_ltag.end
tarsqidoc.tags.append(stag)
# this indexing is needed because we bypassed the add_tag method on
# TagRepository and instead directly appended to the tags list
tarsqidoc.tags.index()
logger.warn('Unexpected token type')
stag.end = last_ltag.end
self.document.tags.append(stag)
# this indexing is needed because we bypassed the add_tag method on
# TagRepository and instead directly appended to the tags list
self.document.tags.index()


def _is_tag(token):
Expand All @@ -222,6 +221,223 @@ def _make_ltag(token):
'origin': PREPROCESSOR })


class TokenizerWrapper:

"""Wrapper for the tokenizer."""

def __init__(self, tarsqidocument):
"""Set component_name and add the TarsqiDocument."""
self.component_name = TOKENIZER
self.document = tarsqidocument
self.tokenize_time = 0

def process(self):
"""Retrieve the element tags from the TarsqiDocument and hand the text for
the elements as strings to the tokenizer. The result is a list of pairs,
where the pair is either (<s>, None) or (SomeString, TokenizedLex). In
the first case an s tag is inserted in the TarsqiDocument's tags
TagRepository and in the second a lex tag."""
TagId.reset()
for element in self.document.elements():
text = self.document.sourcedoc.text[element.begin:element.end]
tokens = self._tokenize_text(text)
adjust_lex_offsets(tokens, element.begin)
self._export_tokens(tokens)
logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)

def _tokenize_text(self, string):
"""Takes a unicode string and returns a list of objects, where each
object is either the pair ('<s>', None) or a pair of a tokenized string
and a TokenizedLex instance."""
t1 = time()
tokenizer = Tokenizer(string)
tokenized_text = tokenizer.tokenize_text()
pairs = tokenized_text.as_pairs()
self.tokenize_time += time() - t1
return pairs

def _export_tokens(self, tokens):
"""Add s tags and lex tags to the TagRepository of the TarsqiDocument."""
tokens = filter_tokens(tokens)
s_begin, s_end = None, None
for t in tokens:
if t == '<s>':
self._export_sentence(s_begin, s_end)
s_begin, s_end = None, None
else:
begin, end = t.begin, t.end
attrs = { 'text': t.text, 'origin': TOKENIZER }
ltag = Tag(TagId.next('l'), 'lex', begin, end, attrs)
self.document.tags.append(ltag)
if s_begin is None:
s_begin = begin
s_end = end
self._export_sentence(s_begin, s_end)
self.document.tags.index()

def _export_sentence(self, s_begin, s_end):
"""Add an s tag to the TagRepository of the TarsqiDocument."""
if s_begin is not None:
stag = Tag(TagId.next('s'), 's', s_begin, s_end, {'origin': TOKENIZER})
self.document.tags.append(stag)


def filter_tokens(tokens):
"""The tokens list is a list of pairs, where the first element is '<s>' or the
text of the token and the second element either None if the first element is
'<s>' or a TokenizedLex instance, just keep the '<s>' or the TokenizedLex."""
filtered_tokens = []
for t in tokens:
core = t[0] if t[0] == '<s>' else t[1]
filtered_tokens.append(core)
return filtered_tokens


class TaggerWrapper:

"""Wrapper for the tagger."""

def __init__(self, tarsqidocument):
"""Set component_name, add the TarsqiDocument and initialize the
TreeTagger."""
self.component_name = TAGGER
self.document = tarsqidocument
self.treetagger_dir = self.document.options.getopt('treetagger')
self.treetagger = initialize_treetagger(self.treetagger_dir)
self.tag_time = 0

def process(self):
"""Generate input for the tagger from the lex and s tags in the document, run
the tagger, and insert the new information (pos and lemma) into the
TagRepository on the TarsqiDocument."""
for element in self.document.elements():
sentences = self.document.tags.find_tags('s', element.begin, element.end)
lexes = self.document.tags.find_tags('lex', element.begin, element.end)
tokens = []
for s in sentences:
tokens.append(('<s>', None))
lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end]
for l in sorted(lexes_in_s):
text = l.attrs['text']
tokens.append((text, TokenizedLex(l.begin, l.end, text)))
tagged_tokens = self._tag_text(tokens)
# TODO: add some code to get lemmas when the TreeTagger just gets
# <unknown>, see https://github.com/tarsqi/ttk/issues/5
self._export_tags(tagged_tokens)
logger.info("tagger processing time: %.3f seconds" % self.tag_time)

def _tag_text(self, tokens):
"""Takes a string and returns a list of sentences. Each sentence is a
list of tuples of token, part-of-speech and lemma."""
t1 = time()
vertical_string = "\n".join([t[0] for t in tokens])
# this avoids handler warning if input is empty
if not vertical_string.strip():
vertical_string = '<s>'
# treetagger does not accept a unicode string, so encode in utf-8
# TODO: this may have changed with the latest version
taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8'))
text = self._merge(tokens, taggedItems)
self.tag_time += time() - t1
return text

def _merge(self, tokens, taggedItems):
"""Merge the tags and lemmas into the tokens. Result is a list of tokens
where each token is a 5-tuple of text, tag, lemma, begin offset and end
offset. Sentence information is not kept in this list."""
text = []
for (token, item) in zip(tokens, taggedItems):
if item == '<s>':
continue
lex = token[1]
if item[0] == '<' and item[-1] == '>':
# not quite sure what these are for, probably tags that the
# TreeTagger leaves alone
token_tuple = (item[0], 'SYM', item[0], lex.begin, lex.end)
text.append(token_tuple)
else:
(tok, pos, stem) = item.split("\t")
pos = normalize_POS(pos)
text.append((tok, pos, stem, lex.begin, lex.end))
return text

def _export_tags(self, tagged_tokens):
"""Take the token tuples and add their pos and lemma information to the
TagRepository in the TarsqiDocument."""
for tagged_token in tagged_tokens:
pos, lemma, p1, p2 = tagged_token[1:5]
tags = self.document.tags.find_tags_at(p1)
tags = [t for t in tags if t.end == p2 and t.name == 'lex']
if len(tags) == 1:
tags[0].attrs['pos'] = pos
tags[0].attrs['lemma'] = lemma
tags[0].attrs['origin'] += ",%s" % TAGGER
else:
logger.warn("More than one lex tag at position %d-%d" % (p1, p2))


class ChunkerWrapper:

"""Wrapper for the chunker."""

def __init__(self, tarsqidocument):
"""Set component_name and add the TarsqiDocument."""
self.component_name = CHUNKER
self.document = tarsqidocument
self.chunk_time = 0

def process(self):
"""Generate input for the chunker from the lex and s tags in the document, run
the chunker, and insert the new ng and vg chunks into the TagRepository
on the TarsqiDocument."""
TagId.reset()
for element in self.document.elements():
sentences = self.document.tags.find_tags('s', element.begin, element.end)
lexes = self.document.tags.find_tags('lex', element.begin, element.end)
text = []
for s in sentences:
sentence = []
lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end]
for l in sorted(lexes_in_s):
token = (l.attrs['text'], l.attrs['pos'], l.attrs['lemma'], l.begin, l.end)
sentence.append(token)
text.append(sentence)
text = self._chunk_text(text)
self._export_chunks(text)
logger.info("chunker processing time: %.3f seconds" % self.chunk_time)

def _chunk_text(self, text):
"""Takes a list of sentences and return the same sentences with chunk
tags inserted. May need to do something with things like &, <, >, and
others."""
t1 = time()
chunked_text = chunk_sentences(text)
self.chunk_time += time() - t1
return chunked_text

def _export_chunks(self, text):
"""Export ng and vg tags to the TagRepository on the TarsqiDocument."""
for sentence in text:
in_chunk = False
chunk_begin = None
chunk_end = None
for token in sentence:
if token in ('<ng>', '<vg>'):
in_chunk = True
chunk_begin = None
chunk_end = None
elif token in ('</ng>', '</vg>'):
in_chunk = False
chunk_tag = token[2:-1]
ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER})
self.document.tags.append(ctag)
elif in_chunk:
if chunk_begin is None:
chunk_begin = token[3]
chunk_end = token[4]
self.document.tags.index()


class TreeTagger(object):

"""Class that wraps the TreeTagger."""
Expand Down
Loading

0 comments on commit 937cba9

Please sign in to comment.