Added option to split preprocessor into tokenizer, tagger and chunker.

tarsqi · Dec 16, 2016 · 937cba9 · 937cba9
1 parent ed4c4fb
commit 937cba9
Show file tree

Hide file tree

Showing 7 changed files with 314 additions and 52 deletions.
diff --git a/code/components/__init__.py b/code/components/__init__.py
@@ -1,8 +1,12 @@
 
-from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET
+from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER
+from library.tarsqi_constants import GUTIME, EVITA, SLINKET
 from library.tarsqi_constants import S2T, CLASSIFIER, BLINKER, LINK_MERGER
 
 from preprocessing.wrapper import PreprocessorWrapper
+from preprocessing.wrapper import TokenizerWrapper
+from preprocessing.wrapper import TaggerWrapper
+from preprocessing.wrapper import ChunkerWrapper
 from gutime.wrapper import GUTimeWrapper
 from evita.wrapper import EvitaWrapper
 from slinket.wrapper import SlinketWrapper
@@ -13,6 +17,9 @@
 
 COMPONENTS = {
     PREPROCESSOR: PreprocessorWrapper,
+    TOKENIZER: TokenizerWrapper,
+    TAGGER: TaggerWrapper,
+    CHUNKER: ChunkerWrapper,
     GUTIME: GUTimeWrapper,
     EVITA: EvitaWrapper,
     SLINKET: SlinketWrapper,

diff --git a/code/components/preprocessing/wrapper.py b/code/components/preprocessing/wrapper.py
@@ -12,9 +12,9 @@
 
 from utilities import logger
 from docmodel.document import Tag
-from library.tarsqi_constants import PREPROCESSOR
+from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER
 
-from components.preprocessing.tokenizer import Tokenizer
+from components.preprocessing.tokenizer import Tokenizer, TokenizedLex
 from components.preprocessing.chunker import chunk_sentences
 
 # TreeTagger executables and parameter file
@@ -54,7 +54,7 @@ def initialize_treetagger(treetagger_dir):
     return treetagger
 
 
-def normalizePOS(pos):
+def normalize_POS(pos):
     """Some simple modifications of the TreeTagger POS tags."""
     if pos == 'SENT':
         pos = '.'
@@ -109,18 +109,18 @@ def process(self):
         TagId.reset()
         for element in self.document.elements():
             text = self.document.sourcedoc.text[element.begin:element.end]
-            tokens = self.tokenize_text(text)
+            tokens = self._tokenize_text(text)
             adjust_lex_offsets(tokens, element.begin)
-            text = self.tag_text(tokens)
+            text = self._tag_text(tokens)
             # TODO: add some code to get lemmas when the TreeTagger just gets
             # <unknown>, see https://github.com/tarsqi/ttk/issues/5
-            text = self.chunk_text(text)
-            export(text, self.document)
+            text = self._chunk_text(text)
+            self._export(text)
         logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)
         logger.info("tagger processing time: %.3f seconds" % self.tag_time)
         logger.info("chunker processing time: %.3f seconds" % self.chunk_time)
 
-    def tokenize_text(self, string):
+    def _tokenize_text(self, string):
         """Takes a unicode string and returns a list of objects, where each
         object is either the pair ('<s>', None) or a pair of a tokenized string
         and a TokenizedLex instance."""
@@ -131,7 +131,7 @@ def tokenize_text(self, string):
         self.tokenize_time += time() - t1
         return pairs
 
-    def tag_text(self, tokens):
+    def _tag_text(self, tokens):
         """Takes a string and returns a list of sentences. Each sentence is a
         list of tuples of token, part-of-speech and lemma."""
         t1 = time()
@@ -142,11 +142,11 @@ def tag_text(self, tokens):
         # treetagger does not accept a unicode string, so encode in utf-8
         # TODO: this may have changed with the latest version
         taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8'))
-        text = self.create_text_from_tokens_and_tags(tokens, taggedItems)
+        text = self._create_text_from_tokens_and_tags(tokens, taggedItems)
         self.tag_time += time() - t1
         return text
 
-    def chunk_text(self, text):
+    def _chunk_text(self, text):
         """Takes a list of sentences and return the same sentences with chunk
         tags inserted. May need to do something with things like &, <, >, and
         others."""
@@ -155,7 +155,7 @@ def chunk_text(self, text):
         self.chunk_time += time() - t1
         return chunked_text
 
-    def create_text_from_tokens_and_tags(self, tokens, taggedItems):
+    def _create_text_from_tokens_and_tags(self, tokens, taggedItems):
         text = []
         current_sentence = []
         for (token, item) in zip(tokens, taggedItems):
@@ -171,42 +171,41 @@ def create_text_from_tokens_and_tags(self, tokens, taggedItems):
                 current_sentence.append(token_tuple)
             else:
                 (tok, pos, stem) = item.split("\t")
-                pos = normalizePOS(pos)
+                pos = normalize_POS(pos)
                 current_sentence.append((tok, pos, stem, lex.begin, lex.end))
         return text
 
-
-def export(text, tarsqidoc):
-    """Export preprocessing information to the tag repository. Updates the
-    TagRepository using the preprocessing result."""
-    ctag = None
-    for sentence in text:
-        stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
-        for token in sentence:
-            if _is_tag(token):
-                if not token.startswith('</'):
-                    ctag = Tag(TagId.next('c'), token[1:-1], None, None,
-                               {'origin': PREPROCESSOR})
+    def _export(self, text):
+        """Export preprocessing information to the tag repository. Updates the
+        TagRepository using the preprocessing result."""
+        ctag = None
+        for sentence in text:
+            stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
+            for token in sentence:
+                if _is_tag(token):
+                    if not token.startswith('</'):
+                        ctag = Tag(TagId.next('c'), token[1:-1], None, None,
+                                   {'origin': PREPROCESSOR})
+                    else:
+                        ctag.end = last_ltag.end
+                        self.document.tags.append(ctag)
+                        ctag = None
+                elif type(token) == TupleType:
+                    ltag = _make_ltag(token)
+                    self.document.tags.append(ltag)
+                    if stag.begin is None:
+                        stag.begin = token[3]
+                    if ctag is not None and ctag.begin is None:
+                        ctag.begin = ltag.begin
+                    last_end_offset = token[4]
+                    last_ltag = ltag
                 else:
-                    ctag.end = last_ltag.end
-                    tarsqidoc.tags.append(ctag)
-                    ctag = None
-            elif type(token) == TupleType:
-                ltag = _make_ltag(token)
-                tarsqidoc.tags.append(ltag)
-                if stag.begin is None:
-                    stag.begin = token[3]
-                if ctag is not None and ctag.begin is None:
-                    ctag.begin = ltag.begin
-                last_end_offset = token[4]
-                last_ltag = ltag
-            else:
-                logger.warn('Unexpected token type')
-        stag.end = last_ltag.end
-        tarsqidoc.tags.append(stag)
-    # this indexing is needed because we bypassed the add_tag method on
-    # TagRepository and instead directly appended to the tags list
-    tarsqidoc.tags.index()
+                    logger.warn('Unexpected token type')
+            stag.end = last_ltag.end
+            self.document.tags.append(stag)
+            # this indexing is needed because we bypassed the add_tag method on
+            # TagRepository and instead directly appended to the tags list
+            self.document.tags.index()
 
 
 def _is_tag(token):
@@ -222,6 +221,223 @@ def _make_ltag(token):
                  'origin': PREPROCESSOR })
 
 
+class TokenizerWrapper:
+
+    """Wrapper for the tokenizer."""
+
+    def __init__(self, tarsqidocument):
+        """Set component_name and add the TarsqiDocument."""
+        self.component_name = TOKENIZER
+        self.document = tarsqidocument
+        self.tokenize_time = 0
+
+    def process(self):
+        """Retrieve the element tags from the TarsqiDocument and hand the text for
+        the elements as strings to the tokenizer. The result is a list of pairs,
+        where the pair is either (<s>, None) or (SomeString, TokenizedLex). In
+        the first case an s tag is inserted in the TarsqiDocument's tags
+        TagRepository and in the second a lex tag."""
+        TagId.reset()
+        for element in self.document.elements():
+            text = self.document.sourcedoc.text[element.begin:element.end]
+            tokens = self._tokenize_text(text)
+            adjust_lex_offsets(tokens, element.begin)
+            self._export_tokens(tokens)
+        logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)
+
+    def _tokenize_text(self, string):
+        """Takes a unicode string and returns a list of objects, where each
+        object is either the pair ('<s>', None) or a pair of a tokenized string
+        and a TokenizedLex instance."""
+        t1 = time()
+        tokenizer = Tokenizer(string)
+        tokenized_text = tokenizer.tokenize_text()
+        pairs = tokenized_text.as_pairs()
+        self.tokenize_time += time() - t1
+        return pairs
+
+    def _export_tokens(self, tokens):
+        """Add s tags and lex tags to the TagRepository of the TarsqiDocument."""
+        tokens = filter_tokens(tokens)
+        s_begin, s_end = None, None
+        for t in tokens:
+            if t == '<s>':
+                self._export_sentence(s_begin, s_end)
+                s_begin, s_end = None, None
+            else:
+                begin, end = t.begin, t.end
+                attrs = { 'text': t.text, 'origin': TOKENIZER }
+                ltag = Tag(TagId.next('l'), 'lex', begin, end, attrs)
+                self.document.tags.append(ltag)
+                if s_begin is None:
+                    s_begin = begin
+                s_end = end
+        self._export_sentence(s_begin, s_end)
+        self.document.tags.index()
+
+    def _export_sentence(self, s_begin, s_end):
+        """Add an s tag to the TagRepository of the TarsqiDocument."""
+        if s_begin is not None:
+            stag = Tag(TagId.next('s'), 's', s_begin, s_end, {'origin': TOKENIZER})
+            self.document.tags.append(stag)
+
+
+def filter_tokens(tokens):
+    """The tokens list is a list of pairs, where the first element is '<s>' or the
+    text of the token and the second element either None if the first element is
+    '<s>' or a TokenizedLex instance, just keep the '<s>' or the TokenizedLex."""
+    filtered_tokens = []
+    for t in tokens:
+        core = t[0] if t[0] == '<s>' else t[1]
+        filtered_tokens.append(core)
+    return filtered_tokens
+
+
+class TaggerWrapper:
+
+    """Wrapper for the tagger."""
+
+    def __init__(self, tarsqidocument):
+        """Set component_name, add the TarsqiDocument and initialize the
+        TreeTagger."""
+        self.component_name = TAGGER
+        self.document = tarsqidocument
+        self.treetagger_dir = self.document.options.getopt('treetagger')
+        self.treetagger = initialize_treetagger(self.treetagger_dir)
+        self.tag_time = 0
+
+    def process(self):
+        """Generate input for the tagger from the lex and s tags in the document, run
+        the tagger, and insert the new information (pos and lemma) into the
+        TagRepository on the TarsqiDocument."""
+        for element in self.document.elements():
+            sentences = self.document.tags.find_tags('s', element.begin, element.end)
+            lexes = self.document.tags.find_tags('lex', element.begin, element.end)
+            tokens = []
+            for s in sentences:
+                tokens.append(('<s>', None))
+                lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end]
+                for l in sorted(lexes_in_s):
+                    text = l.attrs['text']
+                    tokens.append((text, TokenizedLex(l.begin, l.end, text)))
+            tagged_tokens = self._tag_text(tokens)
+            # TODO: add some code to get lemmas when the TreeTagger just gets
+            # <unknown>, see https://github.com/tarsqi/ttk/issues/5
+            self._export_tags(tagged_tokens)
+        logger.info("tagger processing time: %.3f seconds" % self.tag_time)
+
+    def _tag_text(self, tokens):
+        """Takes a string and returns a list of sentences. Each sentence is a
+        list of tuples of token, part-of-speech and lemma."""
+        t1 = time()
+        vertical_string = "\n".join([t[0] for t in tokens])
+        # this avoids handler warning if input is empty
+        if not vertical_string.strip():
+            vertical_string = '<s>'
+        # treetagger does not accept a unicode string, so encode in utf-8
+        # TODO: this may have changed with the latest version
+        taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8'))
+        text = self._merge(tokens, taggedItems)
+        self.tag_time += time() - t1
+        return text
+
+    def _merge(self, tokens, taggedItems):
+        """Merge the tags and lemmas into the tokens. Result is a list of tokens
+        where each token is a 5-tuple of text, tag, lemma, begin offset and end
+        offset. Sentence information is not kept in this list."""
+        text = []
+        for (token, item) in zip(tokens, taggedItems):
+            if item == '<s>':
+                continue
+            lex = token[1]
+            if item[0] == '<' and item[-1] == '>':
+                # not quite sure what these are for, probably tags that the
+                # TreeTagger leaves alone
+                token_tuple = (item[0], 'SYM', item[0], lex.begin, lex.end)
+                text.append(token_tuple)
+            else:
+                (tok, pos, stem) = item.split("\t")
+                pos = normalize_POS(pos)
+                text.append((tok, pos, stem, lex.begin, lex.end))
+        return text
+
+    def _export_tags(self, tagged_tokens):
+        """Take the token tuples and add their pos and lemma information to the
+        TagRepository in the TarsqiDocument."""
+        for tagged_token in tagged_tokens:
+            pos, lemma, p1, p2 = tagged_token[1:5]
+            tags = self.document.tags.find_tags_at(p1)
+            tags = [t for t in tags if t.end == p2 and t.name == 'lex']
+            if len(tags) == 1:
+                tags[0].attrs['pos'] = pos
+                tags[0].attrs['lemma'] = lemma
+                tags[0].attrs['origin'] += ",%s" % TAGGER
+            else:
+                logger.warn("More than one lex tag at position %d-%d" % (p1, p2))
+
+
+class ChunkerWrapper:
+
+    """Wrapper for the chunker."""
+
+    def __init__(self, tarsqidocument):
+        """Set component_name and add the TarsqiDocument."""
+        self.component_name = CHUNKER
+        self.document = tarsqidocument
+        self.chunk_time = 0
+
+    def process(self):
+        """Generate input for the chunker from the lex and s tags in the document, run
+        the chunker, and insert the new ng and vg chunks into the TagRepository
+        on the TarsqiDocument."""
+        TagId.reset()
+        for element in self.document.elements():
+            sentences = self.document.tags.find_tags('s', element.begin, element.end)
+            lexes = self.document.tags.find_tags('lex', element.begin, element.end)
+            text = []
+            for s in sentences:
+                sentence = []
+                lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end]
+                for l in sorted(lexes_in_s):
+                    token = (l.attrs['text'], l.attrs['pos'], l.attrs['lemma'], l.begin, l.end)
+                    sentence.append(token)
+                text.append(sentence)
+            text = self._chunk_text(text)
+            self._export_chunks(text)
+        logger.info("chunker processing time: %.3f seconds" % self.chunk_time)
+
+    def _chunk_text(self, text):
+        """Takes a list of sentences and return the same sentences with chunk
+        tags inserted. May need to do something with things like &, <, >, and
+        others."""
+        t1 = time()
+        chunked_text = chunk_sentences(text)
+        self.chunk_time += time() - t1
+        return chunked_text
+
+    def _export_chunks(self, text):
+        """Export ng and vg tags to the TagRepository on the TarsqiDocument."""
+        for sentence in text:
+            in_chunk = False
+            chunk_begin = None
+            chunk_end = None
+            for token in sentence:
+                if token in ('<ng>', '<vg>'):
+                    in_chunk = True
+                    chunk_begin = None
+                    chunk_end = None
+                elif token in ('</ng>', '</vg>'):
+                    in_chunk = False
+                    chunk_tag = token[2:-1]
+                    ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER})
+                    self.document.tags.append(ctag)
+                elif in_chunk:
+                    if chunk_begin is None:
+                        chunk_begin = token[3]
+                    chunk_end = token[4]
+        self.document.tags.index()
+
+
 class TreeTagger(object):
 
     """Class that wraps the TreeTagger."""