From 937cba99a99ac884ece303a392464abd61fd8275 Mon Sep 17 00:00:00 2001
From: Marc Verhagen <marc@cs.brandeis.edu>
Date: Fri, 16 Dec 2016 00:11:35 -0500
Subject: [PATCH] Added option to split preprocessor into tokenizer, tagger and
 chunker.

---
 code/components/__init__.py              |   9 +-
 code/components/preprocessing/wrapper.py | 304 +++++++++++++++++++----
 code/docmodel/document.py                |   4 +
 code/docmodel/main.py                    |   1 +
 code/tarsqi.py                           |   5 +-
 code/testing/run_tests.py                |  10 +-
 docs/design/2-preprocessor.html          |  33 +++
 7 files changed, 314 insertions(+), 52 deletions(-)
diff --git a/code/components/__init__.py b/code/components/__init__.py
index 46cfd6c..d6c4c7e 100644
--- a/code/components/__init__.py
+++ b/code/components/__init__.py
@@ -1,8 +1,12 @@
 
-from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET
+from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER
+from library.tarsqi_constants import GUTIME, EVITA, SLINKET
 from library.tarsqi_constants import S2T, CLASSIFIER, BLINKER, LINK_MERGER
 
 from preprocessing.wrapper import PreprocessorWrapper
+from preprocessing.wrapper import TokenizerWrapper
+from preprocessing.wrapper import TaggerWrapper
+from preprocessing.wrapper import ChunkerWrapper
 from gutime.wrapper import GUTimeWrapper
 from evita.wrapper import EvitaWrapper
 from slinket.wrapper import SlinketWrapper
@@ -13,6 +17,9 @@
 
 COMPONENTS = {
     PREPROCESSOR: PreprocessorWrapper,
+    TOKENIZER: TokenizerWrapper,
+    TAGGER: TaggerWrapper,
+    CHUNKER: ChunkerWrapper,
     GUTIME: GUTimeWrapper,
     EVITA: EvitaWrapper,
     SLINKET: SlinketWrapper,
diff --git a/code/components/preprocessing/wrapper.py b/code/components/preprocessing/wrapper.py
index 92b9117..994e7fc 100644
--- a/code/components/preprocessing/wrapper.py
+++ b/code/components/preprocessing/wrapper.py
@@ -12,9 +12,9 @@
 
 from utilities import logger
 from docmodel.document import Tag
-from library.tarsqi_constants import PREPROCESSOR
+from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER
 
-from components.preprocessing.tokenizer import Tokenizer
+from components.preprocessing.tokenizer import Tokenizer, TokenizedLex
 from components.preprocessing.chunker import chunk_sentences
 
 # TreeTagger executables and parameter file
@@ -54,7 +54,7 @@ def initialize_treetagger(treetagger_dir):
     return treetagger
 
 
-def normalizePOS(pos):
+def normalize_POS(pos):
     """Some simple modifications of the TreeTagger POS tags."""
     if pos == 'SENT':
         pos = '.'
@@ -109,18 +109,18 @@ def process(self):
         TagId.reset()
         for element in self.document.elements():
             text = self.document.sourcedoc.text[element.begin:element.end]
-            tokens = self.tokenize_text(text)
+            tokens = self._tokenize_text(text)
             adjust_lex_offsets(tokens, element.begin)
-            text = self.tag_text(tokens)
+            text = self._tag_text(tokens)
             # TODO: add some code to get lemmas when the TreeTagger just gets
             # <unknown>, see https://github.com/tarsqi/ttk/issues/5
-            text = self.chunk_text(text)
-            export(text, self.document)
+            text = self._chunk_text(text)
+            self._export(text)
         logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)
         logger.info("tagger processing time: %.3f seconds" % self.tag_time)
         logger.info("chunker processing time: %.3f seconds" % self.chunk_time)
 
-    def tokenize_text(self, string):
+    def _tokenize_text(self, string):
         """Takes a unicode string and returns a list of objects, where each
         object is either the pair ('<s>', None) or a pair of a tokenized string
         and a TokenizedLex instance."""
@@ -131,7 +131,7 @@ def tokenize_text(self, string):
         self.tokenize_time += time() - t1
         return pairs
 
-    def tag_text(self, tokens):
+    def _tag_text(self, tokens):
         """Takes a string and returns a list of sentences. Each sentence is a
         list of tuples of token, part-of-speech and lemma."""
         t1 = time()
@@ -142,11 +142,11 @@ def tag_text(self, tokens):
         # treetagger does not accept a unicode string, so encode in utf-8
         # TODO: this may have changed with the latest version
         taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8'))
-        text = self.create_text_from_tokens_and_tags(tokens, taggedItems)
+        text = self._create_text_from_tokens_and_tags(tokens, taggedItems)
         self.tag_time += time() - t1
         return text
 
-    def chunk_text(self, text):
+    def _chunk_text(self, text):
         """Takes a list of sentences and return the same sentences with chunk
         tags inserted. May need to do something with things like &, <, >, and
         others."""
@@ -155,7 +155,7 @@ def chunk_text(self, text):
         self.chunk_time += time() - t1
         return chunked_text
 
-    def create_text_from_tokens_and_tags(self, tokens, taggedItems):
+    def _create_text_from_tokens_and_tags(self, tokens, taggedItems):
         text = []
         current_sentence = []
         for (token, item) in zip(tokens, taggedItems):
@@ -171,42 +171,41 @@ def create_text_from_tokens_and_tags(self, tokens, taggedItems):
                 current_sentence.append(token_tuple)
             else:
                 (tok, pos, stem) = item.split("\t")
-                pos = normalizePOS(pos)
+                pos = normalize_POS(pos)
                 current_sentence.append((tok, pos, stem, lex.begin, lex.end))
         return text
 
-
-def export(text, tarsqidoc):
-    """Export preprocessing information to the tag repository. Updates the
-    TagRepository using the preprocessing result."""
-    ctag = None
-    for sentence in text:
-        stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
-        for token in sentence:
-            if _is_tag(token):
-                if not token.startswith('</'):
-                    ctag = Tag(TagId.next('c'), token[1:-1], None, None,
-                               {'origin': PREPROCESSOR})
+    def _export(self, text):
+        """Export preprocessing information to the tag repository. Updates the
+        TagRepository using the preprocessing result."""
+        ctag = None
+        for sentence in text:
+            stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
+            for token in sentence:
+                if _is_tag(token):
+                    if not token.startswith('</'):
+                        ctag = Tag(TagId.next('c'), token[1:-1], None, None,
+                                   {'origin': PREPROCESSOR})
+                    else:
+                        ctag.end = last_ltag.end
+                        self.document.tags.append(ctag)
+                        ctag = None
+                elif type(token) == TupleType:
+                    ltag = _make_ltag(token)
+                    self.document.tags.append(ltag)
+                    if stag.begin is None:
+                        stag.begin = token[3]
+                    if ctag is not None and ctag.begin is None:
+                        ctag.begin = ltag.begin
+                    last_end_offset = token[4]
+                    last_ltag = ltag
                 else:
-                    ctag.end = last_ltag.end
-                    tarsqidoc.tags.append(ctag)
-                    ctag = None
-            elif type(token) == TupleType:
-                ltag = _make_ltag(token)
-                tarsqidoc.tags.append(ltag)
-                if stag.begin is None:
-                    stag.begin = token[3]
-                if ctag is not None and ctag.begin is None:
-                    ctag.begin = ltag.begin
-                last_end_offset = token[4]
-                last_ltag = ltag
-            else:
-                logger.warn('Unexpected token type')
-        stag.end = last_ltag.end
-        tarsqidoc.tags.append(stag)
-    # this indexing is needed because we bypassed the add_tag method on
-    # TagRepository and instead directly appended to the tags list
-    tarsqidoc.tags.index()
+                    logger.warn('Unexpected token type')
+            stag.end = last_ltag.end
+            self.document.tags.append(stag)
+            # this indexing is needed because we bypassed the add_tag method on
+            # TagRepository and instead directly appended to the tags list
+            self.document.tags.index()
 
 
 def _is_tag(token):
@@ -222,6 +221,223 @@ def _make_ltag(token):
                  'origin': PREPROCESSOR })
 
 
+class TokenizerWrapper:
+
+    """Wrapper for the tokenizer."""
+
+    def __init__(self, tarsqidocument):
+        """Set component_name and add the TarsqiDocument."""
+        self.component_name = TOKENIZER
+        self.document = tarsqidocument
+        self.tokenize_time = 0
+
+    def process(self):
+        """Retrieve the element tags from the TarsqiDocument and hand the text for
+        the elements as strings to the tokenizer. The result is a list of pairs,
+        where the pair is either (<s>, None) or (SomeString, TokenizedLex). In
+        the first case an s tag is inserted in the TarsqiDocument's tags
+        TagRepository and in the second a lex tag."""
+        TagId.reset()
+        for element in self.document.elements():
+            text = self.document.sourcedoc.text[element.begin:element.end]
+            tokens = self._tokenize_text(text)
+            adjust_lex_offsets(tokens, element.begin)
+            self._export_tokens(tokens)
+        logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time)
+
+    def _tokenize_text(self, string):
+        """Takes a unicode string and returns a list of objects, where each
+        object is either the pair ('<s>', None) or a pair of a tokenized string
+        and a TokenizedLex instance."""
+        t1 = time()
+        tokenizer = Tokenizer(string)
+        tokenized_text = tokenizer.tokenize_text()
+        pairs = tokenized_text.as_pairs()
+        self.tokenize_time += time() - t1
+        return pairs
+
+    def _export_tokens(self, tokens):
+        """Add s tags and lex tags to the TagRepository of the TarsqiDocument."""
+        tokens = filter_tokens(tokens)
+        s_begin, s_end = None, None
+        for t in tokens:
+            if t == '<s>':
+                self._export_sentence(s_begin, s_end)
+                s_begin, s_end = None, None
+            else:
+                begin, end = t.begin, t.end
+                attrs = { 'text': t.text, 'origin': TOKENIZER }
+                ltag = Tag(TagId.next('l'), 'lex', begin, end, attrs)
+                self.document.tags.append(ltag)
+                if s_begin is None:
+                    s_begin = begin
+                s_end = end
+        self._export_sentence(s_begin, s_end)
+        self.document.tags.index()
+
+    def _export_sentence(self, s_begin, s_end):
+        """Add an s tag to the TagRepository of the TarsqiDocument."""
+        if s_begin is not None:
+            stag = Tag(TagId.next('s'), 's', s_begin, s_end, {'origin': TOKENIZER})
+            self.document.tags.append(stag)
+
+
+def filter_tokens(tokens):
+    """The tokens list is a list of pairs, where the first element is '<s>' or the
+    text of the token and the second element either None if the first element is
+    '<s>' or a TokenizedLex instance, just keep the '<s>' or the TokenizedLex."""
+    filtered_tokens = []
+    for t in tokens:
+        core = t[0] if t[0] == '<s>' else t[1]
+        filtered_tokens.append(core)
+    return filtered_tokens
+
+
+class TaggerWrapper:
+
+    """Wrapper for the tagger."""
+
+    def __init__(self, tarsqidocument):
+        """Set component_name, add the TarsqiDocument and initialize the
+        TreeTagger."""
+        self.component_name = TAGGER
+        self.document = tarsqidocument
+        self.treetagger_dir = self.document.options.getopt('treetagger')
+        self.treetagger = initialize_treetagger(self.treetagger_dir)
+        self.tag_time = 0
+
+    def process(self):
+        """Generate input for the tagger from the lex and s tags in the document, run
+        the tagger, and insert the new information (pos and lemma) into the
+        TagRepository on the TarsqiDocument."""
+        for element in self.document.elements():
+            sentences = self.document.tags.find_tags('s', element.begin, element.end)
+            lexes = self.document.tags.find_tags('lex', element.begin, element.end)
+            tokens = []
+            for s in sentences:
+                tokens.append(('<s>', None))
+                lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end]
+                for l in sorted(lexes_in_s):
+                    text = l.attrs['text']
+                    tokens.append((text, TokenizedLex(l.begin, l.end, text)))
+            tagged_tokens = self._tag_text(tokens)
+            # TODO: add some code to get lemmas when the TreeTagger just gets
+            # <unknown>, see https://github.com/tarsqi/ttk/issues/5
+            self._export_tags(tagged_tokens)
+        logger.info("tagger processing time: %.3f seconds" % self.tag_time)
+
+    def _tag_text(self, tokens):
+        """Takes a string and returns a list of sentences. Each sentence is a
+        list of tuples of token, part-of-speech and lemma."""
+        t1 = time()
+        vertical_string = "\n".join([t[0] for t in tokens])
+        # this avoids handler warning if input is empty
+        if not vertical_string.strip():
+            vertical_string = '<s>'
+        # treetagger does not accept a unicode string, so encode in utf-8
+        # TODO: this may have changed with the latest version
+        taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8'))
+        text = self._merge(tokens, taggedItems)
+        self.tag_time += time() - t1
+        return text
+
+    def _merge(self, tokens, taggedItems):
+        """Merge the tags and lemmas into the tokens. Result is a list of tokens
+        where each token is a 5-tuple of text, tag, lemma, begin offset and end
+        offset. Sentence information is not kept in this list."""
+        text = []
+        for (token, item) in zip(tokens, taggedItems):
+            if item == '<s>':
+                continue
+            lex = token[1]
+            if item[0] == '<' and item[-1] == '>':
+                # not quite sure what these are for, probably tags that the
+                # TreeTagger leaves alone
+                token_tuple = (item[0], 'SYM', item[0], lex.begin, lex.end)
+                text.append(token_tuple)
+            else:
+                (tok, pos, stem) = item.split("\t")
+                pos = normalize_POS(pos)
+                text.append((tok, pos, stem, lex.begin, lex.end))
+        return text
+
+    def _export_tags(self, tagged_tokens):
+        """Take the token tuples and add their pos and lemma information to the
+        TagRepository in the TarsqiDocument."""
+        for tagged_token in tagged_tokens:
+            pos, lemma, p1, p2 = tagged_token[1:5]
+            tags = self.document.tags.find_tags_at(p1)
+            tags = [t for t in tags if t.end == p2 and t.name == 'lex']
+            if len(tags) == 1:
+                tags[0].attrs['pos'] = pos
+                tags[0].attrs['lemma'] = lemma
+                tags[0].attrs['origin'] += ",%s" % TAGGER
+            else:
+                logger.warn("More than one lex tag at position %d-%d" % (p1, p2))
+
+
+class ChunkerWrapper:
+
+    """Wrapper for the chunker."""
+
+    def __init__(self, tarsqidocument):
+        """Set component_name and add the TarsqiDocument."""
+        self.component_name = CHUNKER
+        self.document = tarsqidocument
+        self.chunk_time = 0
+
+    def process(self):
+        """Generate input for the chunker from the lex and s tags in the document, run
+        the chunker, and insert the new ng and vg chunks into the TagRepository
+        on the TarsqiDocument."""
+        TagId.reset()
+        for element in self.document.elements():
+            sentences = self.document.tags.find_tags('s', element.begin, element.end)
+            lexes = self.document.tags.find_tags('lex', element.begin, element.end)
+            text = []
+            for s in sentences:
+                sentence = []
+                lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end]
+                for l in sorted(lexes_in_s):
+                    token = (l.attrs['text'], l.attrs['pos'], l.attrs['lemma'], l.begin, l.end)
+                    sentence.append(token)
+                text.append(sentence)
+            text = self._chunk_text(text)
+            self._export_chunks(text)
+        logger.info("chunker processing time: %.3f seconds" % self.chunk_time)
+
+    def _chunk_text(self, text):
+        """Takes a list of sentences and return the same sentences with chunk
+        tags inserted. May need to do something with things like &, <, >, and
+        others."""
+        t1 = time()
+        chunked_text = chunk_sentences(text)
+        self.chunk_time += time() - t1
+        return chunked_text
+
+    def _export_chunks(self, text):
+        """Export ng and vg tags to the TagRepository on the TarsqiDocument."""
+        for sentence in text:
+            in_chunk = False
+            chunk_begin = None
+            chunk_end = None
+            for token in sentence:
+                if token in ('<ng>', '<vg>'):
+                    in_chunk = True
+                    chunk_begin = None
+                    chunk_end = None
+                elif token in ('</ng>', '</vg>'):
+                    in_chunk = False
+                    chunk_tag = token[2:-1]
+                    ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER})
+                    self.document.tags.append(ctag)
+                elif in_chunk:
+                    if chunk_begin is None:
+                        chunk_begin = token[3]
+                    chunk_end = token[4]
+        self.document.tags.index()
+
+
 class TreeTagger(object):
 
     """Class that wraps the TreeTagger."""
diff --git a/code/docmodel/document.py b/code/docmodel/document.py
index 1f5ca90..769d801 100644
--- a/code/docmodel/document.py
+++ b/code/docmodel/document.py
@@ -449,6 +449,10 @@ def find_tag(self, name):
                 return t
         return None
 
+    def find_tags_at(self, begin_offset):
+        """Return the list of tags which start at begin_offset."""
+        return self.opening_tags.get(begin_offset, [])
+
     def import_tags(self, tag_repository, tagname):
         """Import all tags with name=tagname from tag_repository into self. This
         is moslty used when we want to take tags from the SourceDoc and add them
diff --git a/code/docmodel/main.py b/code/docmodel/main.py
index b617aef..284d90e 100644
--- a/code/docmodel/main.py
+++ b/code/docmodel/main.py
@@ -19,6 +19,7 @@
 from docmodel.metadata_parser import MetadataParserTimebank, MetadataParserDB
 from docmodel.metadata_parser import MetadataParserATEE, MetadataParserRTE3
 from docmodel.docstructure_parser import DocumentStructureParser
+from library.tarsqi_constants import TOKENIZER, TAGGER, CHUNKER
 from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET, S2T
 from library.tarsqi_constants import CLASSIFIER, BLINKER, LINK_MERGER
 
diff --git a/code/tarsqi.py b/code/tarsqi.py
index 379353f..cf28271 100644
--- a/code/tarsqi.py
+++ b/code/tarsqi.py
@@ -94,8 +94,8 @@
 USE_PROFILER = False
 PROFILER_OUTPUT = 'profile.txt'
 
-logger.initialize_logger(os.path.join(TTK_ROOT, 'data', 'logs', 'ttk_log'),
-                         level=3)
+logfile = os.path.join(TTK_ROOT, 'data', 'logs', 'ttk_log')
+logger.initialize_logger(logfile, level=3)
 
 
 class Tarsqi:
@@ -273,6 +273,7 @@ def getopt(self, option_name, default=None):
 
 class TarsqiError(Exception):
     """Tarsqi Exception class, so far only used in this file."""
+    # TODO: should probably be defined elsewhere
     pass
 
 
diff --git a/code/testing/run_tests.py b/code/testing/run_tests.py
index b931fc2..ce169da 100644
--- a/code/testing/run_tests.py
+++ b/code/testing/run_tests.py
@@ -70,11 +70,11 @@
 # this can be overruled with the --show-errors option
 SHOW_ERRORS = False
 
-GUTIME_PIPELINE = 'PREPROCESSOR,GUTIME'
-EVITA_PIPELINE = 'PREPROCESSOR,EVITA'
-SLINKET_PIPELINE = 'PREPROCESSOR,GUTIME,EVITA,SLINKET'
-S2T_PIPELINE = 'PREPROCESSOR,EVITA,SLINKET,S2T'
-BLINKER_PIPELINE = 'PREPROCESSOR,GUTIME,EVITA,BLINKER'
+GUTIME_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,GUTIME'
+EVITA_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,EVITA'
+SLINKET_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,GUTIME,EVITA,SLINKET'
+S2T_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,EVITA,SLINKET,S2T'
+BLINKER_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,GUTIME,EVITA,BLINKER'
 
 # this is used when we run all tests so we can print a summary at the end
 SUMMARY = []
diff --git a/docs/design/2-preprocessor.html b/docs/design/2-preprocessor.html
index ec5c496..5e384cc 100644
--- a/docs/design/2-preprocessor.html
+++ b/docs/design/2-preprocessor.html
@@ -19,6 +19,26 @@
 
 <h2>TARSQI Toolkit - The Preprocessor</h2>
 
+<p>The standard Tarsqi preprocessor can be run in two ways:</p>
+
+<pre class="example indent">
+$ python tarsqi.py --pipeline=PREPROCESSOR &lt;INFILE> &lt;OUTFILE>
+$ python tarsqi.py --pipeline=TOKENIZER,TAGGER,CHUNKER &lt;INFILE> &lt;OUTFILE>
+</pre>
+
+<p>In the first case, the PreprocessorWrapper is used, which wraps tokenizer,
+tagger and chunker and chunker all together. In the second case, the tokenizer,
+tagger and chunker are each wrapped individually. The second invocation allows
+some more flexibility. For example, we can now run TTK on input that was already
+tokenized, but not tagged and chunked. This is useful in case we want to adopt
+external tokenization. In addition, it is now easier to swap in different
+taggers if needed. We would just an additional tagger component and reference
+that one in the pipeline (some extra coding would be needed to write the wrapper
+of course). The default is to use the PREPROCESSOR.</p>
+  
+  
+<h3>PreProcessorWrapper</h3>
+
 <p>The PreprocessorWrapper loops through the document elements by using the
 TarsqiDocument.elements() method, which returns a list of Tags of type
 "docelement" where each tag has pointers to the begin and end offset in the
@@ -125,6 +145,19 @@ <h2>TARSQI Toolkit - The Preprocessor</h2>
   13: { 0: {'docelement': True}}}
 </pre>
 
+<p>These dictionaries can be used for quick access based on character
+offsets.</p>
+
+
+<h3>TokenizerWrapper, TaggerWrapper and ChunkerWrapper</h3>
+
+These work with the same tokenizer, tagger and chunker as the
+PreprocessorWrapper and the tokenizer, tagger and chunker take the same input
+and create the same output no matter what wrapper they are called from. The
+difference is that each wrapper will retrieve the data it needs from the
+TarsqiDocument and will always export to the TarsqiDocument. In contrast, with
+the PreprocessorWrapper we could almost directly pipe the output of the tagger
+into the chunker, without doing an export and import inbetween.
 
 </body>
 </html>