From 937cba99a99ac884ece303a392464abd61fd8275 Mon Sep 17 00:00:00 2001 From: Marc Verhagen Date: Fri, 16 Dec 2016 00:11:35 -0500 Subject: [PATCH] Added option to split preprocessor into tokenizer, tagger and chunker. --- code/components/__init__.py | 9 +- code/components/preprocessing/wrapper.py | 304 +++++++++++++++++++---- code/docmodel/document.py | 4 + code/docmodel/main.py | 1 + code/tarsqi.py | 5 +- code/testing/run_tests.py | 10 +- docs/design/2-preprocessor.html | 33 +++ 7 files changed, 314 insertions(+), 52 deletions(-) diff --git a/code/components/__init__.py b/code/components/__init__.py index 46cfd6c..d6c4c7e 100644 --- a/code/components/__init__.py +++ b/code/components/__init__.py @@ -1,8 +1,12 @@ -from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET +from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER +from library.tarsqi_constants import GUTIME, EVITA, SLINKET from library.tarsqi_constants import S2T, CLASSIFIER, BLINKER, LINK_MERGER from preprocessing.wrapper import PreprocessorWrapper +from preprocessing.wrapper import TokenizerWrapper +from preprocessing.wrapper import TaggerWrapper +from preprocessing.wrapper import ChunkerWrapper from gutime.wrapper import GUTimeWrapper from evita.wrapper import EvitaWrapper from slinket.wrapper import SlinketWrapper @@ -13,6 +17,9 @@ COMPONENTS = { PREPROCESSOR: PreprocessorWrapper, + TOKENIZER: TokenizerWrapper, + TAGGER: TaggerWrapper, + CHUNKER: ChunkerWrapper, GUTIME: GUTimeWrapper, EVITA: EvitaWrapper, SLINKET: SlinketWrapper, diff --git a/code/components/preprocessing/wrapper.py b/code/components/preprocessing/wrapper.py index 92b9117..994e7fc 100644 --- a/code/components/preprocessing/wrapper.py +++ b/code/components/preprocessing/wrapper.py @@ -12,9 +12,9 @@ from utilities import logger from docmodel.document import Tag -from library.tarsqi_constants import PREPROCESSOR +from library.tarsqi_constants import PREPROCESSOR, TOKENIZER, TAGGER, CHUNKER -from components.preprocessing.tokenizer import Tokenizer +from components.preprocessing.tokenizer import Tokenizer, TokenizedLex from components.preprocessing.chunker import chunk_sentences # TreeTagger executables and parameter file @@ -54,7 +54,7 @@ def initialize_treetagger(treetagger_dir): return treetagger -def normalizePOS(pos): +def normalize_POS(pos): """Some simple modifications of the TreeTagger POS tags.""" if pos == 'SENT': pos = '.' @@ -109,18 +109,18 @@ def process(self): TagId.reset() for element in self.document.elements(): text = self.document.sourcedoc.text[element.begin:element.end] - tokens = self.tokenize_text(text) + tokens = self._tokenize_text(text) adjust_lex_offsets(tokens, element.begin) - text = self.tag_text(tokens) + text = self._tag_text(tokens) # TODO: add some code to get lemmas when the TreeTagger just gets # , see https://github.com/tarsqi/ttk/issues/5 - text = self.chunk_text(text) - export(text, self.document) + text = self._chunk_text(text) + self._export(text) logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time) logger.info("tagger processing time: %.3f seconds" % self.tag_time) logger.info("chunker processing time: %.3f seconds" % self.chunk_time) - def tokenize_text(self, string): + def _tokenize_text(self, string): """Takes a unicode string and returns a list of objects, where each object is either the pair ('', None) or a pair of a tokenized string and a TokenizedLex instance.""" @@ -131,7 +131,7 @@ def tokenize_text(self, string): self.tokenize_time += time() - t1 return pairs - def tag_text(self, tokens): + def _tag_text(self, tokens): """Takes a string and returns a list of sentences. Each sentence is a list of tuples of token, part-of-speech and lemma.""" t1 = time() @@ -142,11 +142,11 @@ def tag_text(self, tokens): # treetagger does not accept a unicode string, so encode in utf-8 # TODO: this may have changed with the latest version taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8')) - text = self.create_text_from_tokens_and_tags(tokens, taggedItems) + text = self._create_text_from_tokens_and_tags(tokens, taggedItems) self.tag_time += time() - t1 return text - def chunk_text(self, text): + def _chunk_text(self, text): """Takes a list of sentences and return the same sentences with chunk tags inserted. May need to do something with things like &, <, >, and others.""" @@ -155,7 +155,7 @@ def chunk_text(self, text): self.chunk_time += time() - t1 return chunked_text - def create_text_from_tokens_and_tags(self, tokens, taggedItems): + def _create_text_from_tokens_and_tags(self, tokens, taggedItems): text = [] current_sentence = [] for (token, item) in zip(tokens, taggedItems): @@ -171,42 +171,41 @@ def create_text_from_tokens_and_tags(self, tokens, taggedItems): current_sentence.append(token_tuple) else: (tok, pos, stem) = item.split("\t") - pos = normalizePOS(pos) + pos = normalize_POS(pos) current_sentence.append((tok, pos, stem, lex.begin, lex.end)) return text - -def export(text, tarsqidoc): - """Export preprocessing information to the tag repository. Updates the - TagRepository using the preprocessing result.""" - ctag = None - for sentence in text: - stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR}) - for token in sentence: - if _is_tag(token): - if not token.startswith(', None) or (SomeString, TokenizedLex). In + the first case an s tag is inserted in the TarsqiDocument's tags + TagRepository and in the second a lex tag.""" + TagId.reset() + for element in self.document.elements(): + text = self.document.sourcedoc.text[element.begin:element.end] + tokens = self._tokenize_text(text) + adjust_lex_offsets(tokens, element.begin) + self._export_tokens(tokens) + logger.info("tokenizer processing time: %.3f seconds" % self.tokenize_time) + + def _tokenize_text(self, string): + """Takes a unicode string and returns a list of objects, where each + object is either the pair ('', None) or a pair of a tokenized string + and a TokenizedLex instance.""" + t1 = time() + tokenizer = Tokenizer(string) + tokenized_text = tokenizer.tokenize_text() + pairs = tokenized_text.as_pairs() + self.tokenize_time += time() - t1 + return pairs + + def _export_tokens(self, tokens): + """Add s tags and lex tags to the TagRepository of the TarsqiDocument.""" + tokens = filter_tokens(tokens) + s_begin, s_end = None, None + for t in tokens: + if t == '': + self._export_sentence(s_begin, s_end) + s_begin, s_end = None, None + else: + begin, end = t.begin, t.end + attrs = { 'text': t.text, 'origin': TOKENIZER } + ltag = Tag(TagId.next('l'), 'lex', begin, end, attrs) + self.document.tags.append(ltag) + if s_begin is None: + s_begin = begin + s_end = end + self._export_sentence(s_begin, s_end) + self.document.tags.index() + + def _export_sentence(self, s_begin, s_end): + """Add an s tag to the TagRepository of the TarsqiDocument.""" + if s_begin is not None: + stag = Tag(TagId.next('s'), 's', s_begin, s_end, {'origin': TOKENIZER}) + self.document.tags.append(stag) + + +def filter_tokens(tokens): + """The tokens list is a list of pairs, where the first element is '' or the + text of the token and the second element either None if the first element is + '' or a TokenizedLex instance, just keep the '' or the TokenizedLex.""" + filtered_tokens = [] + for t in tokens: + core = t[0] if t[0] == '' else t[1] + filtered_tokens.append(core) + return filtered_tokens + + +class TaggerWrapper: + + """Wrapper for the tagger.""" + + def __init__(self, tarsqidocument): + """Set component_name, add the TarsqiDocument and initialize the + TreeTagger.""" + self.component_name = TAGGER + self.document = tarsqidocument + self.treetagger_dir = self.document.options.getopt('treetagger') + self.treetagger = initialize_treetagger(self.treetagger_dir) + self.tag_time = 0 + + def process(self): + """Generate input for the tagger from the lex and s tags in the document, run + the tagger, and insert the new information (pos and lemma) into the + TagRepository on the TarsqiDocument.""" + for element in self.document.elements(): + sentences = self.document.tags.find_tags('s', element.begin, element.end) + lexes = self.document.tags.find_tags('lex', element.begin, element.end) + tokens = [] + for s in sentences: + tokens.append(('', None)) + lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end] + for l in sorted(lexes_in_s): + text = l.attrs['text'] + tokens.append((text, TokenizedLex(l.begin, l.end, text))) + tagged_tokens = self._tag_text(tokens) + # TODO: add some code to get lemmas when the TreeTagger just gets + # , see https://github.com/tarsqi/ttk/issues/5 + self._export_tags(tagged_tokens) + logger.info("tagger processing time: %.3f seconds" % self.tag_time) + + def _tag_text(self, tokens): + """Takes a string and returns a list of sentences. Each sentence is a + list of tuples of token, part-of-speech and lemma.""" + t1 = time() + vertical_string = "\n".join([t[0] for t in tokens]) + # this avoids handler warning if input is empty + if not vertical_string.strip(): + vertical_string = '' + # treetagger does not accept a unicode string, so encode in utf-8 + # TODO: this may have changed with the latest version + taggedItems = self.treetagger.tag_text(vertical_string.encode('utf-8')) + text = self._merge(tokens, taggedItems) + self.tag_time += time() - t1 + return text + + def _merge(self, tokens, taggedItems): + """Merge the tags and lemmas into the tokens. Result is a list of tokens + where each token is a 5-tuple of text, tag, lemma, begin offset and end + offset. Sentence information is not kept in this list.""" + text = [] + for (token, item) in zip(tokens, taggedItems): + if item == '': + continue + lex = token[1] + if item[0] == '<' and item[-1] == '>': + # not quite sure what these are for, probably tags that the + # TreeTagger leaves alone + token_tuple = (item[0], 'SYM', item[0], lex.begin, lex.end) + text.append(token_tuple) + else: + (tok, pos, stem) = item.split("\t") + pos = normalize_POS(pos) + text.append((tok, pos, stem, lex.begin, lex.end)) + return text + + def _export_tags(self, tagged_tokens): + """Take the token tuples and add their pos and lemma information to the + TagRepository in the TarsqiDocument.""" + for tagged_token in tagged_tokens: + pos, lemma, p1, p2 = tagged_token[1:5] + tags = self.document.tags.find_tags_at(p1) + tags = [t for t in tags if t.end == p2 and t.name == 'lex'] + if len(tags) == 1: + tags[0].attrs['pos'] = pos + tags[0].attrs['lemma'] = lemma + tags[0].attrs['origin'] += ",%s" % TAGGER + else: + logger.warn("More than one lex tag at position %d-%d" % (p1, p2)) + + +class ChunkerWrapper: + + """Wrapper for the chunker.""" + + def __init__(self, tarsqidocument): + """Set component_name and add the TarsqiDocument.""" + self.component_name = CHUNKER + self.document = tarsqidocument + self.chunk_time = 0 + + def process(self): + """Generate input for the chunker from the lex and s tags in the document, run + the chunker, and insert the new ng and vg chunks into the TagRepository + on the TarsqiDocument.""" + TagId.reset() + for element in self.document.elements(): + sentences = self.document.tags.find_tags('s', element.begin, element.end) + lexes = self.document.tags.find_tags('lex', element.begin, element.end) + text = [] + for s in sentences: + sentence = [] + lexes_in_s = [l for l in lexes if l.begin >= s.begin and l.end <= s.end] + for l in sorted(lexes_in_s): + token = (l.attrs['text'], l.attrs['pos'], l.attrs['lemma'], l.begin, l.end) + sentence.append(token) + text.append(sentence) + text = self._chunk_text(text) + self._export_chunks(text) + logger.info("chunker processing time: %.3f seconds" % self.chunk_time) + + def _chunk_text(self, text): + """Takes a list of sentences and return the same sentences with chunk + tags inserted. May need to do something with things like &, <, >, and + others.""" + t1 = time() + chunked_text = chunk_sentences(text) + self.chunk_time += time() - t1 + return chunked_text + + def _export_chunks(self, text): + """Export ng and vg tags to the TagRepository on the TarsqiDocument.""" + for sentence in text: + in_chunk = False + chunk_begin = None + chunk_end = None + for token in sentence: + if token in ('', ''): + in_chunk = True + chunk_begin = None + chunk_end = None + elif token in ('', ''): + in_chunk = False + chunk_tag = token[2:-1] + ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER}) + self.document.tags.append(ctag) + elif in_chunk: + if chunk_begin is None: + chunk_begin = token[3] + chunk_end = token[4] + self.document.tags.index() + + class TreeTagger(object): """Class that wraps the TreeTagger.""" diff --git a/code/docmodel/document.py b/code/docmodel/document.py index 1f5ca90..769d801 100644 --- a/code/docmodel/document.py +++ b/code/docmodel/document.py @@ -449,6 +449,10 @@ def find_tag(self, name): return t return None + def find_tags_at(self, begin_offset): + """Return the list of tags which start at begin_offset.""" + return self.opening_tags.get(begin_offset, []) + def import_tags(self, tag_repository, tagname): """Import all tags with name=tagname from tag_repository into self. This is moslty used when we want to take tags from the SourceDoc and add them diff --git a/code/docmodel/main.py b/code/docmodel/main.py index b617aef..284d90e 100644 --- a/code/docmodel/main.py +++ b/code/docmodel/main.py @@ -19,6 +19,7 @@ from docmodel.metadata_parser import MetadataParserTimebank, MetadataParserDB from docmodel.metadata_parser import MetadataParserATEE, MetadataParserRTE3 from docmodel.docstructure_parser import DocumentStructureParser +from library.tarsqi_constants import TOKENIZER, TAGGER, CHUNKER from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET, S2T from library.tarsqi_constants import CLASSIFIER, BLINKER, LINK_MERGER diff --git a/code/tarsqi.py b/code/tarsqi.py index 379353f..cf28271 100644 --- a/code/tarsqi.py +++ b/code/tarsqi.py @@ -94,8 +94,8 @@ USE_PROFILER = False PROFILER_OUTPUT = 'profile.txt' -logger.initialize_logger(os.path.join(TTK_ROOT, 'data', 'logs', 'ttk_log'), - level=3) +logfile = os.path.join(TTK_ROOT, 'data', 'logs', 'ttk_log') +logger.initialize_logger(logfile, level=3) class Tarsqi: @@ -273,6 +273,7 @@ def getopt(self, option_name, default=None): class TarsqiError(Exception): """Tarsqi Exception class, so far only used in this file.""" + # TODO: should probably be defined elsewhere pass diff --git a/code/testing/run_tests.py b/code/testing/run_tests.py index b931fc2..ce169da 100644 --- a/code/testing/run_tests.py +++ b/code/testing/run_tests.py @@ -70,11 +70,11 @@ # this can be overruled with the --show-errors option SHOW_ERRORS = False -GUTIME_PIPELINE = 'PREPROCESSOR,GUTIME' -EVITA_PIPELINE = 'PREPROCESSOR,EVITA' -SLINKET_PIPELINE = 'PREPROCESSOR,GUTIME,EVITA,SLINKET' -S2T_PIPELINE = 'PREPROCESSOR,EVITA,SLINKET,S2T' -BLINKER_PIPELINE = 'PREPROCESSOR,GUTIME,EVITA,BLINKER' +GUTIME_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,GUTIME' +EVITA_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,EVITA' +SLINKET_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,GUTIME,EVITA,SLINKET' +S2T_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,EVITA,SLINKET,S2T' +BLINKER_PIPELINE = 'TOKENIZER,TAGGER,CHUNKER,GUTIME,EVITA,BLINKER' # this is used when we run all tests so we can print a summary at the end SUMMARY = [] diff --git a/docs/design/2-preprocessor.html b/docs/design/2-preprocessor.html index ec5c496..5e384cc 100644 --- a/docs/design/2-preprocessor.html +++ b/docs/design/2-preprocessor.html @@ -19,6 +19,26 @@

TARSQI Toolkit - The Preprocessor

+

The standard Tarsqi preprocessor can be run in two ways:

+ +
+$ python tarsqi.py --pipeline=PREPROCESSOR <INFILE> <OUTFILE>
+$ python tarsqi.py --pipeline=TOKENIZER,TAGGER,CHUNKER <INFILE> <OUTFILE>
+
+ +

In the first case, the PreprocessorWrapper is used, which wraps tokenizer, +tagger and chunker and chunker all together. In the second case, the tokenizer, +tagger and chunker are each wrapped individually. The second invocation allows +some more flexibility. For example, we can now run TTK on input that was already +tokenized, but not tagged and chunked. This is useful in case we want to adopt +external tokenization. In addition, it is now easier to swap in different +taggers if needed. We would just an additional tagger component and reference +that one in the pipeline (some extra coding would be needed to write the wrapper +of course). The default is to use the PREPROCESSOR.

+ + +

PreProcessorWrapper

+

The PreprocessorWrapper loops through the document elements by using the TarsqiDocument.elements() method, which returns a list of Tags of type "docelement" where each tag has pointers to the begin and end offset in the @@ -125,6 +145,19 @@

TARSQI Toolkit - The Preprocessor

13: { 0: {'docelement': True}}} +

These dictionaries can be used for quick access based on character +offsets.

+ + +

TokenizerWrapper, TaggerWrapper and ChunkerWrapper

+ +These work with the same tokenizer, tagger and chunker as the +PreprocessorWrapper and the tokenizer, tagger and chunker take the same input +and create the same output no matter what wrapper they are called from. The +difference is that each wrapper will retrieve the data it needs from the +TarsqiDocument and will always export to the TarsqiDocument. In contrast, with +the PreprocessorWrapper we could almost directly pipe the output of the tagger +into the chunker, without doing an export and import inbetween.