Various cleanups in preparation for version 2.0.0.rc1.

- Removed some test input files - Updated Library access for TarsqiDocument and its TagRepository - Moved pipeline definition out of docmodel (to settings) - Removed some garbage from the library - Cleaned up sample settings file - Cleaned up options in top-level code - Added some validation to mallet script - Started updating the manual See issue #40.
tarsqi · Jan 13, 2017 · 392cfd8 · 392cfd8
1 parent 2076066
commit 392cfd8
Show file tree

Hide file tree

Showing 13 changed files with 364 additions and 453 deletions.
diff --git a/code/data/in/simple-xml/test2.xml b/code/data/in/simple-xml/test2.xml
diff --git a/code/data/in/simple-xml/test3.xml b/code/data/in/simple-xml/test3.xml
diff --git a/code/docmodel/document.py b/code/docmodel/document.py
@@ -6,12 +6,28 @@
 """
 
 import sys, codecs, StringIO
-from copy import copy
 from xml.sax.saxutils import escape, quoteattr
 
 from library.main import LIBRARY
 
 
+TIMEX = LIBRARY.timeml.TIMEX
+EVENT = LIBRARY.timeml.EVENT
+ALINK = LIBRARY.timeml.ALINK
+SLINK = LIBRARY.timeml.SLINK
+TLINK = LIBRARY.timeml.TLINK
+
+TID = LIBRARY.timeml.TID
+EID = LIBRARY.timeml.EID
+EIID = LIBRARY.timeml.EIID
+
+TIME_ID = LIBRARY.timeml.TIME_ID
+EVENT_INSTANCE_ID = LIBRARY.timeml.EVENT_INSTANCE_ID
+RELATED_TO_TIME = LIBRARY.timeml.RELATED_TO_TIME
+SUBORDINATED_EVENT_INSTANCE = LIBRARY.timeml.SUBORDINATED_EVENT_INSTANCE
+RELATED_TO_EVENT_INSTANCE = LIBRARY.timeml.RELATED_TO_EVENT_INSTANCE
+
+
 class TarsqiDocument:
 
     """An instance of TarsqiDocument should contain all information that may be
@@ -33,11 +49,7 @@ def __init__(self):
         self.metadata = {}
         self.options = {}
         self.tags = TagRepository()
-        self.counters = {LIBRARY.timeml.TIMEX: 0,
-                         LIBRARY.timeml.EVENT: 0,
-                         LIBRARY.timeml.ALINK: 0,
-                         LIBRARY.timeml.SLINK: 0,
-                         LIBRARY.timeml.TLINK: 0}
+        self.counters = {TIMEX: 0, EVENT: 0, ALINK: 0, SLINK: 0, TLINK: 0}
 
     def __str__(self):
         fname = self.sourcedoc.filename if self.sourcedoc is not None else None
@@ -59,16 +71,16 @@ def elements(self):
 
     def events(self):
         """Convenience method for easy access to events."""
-        return self.tags.find_tags(LIBRARY.timeml.EVENT)
+        return self.tags.find_tags(EVENT)
 
     def timexes(self):
         """Convenience method for easy access to timexes."""
-        return self.tags.find_tags(LIBRARY.timeml.TIMEX)
+        return self.tags.find_tags(TIMEX)
 
     def has_event(self, begin, end):
         """Return True if there is already an event at the given begin and
         end."""
-        for tag in self.tags.find_tags(LIBRARY.timeml.EVENT):
+        for tag in self.tags.find_tags(EVENT):
             if tag.begin == begin and tag.end == end:
                 return True
         return False
@@ -99,12 +111,12 @@ def pp(self, source_tags=True, tarsqi_tags=True, tags=True):
         print
 
     def next_event_id(self):
-        self.counters[LIBRARY.timeml.EVENT] += 1
-        return "e%d" % self.counters[LIBRARY.timeml.EVENT]
+        self.counters[EVENT] += 1
+        return "e%d" % self.counters[EVENT]
 
     def next_timex_id(self):
-        self.counters[LIBRARY.timeml.TIMEX3] += 1
-        return "t%d" % self.counters[LIBRARY.timeml.TIMEX3]
+        self.counters[TIMEX] += 1
+        return "t%d" % self.counters[TIMEX]
 
     def next_link_id(self, link_type):
         """Return a unique lid. The link_type argument is one of {ALINK, SLINK,
@@ -113,13 +125,13 @@ def next_link_id(self, link_type):
          using the link counters in the document. Breaks down if there are
          already links added without using those counters."""
         self.counters[link_type] += 1
-        return "l%d" % (self.counters[LIBRARY.timeml.ALINK] +
-                        self.counters[LIBRARY.timeml.SLINK] +
-                        self.counters[LIBRARY.timeml.TLINK])
+        return "l%d" % (self.counters[ALINK] +
+                        self.counters[SLINK] +
+                        self.counters[TLINK])
 
     def remove_tlinks(self):
         """Remove all TLINK tags from the tags repository."""
-        self.tags.remove_tags(LIBRARY.timeml.TLINK)
+        self.tags.remove_tags(TLINK)
 
     def print_source(self, fname):
         """Print the original source of the document, without the tags to file
@@ -392,16 +404,16 @@ def index(self):
     def index_events(self):
         self.eid2event = {}
         for tag in self.tags:
-            if tag.name == LIBRARY.timeml.EVENT:
-                self.eid2event[tag.attrs[LIBRARY.timeml.EIID]] = tag
+            if tag.name == EVENT:
+                self.eid2event[tag.attrs[EIID]] = tag
 
     def index_timexes(self):
         # TODO: merge with ei2events and create id2tag, assumes all tags have
         # ids and they are unique
         self.tid2timex = {}
         for tag in self.tags:
-            if tag.name == LIBRARY.timeml.TIMEX:
-                self.tid2timex[tag.attrs[LIBRARY.timeml.TID]] = tag
+            if tag.name == TIMEX:
+                self.tid2timex[tag.attrs[TID]] = tag
 
     def find_tags(self, name, begin=None, end=None):
         """Return all tags of this name. If the optional begin and end are given
@@ -414,26 +426,21 @@ def find_tags(self, name, begin=None, end=None):
     def find_linktags(self, name, o1, o2):
         """Return all the link tages with type name. Only include the ones that
         fall between offsets o1 and o2."""
-        EID = LIBRARY.timeml.EVENT_INSTANCE_ID
-        SEI = LIBRARY.timeml.SUBORDINATED_EVENT_INSTANCE
-        REI = LIBRARY.timeml.RELATED_TO_EVENT_INSTANCE
-        TID = LIBRARY.timeml.TIME_ID
-        RTT = LIBRARY.timeml.RELATED_TO_TIME
         tags = []
         for tag in sorted([t for t in self.tags if t.name == name]):
-            if name == LIBRARY.timeml.SLINK:
-                t1 = self.eid2event.get(tag.attrs.get(EID))
-                t2 = self.eid2event.get(tag.attrs.get(SEI))
-            if name == LIBRARY.timeml.ALINK:
-                t1 = self.eid2event.get(tag.attrs.get(EID))
-                t2 = self.eid2event.get(tag.attrs.get(REI))
-            if name == LIBRARY.timeml.TLINK:
-                t1 = self.eid2event.get(tag.attrs.get(EID))
-                t2 = self.eid2event.get(tag.attrs.get(REI))
+            if name == SLINK:
+                t1 = self.eid2event.get(tag.attrs.get(EVENT_INSTANCE_ID))
+                t2 = self.eid2event.get(tag.attrs.get(SUBORDINATED_EVENT_INSTANCE))
+            if name == ALINK:
+                t1 = self.eid2event.get(tag.attrs.get(EVENT_INSTANCE_ID))
+                t2 = self.eid2event.get(tag.attrs.get(RELATED_TO_EVENT_INSTANCE))
+            if name == TLINK:
+                t1 = self.eid2event.get(tag.attrs.get(EVENT_INSTANCE_ID))
+                t2 = self.eid2event.get(tag.attrs.get(RELATED_TO_EVENT_INSTANCE))
                 if t1 is None:
-                    t1 = self.tid2timex.get(tag.attrs.get(TID))
+                    t1 = self.tid2timex.get(tag.attrs.get(TIME_ID))
                 if t2 is None:
-                    t2 = self.tid2timex.get(tag.attrs.get(RTT))
+                    t2 = self.tid2timex.get(tag.attrs.get(RELATED_TO_TIME))
             offsets = [t1.begin, t1.end, t2.begin, t2.end]
             to1 = min(offsets)
             to2 = max(offsets)
@@ -500,7 +507,7 @@ def __init__(self, identifier, name, o1, o2, attrs):
         self.attrs = attrs
         # TODO: should investigate tag initialization since there is an
         # impression that it is not consistent
-        if self.id is None and attrs.has_key('id'):
+        if self.id is None and 'id' in attrs:
             self.id = attrs.get('id')
             del(self.attrs['id'])
 
@@ -543,6 +550,7 @@ def as_ttk_tag(self):
             (self.name, identifier, begin, end, self.attributes_as_string())
 
     def as_lex_xml_string(self, text):
+        """Return an opening and closing tag wrapped around text."""
         return "<lex id=\"%s\" begin=\"%d\" end=\"%d\" pos=\"%s\">%s</lex>" % \
             (self.id, self.begin, self.end, str(self.attrs['pos']), escape(text))
 

diff --git a/code/docmodel/main.py b/code/docmodel/main.py
@@ -19,9 +19,6 @@
 from docmodel.metadata_parser import MetadataParserTimebank, MetadataParserDB
 from docmodel.metadata_parser import MetadataParserATEE, MetadataParserRTE3
 from docmodel.docstructure_parser import DocumentStructureParser
-from library.tarsqi_constants import TOKENIZER, TAGGER, CHUNKER
-from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET, S2T
-from library.tarsqi_constants import CLASSIFIER, BLINKER, LINK_MERGER
 
 
 PARSERS = {
@@ -35,9 +32,6 @@
 }
 
 
-DEFAULT_PIPELINE = [PREPROCESSOR, GUTIME, EVITA, SLINKET, S2T,
-                    BLINKER, CLASSIFIER, LINK_MERGER]
-
 DEFAULT_SOURCE_PARSER = SourceParserXML
 DEFAULT_METADATA_PARSER = MetadataParser
 DEFAULT_PARSERS = (DEFAULT_SOURCE_PARSER, DEFAULT_METADATA_PARSER)
@@ -60,9 +54,3 @@ def create_docstructure_parser():
     # where the parser creates tags similar to other components and where the
     # elements variable is gone.
     return DocumentStructureParser()
-
-
-def get_default_pipeline(options):
-    """Now always returns the same but can be used for genre-specific
-    pipelines."""
-    return DEFAULT_PIPELINE
diff --git a/code/library/evita/compile_patterns.py b/code/library/evita/compile_patterns.py
@@ -30,6 +30,9 @@
 # variable is not used so we hand in a dummy value
 os.environ['TTK_ROOT'] = 'DUMMY'
 
+# TODO: why not just import root???
+
+
 from utilities.FSA import compileOP
 from library.evita.multi_chunk_patterns import patternsGroups
 

diff --git a/code/library/main.py b/code/library/main.py
@@ -5,7 +5,7 @@
 syntax for all libraries (simple settings, rules, etcetera) and then read in the
 libraries (which allows you to read only those libraries that are required).
 
-The LIBRARY variable allows other modules a single import from which all
+The LIBRARY variable provides other modules a single import point from which all
 settings can be accessed.
 
 """

diff --git a/code/library/tarsqi_constants.py b/code/library/tarsqi_constants.py
@@ -24,10 +24,5 @@
 CLASSIFIER = 'CLASSIFIER'
 
 LINK_MERGER = 'LINK_MERGER'
+MERGER = 'LINK_MERGER'
 
-
-# PROCESSING PARAMETERS
-
-TRAP_ERRORS = 'trap_errors'
-EXTENSION = 'extension'
-PIPELINE = 'pipeline'
diff --git a/code/root.py b/code/root.py
@@ -2,9 +2,9 @@
 
 All this module does is to set the TTK_ROOT environment variable.
 
-Previsouly this was done inline in for example tarsqi.py, but this resulted in
-an ugly situation where a piece of code was inserted inbetween a set of import
-statements.
+Previously, this was done inline for all the modules that needed it, but this
+resulted in an ugly situation where a piece of code was inserted inbetween a set
+of import statements.
 
 """
 

diff --git a/code/settings.sample.txt b/code/settings.sample.txt
@@ -1,43 +1,65 @@
-# This is an example file with configuration options. You should rename this
-# file into settings.txt (or copy it into settings.txt) and make changes as
-# desired.
+# This is an example file with configuration options. You should make a copy of
+# this file and name it settings.txt and make changes to the copy as needed.
 
-# Option can be changed here or in some cases on the command line when
-# calling the tarsqy.py script. Command line options, when possible, will
-# overwrite options specified here.
+# Option can be changed here or in some cases on the command line when calling
+# the tarsqy.py script. Command line options will overwrite options specified
+# here.
 
 
-# The platform, possible values are 'linux2' and 'darwin'. This is the value of
-# the python sys.platform variable
+# The default pipeline, can be overridden with the --pipeline command line
+# option
 
-platform = linux2
-platform = darwin
+pipeline = PREPROCESSOR,GUTIME,EVITA,SLINKET,S2T,BLINKER,CLASSIFIER,LINK_MERGER
 
 
-# Location of the IMS TreeTagger
+# Location of perl. Change this into an absolute path if perl cannot be accesed
+# by the system by simply using 'perl'
+
+perl = perl
+
+
+# Location of the IMS TreeTagger, can be overridden with the --treetagger
+# command line option.
 
 treetagger = /Applications/ADDED/nlp/treetagger
 
 
 # Location of Mallet, this should be the directory that contains the bin
-# directory. The version 2.0.8 release candidate is somewhat faster.
+# directory. This option can be overridden by the --mallet command line
+# option.
 
-mallet = /Applications/ADDED/nlp/mallet/mallet-2.0.7
-mallet = /Applications/ADDED/nlp/mallet/mallet-2.0.8RC3
+mallet = /Applications/ADDED/nlp/mallet/mallet-2.0.8
 
 
-# Other options used for the classifier, ee-model and et-model should refer to a
-# model in components/classifier/models
+# Other options used for the classifier, where ee-model and et-model should
+# refer to a model in components/classifier/models. These options can be
+# overridden by the --classifier, --ee-model and --et-model command line
+# options.
+# TODO: allow absolute paths
 
 classifier = MaxEnt
 ee-model = tb-vectors.ee.model
 et-model = tb-vectors.et.model
 
 
-# Several other options can be set here. Refer to tarsqi.py for descriptions.
+# The source type of the document which allows components to be sensitive to
+# idiosyncratic properties of the text. It is xml by default, other source
+# types are text and ttk. This would typically be set with the --source command
+# line option.
 
 source = xml
+
+
+# Set log level to an integer from 0 to 4, the higher the level the more
+# messages will be written to the log. The default prints info, errors and
+# warnings, but no debugging statements. See utilities.logger for more
+# details. Can be overridden with --loglevel.
+
 loglevel = 3
+
+
+# Error trapping, errors are trapped by default. Override with --trap-errors.
+
 trap-errors = True