Skip to content

Commit

Permalink
Various cleanups in preparation for version 2.0.0.rc1.
Browse files Browse the repository at this point in the history
- Removed some test input files
- Updated Library access for TarsqiDocument and its TagRepository
- Moved pipeline definition out of docmodel (to settings)
- Removed some garbage from the library
- Cleaned up sample settings file
- Cleaned up options in top-level code
- Added some validation to mallet script
- Started updating the manual

See issue #40.
  • Loading branch information
marcverhagen committed Jan 13, 2017
1 parent 2076066 commit 392cfd8
Show file tree
Hide file tree
Showing 13 changed files with 364 additions and 453 deletions.
19 changes: 0 additions & 19 deletions code/data/in/simple-xml/test2.xml

This file was deleted.

9 changes: 0 additions & 9 deletions code/data/in/simple-xml/test3.xml

This file was deleted.

84 changes: 46 additions & 38 deletions code/docmodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,28 @@
"""

import sys, codecs, StringIO
from copy import copy
from xml.sax.saxutils import escape, quoteattr

from library.main import LIBRARY


TIMEX = LIBRARY.timeml.TIMEX
EVENT = LIBRARY.timeml.EVENT
ALINK = LIBRARY.timeml.ALINK
SLINK = LIBRARY.timeml.SLINK
TLINK = LIBRARY.timeml.TLINK

TID = LIBRARY.timeml.TID
EID = LIBRARY.timeml.EID
EIID = LIBRARY.timeml.EIID

TIME_ID = LIBRARY.timeml.TIME_ID
EVENT_INSTANCE_ID = LIBRARY.timeml.EVENT_INSTANCE_ID
RELATED_TO_TIME = LIBRARY.timeml.RELATED_TO_TIME
SUBORDINATED_EVENT_INSTANCE = LIBRARY.timeml.SUBORDINATED_EVENT_INSTANCE
RELATED_TO_EVENT_INSTANCE = LIBRARY.timeml.RELATED_TO_EVENT_INSTANCE


class TarsqiDocument:

"""An instance of TarsqiDocument should contain all information that may be
Expand All @@ -33,11 +49,7 @@ def __init__(self):
self.metadata = {}
self.options = {}
self.tags = TagRepository()
self.counters = {LIBRARY.timeml.TIMEX: 0,
LIBRARY.timeml.EVENT: 0,
LIBRARY.timeml.ALINK: 0,
LIBRARY.timeml.SLINK: 0,
LIBRARY.timeml.TLINK: 0}
self.counters = {TIMEX: 0, EVENT: 0, ALINK: 0, SLINK: 0, TLINK: 0}

def __str__(self):
fname = self.sourcedoc.filename if self.sourcedoc is not None else None
Expand All @@ -59,16 +71,16 @@ def elements(self):

def events(self):
"""Convenience method for easy access to events."""
return self.tags.find_tags(LIBRARY.timeml.EVENT)
return self.tags.find_tags(EVENT)

def timexes(self):
"""Convenience method for easy access to timexes."""
return self.tags.find_tags(LIBRARY.timeml.TIMEX)
return self.tags.find_tags(TIMEX)

def has_event(self, begin, end):
"""Return True if there is already an event at the given begin and
end."""
for tag in self.tags.find_tags(LIBRARY.timeml.EVENT):
for tag in self.tags.find_tags(EVENT):
if tag.begin == begin and tag.end == end:
return True
return False
Expand Down Expand Up @@ -99,12 +111,12 @@ def pp(self, source_tags=True, tarsqi_tags=True, tags=True):
print

def next_event_id(self):
self.counters[LIBRARY.timeml.EVENT] += 1
return "e%d" % self.counters[LIBRARY.timeml.EVENT]
self.counters[EVENT] += 1
return "e%d" % self.counters[EVENT]

def next_timex_id(self):
self.counters[LIBRARY.timeml.TIMEX3] += 1
return "t%d" % self.counters[LIBRARY.timeml.TIMEX3]
self.counters[TIMEX] += 1
return "t%d" % self.counters[TIMEX]

def next_link_id(self, link_type):
"""Return a unique lid. The link_type argument is one of {ALINK, SLINK,
Expand All @@ -113,13 +125,13 @@ def next_link_id(self, link_type):
using the link counters in the document. Breaks down if there are
already links added without using those counters."""
self.counters[link_type] += 1
return "l%d" % (self.counters[LIBRARY.timeml.ALINK] +
self.counters[LIBRARY.timeml.SLINK] +
self.counters[LIBRARY.timeml.TLINK])
return "l%d" % (self.counters[ALINK] +
self.counters[SLINK] +
self.counters[TLINK])

def remove_tlinks(self):
"""Remove all TLINK tags from the tags repository."""
self.tags.remove_tags(LIBRARY.timeml.TLINK)
self.tags.remove_tags(TLINK)

def print_source(self, fname):
"""Print the original source of the document, without the tags to file
Expand Down Expand Up @@ -392,16 +404,16 @@ def index(self):
def index_events(self):
self.eid2event = {}
for tag in self.tags:
if tag.name == LIBRARY.timeml.EVENT:
self.eid2event[tag.attrs[LIBRARY.timeml.EIID]] = tag
if tag.name == EVENT:
self.eid2event[tag.attrs[EIID]] = tag

def index_timexes(self):
# TODO: merge with ei2events and create id2tag, assumes all tags have
# ids and they are unique
self.tid2timex = {}
for tag in self.tags:
if tag.name == LIBRARY.timeml.TIMEX:
self.tid2timex[tag.attrs[LIBRARY.timeml.TID]] = tag
if tag.name == TIMEX:
self.tid2timex[tag.attrs[TID]] = tag

def find_tags(self, name, begin=None, end=None):
"""Return all tags of this name. If the optional begin and end are given
Expand All @@ -414,26 +426,21 @@ def find_tags(self, name, begin=None, end=None):
def find_linktags(self, name, o1, o2):
"""Return all the link tages with type name. Only include the ones that
fall between offsets o1 and o2."""
EID = LIBRARY.timeml.EVENT_INSTANCE_ID
SEI = LIBRARY.timeml.SUBORDINATED_EVENT_INSTANCE
REI = LIBRARY.timeml.RELATED_TO_EVENT_INSTANCE
TID = LIBRARY.timeml.TIME_ID
RTT = LIBRARY.timeml.RELATED_TO_TIME
tags = []
for tag in sorted([t for t in self.tags if t.name == name]):
if name == LIBRARY.timeml.SLINK:
t1 = self.eid2event.get(tag.attrs.get(EID))
t2 = self.eid2event.get(tag.attrs.get(SEI))
if name == LIBRARY.timeml.ALINK:
t1 = self.eid2event.get(tag.attrs.get(EID))
t2 = self.eid2event.get(tag.attrs.get(REI))
if name == LIBRARY.timeml.TLINK:
t1 = self.eid2event.get(tag.attrs.get(EID))
t2 = self.eid2event.get(tag.attrs.get(REI))
if name == SLINK:
t1 = self.eid2event.get(tag.attrs.get(EVENT_INSTANCE_ID))
t2 = self.eid2event.get(tag.attrs.get(SUBORDINATED_EVENT_INSTANCE))
if name == ALINK:
t1 = self.eid2event.get(tag.attrs.get(EVENT_INSTANCE_ID))
t2 = self.eid2event.get(tag.attrs.get(RELATED_TO_EVENT_INSTANCE))
if name == TLINK:
t1 = self.eid2event.get(tag.attrs.get(EVENT_INSTANCE_ID))
t2 = self.eid2event.get(tag.attrs.get(RELATED_TO_EVENT_INSTANCE))
if t1 is None:
t1 = self.tid2timex.get(tag.attrs.get(TID))
t1 = self.tid2timex.get(tag.attrs.get(TIME_ID))
if t2 is None:
t2 = self.tid2timex.get(tag.attrs.get(RTT))
t2 = self.tid2timex.get(tag.attrs.get(RELATED_TO_TIME))
offsets = [t1.begin, t1.end, t2.begin, t2.end]
to1 = min(offsets)
to2 = max(offsets)
Expand Down Expand Up @@ -500,7 +507,7 @@ def __init__(self, identifier, name, o1, o2, attrs):
self.attrs = attrs
# TODO: should investigate tag initialization since there is an
# impression that it is not consistent
if self.id is None and attrs.has_key('id'):
if self.id is None and 'id' in attrs:
self.id = attrs.get('id')
del(self.attrs['id'])

Expand Down Expand Up @@ -543,6 +550,7 @@ def as_ttk_tag(self):
(self.name, identifier, begin, end, self.attributes_as_string())

def as_lex_xml_string(self, text):
"""Return an opening and closing tag wrapped around text."""
return "<lex id=\"%s\" begin=\"%d\" end=\"%d\" pos=\"%s\">%s</lex>" % \
(self.id, self.begin, self.end, str(self.attrs['pos']), escape(text))

Expand Down
12 changes: 0 additions & 12 deletions code/docmodel/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@
from docmodel.metadata_parser import MetadataParserTimebank, MetadataParserDB
from docmodel.metadata_parser import MetadataParserATEE, MetadataParserRTE3
from docmodel.docstructure_parser import DocumentStructureParser
from library.tarsqi_constants import TOKENIZER, TAGGER, CHUNKER
from library.tarsqi_constants import PREPROCESSOR, GUTIME, EVITA, SLINKET, S2T
from library.tarsqi_constants import CLASSIFIER, BLINKER, LINK_MERGER


PARSERS = {
Expand All @@ -35,9 +32,6 @@
}


DEFAULT_PIPELINE = [PREPROCESSOR, GUTIME, EVITA, SLINKET, S2T,
BLINKER, CLASSIFIER, LINK_MERGER]

DEFAULT_SOURCE_PARSER = SourceParserXML
DEFAULT_METADATA_PARSER = MetadataParser
DEFAULT_PARSERS = (DEFAULT_SOURCE_PARSER, DEFAULT_METADATA_PARSER)
Expand All @@ -60,9 +54,3 @@ def create_docstructure_parser():
# where the parser creates tags similar to other components and where the
# elements variable is gone.
return DocumentStructureParser()


def get_default_pipeline(options):
"""Now always returns the same but can be used for genre-specific
pipelines."""
return DEFAULT_PIPELINE
3 changes: 3 additions & 0 deletions code/library/evita/compile_patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
# variable is not used so we hand in a dummy value
os.environ['TTK_ROOT'] = 'DUMMY'

# TODO: why not just import root???


from utilities.FSA import compileOP
from library.evita.multi_chunk_patterns import patternsGroups

Expand Down
2 changes: 1 addition & 1 deletion code/library/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
syntax for all libraries (simple settings, rules, etcetera) and then read in the
libraries (which allows you to read only those libraries that are required).
The LIBRARY variable allows other modules a single import from which all
The LIBRARY variable provides other modules a single import point from which all
settings can be accessed.
"""
Expand Down
7 changes: 1 addition & 6 deletions code/library/tarsqi_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,5 @@
CLASSIFIER = 'CLASSIFIER'

LINK_MERGER = 'LINK_MERGER'
MERGER = 'LINK_MERGER'


# PROCESSING PARAMETERS

TRAP_ERRORS = 'trap_errors'
EXTENSION = 'extension'
PIPELINE = 'pipeline'
6 changes: 3 additions & 3 deletions code/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
All this module does is to set the TTK_ROOT environment variable.
Previsouly this was done inline in for example tarsqi.py, but this resulted in
an ugly situation where a piece of code was inserted inbetween a set of import
statements.
Previously, this was done inline for all the modules that needed it, but this
resulted in an ugly situation where a piece of code was inserted inbetween a set
of import statements.
"""

Expand Down
56 changes: 39 additions & 17 deletions code/settings.sample.txt
Original file line number Diff line number Diff line change
@@ -1,43 +1,65 @@
# This is an example file with configuration options. You should rename this
# file into settings.txt (or copy it into settings.txt) and make changes as
# desired.
# This is an example file with configuration options. You should make a copy of
# this file and name it settings.txt and make changes to the copy as needed.

# Option can be changed here or in some cases on the command line when
# calling the tarsqy.py script. Command line options, when possible, will
# overwrite options specified here.
# Option can be changed here or in some cases on the command line when calling
# the tarsqy.py script. Command line options will overwrite options specified
# here.


# The platform, possible values are 'linux2' and 'darwin'. This is the value of
# the python sys.platform variable
# The default pipeline, can be overridden with the --pipeline command line
# option

platform = linux2
platform = darwin
pipeline = PREPROCESSOR,GUTIME,EVITA,SLINKET,S2T,BLINKER,CLASSIFIER,LINK_MERGER


# Location of the IMS TreeTagger
# Location of perl. Change this into an absolute path if perl cannot be accesed
# by the system by simply using 'perl'

perl = perl


# Location of the IMS TreeTagger, can be overridden with the --treetagger
# command line option.

treetagger = /Applications/ADDED/nlp/treetagger


# Location of Mallet, this should be the directory that contains the bin
# directory. The version 2.0.8 release candidate is somewhat faster.
# directory. This option can be overridden by the --mallet command line
# option.

mallet = /Applications/ADDED/nlp/mallet/mallet-2.0.7
mallet = /Applications/ADDED/nlp/mallet/mallet-2.0.8RC3
mallet = /Applications/ADDED/nlp/mallet/mallet-2.0.8


# Other options used for the classifier, ee-model and et-model should refer to a
# model in components/classifier/models
# Other options used for the classifier, where ee-model and et-model should
# refer to a model in components/classifier/models. These options can be
# overridden by the --classifier, --ee-model and --et-model command line
# options.
# TODO: allow absolute paths

classifier = MaxEnt
ee-model = tb-vectors.ee.model
et-model = tb-vectors.et.model


# Several other options can be set here. Refer to tarsqi.py for descriptions.
# The source type of the document which allows components to be sensitive to
# idiosyncratic properties of the text. It is xml by default, other source
# types are text and ttk. This would typically be set with the --source command
# line option.

source = xml


# Set log level to an integer from 0 to 4, the higher the level the more
# messages will be written to the log. The default prints info, errors and
# warnings, but no debugging statements. See utilities.logger for more
# details. Can be overridden with --loglevel.

loglevel = 3


# Error trapping, errors are trapped by default. Override with --trap-errors.

trap-errors = True


Expand Down
Loading

0 comments on commit 392cfd8

Please sign in to comment.