From 3a0199d77656c5b68a887c857315556684528456 Mon Sep 17 00:00:00 2001 From: Jens Wille Date: Tue, 6 Mar 2012 14:48:40 +0100 Subject: [PATCH] v1.8.2 --- ChangeLog | 18 ++++++++++++++++++ README | 2 +- Rakefile | 6 ++---- TODO | 2 ++ lib/lingo/version.rb | 2 +- lingo.gemspec | 10 +++++----- 6 files changed, 29 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 00f2708a..31b57a4b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,23 @@ = Revision history for Lingo +== 1.8.2 [2012-04-19] + +* Performance improvements regarding Attendee::VectorFilter's (as well as + Attendee::NonewordFilter's) memory usage; set sort: false in the config. +* Added Attendee::Stemmer (implementing Porter's algorithm for suffix stripping). +* Added progress reporting to Attendee::TextReader; set progress: true + in the config. +* Added directory and glob processing to Attendee::TextReader (new options + +glob+ and +recursive+). +* Renamed Attendee::TextReader's option +lir-record-pattern+ to +records+. +* Fixed Attendee::Debugger to forward all objects so it can be inserted + between any two attendees in the config. +* Fixed regression introduced in 1.8.0 where Lingo would not use existing + compiled dictionary when source file is not present. +* Fixed "invalid byte sequence in UTF-8" on Windows for SDBM store. +* Enabled pluggable (compiled) dictionaries and storage backends. +* Extensive internal refactoring and cleanup. (Finished for now.) + == 1.8.1 [2012-02-19] * Introduced alternative storage backends, mainly to circumvent SDBM's record diff --git a/README b/README index dd989a30..f0369c37 100644 --- a/README +++ b/README @@ -25,7 +25,7 @@ == VERSION -This documentation refers to Lingo version 1.8.1 +This documentation refers to Lingo version 1.8.2 == DESCRIPTION diff --git a/Rakefile b/Rakefile index f0b5537d..c779dd99 100644 --- a/Rakefile +++ b/Rakefile @@ -39,10 +39,8 @@ The main functions of Lingo are: of word classes EOT extra_files: FileList[ - 'lingo.rb', 'lingo{,-call}.cfg', 'lingo.opt', 'doc/**/*', - '{de,en}.lang', '{de,en}/{lingo-*,user-dic}.txt', 'txt/artikel{,-en}.txt', - 'info/gpl-hdr.txt', 'info/*.png', 'lir.cfg', 'txt/lir.txt', 'porter/*', - '{de,en}/test_*.txt' + 'lingo.rb', 'lingo{,-call}.cfg', 'lir.cfg', '{de,en}.lang', + '{de,en}/{lingo-*,user-dic,test_*}.txt', 'txt/{artikel{,-en},lir}.txt' ].to_a, required_ruby_version: '>= 1.9', dependencies: [['ruby-nuggets', '>= 0.8.5'], 'unicode', 'highline'], diff --git a/TODO b/TODO index 78e8005e..412f3474 100644 --- a/TODO +++ b/TODO @@ -1,5 +1,7 @@ = ToDo list for Lingo +* Configuration parameter validation. +* Replace regex-based tokenizer with a (Racc/Ragel/ANTLR-based?) lexer. * Update and translate old documentation. * Allow for handling of documents in various encodings, not just the one the dictionaries are encoded in. diff --git a/lib/lingo/version.rb b/lib/lingo/version.rb index ee169f39..9ccec582 100644 --- a/lib/lingo/version.rb +++ b/lib/lingo/version.rb @@ -4,7 +4,7 @@ module Version MAJOR = 1 MINOR = 8 - TINY = 1 + TINY = 2 class << self diff --git a/lingo.gemspec b/lingo.gemspec index dbd8737f..e24bed76 100644 --- a/lingo.gemspec +++ b/lingo.gemspec @@ -2,18 +2,18 @@ Gem::Specification.new do |s| s.name = "lingo" - s.version = "1.8.1" + s.version = "1.8.2" s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= s.authors = ["John Vorhauer", "Jens Wille"] - s.date = "2012-02-19" + s.date = "2012-04-19" s.description = "Lingo is an open source indexing system for research and teachings.\nThe main functions of Lingo are:\n\n* identification of (i.e. reduction to) basic word form by means of\n dictionaries and suffix lists\n* algorithmic decomposition\n* dictionary-based synonymisation and identification of phrases\n* generic identification of phrases/word sequences based on patterns\n of word classes\n" s.email = ["lingo@vorhauer.de", "jens.wille@uni-koeln.de"] - s.executables = ["lingo", "lingoctl"] + s.executables = ["lingoctl", "lingo"] s.extra_rdoc_files = ["README", "COPYING", "ChangeLog"] - s.files = ["lib/lingo/ctl.rb", "lib/lingo/database.rb", "lib/lingo/error.rb", "lib/lingo/version.rb", "lib/lingo/database/source.rb", "lib/lingo/database/libcdb_store.rb", "lib/lingo/database/sdbm_store.rb", "lib/lingo/database/show_progress.rb", "lib/lingo/database/crypter.rb", "lib/lingo/database/source/multi_key.rb", "lib/lingo/database/source/key_value.rb", "lib/lingo/database/source/single_word.rb", "lib/lingo/database/source/word_class.rb", "lib/lingo/database/source/multi_value.rb", "lib/lingo/database/gdbm_store.rb", "lib/lingo/database/hash_store.rb", "lib/lingo/cli.rb", "lib/lingo/cachable.rb", "lib/lingo/attendee/variator.rb", "lib/lingo/attendee/debugger.rb", "lib/lingo/attendee/object_filter.rb", "lib/lingo/attendee/synonymer.rb", "lib/lingo/attendee/text_writer.rb", "lib/lingo/attendee/multi_worder.rb", "lib/lingo/attendee/text_reader.rb", "lib/lingo/attendee/dehyphenizer.rb", "lib/lingo/attendee/tokenizer.rb", "lib/lingo/attendee/abbreviator.rb", "lib/lingo/attendee/formatter.rb", "lib/lingo/attendee/noneword_filter.rb", "lib/lingo/attendee/sequencer.rb", "lib/lingo/attendee/decomposer.rb", "lib/lingo/attendee/word_searcher.rb", "lib/lingo/attendee/vector_filter.rb", "lib/lingo/config.rb", "lib/lingo/core_ext.rb", "lib/lingo/agenda_item.rb", "lib/lingo/buffered_attendee.rb", "lib/lingo/reportable.rb", "lib/lingo/language.rb", "lib/lingo/language/dictionary.rb", "lib/lingo/language/word.rb", "lib/lingo/language/lexical.rb", "lib/lingo/language/word_form.rb", "lib/lingo/language/token.rb", "lib/lingo/language/grammar.rb", "lib/lingo/language/lexical_hash.rb", "lib/lingo/attendee.rb", "lib/lingo/call.rb", "lib/lingo.rb", "bin/lingo", "bin/lingoctl", "lingo.rb", "lingo.cfg", "lingo-all.cfg", "lingo-call.cfg", "de.lang", "en.lang", "de/lingo-syn.txt", "de/lingo-abk.txt", "de/lingo-dic.txt", "de/lingo-mul.txt", "de/user-dic.txt", "en/lingo-dic.txt", "en/lingo-mul.txt", "en/user-dic.txt", "txt/artikel.txt", "txt/artikel-en.txt", "info/gpl-hdr.txt", "info/kerze.png", "info/meeting.png", "info/lingo.png", "info/types.png", "info/logo.png", "info/language.png", "info/Typen.png", "info/Objekte.png", "info/download.png", "info/database.png", "info/db_small.png", "lir.cfg", "txt/lir.txt", "porter/stem.rb", "porter/stem.cfg", "test.cfg", "de/test_mul.txt", "de/test_singleword.txt", "de/test_mul2.txt", "de/test_syn.txt", "de/test_dic.txt", "de/test_syn2.txt", "TODO", "README", "ChangeLog", "COPYING", "Rakefile", "spec/spec_helper.rb", ".rspec", "test/lir.csv", "test/attendee/ts_abbreviator.rb", "test/attendee/ts_noneword_filter.rb", "test/attendee/ts_word_searcher.rb", "test/attendee/ts_object_filter.rb", "test/attendee/ts_vector_filter.rb", "test/attendee/ts_text_writer.rb", "test/attendee/ts_decomposer.rb", "test/attendee/ts_sequencer.rb", "test/attendee/ts_synonymer.rb", "test/attendee/ts_tokenizer.rb", "test/attendee/ts_variator.rb", "test/attendee/ts_text_reader.rb", "test/attendee/ts_multi_worder.rb", "test/mul.txt", "test/test_helper.rb", "test/ref/artikel.ven", "test/ref/lir.csv", "test/ref/artikel.vec", "test/ref/lir.mul", "test/ref/artikel.syn", "test/ref/lir.syn", "test/ref/artikel.mul", "test/ref/artikel.seq", "test/ref/lir.seq", "test/ref/artikel.non", "test/ref/artikel.ver", "test/ref/lir.non", "test/lir2.txt", "test/ts_database.rb", "test/lir.txt", "test/ts_language.rb"] + s.files = ["lib/lingo.rb", "lib/lingo/show_progress.rb", "lib/lingo/config.rb", "lib/lingo/database.rb", "lib/lingo/language/dictionary.rb", "lib/lingo/language/word_form.rb", "lib/lingo/language/lexical.rb", "lib/lingo/language/grammar.rb", "lib/lingo/language/lexical_hash.rb", "lib/lingo/language/token.rb", "lib/lingo/language/word.rb", "lib/lingo/attendee/stemmer/porter.rb", "lib/lingo/attendee/vector_filter.rb", "lib/lingo/attendee/noneword_filter.rb", "lib/lingo/attendee/object_filter.rb", "lib/lingo/attendee/variator.rb", "lib/lingo/attendee/multi_worder.rb", "lib/lingo/attendee/text_reader.rb", "lib/lingo/attendee/synonymer.rb", "lib/lingo/attendee/word_searcher.rb", "lib/lingo/attendee/dehyphenizer.rb", "lib/lingo/attendee/sequencer.rb", "lib/lingo/attendee/debugger.rb", "lib/lingo/attendee/text_writer.rb", "lib/lingo/attendee/stemmer.rb", "lib/lingo/attendee/tokenizer.rb", "lib/lingo/attendee/abbreviator.rb", "lib/lingo/attendee/decomposer.rb", "lib/lingo/attendee/formatter.rb", "lib/lingo/database/hash_store.rb", "lib/lingo/database/show_progress.rb", "lib/lingo/database/sdbm_store.rb", "lib/lingo/database/source.rb", "lib/lingo/database/crypter.rb", "lib/lingo/database/source/multi_value.rb", "lib/lingo/database/source/word_class.rb", "lib/lingo/database/source/key_value.rb", "lib/lingo/database/source/multi_key.rb", "lib/lingo/database/source/single_word.rb", "lib/lingo/database/gdbm_store.rb", "lib/lingo/database/libcdb_store.rb", "lib/lingo/call.rb", "lib/lingo/attendee.rb", "lib/lingo/version.rb", "lib/lingo/ctl.rb", "lib/lingo/cli.rb", "lib/lingo/core_ext.rb", "lib/lingo/buffered_attendee.rb", "lib/lingo/agenda_item.rb", "lib/lingo/cachable.rb", "lib/lingo/language.rb", "lib/lingo/error.rb", "lib/lingo/reportable.rb", "bin/lingoctl", "bin/lingo", "lingo.rb", "lingo.cfg", "lingo-call.cfg", "lir.cfg", "de.lang", "en.lang", "de/lingo-dic.txt", "de/lingo-abk.txt", "de/lingo-syn.txt", "de/lingo-mul.txt", "de/user-dic.txt", "de/test_syn.txt", "de/test_dic.txt", "de/test_syn2.txt", "de/test_singleword.txt", "de/test_mul.txt", "de/test_mul2.txt", "en/lingo-dic.txt", "en/lingo-syn.txt", "en/lingo-mul.txt", "en/user-dic.txt", "txt/artikel.txt", "txt/artikel-en.txt", "txt/lir.txt", "ChangeLog", "COPYING", "README", "Rakefile", "TODO", "spec/spec_helper.rb", ".rspec", "test/ref/artikel.ven", "test/ref/lir.mul", "test/ref/lir.vec", "test/ref/artikel.vec", "test/ref/lir.syn", "test/ref/artikel.mul", "test/ref/artikel.syn", "test/ref/artikel.seq", "test/ref/artikel.non", "test/ref/lir.non", "test/ref/lir.seq", "test/ref/artikel.ver", "test/ts_language.rb", "test/lir2.txt", "test/attendee/ts_noneword_filter.rb", "test/attendee/ts_text_writer.rb", "test/attendee/ts_sequencer.rb", "test/attendee/ts_object_filter.rb", "test/attendee/ts_text_reader.rb", "test/attendee/ts_multi_worder.rb", "test/attendee/ts_variator.rb", "test/attendee/ts_decomposer.rb", "test/attendee/ts_abbreviator.rb", "test/attendee/ts_stemmer.rb", "test/attendee/ts_tokenizer.rb", "test/attendee/ts_vector_filter.rb", "test/attendee/ts_word_searcher.rb", "test/attendee/ts_synonymer.rb", "test/lir.vec", "test/test_helper.rb", "test/lir.txt", "test/mul.txt", "test/ts_database.rb"] s.homepage = "http://lex-lingo.de" - s.rdoc_options = ["--charset", "UTF-8", "--line-numbers", "--all", "--title", "lingo Application documentation (v1.8.1)", "--main", "README"] + s.rdoc_options = ["--charset", "UTF-8", "--line-numbers", "--all", "--title", "lingo Application documentation (v1.8.2)", "--main", "README"] s.require_paths = ["lib"] s.required_ruby_version = Gem::Requirement.new(">= 1.9") s.rubygems_version = "1.8.17"