From f66dc147faf24d9fc6d790e91f38e4698e392b09 Mon Sep 17 00:00:00 2001 From: Akron Date: Wed, 6 Sep 2023 20:00:47 +0200 Subject: [PATCH] Fix end of text behaviour in case of sentence positions Change-Id: Ic433dd3579d9a79df5734a405e682596c3ccddad --- Changes | 4 ++++ datok.go | 4 ++++ matrix.go | 4 ++++ token_writer_test.go | 12 ++++++++++++ 4 files changed, 24 insertions(+) diff --git a/Changes b/Changes index dd48d28..9af4aaa 100644 --- a/Changes +++ b/Changes @@ -1,3 +1,7 @@ +0.2.2 2023-09-06 + - Fix behaviour for end of text character positions + when no end of sentence occured before. + 0.2.1 2023-09-05 - Add english tokenizer. - Fix buffer bug. diff --git a/datok.go b/datok.go index fba655e..1dd5cbe 100644 --- a/datok.go +++ b/datok.go @@ -1018,6 +1018,10 @@ PARSECHAR: if eot { eot = false + if !sentenceEnd { + sentenceEnd = true + w.SentenceEnd(buffc) + } textEnd = true w.TextEnd(0) if DEBUG { diff --git a/matrix.go b/matrix.go index e2d9858..7eda112 100644 --- a/matrix.go +++ b/matrix.go @@ -592,6 +592,10 @@ PARSECHARM: if eot { eot = false + if !sentenceEnd { + sentenceEnd = true + w.SentenceEnd(buffc) + } textEnd = true w.TextEnd(buffc) rewindBuffer = true diff --git a/token_writer_test.go b/token_writer_test.go index 868e69d..63b9c2b 100644 --- a/token_writer_test.go +++ b/token_writer_test.go @@ -85,6 +85,18 @@ func TestTokenWriterFromOptions(t *testing.T) { matStr = w.String() assert.Equal("1 5 5 6\n1 6\n0 3 3 4\n0 4\n", matStr) + w.Reset() + mat.TransduceTokenWriter(strings.NewReader("Tree\n\x04\n"), tws) + + matStr = w.String() + assert.Equal("0 4\n0 4\n", matStr) + + w.Reset() + mat.TransduceTokenWriter(strings.NewReader("Tree.\n\x04\n"), tws) + + matStr = w.String() + assert.Equal("0 4 4 5\n0 5\n", matStr) + // // Write sentence offsets without token offsets tws = NewTokenWriter(w, SENTENCE_POS|NEWLINE_AFTER_EOT)