From 57a8c35489060dbcd8320360c79fb6e8979e053b Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 19 Jan 2025 21:34:27 +0100 Subject: [PATCH] #5235 - Assistant index chunks should not overlap - Fix overlap - Added unit test --- inception/inception-assistant/pom.xml | 17 ++++- .../assistant/documents/CasChunker.java | 38 +++++----- .../documents/UpdateDocumentIndexTask.java | 7 +- .../assistant/documents/CasChunkerTest.java | 74 +++++++++++++++++++ 4 files changed, 110 insertions(+), 26 deletions(-) create mode 100644 inception/inception-assistant/src/test/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunkerTest.java diff --git a/inception/inception-assistant/pom.xml b/inception/inception-assistant/pom.xml index b6eeceedf9..9b59b306a5 100644 --- a/inception/inception-assistant/pom.xml +++ b/inception/inception-assistant/pom.xml @@ -15,7 +15,9 @@ See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 de.tudarmstadt.ukp.inception.app @@ -138,7 +140,7 @@ de.agilecoders.wicket wicket-bootstrap-extensions - + org.apache.uima uimaj-core @@ -286,6 +288,11 @@ inception-annotation-storage test + + org.apache.uima + uimafit-core + test + @@ -301,7 +308,8 @@ ${ts-link-dependency-phase} - link @inception-project/inception-js-api @inception-project/inception-diam + link @inception-project/inception-js-api + @inception-project/inception-diam @@ -340,7 +348,8 @@ jakarta.persistence:jakarta.persistence-api org.springframework.boot:spring-boot - org.springframework.boot:spring-boot-test-autoconfigure + + org.springframework.boot:spring-boot-test-autoconfigure org.springframework:spring-core org.springframework:spring-websocket diff --git a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunker.java b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunker.java index 57ebfd7680..2a8f83ba3f 100644 --- a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunker.java +++ b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunker.java @@ -17,58 +17,53 @@ */ package de.tudarmstadt.ukp.inception.assistant.documents; -import static java.lang.Math.floorDiv; - import java.util.ArrayList; import java.util.List; import org.apache.uima.cas.CAS; +import org.apache.uima.jcas.tcas.Annotation; -import com.knuddels.jtokkit.api.EncodingRegistry; +import com.knuddels.jtokkit.api.Encoding; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.inception.assistant.config.AssistantProperties; import de.tudarmstadt.ukp.inception.support.text.TrimUtils; public class CasChunker implements Chunker { - private final EncodingRegistry encodingRegistry; - private final AssistantProperties properties; + private Class unitType = Sentence.class; + private final Encoding encoding; + private final int chunkSize; - public CasChunker(EncodingRegistry aEncodingRegistry, AssistantProperties aProperties) + public CasChunker(Encoding aEncoding, int aChunkSize) { - encodingRegistry = aEncodingRegistry; - properties = aProperties; + encoding = aEncoding; + chunkSize = aChunkSize; } @Override public List process(CAS aCas) { var docText = aCas.getDocumentText(); - - var encoding = encodingRegistry.getEncoding(properties.getChat().getEncoding()) - .orElseThrow(() -> new IllegalStateException( - "Unknown encoding: " + properties.getChat().getEncoding())); - var limit = floorDiv(properties.getDocumentIndex().getChunkSize() * 90, 100); - var unitIterator = aCas.select(Sentence.class) // + var unitIterator = aCas.select(unitType) // .toList().iterator(); var chunks = new ArrayList(); - Sentence unit = null; + Annotation unit = null; var chunk = new StringBuilder(); var chunkBegin = 0; + var chunkEnd = 0; var chunkTokens = 0; while (unitIterator.hasNext()) { unit = unitIterator.next(); var unitTokens = encoding.countTokensOrdinary(unit.getCoveredText()); // Start a new chunk if necessary - if (chunkTokens + unitTokens > limit) { + if (chunkTokens + unitTokens > chunkSize) { if (!chunk.isEmpty()) { - chunks.add(buildChunk(docText, chunkBegin, unit)); + chunks.add(buildChunk(docText, chunkBegin, chunkEnd)); } chunk.setLength(0); chunkBegin = unit.getBegin(); @@ -78,19 +73,20 @@ public List process(CAS aCas) chunk.append(unit.getCoveredText()); chunk.append("\n"); chunkTokens += unitTokens; + chunkEnd = unit.getEnd(); } // Add the final chunk (unless empty) if (chunk.length() > 0 && unit != null) { - chunks.add(buildChunk(docText, chunkBegin, unit)); + chunks.add(buildChunk(docText, chunkBegin, unit.getEnd())); } return chunks; } - private Chunk buildChunk(String docText, int currentChunkBegin, Sentence unit) + private Chunk buildChunk(String docText, int aBegin, int aEnd) { - var range = new int[] {currentChunkBegin, unit.getEnd()}; + var range = new int[] { aBegin, aEnd }; TrimUtils.trim(docText, range); return Chunk.builder() // .withText(docText.substring(range[0], range[1])) // diff --git a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java index f5d23bb984..511513c181 100644 --- a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java +++ b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java @@ -103,7 +103,12 @@ public void execute() throws Exception return; } - var chunker = new CasChunker(encodingRegistry, properties); + var encoding = encodingRegistry.getEncoding(properties.getChat().getEncoding()) + .orElseThrow(() -> new IllegalStateException( + "Unknown encoding: " + properties.getChat().getEncoding())); + var limit = floorDiv(properties.getDocumentIndex().getChunkSize() * 90, 100); + + var chunker = new CasChunker(encoding, limit); var monitor = getMonitor(); try (var index = documentQueryService.borrowIndex(getProject())) { diff --git a/inception/inception-assistant/src/test/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunkerTest.java b/inception/inception-assistant/src/test/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunkerTest.java new file mode 100644 index 0000000000..20de988422 --- /dev/null +++ b/inception/inception-assistant/src/test/java/de/tudarmstadt/ukp/inception/assistant/documents/CasChunkerTest.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.assistant.documents; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.testing.factory.TokenBuilder; +import org.junit.jupiter.api.Test; + +import com.knuddels.jtokkit.Encodings; +import com.knuddels.jtokkit.api.EncodingType; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +class CasChunkerTest +{ + @Test + void test() throws Exception + { + var encodingRegistry = Encodings.newLazyEncodingRegistry(); + + var sut = new CasChunker(encodingRegistry.getEncoding(EncodingType.CL100K_BASE), 5); + + var cas = JCasFactory.createJCas(); + + var text = """ + This is sentence 1. + This is sentence 2. + This is sentence 3. + This is sentence 4. + This is sentence 5. + This is sentence 6. + This is sentence 7. + This is sentence 8. + This is sentence 9. + This is sentence 10. + """; + var builder = TokenBuilder.create(Token.class, Sentence.class); + builder.buildTokens(cas, text); + + var chunks = sut.process(cas.getCas()); + + assertThat(chunks) // + .extracting(Chunk::text) // + .containsExactly( // + "This is sentence 1.", // + "This is sentence 2.", // + "This is sentence 3.", // + "This is sentence 4.", // + "This is sentence 5.", // + "This is sentence 6.", // + "This is sentence 7.", // + "This is sentence 8.", // + "This is sentence 9.", // + "This is sentence 10."); + } +}