Skip to content

Commit

Permalink
Merge branch 'release/35.x'
Browse files Browse the repository at this point in the history
* release/35.x:
  #5235 - Assistant index chunks should not overlap
reckart committed Jan 20, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents cb282c8 + 722d585 commit 2c24d87
Showing 4 changed files with 102 additions and 22 deletions.
5 changes: 5 additions & 0 deletions inception/inception-assistant/pom.xml
Original file line number Diff line number Diff line change
@@ -297,6 +297,11 @@
<artifactId>inception-annotation-storage</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>uimafit-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

<build>
Original file line number Diff line number Diff line change
@@ -17,58 +17,53 @@
*/
package de.tudarmstadt.ukp.inception.assistant.documents;

import static java.lang.Math.floorDiv;

import java.util.ArrayList;
import java.util.List;

import org.apache.uima.cas.CAS;
import org.apache.uima.jcas.tcas.Annotation;

import com.knuddels.jtokkit.api.EncodingRegistry;
import com.knuddels.jtokkit.api.Encoding;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.inception.assistant.config.AssistantProperties;
import de.tudarmstadt.ukp.inception.support.text.TrimUtils;

public class CasChunker
implements Chunker<CAS>
{
private final EncodingRegistry encodingRegistry;
private final AssistantProperties properties;
private Class<? extends Annotation> unitType = Sentence.class;
private final Encoding encoding;
private final int chunkSize;

public CasChunker(EncodingRegistry aEncodingRegistry, AssistantProperties aProperties)
public CasChunker(Encoding aEncoding, int aChunkSize)
{
encodingRegistry = aEncodingRegistry;
properties = aProperties;
encoding = aEncoding;
chunkSize = aChunkSize;
}

@Override
public List<Chunk> process(CAS aCas)
{
var docText = aCas.getDocumentText();

var encoding = encodingRegistry.getEncoding(properties.getChat().getEncoding())
.orElseThrow(() -> new IllegalStateException(
"Unknown encoding: " + properties.getChat().getEncoding()));
var limit = floorDiv(properties.getDocumentIndex().getChunkSize() * 90, 100);

var unitIterator = aCas.select(Sentence.class) //
var unitIterator = aCas.select(unitType) //
.toList().iterator();

var chunks = new ArrayList<Chunk>();

Sentence unit = null;
Annotation unit = null;
var chunk = new StringBuilder();
var chunkBegin = 0;
var chunkEnd = 0;
var chunkTokens = 0;
while (unitIterator.hasNext()) {
unit = unitIterator.next();
var unitTokens = encoding.countTokensOrdinary(unit.getCoveredText());

// Start a new chunk if necessary
if (chunkTokens + unitTokens > limit) {
if (chunkTokens + unitTokens > chunkSize) {
if (!chunk.isEmpty()) {
chunks.add(buildChunk(docText, chunkBegin, unit));
chunks.add(buildChunk(docText, chunkBegin, chunkEnd));
}
chunk.setLength(0);
chunkBegin = unit.getBegin();
@@ -78,19 +73,20 @@ public List<Chunk> process(CAS aCas)
chunk.append(unit.getCoveredText());
chunk.append("\n");
chunkTokens += unitTokens;
chunkEnd = unit.getEnd();
}

// Add the final chunk (unless empty)
if (chunk.length() > 0 && unit != null) {
chunks.add(buildChunk(docText, chunkBegin, unit));
chunks.add(buildChunk(docText, chunkBegin, unit.getEnd()));
}

return chunks;
}

private Chunk buildChunk(String docText, int currentChunkBegin, Sentence unit)
private Chunk buildChunk(String docText, int aBegin, int aEnd)
{
var range = new int[] {currentChunkBegin, unit.getEnd()};
var range = new int[] { aBegin, aEnd };
TrimUtils.trim(docText, range);
return Chunk.builder() //
.withText(docText.substring(range[0], range[1])) //
Original file line number Diff line number Diff line change
@@ -103,7 +103,12 @@ public void execute() throws Exception
return;
}

var chunker = new CasChunker(encodingRegistry, properties);
var encoding = encodingRegistry.getEncoding(properties.getChat().getEncoding())
.orElseThrow(() -> new IllegalStateException(
"Unknown encoding: " + properties.getChat().getEncoding()));
var limit = floorDiv(properties.getDocumentIndex().getChunkSize() * 90, 100);

var chunker = new CasChunker(encoding, limit);

var monitor = getMonitor();
try (var index = documentQueryService.borrowIndex(getProject())) {
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.assistant.documents;

import static org.assertj.core.api.Assertions.assertThat;

import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.testing.factory.TokenBuilder;
import org.junit.jupiter.api.Test;

import com.knuddels.jtokkit.Encodings;
import com.knuddels.jtokkit.api.EncodingType;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

class CasChunkerTest
{
@Test
void test() throws Exception
{
var encodingRegistry = Encodings.newLazyEncodingRegistry();

var sut = new CasChunker(encodingRegistry.getEncoding(EncodingType.CL100K_BASE), 5);

var cas = JCasFactory.createJCas();

var text = """
This is sentence 1.
This is sentence 2.
This is sentence 3.
This is sentence 4.
This is sentence 5.
This is sentence 6.
This is sentence 7.
This is sentence 8.
This is sentence 9.
This is sentence 10.
""";
var builder = TokenBuilder.create(Token.class, Sentence.class);
builder.buildTokens(cas, text);

var chunks = sut.process(cas.getCas());

assertThat(chunks) //
.extracting(Chunk::text) //
.containsExactly( //
"This is sentence 1.", //
"This is sentence 2.", //
"This is sentence 3.", //
"This is sentence 4.", //
"This is sentence 5.", //
"This is sentence 6.", //
"This is sentence 7.", //
"This is sentence 8.", //
"This is sentence 9.", //
"This is sentence 10.");
}
}

0 comments on commit 2c24d87

Please sign in to comment.