-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #101 from m-stoeckel/dev-tools
Added AnnotationDropper
- Loading branch information
Showing
2 changed files
with
351 additions
and
0 deletions.
There are no files selected for viewing
162 changes: 162 additions & 0 deletions
162
src/main/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/AnnotationDropper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
package org.texttechnologylab.DockerUnifiedUIMAInterface.tools; | ||
|
||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.function.Predicate; | ||
import java.util.stream.Collectors; | ||
|
||
import org.apache.uima.UimaContext; | ||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | ||
import org.apache.uima.cas.Type; | ||
import org.apache.uima.fit.component.JCasAnnotator_ImplBase; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
|
||
/** | ||
* A {@link JCasAnnotator_ImplBase JCasAnnotator} that drops or retains specific | ||
* types from processed CASes. | ||
* | ||
* @author Manuel Stoeckel | ||
* @version 0.2.0 | ||
*/ | ||
public class AnnotationDropper extends JCasAnnotator_ImplBase { | ||
/** | ||
* The types to drop from the CAS. | ||
* Must be the fully qualified class name of the type. | ||
* | ||
* @apiNote You can use the | ||
* {@link org.apache.uima.jcas.cas.TOP#_TypeName _TypeName} field of | ||
* any {@link org.apache.uima.jcas.tcas.Annotation annotation} to | ||
* access the fully qualified class name for convenience. | ||
* @apiNote Only one of {@link #PARAM_TYPES_TO_DROP} or | ||
* {@link #PARAM_TYPES_TO_RETAIN} can be set. | ||
*/ | ||
public static final String PARAM_TYPES_TO_DROP = "typesToDrop"; | ||
@ConfigurationParameter(name = PARAM_TYPES_TO_DROP, mandatory = false, defaultValue = {}) | ||
private String[] paramTypesToDrop; | ||
|
||
/** | ||
* The types to drop from the CAS. | ||
* Must be the fully qualified class name of the type. | ||
* | ||
* @apiNote WARNING: Make sure to include integral base types like | ||
* {@link org.apache.uima.jcas.cas.Sofa Sofa}! | ||
* @apiNote You can use the | ||
* {@link org.apache.uima.jcas.cas.TOP#_TypeName _TypeName} field of | ||
* any {@link org.apache.uima.jcas.tcas.Annotation annotation} to | ||
* access the fully qualified class name for convenience. | ||
* @apiNote Only one of {@link #PARAM_TYPES_TO_DROP} or | ||
* {@link #PARAM_TYPES_TO_RETAIN} can be set. | ||
*/ | ||
public static final String PARAM_TYPES_TO_RETAIN = "typesToRetain"; | ||
@ConfigurationParameter(name = PARAM_TYPES_TO_RETAIN, mandatory = false, defaultValue = {}) | ||
private String[] paramTypesToRetain; | ||
|
||
enum Mode { | ||
_UNSET, | ||
RETAIN, | ||
DROP | ||
} | ||
|
||
private Mode mode = Mode._UNSET; | ||
private HashSet<String> typeSet = new HashSet<>(); | ||
|
||
/** | ||
* @return The mode of operation. | ||
* Will always be either {@link Mode#RETAIN} or {@link Mode#DROP}. | ||
* @throws IllegalStateException If the mode is unset (i.e. prior to | ||
* {@link #initialize initialization}). | ||
*/ | ||
public Mode getMode() { | ||
switch (this.mode) { | ||
case RETAIN: | ||
return Mode.RETAIN; | ||
case DROP: | ||
return Mode.DROP; | ||
case _UNSET: | ||
default: | ||
throw new IllegalStateException("Mode is unset"); | ||
} | ||
} | ||
|
||
/** | ||
* @return An immutable copy of the {@link #typeSet}. | ||
* @apiNote The returned set can only be empty prior to | ||
* {@link #initialize initialization}. | ||
*/ | ||
public Set<String> getTypeSet() { | ||
return Set.copyOf(this.typeSet); | ||
} | ||
|
||
/** | ||
* Initializes the annotator. | ||
* | ||
* You can either drop or retain specific types from the CAS. | ||
* The mode of operations is determined automatically based on the | ||
* configuration. | ||
* | ||
* @throws IllegalArgumentException If both parameters | ||
* {@link #PARAM_TYPES_TO_DROP} and | ||
* {@link #PARAM_TYPES_TO_RETAIN} are set. | ||
* @throws IllegalArgumentException If both parameters are empty. | ||
*/ | ||
@Override | ||
public void initialize(UimaContext context) throws ResourceInitializationException { | ||
super.initialize(context); | ||
|
||
if (this.paramTypesToDrop.length == 0 && this.paramTypesToRetain.length == 0) { | ||
throw new ResourceInitializationException( | ||
new IllegalArgumentException("At least one of PARAM_TYPES_TO_DROP or PARAM_TYPES_TO_RETAIN must be set")); | ||
} else if (this.paramTypesToDrop.length > 0 && this.paramTypesToRetain.length > 0) { | ||
throw new ResourceInitializationException( | ||
new IllegalArgumentException("Only one of PARAM_TYPES_TO_DROP or PARAM_TYPES_TO_RETAIN can be set")); | ||
} | ||
|
||
if (this.paramTypesToDrop.length > 0) { | ||
this.mode = Mode.DROP; | ||
this.typeSet = new HashSet<>(List.of(this.paramTypesToDrop)); | ||
} else { | ||
this.mode = Mode.RETAIN; | ||
this.typeSet = new HashSet<>(List.of(this.paramTypesToRetain)); | ||
} | ||
} | ||
|
||
@Override | ||
public void process(JCas aJCas) throws AnalysisEngineProcessException { | ||
switch (this.mode) { | ||
case RETAIN: | ||
retainTypes(aJCas, this.typeSet); | ||
break; | ||
case DROP: | ||
dropTypes(aJCas, this.typeSet); | ||
break; | ||
case _UNSET: | ||
default: | ||
throw new IllegalStateException("Mode is unset"); | ||
} | ||
} | ||
|
||
static void retainTypes(JCas aJCas, Set<String> typesToRetain) { | ||
Set<String> typesToDrop = aJCas.getAnnotationIndex().iterator() | ||
.stream() | ||
.map(a -> a.getType().getName()) | ||
.distinct() | ||
.filter(Predicate.not(typesToRetain::contains)) | ||
.collect(Collectors.toSet()); | ||
|
||
dropTypes(aJCas, typesToDrop); | ||
} | ||
|
||
static void dropTypes(JCas aJCas, Iterable<String> typesToDrop) { | ||
for (String typeName : typesToDrop) { | ||
dropType(aJCas, typeName); | ||
} | ||
} | ||
|
||
static void dropType(JCas aJCas, String typeName) { | ||
Type type = aJCas.getTypeSystem().getType(typeName); | ||
aJCas.select(type).forEach(a -> a.removeFromIndexes(aJCas)); | ||
} | ||
} |
189 changes: 189 additions & 0 deletions
189
...st/java/org/texttechnologylab/DockerUnifiedUIMAInterface/tools/TestAnnotationDropper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
package org.texttechnologylab.DockerUnifiedUIMAInterface.tools; | ||
|
||
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; | ||
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; | ||
|
||
import java.io.IOException; | ||
import java.net.URISyntaxException; | ||
import java.net.UnknownHostException; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
|
||
import org.apache.uima.analysis_engine.AnalysisEngine; | ||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | ||
import org.apache.uima.cas.CASException; | ||
import org.apache.uima.fit.factory.JCasFactory; | ||
import org.apache.uima.fit.util.JCasUtil; | ||
import org.apache.uima.jcas.JCas; | ||
import org.apache.uima.jcas.cas.Sofa; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
import org.junit.jupiter.api.AfterAll; | ||
import org.junit.jupiter.api.AfterEach; | ||
import org.junit.jupiter.api.Assertions; | ||
import org.junit.jupiter.api.BeforeAll; | ||
import org.junit.jupiter.api.Test; | ||
import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; | ||
import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver; | ||
import org.xml.sax.SAXException; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; | ||
|
||
public class TestAnnotationDropper { | ||
static JCas jCas; | ||
static DUUIComposer composer; | ||
|
||
static final List<String[]> sentences = Arrays.asList( | ||
new String[] { "This", "is", "a", "sentence", "." }, | ||
new String[] { "This", "is", "another", "sentence", "." }, | ||
new String[] { "This", "is", "a", "third", "sentence", "." }); | ||
|
||
@BeforeAll | ||
static void setUp() throws ResourceInitializationException { | ||
try { | ||
jCas = JCasFactory.createJCas(); | ||
} catch (ResourceInitializationException | CASException e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
resetCas(); | ||
|
||
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size()); | ||
Assertions.assertEquals(16, JCasUtil.select(jCas, Token.class).size()); | ||
|
||
try { | ||
composer = new DUUIComposer() | ||
.withSkipVerification(true) | ||
.withWorkers(1); | ||
} catch (URISyntaxException e) { | ||
throw new ResourceInitializationException(e); | ||
} | ||
|
||
DUUIUIMADriver uimaDriver = new DUUIUIMADriver().withDebug(false); | ||
composer.addDriver(uimaDriver); | ||
} | ||
|
||
@AfterEach | ||
public void afterEach() throws IOException, SAXException { | ||
composer.resetPipeline(); | ||
resetCas(); | ||
} | ||
|
||
static void resetCas() { | ||
jCas.reset(); | ||
jCas.setDocumentText(sentences.stream().flatMap(Arrays::stream).collect(Collectors.joining(" "))); | ||
int tokenOffset = 0; | ||
int sentenceOffset = 0; | ||
for (String[] sentence : sentences) { | ||
String text = String.join(" ", sentence); | ||
jCas.addFsToIndexes(new Sentence(jCas, sentenceOffset, sentenceOffset + text.length())); | ||
sentenceOffset += text.length() + 1; | ||
for (String token : sentence) { | ||
jCas.addFsToIndexes(new Token(jCas, tokenOffset, tokenOffset + token.length())); | ||
tokenOffset += token.length() + 1; | ||
} | ||
} | ||
} | ||
|
||
@AfterAll | ||
static void afterAll() throws UnknownHostException { | ||
composer.shutdown(); | ||
} | ||
|
||
@Test | ||
public void testTypesToRetain() throws ResourceInitializationException, CASException { | ||
try { | ||
AnalysisEngine dropper = createEngine( | ||
AnnotationDropper.class, | ||
AnnotationDropper.PARAM_TYPES_TO_RETAIN, | ||
new String[] { | ||
Sofa._TypeName, | ||
org.apache.uima.jcas.tcas.DocumentAnnotation._TypeName, | ||
org.texttechnologylab.annotation.DocumentAnnotation._TypeName, | ||
Sentence._TypeName, | ||
}); | ||
|
||
try { | ||
dropper.process(jCas); | ||
} catch (AnalysisEngineProcessException e) { | ||
throw new RuntimeException(e); | ||
} | ||
|
||
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size()); | ||
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size()); | ||
} catch (Exception e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
@Test | ||
public void testTypesToDrop() throws ResourceInitializationException, CASException { | ||
try { | ||
AnalysisEngine dropper = createEngine( | ||
AnnotationDropper.class, | ||
AnnotationDropper.PARAM_TYPES_TO_DROP, | ||
new String[] { | ||
Token._TypeName, | ||
}); | ||
|
||
try { | ||
dropper.process(jCas); | ||
} catch (AnalysisEngineProcessException e) { | ||
throw new RuntimeException(e); | ||
} | ||
|
||
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size()); | ||
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size()); | ||
} catch (Exception e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
@Test | ||
public void testTypesToRetainDUUI() { | ||
try { | ||
composer.add(new DUUIUIMADriver.Component(createEngineDescription( | ||
AnnotationDropper.class, | ||
AnnotationDropper.PARAM_TYPES_TO_RETAIN, | ||
new String[] { | ||
Sofa._TypeName, | ||
Sentence._TypeName, | ||
}))); | ||
|
||
try { | ||
composer.run(jCas); | ||
} catch (Exception e) { | ||
Assertions.fail("DUUIComposer failed", e); | ||
} | ||
|
||
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size()); | ||
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size()); | ||
} catch (Exception e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
@Test | ||
public void testTypesToDropDUUI() { | ||
try { | ||
composer.add(new DUUIUIMADriver.Component(createEngineDescription( | ||
AnnotationDropper.class, | ||
AnnotationDropper.PARAM_TYPES_TO_DROP, | ||
new String[] { | ||
Token._TypeName, | ||
}))); | ||
|
||
try { | ||
composer.run(jCas); | ||
} catch (Exception e) { | ||
Assertions.fail("DUUIComposer failed", e); | ||
} | ||
|
||
Assertions.assertEquals(3, JCasUtil.select(jCas, Sentence.class).size()); | ||
Assertions.assertEquals(0, JCasUtil.select(jCas, Token.class).size()); | ||
} catch (Exception e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
} |