Skip to content

Commit

Permalink
Merge with c9133d886f06222fc2674a10219fc3c7b76ec605
Browse files Browse the repository at this point in the history
  • Loading branch information
jzell committed Jun 24, 2013
2 parents eb72f3b + a158f1a commit 07c89a5
Showing 1 changed file with 195 additions and 76 deletions.
271 changes: 195 additions & 76 deletions src/de/unihd/dbs/uima/annotator/jvntextprowrapper/JVnTextProWrapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,22 @@
*/
package de.unihd.dbs.uima.annotator.jvntextprowrapper;

import java.util.Arrays;
import java.util.HashSet;
import java.io.File;
import java.util.LinkedList;
import java.util.List;

import jmaxent.Classification;
import jvnpostag.POSContextGenerator;
import jvnpostag.POSDataReader;
import jvnsegmenter.CRFSegmenter;
import jvnsensegmenter.JVnSenSegmenter;
import jvntextpro.JVnTextPro;
import jvntextpro.conversion.CompositeUnicode2Unicode;
import jvntextpro.data.DataReader;
import jvntextpro.data.TWord;
import jvntextpro.data.TaggingData;
import jvntextpro.util.StringUtils;
import jvntokenizer.PennTokenizer;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
Expand Down Expand Up @@ -41,13 +53,12 @@ public class JVnTextProWrapper extends JCasAnnotator_ImplBase {
private String wordModelPath = null;
private String posModelPath = null;

// list of punctuation so we can split them off as tokens (correcting JVnTextPro's output)
private final HashSet<Character> vietPunctuation = new HashSet<Character>(Arrays.asList(new Character[] {
'!', '.', ',', '(', ')', '-', '{', '}', '[', ']', '"', '\''
}));

// private jvntextpro object
private JVnTextPro jvtp = null;
// private jvntextpro objects
JVnSenSegmenter vnSenSegmenter = new JVnSenSegmenter();
CRFSegmenter vnSegmenter = new CRFSegmenter();
DataReader reader = new POSDataReader();
TaggingData dataTagger = new TaggingData();
Classification classifier = null;

/**
* initialization method where we fill configuration values and check some prerequisites
Expand All @@ -61,23 +72,26 @@ public void initialize(UimaContext aContext) {
wordModelPath = (String) aContext.getConfigParameterValue(PARAM_WORDSEGMODEL_PATH);
posModelPath = (String) aContext.getConfigParameterValue(PARAM_POSMODEL_PATH);

jvtp = new JVnTextPro();

if(sentModelPath != null)
if(!jvtp.initSenSegmenter(sentModelPath)) {
Logger.printError(component, "Error initializing the sentence segmenter model: "+sentModelPath);
if(!vnSenSegmenter.init(sentModelPath)) {
Logger.printError(component, "Error initializing the sentence segmenter model: " + sentModelPath);
System.exit(-1);
}

if(wordModelPath != null)
if(!jvtp.initSegmenter(wordModelPath)) {
Logger.printError(component, "Error initializing the word segmenter model: "+wordModelPath);
try {
vnSegmenter.init(wordModelPath);
} catch(Exception e) {
Logger.printError(component, "Error initializing the word segmenter model: " + wordModelPath);
System.exit(-1);
}

if(posModelPath != null)
if(!jvtp.initPosTagger(posModelPath)) {
Logger.printError(component, "Error initializing the POS tagging model: "+posModelPath);
try {
dataTagger.addContextGenerator(new POSContextGenerator(posModelPath + File.separator + "featuretemplate.xml"));
classifier = new Classification(posModelPath);
} catch(Exception e) {
Logger.printError(component, "Error initializing the POS tagging model: " + posModelPath);
System.exit(-1);
}
}
Expand All @@ -86,86 +100,164 @@ public void initialize(UimaContext aContext) {
* Method that gets called to process the documents' cas objects
*/
public void process(JCas jcas) throws AnalysisEngineProcessException {
CompositeUnicode2Unicode convertor = new CompositeUnicode2Unicode();
String origText = jcas.getDocumentText();

final String convertedText = convertor.convert(origText);
final String senSegmentedText = vnSenSegmenter.senSegment(convertedText).trim();

Integer offset = 0;
final String tokenizedText = PennTokenizer.tokenize(senSegmentedText).trim();
final String segmentedText = vnSegmenter.segmenting(tokenizedText);
final String postProcessedString = (new JVnTextPro()).postProcessing(segmentedText).trim();

String[] sentStrings = jvtp.process(origText).split("\n");
List<jvntextpro.data.Sentence> posSentences = jvnTagging(postProcessedString);
LinkedList<TWord> posWords = new LinkedList<TWord>();
for(jvntextpro.data.Sentence sent : posSentences)
for(Integer i = 0; i < sent.size(); ++i)
posWords.add(sent.getTWordAt(i));

// iterate over sentence strings
for(String sentString : sentStrings) {
Sentence sentence = new Sentence(jcas);
Boolean hasSentBegin = false;

String[] tokenStrings = sentString.split(" ");
// iterate over word strings
for(String tokenString : tokenStrings) {
Token token = new Token(jcas);
/*
* annotate sentences
*/
if(annotate_sentences) {
Integer offset = 0;
String[] sentences = senSegmentedText.split("\n");
for(String sentence : sentences) {
Sentence s = new Sentence(jcas);
sentence = sentence.trim();
Integer sentOffset = origText.indexOf(sentence, offset);

String word = new String();
String tag = new String();
if(sentOffset >= 0) {
s.setBegin(sentOffset);
offset = sentOffset + sentence.length();
s.setEnd(offset);
s.addToIndexes();
} else {
sentence = sentence.substring(0, sentence.length() - 1).trim();
sentOffset = origText.indexOf(sentence, offset);
if(sentOffset >= 0) {
s.setBegin(sentOffset);
offset = sentOffset + sentence.length();
s.setEnd(offset);
s.addToIndexes();
} else {
System.err.println("Sentence \"" + sentence + "\" was not found in the original text.");
}
}
}
}

/*
* annotate tokens
*/
if(annotate_tokens) {
Integer offset = 0;
String[] tokens = postProcessedString.split("\\s+");
for(Integer i = 0; i < tokens.length; ++i) {
final String token = tokens[i].trim();
String thisPosTag = null;
if(posWords.size() >= i + 1) {
if(!token.equals(posWords.get(i).getWord())) {
System.err.println("Couldn't match token: " + token
+ " to expected word/tag combination " + posWords.get(i).getWord());
} else {
thisPosTag = posWords.get(i).getTag();
}
}
Integer tokenOffset = origText.indexOf(token, offset);

Token t = new Token(jcas);

// special case if the token is "/", delimited by "/", tagged as "/" => "///" in text
if(tokenString.equals("///")) {
Integer beginning = origText.indexOf("/", offset);
token.setBegin(beginning);
token.setEnd(beginning+1);
offset = beginning+1;
} else if(tokenString.matches(".+/.+")) { // assume that the last found "/" is the postag-delimiter
Integer delimPos = tokenString.lastIndexOf("/");
word = tokenString.substring(0, delimPos);
tag = tokenString.substring(delimPos+1);
if(tokenOffset >= 0 ) {
/*
* first, try to find the string in the form the tokenizer returned it
*/
t.setBegin(tokenOffset);
offset = tokenOffset + token.length();
t.setEnd(offset);

Boolean hasBegin = false;
sanitizeToken(t, jcas);

String[] inTokenWords = word.split("_");
// iterate over sub-words, i.e. word = "armadillo_animal/N" => "armadillo", "animal"
for(String subWord : inTokenWords) {
offset = origText.indexOf(subWord, offset); // set offset to occurrence in original text

if(hasSentBegin == false) { // beginning of the pos-tagged sentence
sentence.setBegin(offset);
hasSentBegin = true;
if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
} else {
/*
* straight up token not found.
* assume that it is a compound word (e.g. some_thing)
* and try to find it in the original text again; first using
* a "_" -> " " replacement, then try just removing the underscore.
*/
String underscoreToSpaceToken = token.replaceAll("_", " ");
Integer spaceOffset = origText.indexOf(underscoreToSpaceToken, offset);
String underscoreRemovedToken = token.replaceAll("_", "");
Integer removedOffset = origText.indexOf(underscoreRemovedToken, offset);

/*
* offsets are the same. can't think of a good example where this could
* possibly happen, but maybe there is one.
*/
if(removedOffset >= 0 && spaceOffset >= 0) {
if(removedOffset >= spaceOffset) {
t.setBegin(spaceOffset);
offset = spaceOffset + underscoreToSpaceToken.length();
t.setEnd(offset);

sanitizeToken(t, jcas);

if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
} else {
t.setBegin(removedOffset);
offset = removedOffset + underscoreRemovedToken.length();
t.setEnd(offset);

sanitizeToken(t, jcas);

t.addToIndexes();
}
}
/*
* underscore removed was found, underscore replaced to space was not
*/
else if(removedOffset >= 0 && spaceOffset == -1) {
t.setBegin(removedOffset);
offset = removedOffset + underscoreRemovedToken.length();
t.setEnd(offset);

if(hasBegin == false) { // beginning of the pos-tagged token
token.setBegin(offset);
hasBegin = true;
}
sanitizeToken(t, jcas);

offset = origText.indexOf(subWord, offset) + subWord.length(); // offset is now behind the word
if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
}
/*
* underscore removed was not found, underscore replaced was found
*/
else if(removedOffset == -1 && spaceOffset >= 0) {
t.setBegin(spaceOffset);
offset = spaceOffset + underscoreToSpaceToken.length();
t.setEnd(offset);

token.setEnd(offset); // word-token gets final value from the last sub-word
sentence.setEnd(offset); // sentence gets final value from the last sub-word
}
sanitizeToken(t, jcas);

if(annotate_tokens) t.setPos(thisPosTag);
t.addToIndexes();
}
/*
* call our sanitation routine that splits off punctuation marks from the end and
* the beginning of the token and creates new tokens for each of them
* there is no hope of finding this token
*/
sanitizeTokens(token, jcas);

if(annotate_partofspeech) // if flag is true, then add pos info to indexes
token.setPos(tag);

if(annotate_tokens) // if flag is true, then add this token to indexes
token.addToIndexes();

} else { // otherwise, the tagger gave us something we don't understand (yet?)
continue; // jump to next token
else {
System.err.println("Token \"" + token + "\" was not found in the original text.");
}
}
}

if(annotate_sentences) // if flag is true, then add sentence token to indexes
sentence.addToIndexes();
}
}

private Boolean sanitizeTokens(Token t, JCas jcas) {
private Boolean sanitizeToken(Token t, JCas jcas) {
Boolean workDone = false;

// check the beginning of the token for punctuation and split off into a new token
if(vietPunctuation.contains(t.getCoveredText().charAt(0)) && t.getCoveredText().length() > 1) {
if(t.getCoveredText().matches("^\\p{Punct}.*") && t.getCoveredText().length() > 1) {
Character thisChar = t.getCoveredText().charAt(0);
t.setBegin(t.getBegin() + 1); // set corrected token boundary for the word
Token puncToken = new Token(jcas); // create a new token for the punctuation character
Expand All @@ -181,7 +273,7 @@ private Boolean sanitizeTokens(Token t, JCas jcas) {
}

// check the end of the token for punctuation and split off into a new token
if(vietPunctuation.contains(t.getCoveredText().charAt(t.getEnd() - t.getBegin() - 1)) && t.getCoveredText().length() > 1) {
if(t.getCoveredText().matches(".*\\p{Punct}$") && t.getCoveredText().length() > 1) {
Character thisChar = t.getCoveredText().charAt(t.getEnd() - t.getBegin() - 1);
t.setEnd(t.getEnd() - 1); // set corrected token boundary for the word
Token puncToken = new Token(jcas); // create a new token for the punctuation character
Expand All @@ -198,9 +290,36 @@ private Boolean sanitizeTokens(Token t, JCas jcas) {

// get into a recursion to sanitize tokens as long as there are stray ones
if(workDone) {
workDone = sanitizeTokens(t, jcas);
workDone = sanitizeToken(t, jcas);
}

return workDone;
}

/**
* Taken from the JVnTextPro package and adapted to not output a string
* @param instr input string to be tagged
* @return tagged text
*/
public List<jvntextpro.data.Sentence> jvnTagging(String instr) {
List<jvntextpro.data.Sentence> data = reader.readString(instr);
for (int i = 0; i < data.size(); ++i) {

jvntextpro.data.Sentence sent = data.get(i);
for (int j = 0; j < sent.size(); ++j) {
String [] cps = dataTagger.getContext(sent, j);
String label = classifier.classify(cps);

if (label.equalsIgnoreCase("Mrk")) {
if (StringUtils.isPunc(sent.getWordAt(j)))
label = sent.getWordAt(j);
else label = "X";
}

sent.getTWordAt(j).setTag(label);
}
}

return data;
}
}

0 comments on commit 07c89a5

Please sign in to comment.