Skip to content

Commit

Permalink
add solr tokenfactory
Browse files Browse the repository at this point in the history
  • Loading branch information
孙健 committed Sep 7, 2016
1 parent 5ff0319 commit 6cf0dd0
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 4 deletions.
2 changes: 1 addition & 1 deletion plugin/ansj_lucene5_plugin/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
</parent>

<artifactId>ansj_lucene5_plug</artifactId>
<version>5.0.2.1</version>
<version>5.0.2.2</version>
<packaging>jar</packaging>
<name>ansj_lucene5_plug</name>

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package org.ansj.lucene.util;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.ansj.lucene5.AnsjAnalyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.nlpcn.commons.lang.util.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class AnsjTokenizerFactory extends TokenizerFactory {

public final Logger logger = LoggerFactory.getLogger(getClass());

private String stopwordsDir;
public Set<String> filter;
private String type;

public AnsjTokenizerFactory(Map<String, String> args) {
super(args);
stopwordsDir = get(args, "words");
type = get(args, "type");
addStopwords(stopwordsDir);
}

/**
* 添加停用词
*
* @param dir
*/
private void addStopwords(String dir) {
if (dir == null) {
logger.info("no stopwords dir");
return;
}
logger.info("stopwords: {}", dir);
filter = new HashSet<String>();
BufferedReader br = null;
try {
br = IOUtil.getReader(dir, "uf-8");
String word = br.readLine();
while (word != null) {
filter.add(word);
word = br.readLine();
}
} catch (FileNotFoundException e) {
logger.info("No stopword file found");
} catch (IOException e) {
logger.info("stopword file io exception");
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}

@Override
public Tokenizer create(AttributeFactory factory) {
return AnsjAnalyzer.getTokenizer(null, AnsjAnalyzer.TYPE.valueOf(type), filter);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public AnsjAnalyzer(TYPE type, String stopwordsDir) {
public AnsjAnalyzer(TYPE type) {
this.type = type;
}

public AnsjAnalyzer(String typeStr) {
this.type = TYPE.valueOf(typeStr);
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ private TreeMap<Integer, Pair<String, String>> loadFeatureName(Map<String, Integ
TreeMap<Integer, Pair<String, String>> featureNames = new TreeMap<Integer, Pair<String, String>>();

String temp = null;
while (StringUtil.isNotBlank(temp = br.readLine().trim())) {
while (StringUtil.isNotBlank(temp = br.readLine())) {

int indexOf = temp.indexOf(" ");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

public class CRFppTxtModelTest {

private String modelPath = "/Users/sunjian/Documents/src/CRF++-0.58/test/model.txt";
private String modelPath = "src/test/resources/crf_txt.model";

private String testPath = "src/test/resources/corpus.txt";

Expand Down

0 comments on commit 6cf0dd0

Please sign in to comment.