Skip to content

Commit

Permalink
词典重构一办 懒得切分支了
Browse files Browse the repository at this point in the history
  • Loading branch information
ansj committed Nov 30, 2016
1 parent 3976026 commit 572aeb3
Show file tree
Hide file tree
Showing 33 changed files with 1,080 additions and 1,016 deletions.
2 changes: 1 addition & 1 deletion plugin/ansj_lucene5_plugin/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
</parent>

<artifactId>ansj_lucene5_plug</artifactId>
<version>5.0.3.0</version>
<version>5.0.4.0</version>
<packaging>jar</packaging>
<name>ansj_lucene5_plug</name>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.ansj.lucene.util;

import java.io.IOException;
import java.util.Set;

import org.ansj.domain.Term;
import org.ansj.splitWord.Analysis;
Expand All @@ -22,16 +21,7 @@ public final class AnsjTokenizer extends Tokenizer {
// 分词词性
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

private int skippedPositions;

protected Analysis ta = null;
/** 自定义停用词 */
private Set<String> filter;

public AnsjTokenizer(Analysis ta, Set<String> filter) {
this.ta = ta;
this.filter = filter;
}

public AnsjTokenizer(Analysis ta) {
this.ta = ta;
Expand All @@ -41,8 +31,6 @@ public AnsjTokenizer(Analysis ta) {
public final boolean incrementToken() throws IOException {
clearAttributes();

skippedPositions = 0;

int position = 0;
Term term = null;
String name = null;
Expand All @@ -53,16 +41,10 @@ public final boolean incrementToken() throws IOException {
if (term == null) {
break;
}

name = term.getName();
length = name.length();

if (filter != null && filter.contains(name)) {
continue;
} else {
position++;
flag = false;
}
position++;
flag = false;
} while (flag);
if (term != null) {
positionAttr.setPositionIncrement(position);
Expand All @@ -82,7 +64,6 @@ public final boolean incrementToken() throws IOException {
public void reset() throws IOException {
super.reset();
ta.resetContent(new AnsjReader(this.input));
skippedPositions = 0;
}

}
Original file line number Diff line number Diff line change
@@ -1,73 +1,28 @@
package org.ansj.lucene.util;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.ansj.lucene5.AnsjAnalyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import org.nlpcn.commons.lang.util.IOUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;

public class AnsjTokenizerFactory extends TokenizerFactory {

public final Logger logger = LoggerFactory.getLogger(getClass());
public final Log logger = LogFactory.getLog();

private String stopwordsDir;
public Set<String> filter;
private String type;
private Map<String, String> args;

public AnsjTokenizerFactory(Map<String, String> args) {
super(args);
stopwordsDir = get(args, "words");
type = get(args, "type");
addStopwords(stopwordsDir);
}

/**
* 添加停用词
*
* @param dir
*/
private void addStopwords(String dir) {
if (dir == null) {
logger.info("no stopwords dir");
return;
}
logger.info("stopwords: {}", dir);
filter = new HashSet<String>();
BufferedReader br = null;
try {
br = IOUtil.getReader(dir, "uf-8");
String word = br.readLine();
while (word != null) {
filter.add(word);
word = br.readLine();
}
} catch (FileNotFoundException e) {
logger.info("No stopword file found");
} catch (IOException e) {
logger.info("stopword file io exception");
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
this.args = args ;
}

@Override
public Tokenizer create(AttributeFactory factory) {
return AnsjAnalyzer.getTokenizer(null, AnsjAnalyzer.TYPE.valueOf(type), filter);
return AnsjAnalyzer.getTokenizer(null, args);
}

}
Original file line number Diff line number Diff line change
@@ -1,27 +1,22 @@
package org.ansj.lucene5;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Map;

import org.ansj.lucene.util.AnsjTokenizer;
import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.BaseAnalysis;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;

public class AnsjAnalyzer extends Analyzer {
public final Logger logger = LoggerFactory.getLogger(getClass());
public final Log logger = LogFactory.getLog();

/**
* dic equals user , query equals to
Expand All @@ -33,53 +28,23 @@ public static enum TYPE {
base, index, query, to, dic, user, search
}

/** 自定义停用词 */
private Set<String> filter;
/** 是否查询分词 */
private TYPE type;
/**
* 分词类型
*/
private Map<String, String> args;

/**
* @param filter 停用词
*/
public AnsjAnalyzer(TYPE type, Set<String> filter) {
this.type = type;
this.filter = filter;
}

public AnsjAnalyzer(TYPE type, String stopwordsDir) {
this.type = type;
this.filter = filter(stopwordsDir);
}

public AnsjAnalyzer(TYPE type) {
this.type = type;
}

public AnsjAnalyzer(String typeStr) {
this.type = TYPE.valueOf(typeStr);
}

private Set<String> filter(String stopwordsDir) {
if (StringUtil.isBlank(stopwordsDir)) {
return null;
}
try {
List<String> readFile2List = IOUtil.readFile2List(stopwordsDir, IOUtil.UTF8);
return new HashSet<String>(readFile2List);
} catch (FileNotFoundException e) {
logger.warn("文件没有找到", e);
} catch (UnsupportedEncodingException e) {
logger.warn("编码不支持", e);
}
return null;
public AnsjAnalyzer(Map<String, String> args) {
this.args = args;
}

@Override
protected TokenStreamComponents createComponents(String text) {
BufferedReader reader = new BufferedReader(new StringReader(text));
Tokenizer tokenizer = null;

tokenizer = getTokenizer(reader, this.type, this.filter);
tokenizer = getTokenizer(reader, this.args);
return new TokenStreamComponents(tokenizer);
}

Expand All @@ -91,50 +56,36 @@ protected TokenStreamComponents createComponents(String text) {
* @param filter
* @return
*/
public static Tokenizer getTokenizer(BufferedReader reader, TYPE type, Set<String> filter) {
Tokenizer tokenizer;
public static Tokenizer getTokenizer(BufferedReader reader, Map<String, String> args) {

switch (type) {
Analysis analysis = null;

switch (AnsjAnalyzer.TYPE.valueOf(args.get("type"))) {
case base:
if (reader == null) {
tokenizer = new AnsjTokenizer(new BaseAnalysis(), filter);
} else {
tokenizer = new AnsjTokenizer(new BaseAnalysis(reader), filter);
}
analysis = new BaseAnalysis();
break;
case index:
if (reader == null) {
tokenizer = new AnsjTokenizer(new IndexAnalysis(), filter);
} else {
tokenizer = new AnsjTokenizer(new IndexAnalysis(reader), filter);
}
analysis = new IndexAnalysis();
break;
case dic:
case user:
if (reader == null) {
tokenizer = new AnsjTokenizer(new DicAnalysis(), filter);
} else {
tokenizer = new AnsjTokenizer(new DicAnalysis(reader), filter);
}
analysis = new DicAnalysis();
break;

case to:
case query:
case search:
if (reader == null) {
tokenizer = new AnsjTokenizer(new ToAnalysis(), filter);
} else {
tokenizer = new AnsjTokenizer(new ToAnalysis(reader), filter);
}
analysis = new ToAnalysis();
break;
default:
if (reader == null) {
tokenizer = new AnsjTokenizer(new ToAnalysis(), filter);
} else {
tokenizer = new AnsjTokenizer(new ToAnalysis(reader), filter);
}
analysis = new BaseAnalysis();
}

return tokenizer;
if (reader != null) {
analysis.resetContent(reader);
}

return new AnsjTokenizer(analysis);

}

}
8 changes: 8 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,21 @@
<scope>compile</scope>
</dependency>

<dependency>
<groupId>org.nutz</groupId>
<artifactId>nutz</artifactId>
<version>1.r.58</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
<scope>test</scope>
</dependency>


</dependencies>

<build>
Expand Down
Loading

0 comments on commit 572aeb3

Please sign in to comment.