diff --git a/plugin/ansj_lucene5_plugin/pom.xml b/plugin/ansj_lucene5_plugin/pom.xml
index fea7a684..3641f70c 100644
--- a/plugin/ansj_lucene5_plugin/pom.xml
+++ b/plugin/ansj_lucene5_plugin/pom.xml
@@ -10,7 +10,7 @@
ansj_lucene5_plug
- 5.0.3.0
+ 5.0.4.0
jar
ansj_lucene5_plug
diff --git a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java
index 4ae5c016..73057114 100644
--- a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java
+++ b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java
@@ -1,7 +1,6 @@
package org.ansj.lucene.util;
import java.io.IOException;
-import java.util.Set;
import org.ansj.domain.Term;
import org.ansj.splitWord.Analysis;
@@ -22,16 +21,7 @@ public final class AnsjTokenizer extends Tokenizer {
// 分词词性
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private int skippedPositions;
-
protected Analysis ta = null;
- /** 自定义停用词 */
- private Set filter;
-
- public AnsjTokenizer(Analysis ta, Set filter) {
- this.ta = ta;
- this.filter = filter;
- }
public AnsjTokenizer(Analysis ta) {
this.ta = ta;
@@ -41,8 +31,6 @@ public AnsjTokenizer(Analysis ta) {
public final boolean incrementToken() throws IOException {
clearAttributes();
- skippedPositions = 0;
-
int position = 0;
Term term = null;
String name = null;
@@ -53,16 +41,10 @@ public final boolean incrementToken() throws IOException {
if (term == null) {
break;
}
-
name = term.getName();
length = name.length();
-
- if (filter != null && filter.contains(name)) {
- continue;
- } else {
- position++;
- flag = false;
- }
+ position++;
+ flag = false;
} while (flag);
if (term != null) {
positionAttr.setPositionIncrement(position);
@@ -82,7 +64,6 @@ public final boolean incrementToken() throws IOException {
public void reset() throws IOException {
super.reset();
ta.resetContent(new AnsjReader(this.input));
- skippedPositions = 0;
}
}
\ No newline at end of file
diff --git a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java
index f076eb59..a55134d2 100644
--- a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java
+++ b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java
@@ -1,73 +1,28 @@
package org.ansj.lucene.util;
-import java.io.BufferedReader;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.HashSet;
import java.util.Map;
-import java.util.Set;
import org.ansj.lucene5.AnsjAnalyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
-import org.nlpcn.commons.lang.util.IOUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
public class AnsjTokenizerFactory extends TokenizerFactory {
- public final Logger logger = LoggerFactory.getLogger(getClass());
+ public final Log logger = LogFactory.getLog();
- private String stopwordsDir;
- public Set filter;
- private String type;
+ private Map args;
public AnsjTokenizerFactory(Map args) {
super(args);
- stopwordsDir = get(args, "words");
- type = get(args, "type");
- addStopwords(stopwordsDir);
- }
-
- /**
- * 添加停用词
- *
- * @param dir
- */
- private void addStopwords(String dir) {
- if (dir == null) {
- logger.info("no stopwords dir");
- return;
- }
- logger.info("stopwords: {}", dir);
- filter = new HashSet();
- BufferedReader br = null;
- try {
- br = IOUtil.getReader(dir, "uf-8");
- String word = br.readLine();
- while (word != null) {
- filter.add(word);
- word = br.readLine();
- }
- } catch (FileNotFoundException e) {
- logger.info("No stopword file found");
- } catch (IOException e) {
- logger.info("stopword file io exception");
- } finally {
- if (br != null) {
- try {
- br.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
+ this.args = args ;
}
@Override
public Tokenizer create(AttributeFactory factory) {
- return AnsjAnalyzer.getTokenizer(null, AnsjAnalyzer.TYPE.valueOf(type), filter);
+ return AnsjAnalyzer.getTokenizer(null, args);
}
-
+
}
diff --git a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java
index 8cadb539..e99a2c84 100644
--- a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java
+++ b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java
@@ -1,27 +1,22 @@
package org.ansj.lucene5;
import java.io.BufferedReader;
-import java.io.FileNotFoundException;
import java.io.StringReader;
-import java.io.UnsupportedEncodingException;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.Map;
import org.ansj.lucene.util.AnsjTokenizer;
+import org.ansj.splitWord.Analysis;
import org.ansj.splitWord.analysis.BaseAnalysis;
import org.ansj.splitWord.analysis.DicAnalysis;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
-import org.nlpcn.commons.lang.util.IOUtil;
-import org.nlpcn.commons.lang.util.StringUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
public class AnsjAnalyzer extends Analyzer {
- public final Logger logger = LoggerFactory.getLogger(getClass());
+ public final Log logger = LogFactory.getLog();
/**
* dic equals user , query equals to
@@ -33,53 +28,23 @@ public static enum TYPE {
base, index, query, to, dic, user, search
}
- /** 自定义停用词 */
- private Set filter;
- /** 是否查询分词 */
- private TYPE type;
+ /**
+ * 分词类型
+ */
+ private Map args;
/**
* @param filter 停用词
*/
- public AnsjAnalyzer(TYPE type, Set filter) {
- this.type = type;
- this.filter = filter;
- }
-
- public AnsjAnalyzer(TYPE type, String stopwordsDir) {
- this.type = type;
- this.filter = filter(stopwordsDir);
- }
-
- public AnsjAnalyzer(TYPE type) {
- this.type = type;
- }
-
- public AnsjAnalyzer(String typeStr) {
- this.type = TYPE.valueOf(typeStr);
- }
-
- private Set filter(String stopwordsDir) {
- if (StringUtil.isBlank(stopwordsDir)) {
- return null;
- }
- try {
- List readFile2List = IOUtil.readFile2List(stopwordsDir, IOUtil.UTF8);
- return new HashSet(readFile2List);
- } catch (FileNotFoundException e) {
- logger.warn("文件没有找到", e);
- } catch (UnsupportedEncodingException e) {
- logger.warn("编码不支持", e);
- }
- return null;
+ public AnsjAnalyzer(Map args) {
+ this.args = args;
}
@Override
protected TokenStreamComponents createComponents(String text) {
BufferedReader reader = new BufferedReader(new StringReader(text));
Tokenizer tokenizer = null;
-
- tokenizer = getTokenizer(reader, this.type, this.filter);
+ tokenizer = getTokenizer(reader, this.args);
return new TokenStreamComponents(tokenizer);
}
@@ -91,50 +56,36 @@ protected TokenStreamComponents createComponents(String text) {
* @param filter
* @return
*/
- public static Tokenizer getTokenizer(BufferedReader reader, TYPE type, Set filter) {
- Tokenizer tokenizer;
+ public static Tokenizer getTokenizer(BufferedReader reader, Map args) {
- switch (type) {
+ Analysis analysis = null;
+
+ switch (AnsjAnalyzer.TYPE.valueOf(args.get("type"))) {
case base:
- if (reader == null) {
- tokenizer = new AnsjTokenizer(new BaseAnalysis(), filter);
- } else {
- tokenizer = new AnsjTokenizer(new BaseAnalysis(reader), filter);
- }
+ analysis = new BaseAnalysis();
break;
case index:
- if (reader == null) {
- tokenizer = new AnsjTokenizer(new IndexAnalysis(), filter);
- } else {
- tokenizer = new AnsjTokenizer(new IndexAnalysis(reader), filter);
- }
+ analysis = new IndexAnalysis();
break;
case dic:
case user:
- if (reader == null) {
- tokenizer = new AnsjTokenizer(new DicAnalysis(), filter);
- } else {
- tokenizer = new AnsjTokenizer(new DicAnalysis(reader), filter);
- }
+ analysis = new DicAnalysis();
break;
-
case to:
case query:
case search:
- if (reader == null) {
- tokenizer = new AnsjTokenizer(new ToAnalysis(), filter);
- } else {
- tokenizer = new AnsjTokenizer(new ToAnalysis(reader), filter);
- }
+ analysis = new ToAnalysis();
break;
default:
- if (reader == null) {
- tokenizer = new AnsjTokenizer(new ToAnalysis(), filter);
- } else {
- tokenizer = new AnsjTokenizer(new ToAnalysis(reader), filter);
- }
+ analysis = new BaseAnalysis();
}
- return tokenizer;
+ if (reader != null) {
+ analysis.resetContent(reader);
+ }
+
+ return new AnsjTokenizer(analysis);
+
}
+
}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 28c9c1f0..2549fcff 100644
--- a/pom.xml
+++ b/pom.xml
@@ -43,6 +43,13 @@
compile
+
+ org.nutz
+ nutz
+ 1.r.58
+ provided
+
+
junit
junit
@@ -50,6 +57,7 @@
test
+
diff --git a/pom_Maven.xml b/pom_Maven.xml
deleted file mode 100644
index 48beabb1..00000000
--- a/pom_Maven.xml
+++ /dev/null
@@ -1,163 +0,0 @@
-
- 4.0.0
- org.ansj
- ansj_seg
- jar
- ansj_seg
- 5.0.3
- best java chinese word seg !
- https://github.com/NLPchina/ansj_seg
-
-
- The Apache Software License, Version 2.0
- http://www.apache.org/licenses/LICENSE-2.0.txt
- repo
-
-
-
-
- scm:git:git@github.com:ansjsun/ansj_seg.git
- scm:git:git@github.com:ansjsun/ansj_seg.git
- git@github.com:ansjsun/ansj_seg.git
-
-
-
-
-
- ansj
- ansj
- ansj-sun@163.com
-
-
-
-
- UTF-8
-
-
-
-
- org.nlpcn
- nlp-lang
- 1.7
- compile
-
-
-
- org.slf4j
- slf4j-api
- 1.7.21
-
-
-
- org.slf4j
- slf4j-log4j12
- 1.7.21
- provided
-
-
-
- log4j
- log4j
- 1.2.16
- provided
-
-
-
- junit
- junit
- 4.8.1
- test
-
-
-
-
-
-
-
- net.orfjackal.retrolambda
- retrolambda-maven-plugin
- 2.0.6
-
-
- default
-
- process-main
-
-
-
-
- 1.6
- false
- false
-
-
-
-
- maven-compiler-plugin
- 2.3.2
-
-
- 1.7
- UTF-8
-
-
-
-
- org.apache.maven.plugins
- maven-source-plugin
-
-
- attach-sources
-
- jar-no-fork
-
-
-
-
-
- true
-
- **/*.java
-
-
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.10.3
-
- -Xdoclint:none
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.4
-
-
- sign-artifacts
- verify
-
- sign
-
-
-
-
-
-
-
-
-
- sonatype-nexus-snapshots
- Sonatype Nexus snapshot repository
- https://oss.sonatype.org/content/repositories/snapshots
-
-
-
- sonatype-nexus-staging
- Sonatype Nexus release repository
- https://oss.sonatype.org/service/local/staging/deploy/maven2
-
-
-
diff --git a/src/main/java/org/ansj/app/crf/MakeTrainFile.java b/src/main/java/org/ansj/app/crf/MakeTrainFile.java
index aa4c3add..33a78f12 100644
--- a/src/main/java/org/ansj/app/crf/MakeTrainFile.java
+++ b/src/main/java/org/ansj/app/crf/MakeTrainFile.java
@@ -11,6 +11,7 @@
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 生成crf 或者是 wapiti的训练语聊工具.
@@ -22,7 +23,7 @@
*/
public class MakeTrainFile {
- private static final Log logger = MyStaticValue.getLog();
+ private static final Log logger = LogFactory.getLog();
public static void main(String[] args) {
@@ -39,8 +40,7 @@ public static void main(String[] args) {
logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]");
return;
}
- try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8");
- FileOutputStream fos = new FileOutputStream(outputPath)) {
+ try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8"); FileOutputStream fos = new FileOutputStream(outputPath)) {
String temp = null;
int i = 0;
while ((temp = reader.readLine()) != null) {
diff --git a/src/main/java/org/ansj/app/crf/Model.java b/src/main/java/org/ansj/app/crf/Model.java
index 0c6b0fde..be9b4c84 100755
--- a/src/main/java/org/ansj/app/crf/Model.java
+++ b/src/main/java/org/ansj/app/crf/Model.java
@@ -3,6 +3,7 @@
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.io.ObjectOutputStream;
import java.util.Map;
import java.util.Map.Entry;
@@ -15,12 +16,11 @@
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.MapCount;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
public abstract class Model {
- public static final Log logger = MyStaticValue.getLog() ;
-
- protected String name;
+ public static final Log logger = LogFactory.getLog(Model.class);
protected Config config;
@@ -30,10 +30,6 @@ public abstract class Model {
public int allFeatureCount = 0;
- public Model(String name) {
- this.name = name;
- };
-
/**
* 判断当前数据流是否是本实例
*
@@ -50,32 +46,44 @@ public Model(String name) {
* @return
* @throws Exception
*/
- public static Model load(String name, String modelPath) throws Exception {
- Model model = new CRFModel(name);
+ public static Model load(String modelPath) throws Exception {
+ Model model = new CRFModel();
if (model.checkModel(modelPath)) {
- model.loadModel(modelPath);
- return model;
+ return model.loadModel(modelPath);
}
- model = new CRFppTxtModel(name);
+ model = new CRFppTxtModel();
if (model.checkModel(modelPath)) {
- model.loadModel(modelPath);
- return model;
+ return model.loadModel(modelPath);
}
- model = new WapitiCRFModel(name);
+ model = new WapitiCRFModel();
if (model.checkModel(modelPath)) {
- model.loadModel(modelPath);
- return model;
+ return model.loadModel(modelPath);
}
throw new Exception("I did not know what type of model by file " + modelPath);
}
+ /**
+ * 模型读取
+ *
+ * @param path
+ * @return
+ * @return
+ * @throws Exception
+ */
+ public static Model load(Class extends Model> c, InputStream is) throws Exception {
+ Model model = c.newInstance();
+ return model.loadModel(is);
+ }
+
/**
* 不同的模型实现自己的加载模型类
*
* @throws Exception
*/
- public abstract void loadModel(String modelPath) throws Exception;
+ public abstract Model loadModel(String modelPath) throws Exception;
+
+ public abstract Model loadModel(InputStream is) throws Exception;
/**
* 获得特征所在权重数组
@@ -95,10 +103,6 @@ public float[] getFeature(char... chars) {
return sf.getParam();
}
- public String getName() {
- return this.name;
- };
-
public Config getConfig() {
return this.config;
}
@@ -125,12 +129,10 @@ protected static void printFeatureTree(String cs, float[] tempW) {
if (tempW.length == 4) {
name = "U";
}
- name += "*" + ((int) cs.charAt(cs.length() - 1) - Config.FEATURE_BEGIN + 1) + ":"
- + cs.substring(0, cs.length() - 1);
+ name += "*" + ((int) cs.charAt(cs.length() - 1) - Config.FEATURE_BEGIN + 1) + ":" + cs.substring(0, cs.length() - 1);
for (int i = 0; i < tempW.length; i++) {
if (tempW[i] != 0) {
- System.out.println(
- name + "\t" + Config.getTagName(i / 4 - 1) + "\t" + Config.getTagName(i % 4) + "\t" + tempW[i]);
+ System.out.println(name + "\t" + Config.getTagName(i / 4 - 1) + "\t" + Config.getTagName(i % 4) + "\t" + tempW[i]);
}
}
@@ -172,9 +174,9 @@ public void writeModel(String path) {
oos.writeInt(0);
oos.flush();
} catch (FileNotFoundException e) {
- logger.warn("文件没有找到",e);
+ logger.warn("文件没有找到", e);
} catch (IOException e) {
- logger.warn("IO异常",e);
+ logger.warn("IO异常", e);
}
}
}
\ No newline at end of file
diff --git a/src/main/java/org/ansj/app/crf/model/CRFModel.java b/src/main/java/org/ansj/app/crf/model/CRFModel.java
index 22575cee..086c0824 100644
--- a/src/main/java/org/ansj/app/crf/model/CRFModel.java
+++ b/src/main/java/org/ansj/app/crf/model/CRFModel.java
@@ -23,16 +23,16 @@ public class CRFModel extends Model {
public static final String version = "ansj1";
- public CRFModel(String name) {
- super(name);
- }
-
@Override
- public void loadModel(String modelPath) throws Exception {
- loadModel(IOUtil.getInputStream(modelPath));
+ public CRFModel loadModel(String modelPath) throws Exception {
+ try (InputStream is = IOUtil.getInputStream(modelPath)) {
+ loadModel(is);
+ return this;
+ }
}
- public void loadModel(InputStream is) throws Exception {
+ @Override
+ public CRFModel loadModel(InputStream is) throws Exception {
long start = System.currentTimeMillis();
try (ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(is))) {
ois.readUTF();
@@ -58,6 +58,7 @@ public void loadModel(InputStream is) throws Exception {
} while (win == 0 || size == 0);
logger.info("load crf model ok ! use time :" + (System.currentTimeMillis() - start));
}
+ return this;
}
@Override
diff --git a/src/main/java/org/ansj/app/crf/model/CRFppModel.java b/src/main/java/org/ansj/app/crf/model/CRFppModel.java
deleted file mode 100644
index 833034b8..00000000
--- a/src/main/java/org/ansj/app/crf/model/CRFppModel.java
+++ /dev/null
@@ -1,77 +0,0 @@
-//package org.ansj.app.crf.model;
-//
-//import java.io.DataInputStream;
-//import java.io.FileInputStream;
-//
-//import org.ansj.app.crf.Model;
-//
-///**
-// * 加载CRF+生成的crf二进制模型,测试使用的CRF++版本为:CRF++-0.58
-// *
-// * 下载地址:https://taku910.github.io/crfpp/#download 在这里感谢作者所做的工作.
-// *
-// * @author Ansj
-// *
-// */
-//public class CRFppModel extends Model {
-//
-// public CRFppModel(String name) {
-// super(name);
-// }
-//
-// /**
-// * 解析crf++生成的可可视文件
-// */
-// public void loadModel(String modelPath) throws Exception {
-//
-// FileInputStream fileInputStream = new FileInputStream(modelPath);
-//
-// DataInputStream dis = new DataInputStream(fileInputStream);
-//
-// System.out.println();
-//
-// }
-//
-// public static void main(String[] args) throws Exception {
-// new CRFppModel("test").loadModel("/Users/sunjian/Documents/src/CRF++-0.58/test/model");
-//
-// // System.out.println("---------------------------");
-// //
-// // int u = 1;
-// //
-// // byte[] b = new byte[4];
-// //
-// // b[0] = (byte) (u);
-// // b[1] = (byte) (u >> 8);
-// // b[2] = (byte) (u >> 16);
-// // b[3] = (byte) (u >> 24);
-// //
-// // System.out.println(Arrays.toString(b));
-// //
-// // System.out.println("---------------------------");
-// //
-// // b = new byte[4];
-// //
-// // b[0] = -72;
-// // b[1] = 36;
-// // b[2] = 86;
-// // b[3] = 0;
-// //
-// // System.out.println((int) (b[0] | b[1] << 8 | b[2] << 16 | b[3] <<
-// // 24));
-// //
-// // System.out.println((char) 66);
-// // System.out.println((char) 69);
-// // System.out.println((char) 77);
-// // System.out.println((char) 83);
-// // System.out.println((char) 104);
-// //
-// // System.out.println((char) 85);
-// // System.out.println((char) 48);
-// // System.out.println((char) 49);
-// // System.out.println((char) 58);
-// // System.out.println((char) 37);
-// // System.out.println((char) 120);
-// }
-//
-//}
diff --git a/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java b/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java
index 4288231c..8ef2d3bf 100644
--- a/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java
+++ b/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java
@@ -1,6 +1,7 @@
package org.ansj.app.crf.model;
import java.io.BufferedReader;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
@@ -28,18 +29,23 @@
*/
public class CRFppTxtModel extends Model {
- public CRFppTxtModel(String name) {
- super(name);
- }
-
/**
* 解析crf++生成的可可视txt文件
+ *
+ * @return
*/
- public void loadModel(String modelPath) throws Exception {
+ public CRFppTxtModel loadModel(String modelPath) throws Exception {
+ try (InputStream is = new FileInputStream(modelPath)) {
+ loadModel(new FileInputStream(modelPath));
+ return this;
+ }
+ }
+ @Override
+ public Model loadModel(InputStream is) throws Exception {
long start = System.currentTimeMillis();
- BufferedReader reader = IOUtil.getReader(modelPath, IOUtil.UTF8);
+ BufferedReader reader = IOUtil.getReader(is, IOUtil.UTF8);
reader.readLine();// version
reader.readLine();// cost-factor
@@ -54,11 +60,12 @@ public void loadModel(String modelPath) throws Exception {
for (int[] t1 : config.getTemplate()) {
sb.append(Arrays.toString(t1) + " ");
}
- logger.info("load template ok template : "+ sb);
+ logger.info("load template ok template : " + sb);
TreeMap> featureNames = loadFeatureName(featureIndex, reader);
- logger.info("load feature ok feature size : "+ featureNames.size());
+ logger.info("load feature ok feature size : " + featureNames.size());
loadFeatureWeight(reader, statusCoven, featureNames);
- logger.info("load crfpp model ok ! use time : "+ (System.currentTimeMillis() - start));
+ logger.info("load crfpp model ok ! use time : " + (System.currentTimeMillis() - start));
+ return this;
}
/**
@@ -309,4 +316,5 @@ public boolean checkModel(String modelPath) {
}
return false;
}
+
}
diff --git a/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java b/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java
index 0269180a..6a6e0da9 100644
--- a/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java
+++ b/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java
@@ -27,13 +27,14 @@
*/
public class WapitiCRFModel extends Model {
- public WapitiCRFModel(String name) {
- super(name);
+ public WapitiCRFModel loadModel(String modelPath) throws Exception {
+ try (InputStream is = IOUtil.getInputStream(modelPath)) {
+ return loadModel(is);
+ }
}
- public void loadModel(String modelPath) throws Exception {
-
- BufferedReader br = IOUtil.getReader(modelPath, IOUtil.UTF8);
+ public WapitiCRFModel loadModel(InputStream is) throws Exception {
+ BufferedReader br = IOUtil.getReader(is, IOUtil.UTF8);
long start = System.currentTimeMillis();
@@ -50,21 +51,21 @@ public void loadModel(String modelPath) throws Exception {
sb.append(Arrays.toString(t1) + " ");
}
- logger.info("featureIndex is "+ featureIndex);
- logger.info("load template ok template : "+ sb);
+ logger.info("featureIndex is " + featureIndex);
+ logger.info("load template ok template : " + sb);
int[] statusCoven = loadTagCoven(br);
List> loadFeatureName = loadFeatureName(featureIndex, br);
- logger.info("load feature ok feature size : "+ loadFeatureName.size());
+ logger.info("load feature ok feature size : " + loadFeatureName.size());
featureTree = new SmartForest();
loadFeatureWeight(br, statusCoven, loadFeatureName);
- logger.info("load wapiti model ok ! use time :"+ (System.currentTimeMillis() - start));
-
+ logger.info("load wapiti model ok ! use time :" + (System.currentTimeMillis() - start));
+ return this;
}
/**
@@ -75,8 +76,7 @@ public void loadModel(String modelPath) throws Exception {
* @param statusCoven
* @throws Exception
*/
- private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List> featureNames)
- throws Exception {
+ private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List> featureNames) throws Exception {
int key = 0;
@@ -97,14 +97,13 @@ private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List pair : featureNames) {
if (temp == null) {
- logger.warn(pair.getValue0()+"\t"+pair.getValue1()+" not have any weight ,so skip it !");
+ logger.warn(pair.getValue0() + "\t" + pair.getValue1() + " not have any weight ,so skip it !");
continue;
}
char fc = Character.toUpperCase(pair.getValue0().charAt(0));
- len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM
- : fc == 'U' ? Config.TAG_NUM : fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0;
+ len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM : fc == 'U' ? Config.TAG_NUM : fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0;
if (len == 0) {
throw new Exception("unknow feature type " + pair.getValue0());
@@ -162,8 +161,7 @@ private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List> loadFeatureName(Map featureIndex, BufferedReader br)
- throws Exception {
+ private List> loadFeatureName(Map featureIndex, BufferedReader br) throws Exception {
String temp = br.readLine();// #qrk#num
int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数
@@ -328,9 +326,9 @@ private Map loadConfig(BufferedReader br) throws IOException {
}
@Override
- public boolean checkModel(String modelPath){
+ public boolean checkModel(String modelPath) {
- try (InputStream is = IOUtil.getInputStream(modelPath)){
+ try (InputStream is = IOUtil.getInputStream(modelPath)) {
byte[] bytes = new byte[100];
is.read(bytes);
diff --git a/src/main/java/org/ansj/dic/DicReader.java b/src/main/java/org/ansj/dic/DicReader.java
index 75add540..cc7abfca 100644
--- a/src/main/java/org/ansj/dic/DicReader.java
+++ b/src/main/java/org/ansj/dic/DicReader.java
@@ -5,8 +5,8 @@
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
-import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 加载词典用的类
@@ -15,7 +15,7 @@
*/
public class DicReader {
- private static final Log logger = MyStaticValue.getLog() ;
+ private static final Log logger = LogFactory.getLog();
public static BufferedReader getReader(String name) {
// maven工程修改词典加载方式
diff --git a/src/main/java/org/ansj/dic/PathToStream.java b/src/main/java/org/ansj/dic/PathToStream.java
new file mode 100644
index 00000000..c77414f2
--- /dev/null
+++ b/src/main/java/org/ansj/dic/PathToStream.java
@@ -0,0 +1,43 @@
+package org.ansj.dic;
+
+import java.io.InputStream;
+
+import org.ansj.dic.impl.File2Stream;
+import org.ansj.dic.impl.Jar2Stream;
+import org.ansj.dic.impl.Jdbc2Stream;
+import org.ansj.dic.impl.Url2Stream;
+import org.ansj.exception.LibraryException;
+
+/**
+ * 将路径转换为流,如果你需要实现自己的加载器请实现这个类,使用这个类可能需要自己依赖第三方包,比如jdbc连接和nutz
+ *
+ * @author ansj
+ *
+ */
+public abstract class PathToStream {
+
+ public static InputStream stream(String path) {
+ try {
+ if (path.startsWith("file://")) {
+ return new File2Stream().toStream(path);
+ } else if (path.startsWith("jdbc://")) {
+ return new Jdbc2Stream().toStream(path);
+ } else if (path.startsWith("jar://")) {
+ return new Jar2Stream().toStream(path);
+ } else if (path.startsWith("class://")) {
+ ((PathToStream) Class.forName(path.substring(8).split("\\|")[0]).newInstance()).toStream(path);
+ } else if (path.startsWith("url://")) {
+ return new Url2Stream().toStream(path);
+ } else {
+ return new File2Stream().toStream(path);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new LibraryException(e);
+ }
+ throw new LibraryException("not find method type in path " + path);
+ }
+
+ public abstract InputStream toStream(String path);
+
+}
diff --git a/src/main/java/org/ansj/dic/impl/File2Stream.java b/src/main/java/org/ansj/dic/impl/File2Stream.java
new file mode 100644
index 00000000..d868de95
--- /dev/null
+++ b/src/main/java/org/ansj/dic/impl/File2Stream.java
@@ -0,0 +1,95 @@
+package org.ansj.dic.impl;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FilenameFilter;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
+import java.util.Vector;
+
+import org.ansj.dic.PathToStream;
+import org.ansj.exception.LibraryException;
+import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
+
+/**
+ * 将文件转换为流 file://c:/dic.txt
+ *
+ * @author ansj
+ *
+ */
+public class File2Stream extends PathToStream {
+
+ private static final Log LOG = LogFactory.getLog(File2Stream.class);
+
+ @Override
+ public InputStream toStream(String path) {
+ LOG.info("path to stream " + path);
+
+ if (path.startsWith("file://")) {
+ path = path.substring(7);
+ }
+
+ File file = new File(path);
+
+ if (file.exists() && file.canRead()) {
+
+ try {
+ if (file.isDirectory()) {
+ return multiple(path);
+ } else {
+ return new FileInputStream(file);
+ }
+ } catch (Exception e) {
+ throw new LibraryException(e);
+ }
+ }
+ throw new LibraryException("file " + path + " not found or can not to read");
+
+ }
+
+ private InputStream multiple(String path) throws FileNotFoundException {
+ File[] libs = new File[0];
+
+ File file = new File(path);
+
+ if (file.exists() && file.canRead()) {
+ if (file.isFile()) {
+ libs = new File[1];
+ libs[0] = file;
+ } else if (file.isDirectory()) {
+ File[] files = file.listFiles(new FilenameFilter() {
+ @Override
+ public boolean accept(File dir, String name) {
+ if (dir.canRead() && !dir.isHidden() && !dir.isDirectory()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+ });
+ if (files != null && files.length > 0) {
+ libs = files;
+ }
+ }
+ }
+
+ if (libs.length == 0) {
+ throw new LibraryException("not find any file in path : " + path);
+ }
+
+ if (libs.length == 1) {
+ return new FileInputStream(libs[0]);
+ }
+
+ Vector vector = new Vector<>(libs.length);
+
+ for (int i = 0; i < libs.length; i++) {
+ vector.add(new FileInputStream(libs[i]));
+ }
+
+ return new SequenceInputStream(vector.elements());
+ }
+
+}
diff --git a/src/main/java/org/ansj/dic/impl/Jar2Stream.java b/src/main/java/org/ansj/dic/impl/Jar2Stream.java
new file mode 100644
index 00000000..4a48687f
--- /dev/null
+++ b/src/main/java/org/ansj/dic/impl/Jar2Stream.java
@@ -0,0 +1,21 @@
+package org.ansj.dic.impl;
+
+import java.io.InputStream;
+
+import org.ansj.dic.DicReader;
+import org.ansj.dic.PathToStream;
+
+/**
+ * 从系统jar包中读取文件,你们不能用,只有我能用 jar://
+ *
+ * @author ansj
+ *
+ */
+public class Jar2Stream extends PathToStream {
+
+ @Override
+ public InputStream toStream(String path) {
+ return DicReader.getInputStream(path.substring(6));
+ }
+
+}
diff --git a/src/main/java/org/ansj/dic/impl/Jdbc2Stream.java b/src/main/java/org/ansj/dic/impl/Jdbc2Stream.java
new file mode 100644
index 00000000..1ac5f65d
--- /dev/null
+++ b/src/main/java/org/ansj/dic/impl/Jdbc2Stream.java
@@ -0,0 +1,91 @@
+package org.ansj.dic.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.ansj.dic.PathToStream;
+import org.ansj.exception.LibraryException;
+import org.nutz.dao.Dao;
+import org.nutz.dao.Sqls;
+import org.nutz.dao.impl.NutDao;
+import org.nutz.dao.impl.SimpleDataSource;
+import org.nutz.dao.sql.Sql;
+import org.nutz.dao.sql.SqlCallback;
+
+/**
+ * jdbc:mysql://192.168.10.103:3306/infcn_mss?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull|username|password|select name as name,nature,freq from dic where type=1
+ *
+ * @author ansj
+ *
+ */
+public class Jdbc2Stream extends PathToStream {
+
+ private static final byte[] TAB = "\t".getBytes();
+
+ private static final byte[] LINE = "\n".getBytes();
+
+ @Override
+ public InputStream toStream(String path) {
+ path = path.substring(7);
+
+ String[] split = path.split("\\|");
+
+ String jdbc = split[0];
+
+ String username = split[1];
+
+ String password = split[2];
+
+ String sqlStr = split[3];
+
+ SimpleDataSource ds = null;
+
+ try {
+ ds = new SimpleDataSource();
+
+ ds.setJdbcUrl(jdbc);
+ ds.setUsername(username);
+ ds.setPassword(password);
+
+ Dao dao = new NutDao(ds);
+
+ Sql sql = Sqls.create(sqlStr);
+
+ Sql execute = dao.execute(sql.setCallback(new SqlCallback() {
+ @Override
+ public byte[] invoke(Connection conn, ResultSet rs, Sql sql) throws SQLException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream(100 * 1024);
+ while (rs.next()) {
+ try {
+ baos.write(rs.getString(0).getBytes());
+ baos.write(TAB);
+ baos.write(rs.getString(1).getBytes());
+ baos.write(TAB);
+ baos.write(rs.getString(2).getBytes());
+ baos.write(LINE);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ return baos.toByteArray();
+ }
+ }));
+
+ return new ByteArrayInputStream((byte[]) execute.getResult());
+ } catch (Exception e) {
+ throw new LibraryException(e);
+ } finally {
+ if (ds != null) {
+ ds.close();
+ }
+ }
+
+ }
+
+}
diff --git a/src/main/java/org/ansj/dic/impl/Url2Stream.java b/src/main/java/org/ansj/dic/impl/Url2Stream.java
new file mode 100644
index 00000000..958c9de0
--- /dev/null
+++ b/src/main/java/org/ansj/dic/impl/Url2Stream.java
@@ -0,0 +1,24 @@
+package org.ansj.dic.impl;
+
+import java.io.InputStream;
+
+import org.ansj.dic.PathToStream;
+import org.nutz.http.Http;
+import org.nutz.http.Response;
+
+/**
+ * url://http://maven.nlpcn.org/down/library/default.dic
+ *
+ * @author ansj
+ *
+ */
+public class Url2Stream extends PathToStream {
+
+ @Override
+ public InputStream toStream(String path) {
+ path = path.substring(6);
+ Response response = Http.get(path);
+ return response.getStream();
+ }
+
+}
diff --git a/src/main/java/org/ansj/domain/KV.java b/src/main/java/org/ansj/domain/KV.java
new file mode 100644
index 00000000..e1eda4d6
--- /dev/null
+++ b/src/main/java/org/ansj/domain/KV.java
@@ -0,0 +1,17 @@
+package org.ansj.domain;
+
+public class KV {
+
+ private K k;
+
+ private V v;
+
+ private KV(K k, V v) {
+ this.k = k;
+ this.v = v;
+ }
+
+ public static KV with(K k, V v) {
+ return new KV(k, v);
+ }
+}
diff --git a/src/main/java/org/ansj/exception/LibraryException.java b/src/main/java/org/ansj/exception/LibraryException.java
new file mode 100644
index 00000000..a182ed89
--- /dev/null
+++ b/src/main/java/org/ansj/exception/LibraryException.java
@@ -0,0 +1,15 @@
+package org.ansj.exception;
+
+public class LibraryException extends RuntimeException {
+
+ private static final long serialVersionUID = 1L;
+
+ public LibraryException(Exception e) {
+ super(e);
+ }
+
+ public LibraryException(String message) {
+ super(message);
+ }
+
+}
diff --git a/src/main/java/org/ansj/library/CrfLibrary.java b/src/main/java/org/ansj/library/CrfLibrary.java
new file mode 100644
index 00000000..c0d91a1c
--- /dev/null
+++ b/src/main/java/org/ansj/library/CrfLibrary.java
@@ -0,0 +1,118 @@
+package org.ansj.library;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.ansj.app.crf.Model;
+import org.ansj.app.crf.SplitWord;
+import org.ansj.app.crf.model.CRFModel;
+import org.ansj.dic.PathToStream;
+import org.nlpcn.commons.lang.tire.domain.Forest;
+import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
+import org.nlpcn.commons.lang.util.tuples.KeyValue;
+
+public class CrfLibrary {
+
+ private static final Log LOG = LogFactory.getLog();
+
+ // CRF模型
+ private static final Map> CRF = new HashMap<>();
+
+ public static final String DEFAULT = "crf_";
+
+ /**
+ * 根据key获取crf分词器
+ *
+ * @param key
+ * @return crf分词器
+ */
+ public static SplitWord crf(String key) {
+ KeyValue kv = CRF.get(fix(key));
+
+ if (kv == null) {
+ LOG.warn("crf " + key + " not found in config ");
+ return null;
+ }
+
+ SplitWord sw = (SplitWord) kv.getValue();
+ if (sw == null) {
+ sw = initCRFModel(kv);
+ }
+ return sw;
+ }
+
+ /**
+ * 加载CRF模型
+ *
+ * @param modelPath
+ * @return
+ */
+ private static synchronized SplitWord initCRFModel(KeyValue kv) {
+ try {
+ if (kv.getValue() != null) {
+ return kv.getValue();
+ }
+
+ long start = System.currentTimeMillis();
+ LOG.info("begin init crf model!");
+ try (InputStream is = PathToStream.stream(kv.getKey())) {
+ SplitWord crfSplitWord = new SplitWord(Model.load(CRFModel.class, is));
+ kv.setValue(crfSplitWord);
+ LOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getKey());
+ return crfSplitWord;
+ }
+ } catch (Exception e) {
+ LOG.error(kv + " load err " + e.getMessage());
+ return null;
+ }
+ }
+
+ /**
+ * 动态添加
+ *
+ * @param dicDefault
+ * @param dicDefault2
+ * @param dic2
+ */
+ public static void put(String key, String path) {
+ put(key, path, null);
+ }
+
+ public static void put(String key, String path, SplitWord sw) {
+ CRF.put(key, KeyValue.with(path, sw));
+ }
+
+ /**
+ * 删除一个key
+ *
+ * @param key
+ * @return
+ */
+ public static KeyValue remove(String key) {
+ return CRF.remove(key);
+ }
+
+ /**
+ * 刷新一个,将值设置为null
+ * @param key
+ * @return
+ */
+ public static KeyValue flush(String key) {
+ CRF.get(key).setValue(null);
+ }
+
+ public static Set keys() {
+ return CRF.keySet();
+ }
+
+ private static String fix(String key) {
+ if (key.startsWith(DEFAULT)) {
+ return key;
+ } else {
+ return DEFAULT + key;
+ }
+ }
+}
diff --git a/src/main/java/org/ansj/library/DATDictionary.java b/src/main/java/org/ansj/library/DATDictionary.java
index 94c0c69d..11b51b30 100644
--- a/src/main/java/org/ansj/library/DATDictionary.java
+++ b/src/main/java/org/ansj/library/DATDictionary.java
@@ -11,14 +11,14 @@
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
import org.ansj.library.name.PersonAttrLibrary;
-import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.dat.DoubleArrayTire;
import org.nlpcn.commons.lang.dat.Item;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
public class DATDictionary {
- private static final Log logger = MyStaticValue.getLog();
+ private static final Log LOG = LogFactory.getLog(DATDictionary.class);
/**
* 所有在词典中出现的词,并且承担简繁体转换的任务.
@@ -64,16 +64,16 @@ private static DoubleArrayTire loadDAT() {
}
// 特殊字符标准化
IN_SYSTEM['%'] = '%';
- logger.info("init core library ok use time : " + (System.currentTimeMillis() - start));
+ LOG.info("init core library ok use time : " + (System.currentTimeMillis() - start));
return dat;
} catch (InstantiationException e) {
- MyStaticValue.LIBRARYLOG.warn("无法实例化", e);
+ LOG.warn("无法实例化", e);
} catch (IllegalAccessException e) {
- MyStaticValue.LIBRARYLOG.warn("非法访问", e);
+ LOG.warn("非法访问", e);
} catch (NumberFormatException e) {
- MyStaticValue.LIBRARYLOG.warn("数字格式异常", e);
+ LOG.warn("数字格式异常", e);
} catch (IOException e) {
- MyStaticValue.LIBRARYLOG.warn("IO异常", e);
+ LOG.warn("IO异常", e);
}
return null;
@@ -146,7 +146,7 @@ public static AnsjItem getItem(int index) {
public static AnsjItem getItem(String str) {
AnsjItem item = DAT.getItem(str);
- if (item == null || item.getStatus()<2) {
+ if (item == null || item.getStatus() < 2) {
return AnsjItem.NULL;
}
diff --git a/src/main/java/org/ansj/library/DicLibrary.java b/src/main/java/org/ansj/library/DicLibrary.java
new file mode 100644
index 00000000..1b85e0be
--- /dev/null
+++ b/src/main/java/org/ansj/library/DicLibrary.java
@@ -0,0 +1,224 @@
+package org.ansj.library;
+
+import java.io.BufferedReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.ansj.dic.PathToStream;
+import org.ansj.util.MyStaticValue;
+import org.nlpcn.commons.lang.tire.domain.Forest;
+import org.nlpcn.commons.lang.tire.domain.Value;
+import org.nlpcn.commons.lang.tire.library.Library;
+import org.nlpcn.commons.lang.util.IOUtil;
+import org.nlpcn.commons.lang.util.StringUtil;
+import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
+import org.nlpcn.commons.lang.util.tuples.KeyValue;
+
+public class DicLibrary {
+
+ private static final Log LOG = LogFactory.getLog();
+
+ public static final String DEFAULT = "dic_";
+
+ public static final String DEFAULT_NATURE = "userDefine";
+
+ public static final Integer DEFAULT_FREQ = 1000;
+
+ public static final String DEFAULT_FREQ_STR = "1000";
+
+ // 用户自定义词典
+ private static final Map> DIC = new HashMap<>();
+
+ /**
+ * 关键词增加
+ *
+ * @param keyword 所要增加的关键词
+ * @param nature 关键词的词性
+ * @param freq 关键词的词频
+ */
+ public static void insert(String key, String keyword, String nature, int freq) {
+ Forest dic = get(key);
+
+ if (dic == null) {
+ dic = putIfAbsent(DEFAULT, DEFAULT, new Forest());
+ }
+
+ String[] paramers = new String[2];
+ paramers[0] = nature;
+ paramers[1] = String.valueOf(freq);
+ Value value = new Value(keyword, paramers);
+ Library.insertWord(dic, value);
+ }
+
+ /**
+ * 增加关键词
+ *
+ * @param keyword
+ */
+ public static void insert(String key, String keyword) {
+ insert(key, keyword, DEFAULT_NATURE, DEFAULT_FREQ);
+ }
+
+ /**
+ * 删除关键词
+ */
+ public static void remove(String key, String word) {
+ Forest dic = get(key);
+ if (dic != null) {
+ Library.removeWord(dic, word);
+ }
+ }
+
+ /**
+ * 将用户自定义词典清空
+ */
+ public static void clear(String key) {
+ get(key).clear();
+ }
+
+ public static Forest get() {
+ return get(DEFAULT);
+ }
+
+ /**
+ * 根据模型名称获取crf模型
+ *
+ * @param modelName
+ * @return
+ */
+ public static Forest get(String key) {
+ KeyValue kv = DIC.get(fix(key));
+
+ if (kv == null) {
+ LOG.warn("dic " + key + " not found in config ");
+ return null;
+ }
+ Forest forest = kv.getValue();
+ if (forest == null) {
+ forest = init(kv);
+ }
+ return forest;
+
+ }
+
+ /**
+ * 用户自定义词典加载
+ *
+ * @param key
+ * @param path
+ * @return
+ */
+
+ private synchronized static Forest init(KeyValue kv) {
+ Forest forest = kv.getValue();
+ if (forest != null) {
+ return forest;
+ }
+ try {
+ forest = new Forest();
+ LOG.info("begin init dic !");
+ long start = System.currentTimeMillis();
+ String temp = null;
+ String[] strs = null;
+ Value value = null;
+ try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getKey()), "UTF-8")) {
+ while ((temp = br.readLine()) != null) {
+ if (StringUtil.isNotBlank(temp)) {
+ temp = StringUtil.trim(temp);
+ strs = temp.split("\t");
+ strs[0] = strs[0].toLowerCase();
+ // 如何核心辞典存在那么就放弃
+ if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
+ continue;
+ }
+ if (strs.length != 3) {
+ value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR);
+ } else {
+ value = new Value(strs[0], strs[1], strs[2]);
+ }
+ Library.insertWord(forest, value);
+ }
+ }
+ }
+ LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getKey());
+ kv.setValue(forest);
+ return forest;
+ } catch (Exception e) {
+ LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getKey());
+ return null;
+ }
+ }
+
+ /**
+ * 动态添加词典
+ *
+ * @param dicDefault
+ * @param dicDefault2
+ * @param dic2
+ */
+ public static void put(String key, String path, Forest forest) {
+ DIC.put(key, KeyValue.with(path, forest));
+ }
+
+ /**
+ * 动态添加词典
+ *
+ * @param dicDefault
+ * @param dicDefault2
+ * @param dic2
+ */
+ public static void putIfAbsent(String key, String path) {
+ if (!DIC.containsKey(key)) {
+ DIC.put(key, KeyValue.with(path, (Forest) null));
+ }
+ }
+
+ /**
+ * 动态添加词典
+ *
+ * @param dicDefault
+ * @param dicDefault2
+ * @param dic2
+ */
+ public static void put(String key, String path) {
+ put(key, path, null);
+ }
+
+ /**
+ * 动态添加词典
+ *
+ * @param
+ * @param
+ *
+ * @param dicDefault
+ * @param dicDefault2
+ * @param dic2
+ */
+ public static synchronized Forest putIfAbsent(String key, String path, Forest forest) {
+ KeyValue kv = DIC.get(key);
+ if (kv != null && kv.getValue() != null) {
+ return kv.getValue();
+ }
+ put(key, path, forest);
+ return forest;
+ }
+
+ public static KeyValue remove(String key) {
+ return DIC.remove(key);
+ }
+
+ public static Set keys() {
+ return DIC.keySet();
+ }
+
+ private static String fix(String key) {
+ if (key.startsWith(DEFAULT)) {
+ return key;
+ } else {
+ return DEFAULT + key;
+ }
+ }
+
+}
diff --git a/src/main/java/org/ansj/library/NatureLibrary.java b/src/main/java/org/ansj/library/NatureLibrary.java
index 79acb92c..fafcab0d 100644
--- a/src/main/java/org/ansj/library/NatureLibrary.java
+++ b/src/main/java/org/ansj/library/NatureLibrary.java
@@ -9,6 +9,7 @@
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 这里封装了词性和词性之间的关系.以及词性的索引.这是个好东西. 里面数组是从ict里面找来的. 不是很新.没有语料无法训练
@@ -18,7 +19,7 @@
*/
public class NatureLibrary {
- private static final Log logger = MyStaticValue.getLog();
+ private static final Log logger = LogFactory.getLog(NatureLibrary.class);
private static final int YI = 1;
private static final int FYI = -1;
diff --git a/src/main/java/org/ansj/library/NgramLibrary.java b/src/main/java/org/ansj/library/NgramLibrary.java
index 4cb99478..06080645 100644
--- a/src/main/java/org/ansj/library/NgramLibrary.java
+++ b/src/main/java/org/ansj/library/NgramLibrary.java
@@ -2,6 +2,7 @@
import org.ansj.domain.Term;
import org.ansj.util.MyStaticValue;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 两个词之间的关联
@@ -13,7 +14,7 @@ public class NgramLibrary {
static {
long start = System.currentTimeMillis();
MyStaticValue.initBigramTables();
- MyStaticValue.LIBRARYLOG.info("init ngram ok use time :" + (System.currentTimeMillis() - start));
+ LogFactory.getLog(NgramLibrary.class).info("init ngram ok use time :" + (System.currentTimeMillis() - start));
}
/**
diff --git a/src/main/java/org/ansj/library/UserDefineLibrary.java b/src/main/java/org/ansj/library/UserDefineLibrary.java
index 22857fa0..b47f5cf3 100644
--- a/src/main/java/org/ansj/library/UserDefineLibrary.java
+++ b/src/main/java/org/ansj/library/UserDefineLibrary.java
@@ -1,18 +1,18 @@
package org.ansj.library;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
-import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
-import java.io.*;
-import java.net.URL;
-
-import static org.ansj.util.MyStaticValue.LIBRARYLOG;
-
/**
* 用户自定义词典操作类
*
@@ -20,222 +20,111 @@
*/
public class UserDefineLibrary {
- public static final String DEFAULT_NATURE = "userDefine";
-
- public static final Integer DEFAULT_FREQ = 1000;
-
- public static final String DEFAULT_FREQ_STR = "1000";
-
- public static Forest FOREST = null;
-
- public static Forest ambiguityForest = null;
-
- static {
- initUserLibrary();
- initAmbiguityLibrary();
- }
-
- /**
- * 关键词增加
- *
- * @param keyword 所要增加的关键词
- * @param nature 关键词的词性
- * @param freq 关键词的词频
- */
- public static void insertWord(String keyword, String nature, int freq) {
- if (FOREST == null) {
- FOREST = new Forest();
- }
- String[] paramers = new String[2];
- paramers[0] = nature;
- paramers[1] = String.valueOf(freq);
- Value value = new Value(keyword, paramers);
- Library.insertWord(FOREST, value);
- }
-
- /**
- * 增加关键词
- *
- * @param keyword
- */
- public static void insertWord(String keyword) {
- insertWord(keyword, DEFAULT_NATURE, DEFAULT_FREQ);
- }
-
-
- /**
- * 加载纠正词典
- */
- private static void initAmbiguityLibrary() {
-
- File[] lib = findLibrary(MyStaticValue.ambiguityLibrary);
-
- if (lib.length > 0) {
- ambiguityForest = new Forest();
- for (File file : lib) {
- try (BufferedReader br = IOUtil.getReader(file, "utf-8")) {
- String temp;
- while ((temp = br.readLine()) != null) {
- if (StringUtil.isNotBlank(temp)) {
- temp = StringUtil.trim(temp);
- String[] split = temp.split("\t");
- StringBuilder sb = new StringBuilder();
- if (split.length % 2 != 0) {
- LIBRARYLOG.error("init ambiguity error in line :" + temp + " format err !");
- }
- for (int i = 0; i < split.length; i += 2) {
- sb.append(split[i]);
- }
- ambiguityForest.addBranch(sb.toString(), split);
- }
- }
-
- } catch (UnsupportedEncodingException e) {
- LIBRARYLOG.warn("不支持的编码", e);
- } catch (IOException e) {
- LIBRARYLOG.warn("Init ambiguity library error :"+ e.getMessage()+", path: "+ file.getPath());
- }
- }
-
- LIBRARYLOG.info("Init ambiguity library ok!");
-
- } else {
- LIBRARYLOG.warn("Init ambiguity library warning :"+MyStaticValue.ambiguityLibrary+" because : file not found or failed to read !");
- }
-
- }
-
- /**
- * 加载用户自定义词典和补充词典
- */
- private static void initUserLibrary() {
- FOREST = MyStaticValue.getDicForest();
- }
-
-
- /**
- * 加载词典,传入一本词典的路径.或者目录.词典后缀必须为.dic 按文件名称顺序加载
- */
- public static void loadLibrary(Forest forest, String path) {
-
- File[] lib = findLibrary(path);
-
- if (lib.length > 0) {
- for (File file : lib) {
- String temp;
- String[] strs;
- Value value;
- try (BufferedReader br = IOUtil.getReader(new FileInputStream(file), "UTF-8")) {
- while ((temp = br.readLine()) != null) {
- if (StringUtil.isNotBlank(temp)) {
- temp = StringUtil.trim(temp);
- strs = temp.split("\t");
- strs[0] = strs[0].toLowerCase();
- // 如何核心辞典存在那么就放弃
- if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) {
- continue;
- }
- if (strs.length != 3) {
- value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR);
- } else {
- value = new Value(strs[0], strs[1], strs[2]);
- }
- Library.insertWord(forest, value);
- }
- }
- } catch (UnsupportedEncodingException e) {
- LIBRARYLOG.warn("不支持的编码", e);
- } catch (IOException e) {
- LIBRARYLOG.warn("Init user library error :"+e.getMessage()+", path: "+file.getPath());
- }
- }
-
- LIBRARYLOG.info("Init user library ok!");
-
-
- } else {
- LIBRARYLOG.warn("Init user library error :"+path+" because : not find that file !");
- }
-
- }
-
- /**
- * 删除关键词
- */
- public static void removeWord(String word) {
- Library.removeWord(FOREST, word);
- }
-
- public static String[] getParams(String word) {
- return getParams(FOREST, word);
- }
-
- public static String[] getParams(Forest forest, String word) {
- SmartForest temp = forest;
- for (int i = 0; i < word.length(); i++) {
- temp = temp.get(word.charAt(i));
- if (temp == null) {
- return null;
- }
- }
- if (temp.getStatus() > 1) {
- return temp.getParam();
- } else {
- return null;
- }
- }
-
- public static boolean contains(String word) {
- return getParams(word) != null;
- }
-
- /**
- * 将用户自定义词典清空
- */
- public static void clear() {
- FOREST.clear();
- }
-
-
- /**
- * Load files
- *
- * @param path file path
- * @return File Array
- */
- private static File[] findLibrary(String path) {
- File[] libs = new File[0];
- File file = new File(path);
- if (!file.exists()) {
- // Try load from classpath
- URL url = UserDefineLibrary.class.getClassLoader().getResource(path);
- if (url != null) {
- file = new File(url.getPath());
- }
- }
-
- if (file.canRead()) {
-
- if (file.isFile()) {
- libs = new File[1];
- libs[0] = file;
- } else if (file.isDirectory()) {
- File[] files = file.listFiles(new FilenameFilter() {
- @Override
- public boolean accept(File dir, String name) {
- if (name.endsWith(".dic") && dir.canRead()) {
- return true;
- } else {
- return false;
- }
- }
- });
- if (files != null && files.length > 0) {
- libs = files;
- }
- }
- }
- return libs;
- }
-
+
+ /**
+ * 覆盖更新同义词 [中国, 中华, 我国] -> replace([中国,华夏]) -> [中国,华夏]
+ *
+ * @param words
+ */
+ public void insert(String key, String[] words) {
+
+ List list = new ArrayList<>();
+
+ for (String word : words) {
+ if (StringUtil.isBlank(word)) {
+ continue;
+ }
+ list.add(word);
+ }
+
+ if (list.size() <= 1) {
+ LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
+ return;
+ }
+
+ Set set = findAllWords(words);
+
+ for (String word : list) {
+ set.remove(word);
+ synonyms.add(word, list);
+ }
+
+ for (String word : set) { //删除所有
+ synonyms.remove(word);
+ synonyms.getBranch(word).setParam(null);
+ }
+
+ }
+
+ private Set findAllWords(String[] words) {
+ Set set = new HashSet<>();
+
+ for (String word : words) {
+ SmartForest> branch = synonyms.getBranch(word);
+ if (branch != null) {
+ List params = branch.getParam();
+ if (params != null) {
+ set.addAll(params);
+ }
+ }
+ }
+ return set;
+ }
+
+ /**
+ * 合并更新同义词 覆盖更新同义词 [中国, 中华, 我国] -> append([中国,华夏]) -> [中国, 中华, 我国 , 华夏]
+ *
+ * @param words
+ */
+ public void append(String[] words) {
+
+ Set set = new HashSet<>();
+
+ for (String word : words) {
+ if (StringUtil.isBlank(word)) {
+ continue;
+ }
+ set.add(word);
+ }
+
+ if (set.size() <= 1) {
+ LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
+ return;
+ }
+
+ set.addAll(findAllWords(words));
+
+ List list = new ArrayList<>(set);
+
+ for (String word : list) {
+ synonyms.addBranch(word, list);
+ }
+ }
+
+ /**
+ * 从同义词组中删除掉一个词 [中国, 中华, 我国] -> remove(我国) -> [中国, 中华]
+ *
+ * @param words
+ */
+ public void remove(String word) {
+
+ SmartForest> branch = synonyms.getBranch(word);
+
+ if (branch == null || branch.getStatus() < 2) {
+ return;
+ }
+
+ List params = branch.getParam();
+
+ synonyms.remove(word);
+ branch.setParam(null);
+ params.remove(word);
+
+ if (params.size() == 1) { //如果是1 个也删除
+ synonyms.remove(params.get(0));
+ params.remove(0);
+ } else {
+ params.remove(word);
+ }
+ }
}
diff --git a/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java b/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java
index 5aa827f4..d5069a02 100644
--- a/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java
+++ b/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java
@@ -6,6 +6,7 @@
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 机构名识别词典加载类
@@ -15,7 +16,7 @@
*/
public class CompanyAttrLibrary {
- private static final Log logger = MyStaticValue.getLog();
+ private static final Log logger = LogFactory.getLog();
private static HashMap cnMap = null;
@@ -31,7 +32,7 @@ public static HashMap getCompanyMap() {
}
// company_freq
-
+
private static void init() {
try (BufferedReader br = MyStaticValue.getCompanReader()) {
cnMap = new HashMap();
diff --git a/src/main/java/org/ansj/library/name/PersonAttrLibrary.java b/src/main/java/org/ansj/library/name/PersonAttrLibrary.java
index b7d67154..4a81982f 100644
--- a/src/main/java/org/ansj/library/name/PersonAttrLibrary.java
+++ b/src/main/java/org/ansj/library/name/PersonAttrLibrary.java
@@ -10,6 +10,7 @@
import org.ansj.domain.PersonNatureAttr;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 人名标注所用的词典就是简单的hashmap简单方便谁用谁知道,只在加载词典的时候用
@@ -19,7 +20,7 @@
public class PersonAttrLibrary {
- private static final Log logger = MyStaticValue.getLog();
+ private static final Log logger = LogFactory.getLog();
private HashMap pnMap = null;
diff --git a/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java b/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java
index c2ecabb7..69c100e1 100644
--- a/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java
+++ b/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java
@@ -3,7 +3,6 @@
import org.ansj.domain.Term;
import org.ansj.domain.TermNature;
import org.ansj.domain.TermNatures;
-import org.ansj.library.UserDefineLibrary;
import org.ansj.recognition.TermArrRecognition;
import org.ansj.util.MyStaticValue;
import org.ansj.util.TermUtil;
@@ -11,6 +10,7 @@
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 用户自定义词典.又称补充词典
@@ -20,11 +20,11 @@
*/
public class UserDefineRecognition implements TermArrRecognition {
- public static final Log logger = MyStaticValue.getLog();
+ public static final Log logger = LogFactory.getLog(UserDefineRecognition.class);
private Term[] terms = null;
- private Forest[] forests = { UserDefineLibrary.FOREST };
+ private Forest[] forests = { MyStaticValue.dic() };
private int offe = -1;
private int endOffe = -1;
diff --git a/src/main/java/org/ansj/recognition/impl/NatureRecognition.java b/src/main/java/org/ansj/recognition/impl/NatureRecognition.java
index 65e5adce..4df53e9b 100644
--- a/src/main/java/org/ansj/recognition/impl/NatureRecognition.java
+++ b/src/main/java/org/ansj/recognition/impl/NatureRecognition.java
@@ -17,11 +17,11 @@
import org.ansj.recognition.arrimpl.ForeignPersonRecognition;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.ansj.util.MathUtil;
-import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.WordAlert;
import org.nlpcn.commons.lang.util.logging.Log;
+import org.nlpcn.commons.lang.util.logging.LogFactory;
/**
* 词性标注工具类
@@ -32,7 +32,7 @@
public class NatureRecognition implements Recognition {
private static final long serialVersionUID = 1L;
- private static final Log logger = MyStaticValue.getLog() ;
+ private static final Log logger = LogFactory.getLog() ;
private static final Forest SUFFIX_FOREST = new Forest();
diff --git a/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java b/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java
index a9a66b9e..1d163c47 100644
--- a/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java
+++ b/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java
@@ -1,21 +1,12 @@
package org.ansj.recognition.impl;
-import java.io.BufferedReader;
-import java.io.File;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
import java.util.List;
-import java.util.Set;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.Recognition;
import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
-import org.nlpcn.commons.lang.util.IOUtil;
-import org.nlpcn.commons.lang.util.StringUtil;
-import org.nlpcn.commons.lang.util.logging.Log;
/**
* 同义词功能
@@ -25,177 +16,18 @@
*/
public class SynonymsRecgnition implements Recognition {
- /**
- *
- */
private static final long serialVersionUID = 5961499108093950130L;
- private static final Log LOG = MyStaticValue.getLog();
-
- private static SmartForest> SYS_SYNONYMS = null;
-
- private SmartForest> synonyms = new SmartForest<>();
-
- public SmartForest> initLibrary(String synonymsLibrary) {
-
- if (synonymsLibrary == null || !new File(synonymsLibrary).exists()) {
- MyStaticValue.LIBRARYLOG.warn(synonymsLibrary + " not exists so set syn to empty!");
- } else {
- try (BufferedReader reader = IOUtil.getReader(synonymsLibrary, IOUtil.UTF8)) {
- String temp = null;
- while ((temp = reader.readLine()) != null) {
- if (StringUtil.isBlank(temp)) {
- continue;
- }
- String[] split = temp.split("\t");
-
- List list = new ArrayList<>();
- for (String word : split) {
- if (StringUtil.isBlank(word)) {
- continue;
- }
- list.add(word);
- }
-
- if (split.length <= 1) {
- MyStaticValue.LIBRARYLOG.warn(temp + " in synonymsLibrary not in to library !");
- continue;
- }
-
- for (int i = 0; i < split.length; i++) {
- synonyms.add(split[i], list);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- LOG.info("init library synonymsLibrary ok from " + new File(synonymsLibrary).getAbsolutePath());
- return synonyms;
- }
+ private SmartForest> synonyms = null;
public SynonymsRecgnition() {
- if (SYS_SYNONYMS == null) {
- synchronized (SynonymsRecgnition.class) {
- if (SYS_SYNONYMS == null) {
- SYS_SYNONYMS = initLibrary(MyStaticValue.synonymsLibrary);
- }
- }
- }
- synonyms = SYS_SYNONYMS;
+ this.synonyms = MyStaticValue.synonyms();
}
- public SynonymsRecgnition(String synonymsLibrary) {
- initLibrary(synonymsLibrary);
+ public SynonymsRecgnition(String key) {
+ this.synonyms = MyStaticValue.synonyms(key);
}
- /**
- * 覆盖更新同义词 [中国, 中华, 我国] -> replace([中国,华夏]) -> [中国,华夏]
- *
- * @param words
- */
- public void insert(String[] words) {
-
- List list = new ArrayList<>();
-
- for (String word : words) {
- if (StringUtil.isBlank(word)) {
- continue;
- }
- list.add(word);
- }
-
- if (list.size() <= 1) {
- LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
- return;
- }
-
- Set set = findAllWords(words);
-
- for (String word : list) {
- set.remove(word);
- synonyms.add(word, list);
- }
-
- for (String word : set) { //删除所有
- synonyms.remove(word);
- synonyms.getBranch(word).setParam(null);
- }
-
- }
-
- private Set findAllWords(String[] words) {
- Set set = new HashSet<>();
-
- for (String word : words) {
- SmartForest> branch = synonyms.getBranch(word);
- if (branch != null) {
- List params = branch.getParam();
- if (params != null) {
- set.addAll(params);
- }
- }
- }
- return set;
- }
-
- /**
- * 合并更新同义词 覆盖更新同义词 [中国, 中华, 我国] -> append([中国,华夏]) -> [中国, 中华, 我国 , 华夏]
- *
- * @param words
- */
- public void append(String[] words) {
-
- Set set = new HashSet<>();
-
- for (String word : words) {
- if (StringUtil.isBlank(word)) {
- continue;
- }
- set.add(word);
- }
-
- if (set.size() <= 1) {
- LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word");
- return;
- }
-
- set.addAll(findAllWords(words));
-
- List list = new ArrayList<>(set);
-
- for (String word : list) {
- synonyms.addBranch(word, list);
- }
- }
-
- /**
- * 从同义词组中删除掉一个词 [中国, 中华, 我国] -> remove(我国) -> [中国, 中华]
- *
- * @param words
- */
- public void remove(String word) {
-
- SmartForest> branch = synonyms.getBranch(word);
-
- if (branch == null || branch.getStatus() < 2) {
- return;
- }
-
- List params = branch.getParam();
-
- synonyms.remove(word);
- branch.setParam(null);
- params.remove(word);
-
- if (params.size() == 1) { //如果是1 个也删除
- synonyms.remove(params.get(0));
- params.remove(0);
- } else {
- params.remove(word);
- }
- }
@Override
public void recognition(Result result) {
diff --git a/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java b/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java
index 8c843f63..e7a18107 100644
--- a/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java
+++ b/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java
@@ -3,8 +3,8 @@
import org.ansj.domain.Nature;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
-import org.ansj.library.UserDefineLibrary;
import org.ansj.recognition.Recognition;
+import org.ansj.util.MyStaticValue;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.SmartForest;
@@ -20,7 +20,7 @@ public class UserDicNatureRecognition implements Recognition {
*
*/
private static final long serialVersionUID = 1L;
- private Forest[] forests = new Forest[] { UserDefineLibrary.FOREST };
+ private Forest[] forests = new Forest[] { MyStaticValue.dic() };
public UserDicNatureRecognition() {
}
diff --git a/src/main/java/org/ansj/util/MyStaticValue.java b/src/main/java/org/ansj/util/MyStaticValue.java
index 150f6da2..f30e609e 100644
--- a/src/main/java/org/ansj/util/MyStaticValue.java
+++ b/src/main/java/org/ansj/util/MyStaticValue.java
@@ -2,25 +2,33 @@
import java.io.BufferedReader;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Field;
+import java.util.ArrayList;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import java.util.PropertyResourceBundle;
import java.util.ResourceBundle;
-import java.util.logging.Logger;
import org.ansj.app.crf.Model;
import org.ansj.app.crf.SplitWord;
import org.ansj.app.crf.model.CRFModel;
import org.ansj.dic.DicReader;
+import org.ansj.dic.PathToStream;
import org.ansj.domain.AnsjItem;
+import org.ansj.exception.LibraryException;
import org.ansj.library.DATDictionary;
+import org.ansj.library.DicLibrary;
import org.ansj.library.UserDefineLibrary;
import org.nlpcn.commons.lang.tire.domain.Forest;
+import org.nlpcn.commons.lang.tire.domain.SmartForest;
+import org.nlpcn.commons.lang.tire.domain.Value;
+import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.FileFinder;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.ObjConver;
@@ -38,11 +46,12 @@ public class MyStaticValue {
public static final Forest EMPTY_FOREST = new Forest();
- public static final Log LIBRARYLOG = getLog();
+ private static final Log LOG = LogFactory.getLog(MyStaticValue.class);
- public static final String DIC_DEFAULT = "dic";
- public static final String CRF_DEFAULT = "crf";
+ public static final String AMBIGUITY_DEFAULT = "ambiguity_";
+
+ public static final String SYNONYMS_DEFAULT = "synonyms_";
// 是否开启人名识别
public static Boolean isNameRecognition = true;
@@ -56,21 +65,21 @@ public class MyStaticValue {
// 是否显示真实词语
public static Boolean isRealName = false;
- // 用户自定义词典
- public static final Map DIC = new HashMap();
- // CRF模型
- public static final Map CRF = new HashMap();
+ // 歧义词典
+ public static final Map AMBIGUITY = new HashMap<>();
- /**
- * 用户自定义词典的加载,如果是路径就扫描路径下的dic文件
- */
- public static String ambiguityLibrary = "library/ambiguity.dic";
+ // 同义词典
+ public static final Map SYNONYMS = new HashMap<>();
- /**
- * 增加同义词词典路径变量
- */
- public static String synonymsLibrary = "library/synonyms.dic";
+ //存放所有的词典
+ private static final Map ALL = new HashMap<>();
+
+ //默认的词性
+ public static final String DEFAULT_NATURE = "userDefine";
+
+ //默认的词频
+ public static final String DEFAULT_FREQ_STR = "1000";
/**
* 是否用户辞典不加载相同的词
@@ -89,10 +98,10 @@ public class MyStaticValue {
File find = FileFinder.find("ansj_library.properties", 1);
if (find != null && find.isFile()) {
rb = new PropertyResourceBundle(IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding")));
- LIBRARYLOG.info("load ansj_library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!");
+ LOG.info("load ansj_library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!");
}
} catch (Exception e1) {
- LIBRARYLOG.warn("not find ansj_library.properties. and err {} i think it is a bug!", e1);
+ LOG.warn("not find ansj_library.properties. and err {} i think it is a bug!", e1);
}
}
@@ -104,54 +113,75 @@ public class MyStaticValue {
File find = FileFinder.find("library.properties", 2);
if (find != null && find.isFile()) {
rb = new PropertyResourceBundle(IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding")));
- LIBRARYLOG.info("load library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!");
+ LOG.info("load library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!");
}
} catch (Exception e1) {
- LIBRARYLOG.warn("not find library.properties. and err {} i think it is a bug!", e1);
+ LOG.warn("not find library.properties. and err {} i think it is a bug!", e1);
}
}
}
- DIC.put(DIC_DEFAULT, "library/default.dic");
-
if (rb == null) {
- LIBRARYLOG.warn("not find library.properties in classpath use it by default !");
+ LOG.warn("not find library.properties in classpath use it by default !");
} else {
for (String key : rb.keySet()) {
if (key.equals("dic")) {
- DIC.put(key, rb.getString(key));
+ DicLibrary.put(DicLibrary.DEFAULT, rb.getString(key));
} else if (key.equals("crf")) {
- CRF.put(key, rb.getString(key));
+ CRF.put(CRF_DEFAULT, rb.getString(key));
+ } else if (key.equals("ambiguity")) {
+ AMBIGUITY.put(AMBIGUITY_DEFAULT, rb.getString(key));
+ } else if (key.equals("synonyms")) {
+ SYNONYMS.put(AMBIGUITY_DEFAULT, rb.getString(key));
} else if (key.startsWith("dic_")) {
- if (DIC.containsKey(key)) {
- LIBRARYLOG.warn(key + "{} dic config repeat definition now overwrite it !");
+ if (DicLibrary.DIC.containsKey(key)) {
+ LOG.warn(key + " dic config repeat definition now overwrite it !");
}
- DIC.put(key, rb.getString(key));
+ DicLibrary.put(key, rb.getString(key));
} else if (key.startsWith("crf_")) {
if (CRF.containsKey(key)) {
- LIBRARYLOG.warn(key + " crf config repeat definition now overwrite it !");
+ LOG.warn(key + " crf config repeat definition now overwrite it !");
}
CRF.put(key, rb.getString(key));
+ } else if (key.startsWith("synonyms_")) {
+ if (CRF.containsKey(key)) {
+ LOG.warn(key + " crf config repeat definition now overwrite it !");
+ }
+ SYNONYMS.put(key, rb.getString(key));
+ } else if (key.startsWith("ambiguity_")) {
+ if (CRF.containsKey(key)) {
+ LOG.warn(key + " crf config repeat definition now overwrite it !");
+ }
+ AMBIGUITY.put(key, rb.getString(key));
} else {
try {
Field field = MyStaticValue.class.getField(key);
field.set(null, ObjConver.conversion(rb.getString(key), field.getType()));
} catch (NoSuchFieldException e) {
- LIBRARYLOG.error("not find field by " + key);
+ LOG.error("not find field by " + key);
} catch (SecurityException e) {
- LIBRARYLOG.error("安全异常", e);
+ LOG.error("安全异常", e);
} catch (IllegalArgumentException e) {
- LIBRARYLOG.error("非法参数", e);
+ LOG.error("非法参数", e);
} catch (IllegalAccessException e) {
- LIBRARYLOG.error("非法访问", e);
+ LOG.error("非法访问", e);
}
}
}
}
+
+ //如果没有设置则设置默认路径
+ DicLibrary.putIfAbsent(DicLibrary.DEFAULT, "library/default.dic");
+
+ CRF.putIfAbsent(CRF_DEFAULT, "jar://crf.model");
+
+ AMBIGUITY.putIfAbsent(DIC_DEFAULT, "library/ambiguity.dic");
+
+ SYNONYMS.putIfAbsent(DIC_DEFAULT, "library/synonyms.dic");
}
/**
@@ -247,9 +277,9 @@ public static Map getPersonFreqMap() {
ObjectInputStream objectInputStream = new ObjectInputStream(inputStream);
map = (Map) objectInputStream.readObject();
} catch (IOException e) {
- LIBRARYLOG.warn("IO异常", e);
+ LOG.warn("IO异常", e);
} catch (ClassNotFoundException e) {
- LIBRARYLOG.warn("找不到类", e);
+ LOG.warn("找不到类", e);
}
return map;
}
@@ -295,152 +325,149 @@ public static void initBigramTables() {
}
} catch (NumberFormatException e) {
- LIBRARYLOG.warn("数字格式异常", e);
+ LOG.warn("数字格式异常", e);
} catch (UnsupportedEncodingException e) {
- LIBRARYLOG.warn("不支持的编码", e);
+ LOG.warn("不支持的编码", e);
} catch (IOException e) {
- LIBRARYLOG.warn("IO异常", e);
+ LOG.warn("IO异常", e);
}
}
- /**
- * 得到默认的模型
- *
- * @return
- */
- public static SplitWord getCRFSplitWord() {
- return getCRFSplitWord(CRF_DEFAULT);
- }
-
- /**
- * 根据模型名称获取crf模型
- *
- * @param key
- * @return
- */
- public static SplitWord getCRFSplitWord(String key) {
- Object temp = CRF.get(key);
-
- if (temp == null) {
- if (CRF_DEFAULT.equals(key)) { // 加载内置模型
- return initDefaultModel();
- } else {
- LIBRARYLOG.warn("crf " + key + " not found in config ");
- return null;
- }
- } else if (temp instanceof String) {
- return initCRFModel(key, (String) temp);
- } else {
- return (SplitWord) temp;
- }
- }
+
/**
- * 加载默认的crf模型
+ * 加载歧义词典
*
+ * @param modelName
* @return
*/
- private static synchronized SplitWord initDefaultModel() {
+ public static Forest ambiguity(String key) {
+ String path = AMBIGUITY.get(fix("ambiguity_", key));
- Object obj = CRF.get(CRF_DEFAULT);
- if (obj != null && obj instanceof SplitWord) {
- return (SplitWord) obj;
+ if (path == null) {
+ LOG.warn("ambiguity " + key + " not found in config ");
+ return null;
}
- try {
- LIBRARYLOG.info("init deafult crf model begin !");
- CRFModel model = new CRFModel(CRF_DEFAULT);
- model.loadModel(DicReader.getInputStream("crf.model"));
- SplitWord splitWord = new SplitWord(model);
- CRF.put(CRF_DEFAULT, splitWord);
- return splitWord;
- } catch (Exception e) {
- LIBRARYLOG.error("init err!", e);
+ Forest forest = (Forest) ALL.get(path);
+ if (forest == null) {
+ forest = initAmbiguity(key, path);
}
- return null;
+ return forest;
+
}
/**
- * 加载CRF模型
+ * 加载歧义词典
*
- * @param modelPath
+ * @param key
+ * @param path
* @return
*/
- private static synchronized SplitWord initCRFModel(String key, String modelPath) {
- try {
- Object obj = CRF.get(key);
- if (obj != null && obj instanceof SplitWord) {
- return (SplitWord) obj;
- }
- if (new File(modelPath).isFile() && new File(modelPath).exists()) {
- long start = System.currentTimeMillis();
- LIBRARYLOG.info("begin init crf model!");
- SplitWord crfSplitWord = new SplitWord(Model.load(key, modelPath));
- CRF.put(key, crfSplitWord);
- LIBRARYLOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + modelPath);
- return crfSplitWord;
- } else {
- LIBRARYLOG.info(key + " file not found ,please make sure it is exists : " + modelPath);
+ private synchronized static Forest initAmbiguity(String key, String path) {
+ Forest forest = (Forest) ALL.get(path);
+ if (forest != null) {
+ return forest;
+ }
+ forest = new Forest();
+ try (BufferedReader br = IOUtil.getReader(PathToStream.stream(path), "utf-8")) {
+ String temp;
+ LOG.info("begin init dic !");
+ long start = System.currentTimeMillis();
+ while ((temp = br.readLine()) != null) {
+ if (StringUtil.isNotBlank(temp)) {
+ temp = StringUtil.trim(temp);
+ String[] split = temp.split("\t");
+ StringBuilder sb = new StringBuilder();
+ if (split.length % 2 != 0) {
+ LOG.error("init ambiguity error in line :" + temp + " format err !");
+ continue;
+ }
+ for (int i = 0; i < split.length; i += 2) {
+ sb.append(split[i]);
+ }
+ forest.addBranch(sb.toString(), split);
+ }
}
+ LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + path);
+ ALL.put(path, forest);
+ return forest;
} catch (Exception e) {
- LIBRARYLOG.info(key + " file : " + modelPath + " load err " + e.getMessage());
+ LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + path);
+ return null;
}
- return null;
- }
-
- /**
- * 得到默认的模型
- *
- * @return
- */
- public static Forest getDicForest() {
- return getDicForest(DIC_DEFAULT);
}
/**
- * 根据模型名称获取crf模型
+ * 加载同义词典
*
* @param modelName
* @return
*/
- public static Forest getDicForest(String key) {
- Object temp = DIC.get(key);
-
- if (temp == null) {
- LIBRARYLOG.warn("dic " + key + " not found in config ");
+ public static SmartForest> synonyms(String key) {
+ String path = SYNONYMS.get(fix("synonyms_", key));
+ if (path == null) {
+ LOG.warn("synonyms " + key + " not found in config ");
return null;
- } else if (temp instanceof String) {
- return initForest(key, (String) temp);
- } else {
- return (Forest) temp;
}
+ @SuppressWarnings("unchecked")
+ SmartForest> forest = (SmartForest>) ALL.get(path);
+ if (forest == null) {
+ forest = initSynonyms(key, path);
+ }
+ return forest;
+
}
/**
- * 用户自定义词典加载
+ * 加载同义词典
*
* @param key
- * @param dicPath
+ * @param path
* @return
*/
- private synchronized static Forest initForest(String key, String dicPath) {
- Object obj = CRF.get(key);
+ private synchronized static SmartForest> initSynonyms(String key, String path) {
+ @SuppressWarnings("unchecked")
+ SmartForest> forest = (SmartForest>) ALL.get(path);
+ if (forest != null) {
+ return forest;
+ }
+ forest = new SmartForest<>();
+
+ LOG.info("begin init synonyms " + key);
+ long start = System.currentTimeMillis();
+
+ try (BufferedReader reader = IOUtil.getReader(PathToStream.stream(path), IOUtil.UTF8)) {
+ String temp = null;
+ while ((temp = reader.readLine()) != null) {
+ if (StringUtil.isBlank(temp)) {
+ continue;
+ }
+ String[] split = temp.split("\t");
+
+ List list = new ArrayList<>();
+ for (String word : split) {
+ if (StringUtil.isBlank(word)) {
+ continue;
+ }
+ list.add(word);
+ }
- if (obj != null && obj instanceof Forest) {
- return (Forest) obj;
+ if (split.length <= 1) {
+ LOG.warn(temp + " in synonymsLibrary not in to library !");
+ continue;
+ }
+
+ for (int i = 0; i < split.length; i++) {
+ forest.add(split[i], list);
+ }
+ }
+ LOG.info("load synonyms use time:" + (System.currentTimeMillis() - start) + " path is : " + path);
+ return forest;
+ } catch (Exception e) {
+ LOG.error("Init synonyms library error :" + e.getMessage() + ", path: " + path);
+ return null;
}
- Forest forest = new Forest();
- UserDefineLibrary.loadLibrary(forest, dicPath);
- DIC.put(key, forest);
- return forest;
- }
- /**
- * 获取log默认当前类,不支持android
- *
- * @return
- */
- public static Log getLog() {
- StackTraceElement[] sts = Thread.currentThread().getStackTrace();
- return LogFactory.getLog(sts[2].getClassName());
}
+
}