diff --git a/plugin/ansj_lucene5_plugin/pom.xml b/plugin/ansj_lucene5_plugin/pom.xml index fea7a684..3641f70c 100644 --- a/plugin/ansj_lucene5_plugin/pom.xml +++ b/plugin/ansj_lucene5_plugin/pom.xml @@ -10,7 +10,7 @@ ansj_lucene5_plug - 5.0.3.0 + 5.0.4.0 jar ansj_lucene5_plug diff --git a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java index 4ae5c016..73057114 100644 --- a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java +++ b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizer.java @@ -1,7 +1,6 @@ package org.ansj.lucene.util; import java.io.IOException; -import java.util.Set; import org.ansj.domain.Term; import org.ansj.splitWord.Analysis; @@ -22,16 +21,7 @@ public final class AnsjTokenizer extends Tokenizer { // 分词词性 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private int skippedPositions; - protected Analysis ta = null; - /** 自定义停用词 */ - private Set filter; - - public AnsjTokenizer(Analysis ta, Set filter) { - this.ta = ta; - this.filter = filter; - } public AnsjTokenizer(Analysis ta) { this.ta = ta; @@ -41,8 +31,6 @@ public AnsjTokenizer(Analysis ta) { public final boolean incrementToken() throws IOException { clearAttributes(); - skippedPositions = 0; - int position = 0; Term term = null; String name = null; @@ -53,16 +41,10 @@ public final boolean incrementToken() throws IOException { if (term == null) { break; } - name = term.getName(); length = name.length(); - - if (filter != null && filter.contains(name)) { - continue; - } else { - position++; - flag = false; - } + position++; + flag = false; } while (flag); if (term != null) { positionAttr.setPositionIncrement(position); @@ -82,7 +64,6 @@ public final boolean incrementToken() throws IOException { public void reset() throws IOException { super.reset(); ta.resetContent(new AnsjReader(this.input)); - skippedPositions = 0; } } \ No newline at end of file diff --git a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java index f076eb59..a55134d2 100644 --- a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java +++ b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene/util/AnsjTokenizerFactory.java @@ -1,73 +1,28 @@ package org.ansj.lucene.util; -import java.io.BufferedReader; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.HashSet; import java.util.Map; -import java.util.Set; import org.ansj.lucene5.AnsjAnalyzer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.AttributeFactory; -import org.nlpcn.commons.lang.util.IOUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; public class AnsjTokenizerFactory extends TokenizerFactory { - public final Logger logger = LoggerFactory.getLogger(getClass()); + public final Log logger = LogFactory.getLog(); - private String stopwordsDir; - public Set filter; - private String type; + private Map args; public AnsjTokenizerFactory(Map args) { super(args); - stopwordsDir = get(args, "words"); - type = get(args, "type"); - addStopwords(stopwordsDir); - } - - /** - * 添加停用词 - * - * @param dir - */ - private void addStopwords(String dir) { - if (dir == null) { - logger.info("no stopwords dir"); - return; - } - logger.info("stopwords: {}", dir); - filter = new HashSet(); - BufferedReader br = null; - try { - br = IOUtil.getReader(dir, "uf-8"); - String word = br.readLine(); - while (word != null) { - filter.add(word); - word = br.readLine(); - } - } catch (FileNotFoundException e) { - logger.info("No stopword file found"); - } catch (IOException e) { - logger.info("stopword file io exception"); - } finally { - if (br != null) { - try { - br.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - } + this.args = args ; } @Override public Tokenizer create(AttributeFactory factory) { - return AnsjAnalyzer.getTokenizer(null, AnsjAnalyzer.TYPE.valueOf(type), filter); + return AnsjAnalyzer.getTokenizer(null, args); } - + } diff --git a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java index 8cadb539..e99a2c84 100644 --- a/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java +++ b/plugin/ansj_lucene5_plugin/src/main/java/org/ansj/lucene5/AnsjAnalyzer.java @@ -1,27 +1,22 @@ package org.ansj.lucene5; import java.io.BufferedReader; -import java.io.FileNotFoundException; import java.io.StringReader; -import java.io.UnsupportedEncodingException; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.Map; import org.ansj.lucene.util.AnsjTokenizer; +import org.ansj.splitWord.Analysis; import org.ansj.splitWord.analysis.BaseAnalysis; import org.ansj.splitWord.analysis.DicAnalysis; import org.ansj.splitWord.analysis.IndexAnalysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; -import org.nlpcn.commons.lang.util.IOUtil; -import org.nlpcn.commons.lang.util.StringUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; public class AnsjAnalyzer extends Analyzer { - public final Logger logger = LoggerFactory.getLogger(getClass()); + public final Log logger = LogFactory.getLog(); /** * dic equals user , query equals to @@ -33,53 +28,23 @@ public static enum TYPE { base, index, query, to, dic, user, search } - /** 自定义停用词 */ - private Set filter; - /** 是否查询分词 */ - private TYPE type; + /** + * 分词类型 + */ + private Map args; /** * @param filter 停用词 */ - public AnsjAnalyzer(TYPE type, Set filter) { - this.type = type; - this.filter = filter; - } - - public AnsjAnalyzer(TYPE type, String stopwordsDir) { - this.type = type; - this.filter = filter(stopwordsDir); - } - - public AnsjAnalyzer(TYPE type) { - this.type = type; - } - - public AnsjAnalyzer(String typeStr) { - this.type = TYPE.valueOf(typeStr); - } - - private Set filter(String stopwordsDir) { - if (StringUtil.isBlank(stopwordsDir)) { - return null; - } - try { - List readFile2List = IOUtil.readFile2List(stopwordsDir, IOUtil.UTF8); - return new HashSet(readFile2List); - } catch (FileNotFoundException e) { - logger.warn("文件没有找到", e); - } catch (UnsupportedEncodingException e) { - logger.warn("编码不支持", e); - } - return null; + public AnsjAnalyzer(Map args) { + this.args = args; } @Override protected TokenStreamComponents createComponents(String text) { BufferedReader reader = new BufferedReader(new StringReader(text)); Tokenizer tokenizer = null; - - tokenizer = getTokenizer(reader, this.type, this.filter); + tokenizer = getTokenizer(reader, this.args); return new TokenStreamComponents(tokenizer); } @@ -91,50 +56,36 @@ protected TokenStreamComponents createComponents(String text) { * @param filter * @return */ - public static Tokenizer getTokenizer(BufferedReader reader, TYPE type, Set filter) { - Tokenizer tokenizer; + public static Tokenizer getTokenizer(BufferedReader reader, Map args) { - switch (type) { + Analysis analysis = null; + + switch (AnsjAnalyzer.TYPE.valueOf(args.get("type"))) { case base: - if (reader == null) { - tokenizer = new AnsjTokenizer(new BaseAnalysis(), filter); - } else { - tokenizer = new AnsjTokenizer(new BaseAnalysis(reader), filter); - } + analysis = new BaseAnalysis(); break; case index: - if (reader == null) { - tokenizer = new AnsjTokenizer(new IndexAnalysis(), filter); - } else { - tokenizer = new AnsjTokenizer(new IndexAnalysis(reader), filter); - } + analysis = new IndexAnalysis(); break; case dic: case user: - if (reader == null) { - tokenizer = new AnsjTokenizer(new DicAnalysis(), filter); - } else { - tokenizer = new AnsjTokenizer(new DicAnalysis(reader), filter); - } + analysis = new DicAnalysis(); break; - case to: case query: case search: - if (reader == null) { - tokenizer = new AnsjTokenizer(new ToAnalysis(), filter); - } else { - tokenizer = new AnsjTokenizer(new ToAnalysis(reader), filter); - } + analysis = new ToAnalysis(); break; default: - if (reader == null) { - tokenizer = new AnsjTokenizer(new ToAnalysis(), filter); - } else { - tokenizer = new AnsjTokenizer(new ToAnalysis(reader), filter); - } + analysis = new BaseAnalysis(); } - return tokenizer; + if (reader != null) { + analysis.resetContent(reader); + } + + return new AnsjTokenizer(analysis); + } + } \ No newline at end of file diff --git a/pom.xml b/pom.xml index 28c9c1f0..2549fcff 100644 --- a/pom.xml +++ b/pom.xml @@ -43,6 +43,13 @@ compile + + org.nutz + nutz + 1.r.58 + provided + + junit junit @@ -50,6 +57,7 @@ test + diff --git a/pom_Maven.xml b/pom_Maven.xml deleted file mode 100644 index 48beabb1..00000000 --- a/pom_Maven.xml +++ /dev/null @@ -1,163 +0,0 @@ - - 4.0.0 - org.ansj - ansj_seg - jar - ansj_seg - 5.0.3 - best java chinese word seg ! - https://github.com/NLPchina/ansj_seg - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - scm:git:git@github.com:ansjsun/ansj_seg.git - scm:git:git@github.com:ansjsun/ansj_seg.git - git@github.com:ansjsun/ansj_seg.git - - - - - - ansj - ansj - ansj-sun@163.com - - - - - UTF-8 - - - - - org.nlpcn - nlp-lang - 1.7 - compile - - - - org.slf4j - slf4j-api - 1.7.21 - - - - org.slf4j - slf4j-log4j12 - 1.7.21 - provided - - - - log4j - log4j - 1.2.16 - provided - - - - junit - junit - 4.8.1 - test - - - - - - - - net.orfjackal.retrolambda - retrolambda-maven-plugin - 2.0.6 - - - default - - process-main - - - - - 1.6 - false - false - - - - - maven-compiler-plugin - 2.3.2 - - 1.7 - 1.7 - UTF-8 - - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar-no-fork - - - - - - true - - **/*.java - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.3 - - -Xdoclint:none - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.4 - - - sign-artifacts - verify - - sign - - - - - - - - - - sonatype-nexus-snapshots - Sonatype Nexus snapshot repository - https://oss.sonatype.org/content/repositories/snapshots - - - - sonatype-nexus-staging - Sonatype Nexus release repository - https://oss.sonatype.org/service/local/staging/deploy/maven2 - - - diff --git a/src/main/java/org/ansj/app/crf/MakeTrainFile.java b/src/main/java/org/ansj/app/crf/MakeTrainFile.java index aa4c3add..33a78f12 100644 --- a/src/main/java/org/ansj/app/crf/MakeTrainFile.java +++ b/src/main/java/org/ansj/app/crf/MakeTrainFile.java @@ -11,6 +11,7 @@ import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.StringUtil; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 生成crf 或者是 wapiti的训练语聊工具. @@ -22,7 +23,7 @@ */ public class MakeTrainFile { - private static final Log logger = MyStaticValue.getLog(); + private static final Log logger = LogFactory.getLog(); public static void main(String[] args) { @@ -39,8 +40,7 @@ public static void main(String[] args) { logger.info("org.ansj.app.crf.MakeTrainFile [inputPath] [outputPath]"); return; } - try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8"); - FileOutputStream fos = new FileOutputStream(outputPath)) { + try (BufferedReader reader = IOUtil.getReader(inputPath, "utf-8"); FileOutputStream fos = new FileOutputStream(outputPath)) { String temp = null; int i = 0; while ((temp = reader.readLine()) != null) { diff --git a/src/main/java/org/ansj/app/crf/Model.java b/src/main/java/org/ansj/app/crf/Model.java index 0c6b0fde..be9b4c84 100755 --- a/src/main/java/org/ansj/app/crf/Model.java +++ b/src/main/java/org/ansj/app/crf/Model.java @@ -3,6 +3,7 @@ import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.ObjectOutputStream; import java.util.Map; import java.util.Map.Entry; @@ -15,12 +16,11 @@ import org.nlpcn.commons.lang.tire.domain.SmartForest; import org.nlpcn.commons.lang.util.MapCount; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; public abstract class Model { - public static final Log logger = MyStaticValue.getLog() ; - - protected String name; + public static final Log logger = LogFactory.getLog(Model.class); protected Config config; @@ -30,10 +30,6 @@ public abstract class Model { public int allFeatureCount = 0; - public Model(String name) { - this.name = name; - }; - /** * 判断当前数据流是否是本实例 * @@ -50,32 +46,44 @@ public Model(String name) { * @return * @throws Exception */ - public static Model load(String name, String modelPath) throws Exception { - Model model = new CRFModel(name); + public static Model load(String modelPath) throws Exception { + Model model = new CRFModel(); if (model.checkModel(modelPath)) { - model.loadModel(modelPath); - return model; + return model.loadModel(modelPath); } - model = new CRFppTxtModel(name); + model = new CRFppTxtModel(); if (model.checkModel(modelPath)) { - model.loadModel(modelPath); - return model; + return model.loadModel(modelPath); } - model = new WapitiCRFModel(name); + model = new WapitiCRFModel(); if (model.checkModel(modelPath)) { - model.loadModel(modelPath); - return model; + return model.loadModel(modelPath); } throw new Exception("I did not know what type of model by file " + modelPath); } + /** + * 模型读取 + * + * @param path + * @return + * @return + * @throws Exception + */ + public static Model load(Class c, InputStream is) throws Exception { + Model model = c.newInstance(); + return model.loadModel(is); + } + /** * 不同的模型实现自己的加载模型类 * * @throws Exception */ - public abstract void loadModel(String modelPath) throws Exception; + public abstract Model loadModel(String modelPath) throws Exception; + + public abstract Model loadModel(InputStream is) throws Exception; /** * 获得特征所在权重数组 @@ -95,10 +103,6 @@ public float[] getFeature(char... chars) { return sf.getParam(); } - public String getName() { - return this.name; - }; - public Config getConfig() { return this.config; } @@ -125,12 +129,10 @@ protected static void printFeatureTree(String cs, float[] tempW) { if (tempW.length == 4) { name = "U"; } - name += "*" + ((int) cs.charAt(cs.length() - 1) - Config.FEATURE_BEGIN + 1) + ":" - + cs.substring(0, cs.length() - 1); + name += "*" + ((int) cs.charAt(cs.length() - 1) - Config.FEATURE_BEGIN + 1) + ":" + cs.substring(0, cs.length() - 1); for (int i = 0; i < tempW.length; i++) { if (tempW[i] != 0) { - System.out.println( - name + "\t" + Config.getTagName(i / 4 - 1) + "\t" + Config.getTagName(i % 4) + "\t" + tempW[i]); + System.out.println(name + "\t" + Config.getTagName(i / 4 - 1) + "\t" + Config.getTagName(i % 4) + "\t" + tempW[i]); } } @@ -172,9 +174,9 @@ public void writeModel(String path) { oos.writeInt(0); oos.flush(); } catch (FileNotFoundException e) { - logger.warn("文件没有找到",e); + logger.warn("文件没有找到", e); } catch (IOException e) { - logger.warn("IO异常",e); + logger.warn("IO异常", e); } } } \ No newline at end of file diff --git a/src/main/java/org/ansj/app/crf/model/CRFModel.java b/src/main/java/org/ansj/app/crf/model/CRFModel.java index 22575cee..086c0824 100644 --- a/src/main/java/org/ansj/app/crf/model/CRFModel.java +++ b/src/main/java/org/ansj/app/crf/model/CRFModel.java @@ -23,16 +23,16 @@ public class CRFModel extends Model { public static final String version = "ansj1"; - public CRFModel(String name) { - super(name); - } - @Override - public void loadModel(String modelPath) throws Exception { - loadModel(IOUtil.getInputStream(modelPath)); + public CRFModel loadModel(String modelPath) throws Exception { + try (InputStream is = IOUtil.getInputStream(modelPath)) { + loadModel(is); + return this; + } } - public void loadModel(InputStream is) throws Exception { + @Override + public CRFModel loadModel(InputStream is) throws Exception { long start = System.currentTimeMillis(); try (ObjectInputStream ois = new ObjectInputStream(new GZIPInputStream(is))) { ois.readUTF(); @@ -58,6 +58,7 @@ public void loadModel(InputStream is) throws Exception { } while (win == 0 || size == 0); logger.info("load crf model ok ! use time :" + (System.currentTimeMillis() - start)); } + return this; } @Override diff --git a/src/main/java/org/ansj/app/crf/model/CRFppModel.java b/src/main/java/org/ansj/app/crf/model/CRFppModel.java deleted file mode 100644 index 833034b8..00000000 --- a/src/main/java/org/ansj/app/crf/model/CRFppModel.java +++ /dev/null @@ -1,77 +0,0 @@ -//package org.ansj.app.crf.model; -// -//import java.io.DataInputStream; -//import java.io.FileInputStream; -// -//import org.ansj.app.crf.Model; -// -///** -// * 加载CRF+生成的crf二进制模型,测试使用的CRF++版本为:CRF++-0.58 -// * -// * 下载地址:https://taku910.github.io/crfpp/#download 在这里感谢作者所做的工作. -// * -// * @author Ansj -// * -// */ -//public class CRFppModel extends Model { -// -// public CRFppModel(String name) { -// super(name); -// } -// -// /** -// * 解析crf++生成的可可视文件 -// */ -// public void loadModel(String modelPath) throws Exception { -// -// FileInputStream fileInputStream = new FileInputStream(modelPath); -// -// DataInputStream dis = new DataInputStream(fileInputStream); -// -// System.out.println(); -// -// } -// -// public static void main(String[] args) throws Exception { -// new CRFppModel("test").loadModel("/Users/sunjian/Documents/src/CRF++-0.58/test/model"); -// -// // System.out.println("---------------------------"); -// // -// // int u = 1; -// // -// // byte[] b = new byte[4]; -// // -// // b[0] = (byte) (u); -// // b[1] = (byte) (u >> 8); -// // b[2] = (byte) (u >> 16); -// // b[3] = (byte) (u >> 24); -// // -// // System.out.println(Arrays.toString(b)); -// // -// // System.out.println("---------------------------"); -// // -// // b = new byte[4]; -// // -// // b[0] = -72; -// // b[1] = 36; -// // b[2] = 86; -// // b[3] = 0; -// // -// // System.out.println((int) (b[0] | b[1] << 8 | b[2] << 16 | b[3] << -// // 24)); -// // -// // System.out.println((char) 66); -// // System.out.println((char) 69); -// // System.out.println((char) 77); -// // System.out.println((char) 83); -// // System.out.println((char) 104); -// // -// // System.out.println((char) 85); -// // System.out.println((char) 48); -// // System.out.println((char) 49); -// // System.out.println((char) 58); -// // System.out.println((char) 37); -// // System.out.println((char) 120); -// } -// -//} diff --git a/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java b/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java index 4288231c..8ef2d3bf 100644 --- a/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java +++ b/src/main/java/org/ansj/app/crf/model/CRFppTxtModel.java @@ -1,6 +1,7 @@ package org.ansj.app.crf.model; import java.io.BufferedReader; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -28,18 +29,23 @@ */ public class CRFppTxtModel extends Model { - public CRFppTxtModel(String name) { - super(name); - } - /** * 解析crf++生成的可可视txt文件 + * + * @return */ - public void loadModel(String modelPath) throws Exception { + public CRFppTxtModel loadModel(String modelPath) throws Exception { + try (InputStream is = new FileInputStream(modelPath)) { + loadModel(new FileInputStream(modelPath)); + return this; + } + } + @Override + public Model loadModel(InputStream is) throws Exception { long start = System.currentTimeMillis(); - BufferedReader reader = IOUtil.getReader(modelPath, IOUtil.UTF8); + BufferedReader reader = IOUtil.getReader(is, IOUtil.UTF8); reader.readLine();// version reader.readLine();// cost-factor @@ -54,11 +60,12 @@ public void loadModel(String modelPath) throws Exception { for (int[] t1 : config.getTemplate()) { sb.append(Arrays.toString(t1) + " "); } - logger.info("load template ok template : "+ sb); + logger.info("load template ok template : " + sb); TreeMap> featureNames = loadFeatureName(featureIndex, reader); - logger.info("load feature ok feature size : "+ featureNames.size()); + logger.info("load feature ok feature size : " + featureNames.size()); loadFeatureWeight(reader, statusCoven, featureNames); - logger.info("load crfpp model ok ! use time : "+ (System.currentTimeMillis() - start)); + logger.info("load crfpp model ok ! use time : " + (System.currentTimeMillis() - start)); + return this; } /** @@ -309,4 +316,5 @@ public boolean checkModel(String modelPath) { } return false; } + } diff --git a/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java b/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java index 0269180a..6a6e0da9 100644 --- a/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java +++ b/src/main/java/org/ansj/app/crf/model/WapitiCRFModel.java @@ -27,13 +27,14 @@ */ public class WapitiCRFModel extends Model { - public WapitiCRFModel(String name) { - super(name); + public WapitiCRFModel loadModel(String modelPath) throws Exception { + try (InputStream is = IOUtil.getInputStream(modelPath)) { + return loadModel(is); + } } - public void loadModel(String modelPath) throws Exception { - - BufferedReader br = IOUtil.getReader(modelPath, IOUtil.UTF8); + public WapitiCRFModel loadModel(InputStream is) throws Exception { + BufferedReader br = IOUtil.getReader(is, IOUtil.UTF8); long start = System.currentTimeMillis(); @@ -50,21 +51,21 @@ public void loadModel(String modelPath) throws Exception { sb.append(Arrays.toString(t1) + " "); } - logger.info("featureIndex is "+ featureIndex); - logger.info("load template ok template : "+ sb); + logger.info("featureIndex is " + featureIndex); + logger.info("load template ok template : " + sb); int[] statusCoven = loadTagCoven(br); List> loadFeatureName = loadFeatureName(featureIndex, br); - logger.info("load feature ok feature size : "+ loadFeatureName.size()); + logger.info("load feature ok feature size : " + loadFeatureName.size()); featureTree = new SmartForest(); loadFeatureWeight(br, statusCoven, loadFeatureName); - logger.info("load wapiti model ok ! use time :"+ (System.currentTimeMillis() - start)); - + logger.info("load wapiti model ok ! use time :" + (System.currentTimeMillis() - start)); + return this; } /** @@ -75,8 +76,7 @@ public void loadModel(String modelPath) throws Exception { * @param statusCoven * @throws Exception */ - private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List> featureNames) - throws Exception { + private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List> featureNames) throws Exception { int key = 0; @@ -97,14 +97,13 @@ private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List pair : featureNames) { if (temp == null) { - logger.warn(pair.getValue0()+"\t"+pair.getValue1()+" not have any weight ,so skip it !"); + logger.warn(pair.getValue0() + "\t" + pair.getValue1() + " not have any weight ,so skip it !"); continue; } char fc = Character.toUpperCase(pair.getValue0().charAt(0)); - len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM - : fc == 'U' ? Config.TAG_NUM : fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0; + len = fc == 'B' ? Config.TAG_NUM * Config.TAG_NUM : fc == 'U' ? Config.TAG_NUM : fc == '*' ? (Config.TAG_NUM + Config.TAG_NUM * Config.TAG_NUM) : 0; if (len == 0) { throw new Exception("unknow feature type " + pair.getValue0()); @@ -162,8 +161,7 @@ private void loadFeatureWeight(BufferedReader br, int[] statusCoven, List> loadFeatureName(Map featureIndex, BufferedReader br) - throws Exception { + private List> loadFeatureName(Map featureIndex, BufferedReader br) throws Exception { String temp = br.readLine();// #qrk#num int featureNum = ObjConver.getIntValue(StringUtil.matcherFirst("\\d+", temp)); // 找到特征个数 @@ -328,9 +326,9 @@ private Map loadConfig(BufferedReader br) throws IOException { } @Override - public boolean checkModel(String modelPath){ + public boolean checkModel(String modelPath) { - try (InputStream is = IOUtil.getInputStream(modelPath)){ + try (InputStream is = IOUtil.getInputStream(modelPath)) { byte[] bytes = new byte[100]; is.read(bytes); diff --git a/src/main/java/org/ansj/dic/DicReader.java b/src/main/java/org/ansj/dic/DicReader.java index 75add540..cc7abfca 100644 --- a/src/main/java/org/ansj/dic/DicReader.java +++ b/src/main/java/org/ansj/dic/DicReader.java @@ -5,8 +5,8 @@ import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; -import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 加载词典用的类 @@ -15,7 +15,7 @@ */ public class DicReader { - private static final Log logger = MyStaticValue.getLog() ; + private static final Log logger = LogFactory.getLog(); public static BufferedReader getReader(String name) { // maven工程修改词典加载方式 diff --git a/src/main/java/org/ansj/dic/PathToStream.java b/src/main/java/org/ansj/dic/PathToStream.java new file mode 100644 index 00000000..c77414f2 --- /dev/null +++ b/src/main/java/org/ansj/dic/PathToStream.java @@ -0,0 +1,43 @@ +package org.ansj.dic; + +import java.io.InputStream; + +import org.ansj.dic.impl.File2Stream; +import org.ansj.dic.impl.Jar2Stream; +import org.ansj.dic.impl.Jdbc2Stream; +import org.ansj.dic.impl.Url2Stream; +import org.ansj.exception.LibraryException; + +/** + * 将路径转换为流,如果你需要实现自己的加载器请实现这个类,使用这个类可能需要自己依赖第三方包,比如jdbc连接和nutz + * + * @author ansj + * + */ +public abstract class PathToStream { + + public static InputStream stream(String path) { + try { + if (path.startsWith("file://")) { + return new File2Stream().toStream(path); + } else if (path.startsWith("jdbc://")) { + return new Jdbc2Stream().toStream(path); + } else if (path.startsWith("jar://")) { + return new Jar2Stream().toStream(path); + } else if (path.startsWith("class://")) { + ((PathToStream) Class.forName(path.substring(8).split("\\|")[0]).newInstance()).toStream(path); + } else if (path.startsWith("url://")) { + return new Url2Stream().toStream(path); + } else { + return new File2Stream().toStream(path); + } + } catch (Exception e) { + e.printStackTrace(); + throw new LibraryException(e); + } + throw new LibraryException("not find method type in path " + path); + } + + public abstract InputStream toStream(String path); + +} diff --git a/src/main/java/org/ansj/dic/impl/File2Stream.java b/src/main/java/org/ansj/dic/impl/File2Stream.java new file mode 100644 index 00000000..d868de95 --- /dev/null +++ b/src/main/java/org/ansj/dic/impl/File2Stream.java @@ -0,0 +1,95 @@ +package org.ansj.dic.impl; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FilenameFilter; +import java.io.InputStream; +import java.io.SequenceInputStream; +import java.util.Vector; + +import org.ansj.dic.PathToStream; +import org.ansj.exception.LibraryException; +import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; + +/** + * 将文件转换为流 file://c:/dic.txt + * + * @author ansj + * + */ +public class File2Stream extends PathToStream { + + private static final Log LOG = LogFactory.getLog(File2Stream.class); + + @Override + public InputStream toStream(String path) { + LOG.info("path to stream " + path); + + if (path.startsWith("file://")) { + path = path.substring(7); + } + + File file = new File(path); + + if (file.exists() && file.canRead()) { + + try { + if (file.isDirectory()) { + return multiple(path); + } else { + return new FileInputStream(file); + } + } catch (Exception e) { + throw new LibraryException(e); + } + } + throw new LibraryException("file " + path + " not found or can not to read"); + + } + + private InputStream multiple(String path) throws FileNotFoundException { + File[] libs = new File[0]; + + File file = new File(path); + + if (file.exists() && file.canRead()) { + if (file.isFile()) { + libs = new File[1]; + libs[0] = file; + } else if (file.isDirectory()) { + File[] files = file.listFiles(new FilenameFilter() { + @Override + public boolean accept(File dir, String name) { + if (dir.canRead() && !dir.isHidden() && !dir.isDirectory()) { + return true; + } else { + return false; + } + } + }); + if (files != null && files.length > 0) { + libs = files; + } + } + } + + if (libs.length == 0) { + throw new LibraryException("not find any file in path : " + path); + } + + if (libs.length == 1) { + return new FileInputStream(libs[0]); + } + + Vector vector = new Vector<>(libs.length); + + for (int i = 0; i < libs.length; i++) { + vector.add(new FileInputStream(libs[i])); + } + + return new SequenceInputStream(vector.elements()); + } + +} diff --git a/src/main/java/org/ansj/dic/impl/Jar2Stream.java b/src/main/java/org/ansj/dic/impl/Jar2Stream.java new file mode 100644 index 00000000..4a48687f --- /dev/null +++ b/src/main/java/org/ansj/dic/impl/Jar2Stream.java @@ -0,0 +1,21 @@ +package org.ansj.dic.impl; + +import java.io.InputStream; + +import org.ansj.dic.DicReader; +import org.ansj.dic.PathToStream; + +/** + * 从系统jar包中读取文件,你们不能用,只有我能用 jar:// + * + * @author ansj + * + */ +public class Jar2Stream extends PathToStream { + + @Override + public InputStream toStream(String path) { + return DicReader.getInputStream(path.substring(6)); + } + +} diff --git a/src/main/java/org/ansj/dic/impl/Jdbc2Stream.java b/src/main/java/org/ansj/dic/impl/Jdbc2Stream.java new file mode 100644 index 00000000..1ac5f65d --- /dev/null +++ b/src/main/java/org/ansj/dic/impl/Jdbc2Stream.java @@ -0,0 +1,91 @@ +package org.ansj.dic.impl; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; + +import org.ansj.dic.PathToStream; +import org.ansj.exception.LibraryException; +import org.nutz.dao.Dao; +import org.nutz.dao.Sqls; +import org.nutz.dao.impl.NutDao; +import org.nutz.dao.impl.SimpleDataSource; +import org.nutz.dao.sql.Sql; +import org.nutz.dao.sql.SqlCallback; + +/** + * jdbc:mysql://192.168.10.103:3306/infcn_mss?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull|username|password|select name as name,nature,freq from dic where type=1 + * + * @author ansj + * + */ +public class Jdbc2Stream extends PathToStream { + + private static final byte[] TAB = "\t".getBytes(); + + private static final byte[] LINE = "\n".getBytes(); + + @Override + public InputStream toStream(String path) { + path = path.substring(7); + + String[] split = path.split("\\|"); + + String jdbc = split[0]; + + String username = split[1]; + + String password = split[2]; + + String sqlStr = split[3]; + + SimpleDataSource ds = null; + + try { + ds = new SimpleDataSource(); + + ds.setJdbcUrl(jdbc); + ds.setUsername(username); + ds.setPassword(password); + + Dao dao = new NutDao(ds); + + Sql sql = Sqls.create(sqlStr); + + Sql execute = dao.execute(sql.setCallback(new SqlCallback() { + @Override + public byte[] invoke(Connection conn, ResultSet rs, Sql sql) throws SQLException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(100 * 1024); + while (rs.next()) { + try { + baos.write(rs.getString(0).getBytes()); + baos.write(TAB); + baos.write(rs.getString(1).getBytes()); + baos.write(TAB); + baos.write(rs.getString(2).getBytes()); + baos.write(LINE); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + return baos.toByteArray(); + } + })); + + return new ByteArrayInputStream((byte[]) execute.getResult()); + } catch (Exception e) { + throw new LibraryException(e); + } finally { + if (ds != null) { + ds.close(); + } + } + + } + +} diff --git a/src/main/java/org/ansj/dic/impl/Url2Stream.java b/src/main/java/org/ansj/dic/impl/Url2Stream.java new file mode 100644 index 00000000..958c9de0 --- /dev/null +++ b/src/main/java/org/ansj/dic/impl/Url2Stream.java @@ -0,0 +1,24 @@ +package org.ansj.dic.impl; + +import java.io.InputStream; + +import org.ansj.dic.PathToStream; +import org.nutz.http.Http; +import org.nutz.http.Response; + +/** + * url://http://maven.nlpcn.org/down/library/default.dic + * + * @author ansj + * + */ +public class Url2Stream extends PathToStream { + + @Override + public InputStream toStream(String path) { + path = path.substring(6); + Response response = Http.get(path); + return response.getStream(); + } + +} diff --git a/src/main/java/org/ansj/domain/KV.java b/src/main/java/org/ansj/domain/KV.java new file mode 100644 index 00000000..e1eda4d6 --- /dev/null +++ b/src/main/java/org/ansj/domain/KV.java @@ -0,0 +1,17 @@ +package org.ansj.domain; + +public class KV { + + private K k; + + private V v; + + private KV(K k, V v) { + this.k = k; + this.v = v; + } + + public static KV with(K k, V v) { + return new KV(k, v); + } +} diff --git a/src/main/java/org/ansj/exception/LibraryException.java b/src/main/java/org/ansj/exception/LibraryException.java new file mode 100644 index 00000000..a182ed89 --- /dev/null +++ b/src/main/java/org/ansj/exception/LibraryException.java @@ -0,0 +1,15 @@ +package org.ansj.exception; + +public class LibraryException extends RuntimeException { + + private static final long serialVersionUID = 1L; + + public LibraryException(Exception e) { + super(e); + } + + public LibraryException(String message) { + super(message); + } + +} diff --git a/src/main/java/org/ansj/library/CrfLibrary.java b/src/main/java/org/ansj/library/CrfLibrary.java new file mode 100644 index 00000000..c0d91a1c --- /dev/null +++ b/src/main/java/org/ansj/library/CrfLibrary.java @@ -0,0 +1,118 @@ +package org.ansj.library; + +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.ansj.app.crf.Model; +import org.ansj.app.crf.SplitWord; +import org.ansj.app.crf.model.CRFModel; +import org.ansj.dic.PathToStream; +import org.nlpcn.commons.lang.tire.domain.Forest; +import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; +import org.nlpcn.commons.lang.util.tuples.KeyValue; + +public class CrfLibrary { + + private static final Log LOG = LogFactory.getLog(); + + // CRF模型 + private static final Map> CRF = new HashMap<>(); + + public static final String DEFAULT = "crf_"; + + /** + * 根据key获取crf分词器 + * + * @param key + * @return crf分词器 + */ + public static SplitWord crf(String key) { + KeyValue kv = CRF.get(fix(key)); + + if (kv == null) { + LOG.warn("crf " + key + " not found in config "); + return null; + } + + SplitWord sw = (SplitWord) kv.getValue(); + if (sw == null) { + sw = initCRFModel(kv); + } + return sw; + } + + /** + * 加载CRF模型 + * + * @param modelPath + * @return + */ + private static synchronized SplitWord initCRFModel(KeyValue kv) { + try { + if (kv.getValue() != null) { + return kv.getValue(); + } + + long start = System.currentTimeMillis(); + LOG.info("begin init crf model!"); + try (InputStream is = PathToStream.stream(kv.getKey())) { + SplitWord crfSplitWord = new SplitWord(Model.load(CRFModel.class, is)); + kv.setValue(crfSplitWord); + LOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getKey()); + return crfSplitWord; + } + } catch (Exception e) { + LOG.error(kv + " load err " + e.getMessage()); + return null; + } + } + + /** + * 动态添加 + * + * @param dicDefault + * @param dicDefault2 + * @param dic2 + */ + public static void put(String key, String path) { + put(key, path, null); + } + + public static void put(String key, String path, SplitWord sw) { + CRF.put(key, KeyValue.with(path, sw)); + } + + /** + * 删除一个key + * + * @param key + * @return + */ + public static KeyValue remove(String key) { + return CRF.remove(key); + } + + /** + * 刷新一个,将值设置为null + * @param key + * @return + */ + public static KeyValue flush(String key) { + CRF.get(key).setValue(null); + } + + public static Set keys() { + return CRF.keySet(); + } + + private static String fix(String key) { + if (key.startsWith(DEFAULT)) { + return key; + } else { + return DEFAULT + key; + } + } +} diff --git a/src/main/java/org/ansj/library/DATDictionary.java b/src/main/java/org/ansj/library/DATDictionary.java index 94c0c69d..11b51b30 100644 --- a/src/main/java/org/ansj/library/DATDictionary.java +++ b/src/main/java/org/ansj/library/DATDictionary.java @@ -11,14 +11,14 @@ import org.ansj.domain.TermNature; import org.ansj.domain.TermNatures; import org.ansj.library.name.PersonAttrLibrary; -import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.dat.DoubleArrayTire; import org.nlpcn.commons.lang.dat.Item; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; public class DATDictionary { - private static final Log logger = MyStaticValue.getLog(); + private static final Log LOG = LogFactory.getLog(DATDictionary.class); /** * 所有在词典中出现的词,并且承担简繁体转换的任务. @@ -64,16 +64,16 @@ private static DoubleArrayTire loadDAT() { } // 特殊字符标准化 IN_SYSTEM['%'] = '%'; - logger.info("init core library ok use time : " + (System.currentTimeMillis() - start)); + LOG.info("init core library ok use time : " + (System.currentTimeMillis() - start)); return dat; } catch (InstantiationException e) { - MyStaticValue.LIBRARYLOG.warn("无法实例化", e); + LOG.warn("无法实例化", e); } catch (IllegalAccessException e) { - MyStaticValue.LIBRARYLOG.warn("非法访问", e); + LOG.warn("非法访问", e); } catch (NumberFormatException e) { - MyStaticValue.LIBRARYLOG.warn("数字格式异常", e); + LOG.warn("数字格式异常", e); } catch (IOException e) { - MyStaticValue.LIBRARYLOG.warn("IO异常", e); + LOG.warn("IO异常", e); } return null; @@ -146,7 +146,7 @@ public static AnsjItem getItem(int index) { public static AnsjItem getItem(String str) { AnsjItem item = DAT.getItem(str); - if (item == null || item.getStatus()<2) { + if (item == null || item.getStatus() < 2) { return AnsjItem.NULL; } diff --git a/src/main/java/org/ansj/library/DicLibrary.java b/src/main/java/org/ansj/library/DicLibrary.java new file mode 100644 index 00000000..1b85e0be --- /dev/null +++ b/src/main/java/org/ansj/library/DicLibrary.java @@ -0,0 +1,224 @@ +package org.ansj.library; + +import java.io.BufferedReader; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.ansj.dic.PathToStream; +import org.ansj.util.MyStaticValue; +import org.nlpcn.commons.lang.tire.domain.Forest; +import org.nlpcn.commons.lang.tire.domain.Value; +import org.nlpcn.commons.lang.tire.library.Library; +import org.nlpcn.commons.lang.util.IOUtil; +import org.nlpcn.commons.lang.util.StringUtil; +import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; +import org.nlpcn.commons.lang.util.tuples.KeyValue; + +public class DicLibrary { + + private static final Log LOG = LogFactory.getLog(); + + public static final String DEFAULT = "dic_"; + + public static final String DEFAULT_NATURE = "userDefine"; + + public static final Integer DEFAULT_FREQ = 1000; + + public static final String DEFAULT_FREQ_STR = "1000"; + + // 用户自定义词典 + private static final Map> DIC = new HashMap<>(); + + /** + * 关键词增加 + * + * @param keyword 所要增加的关键词 + * @param nature 关键词的词性 + * @param freq 关键词的词频 + */ + public static void insert(String key, String keyword, String nature, int freq) { + Forest dic = get(key); + + if (dic == null) { + dic = putIfAbsent(DEFAULT, DEFAULT, new Forest()); + } + + String[] paramers = new String[2]; + paramers[0] = nature; + paramers[1] = String.valueOf(freq); + Value value = new Value(keyword, paramers); + Library.insertWord(dic, value); + } + + /** + * 增加关键词 + * + * @param keyword + */ + public static void insert(String key, String keyword) { + insert(key, keyword, DEFAULT_NATURE, DEFAULT_FREQ); + } + + /** + * 删除关键词 + */ + public static void remove(String key, String word) { + Forest dic = get(key); + if (dic != null) { + Library.removeWord(dic, word); + } + } + + /** + * 将用户自定义词典清空 + */ + public static void clear(String key) { + get(key).clear(); + } + + public static Forest get() { + return get(DEFAULT); + } + + /** + * 根据模型名称获取crf模型 + * + * @param modelName + * @return + */ + public static Forest get(String key) { + KeyValue kv = DIC.get(fix(key)); + + if (kv == null) { + LOG.warn("dic " + key + " not found in config "); + return null; + } + Forest forest = kv.getValue(); + if (forest == null) { + forest = init(kv); + } + return forest; + + } + + /** + * 用户自定义词典加载 + * + * @param key + * @param path + * @return + */ + + private synchronized static Forest init(KeyValue kv) { + Forest forest = kv.getValue(); + if (forest != null) { + return forest; + } + try { + forest = new Forest(); + LOG.info("begin init dic !"); + long start = System.currentTimeMillis(); + String temp = null; + String[] strs = null; + Value value = null; + try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getKey()), "UTF-8")) { + while ((temp = br.readLine()) != null) { + if (StringUtil.isNotBlank(temp)) { + temp = StringUtil.trim(temp); + strs = temp.split("\t"); + strs[0] = strs[0].toLowerCase(); + // 如何核心辞典存在那么就放弃 + if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) { + continue; + } + if (strs.length != 3) { + value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR); + } else { + value = new Value(strs[0], strs[1], strs[2]); + } + Library.insertWord(forest, value); + } + } + } + LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getKey()); + kv.setValue(forest); + return forest; + } catch (Exception e) { + LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getKey()); + return null; + } + } + + /** + * 动态添加词典 + * + * @param dicDefault + * @param dicDefault2 + * @param dic2 + */ + public static void put(String key, String path, Forest forest) { + DIC.put(key, KeyValue.with(path, forest)); + } + + /** + * 动态添加词典 + * + * @param dicDefault + * @param dicDefault2 + * @param dic2 + */ + public static void putIfAbsent(String key, String path) { + if (!DIC.containsKey(key)) { + DIC.put(key, KeyValue.with(path, (Forest) null)); + } + } + + /** + * 动态添加词典 + * + * @param dicDefault + * @param dicDefault2 + * @param dic2 + */ + public static void put(String key, String path) { + put(key, path, null); + } + + /** + * 动态添加词典 + * + * @param + * @param + * + * @param dicDefault + * @param dicDefault2 + * @param dic2 + */ + public static synchronized Forest putIfAbsent(String key, String path, Forest forest) { + KeyValue kv = DIC.get(key); + if (kv != null && kv.getValue() != null) { + return kv.getValue(); + } + put(key, path, forest); + return forest; + } + + public static KeyValue remove(String key) { + return DIC.remove(key); + } + + public static Set keys() { + return DIC.keySet(); + } + + private static String fix(String key) { + if (key.startsWith(DEFAULT)) { + return key; + } else { + return DEFAULT + key; + } + } + +} diff --git a/src/main/java/org/ansj/library/NatureLibrary.java b/src/main/java/org/ansj/library/NatureLibrary.java index 79acb92c..fafcab0d 100644 --- a/src/main/java/org/ansj/library/NatureLibrary.java +++ b/src/main/java/org/ansj/library/NatureLibrary.java @@ -9,6 +9,7 @@ import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.util.StringUtil; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 这里封装了词性和词性之间的关系.以及词性的索引.这是个好东西. 里面数组是从ict里面找来的. 不是很新.没有语料无法训练 @@ -18,7 +19,7 @@ */ public class NatureLibrary { - private static final Log logger = MyStaticValue.getLog(); + private static final Log logger = LogFactory.getLog(NatureLibrary.class); private static final int YI = 1; private static final int FYI = -1; diff --git a/src/main/java/org/ansj/library/NgramLibrary.java b/src/main/java/org/ansj/library/NgramLibrary.java index 4cb99478..06080645 100644 --- a/src/main/java/org/ansj/library/NgramLibrary.java +++ b/src/main/java/org/ansj/library/NgramLibrary.java @@ -2,6 +2,7 @@ import org.ansj.domain.Term; import org.ansj.util.MyStaticValue; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 两个词之间的关联 @@ -13,7 +14,7 @@ public class NgramLibrary { static { long start = System.currentTimeMillis(); MyStaticValue.initBigramTables(); - MyStaticValue.LIBRARYLOG.info("init ngram ok use time :" + (System.currentTimeMillis() - start)); + LogFactory.getLog(NgramLibrary.class).info("init ngram ok use time :" + (System.currentTimeMillis() - start)); } /** diff --git a/src/main/java/org/ansj/library/UserDefineLibrary.java b/src/main/java/org/ansj/library/UserDefineLibrary.java index 22857fa0..b47f5cf3 100644 --- a/src/main/java/org/ansj/library/UserDefineLibrary.java +++ b/src/main/java/org/ansj/library/UserDefineLibrary.java @@ -1,18 +1,18 @@ package org.ansj.library; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.tire.domain.SmartForest; import org.nlpcn.commons.lang.tire.domain.Value; import org.nlpcn.commons.lang.tire.library.Library; -import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.StringUtil; -import java.io.*; -import java.net.URL; - -import static org.ansj.util.MyStaticValue.LIBRARYLOG; - /** * 用户自定义词典操作类 * @@ -20,222 +20,111 @@ */ public class UserDefineLibrary { - public static final String DEFAULT_NATURE = "userDefine"; - - public static final Integer DEFAULT_FREQ = 1000; - - public static final String DEFAULT_FREQ_STR = "1000"; - - public static Forest FOREST = null; - - public static Forest ambiguityForest = null; - - static { - initUserLibrary(); - initAmbiguityLibrary(); - } - - /** - * 关键词增加 - * - * @param keyword 所要增加的关键词 - * @param nature 关键词的词性 - * @param freq 关键词的词频 - */ - public static void insertWord(String keyword, String nature, int freq) { - if (FOREST == null) { - FOREST = new Forest(); - } - String[] paramers = new String[2]; - paramers[0] = nature; - paramers[1] = String.valueOf(freq); - Value value = new Value(keyword, paramers); - Library.insertWord(FOREST, value); - } - - /** - * 增加关键词 - * - * @param keyword - */ - public static void insertWord(String keyword) { - insertWord(keyword, DEFAULT_NATURE, DEFAULT_FREQ); - } - - - /** - * 加载纠正词典 - */ - private static void initAmbiguityLibrary() { - - File[] lib = findLibrary(MyStaticValue.ambiguityLibrary); - - if (lib.length > 0) { - ambiguityForest = new Forest(); - for (File file : lib) { - try (BufferedReader br = IOUtil.getReader(file, "utf-8")) { - String temp; - while ((temp = br.readLine()) != null) { - if (StringUtil.isNotBlank(temp)) { - temp = StringUtil.trim(temp); - String[] split = temp.split("\t"); - StringBuilder sb = new StringBuilder(); - if (split.length % 2 != 0) { - LIBRARYLOG.error("init ambiguity error in line :" + temp + " format err !"); - } - for (int i = 0; i < split.length; i += 2) { - sb.append(split[i]); - } - ambiguityForest.addBranch(sb.toString(), split); - } - } - - } catch (UnsupportedEncodingException e) { - LIBRARYLOG.warn("不支持的编码", e); - } catch (IOException e) { - LIBRARYLOG.warn("Init ambiguity library error :"+ e.getMessage()+", path: "+ file.getPath()); - } - } - - LIBRARYLOG.info("Init ambiguity library ok!"); - - } else { - LIBRARYLOG.warn("Init ambiguity library warning :"+MyStaticValue.ambiguityLibrary+" because : file not found or failed to read !"); - } - - } - - /** - * 加载用户自定义词典和补充词典 - */ - private static void initUserLibrary() { - FOREST = MyStaticValue.getDicForest(); - } - - - /** - * 加载词典,传入一本词典的路径.或者目录.词典后缀必须为.dic 按文件名称顺序加载 - */ - public static void loadLibrary(Forest forest, String path) { - - File[] lib = findLibrary(path); - - if (lib.length > 0) { - for (File file : lib) { - String temp; - String[] strs; - Value value; - try (BufferedReader br = IOUtil.getReader(new FileInputStream(file), "UTF-8")) { - while ((temp = br.readLine()) != null) { - if (StringUtil.isNotBlank(temp)) { - temp = StringUtil.trim(temp); - strs = temp.split("\t"); - strs[0] = strs[0].toLowerCase(); - // 如何核心辞典存在那么就放弃 - if (MyStaticValue.isSkipUserDefine && DATDictionary.getId(strs[0]) > 0) { - continue; - } - if (strs.length != 3) { - value = new Value(strs[0], DEFAULT_NATURE, DEFAULT_FREQ_STR); - } else { - value = new Value(strs[0], strs[1], strs[2]); - } - Library.insertWord(forest, value); - } - } - } catch (UnsupportedEncodingException e) { - LIBRARYLOG.warn("不支持的编码", e); - } catch (IOException e) { - LIBRARYLOG.warn("Init user library error :"+e.getMessage()+", path: "+file.getPath()); - } - } - - LIBRARYLOG.info("Init user library ok!"); - - - } else { - LIBRARYLOG.warn("Init user library error :"+path+" because : not find that file !"); - } - - } - - /** - * 删除关键词 - */ - public static void removeWord(String word) { - Library.removeWord(FOREST, word); - } - - public static String[] getParams(String word) { - return getParams(FOREST, word); - } - - public static String[] getParams(Forest forest, String word) { - SmartForest temp = forest; - for (int i = 0; i < word.length(); i++) { - temp = temp.get(word.charAt(i)); - if (temp == null) { - return null; - } - } - if (temp.getStatus() > 1) { - return temp.getParam(); - } else { - return null; - } - } - - public static boolean contains(String word) { - return getParams(word) != null; - } - - /** - * 将用户自定义词典清空 - */ - public static void clear() { - FOREST.clear(); - } - - - /** - * Load files - * - * @param path file path - * @return File Array - */ - private static File[] findLibrary(String path) { - File[] libs = new File[0]; - File file = new File(path); - if (!file.exists()) { - // Try load from classpath - URL url = UserDefineLibrary.class.getClassLoader().getResource(path); - if (url != null) { - file = new File(url.getPath()); - } - } - - if (file.canRead()) { - - if (file.isFile()) { - libs = new File[1]; - libs[0] = file; - } else if (file.isDirectory()) { - File[] files = file.listFiles(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - if (name.endsWith(".dic") && dir.canRead()) { - return true; - } else { - return false; - } - } - }); - if (files != null && files.length > 0) { - libs = files; - } - } - } - return libs; - } - + + /** + * 覆盖更新同义词 [中国, 中华, 我国] -> replace([中国,华夏]) -> [中国,华夏] + * + * @param words + */ + public void insert(String key, String[] words) { + + List list = new ArrayList<>(); + + for (String word : words) { + if (StringUtil.isBlank(word)) { + continue; + } + list.add(word); + } + + if (list.size() <= 1) { + LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word"); + return; + } + + Set set = findAllWords(words); + + for (String word : list) { + set.remove(word); + synonyms.add(word, list); + } + + for (String word : set) { //删除所有 + synonyms.remove(word); + synonyms.getBranch(word).setParam(null); + } + + } + + private Set findAllWords(String[] words) { + Set set = new HashSet<>(); + + for (String word : words) { + SmartForest> branch = synonyms.getBranch(word); + if (branch != null) { + List params = branch.getParam(); + if (params != null) { + set.addAll(params); + } + } + } + return set; + } + + /** + * 合并更新同义词 覆盖更新同义词 [中国, 中华, 我国] -> append([中国,华夏]) -> [中国, 中华, 我国 , 华夏] + * + * @param words + */ + public void append(String[] words) { + + Set set = new HashSet<>(); + + for (String word : words) { + if (StringUtil.isBlank(word)) { + continue; + } + set.add(word); + } + + if (set.size() <= 1) { + LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word"); + return; + } + + set.addAll(findAllWords(words)); + + List list = new ArrayList<>(set); + + for (String word : list) { + synonyms.addBranch(word, list); + } + } + + /** + * 从同义词组中删除掉一个词 [中国, 中华, 我国] -> remove(我国) -> [中国, 中华] + * + * @param words + */ + public void remove(String word) { + + SmartForest> branch = synonyms.getBranch(word); + + if (branch == null || branch.getStatus() < 2) { + return; + } + + List params = branch.getParam(); + + synonyms.remove(word); + branch.setParam(null); + params.remove(word); + + if (params.size() == 1) { //如果是1 个也删除 + synonyms.remove(params.get(0)); + params.remove(0); + } else { + params.remove(word); + } + } } diff --git a/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java b/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java index 5aa827f4..d5069a02 100644 --- a/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java +++ b/src/main/java/org/ansj/library/company/CompanyAttrLibrary.java @@ -6,6 +6,7 @@ import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 机构名识别词典加载类 @@ -15,7 +16,7 @@ */ public class CompanyAttrLibrary { - private static final Log logger = MyStaticValue.getLog(); + private static final Log logger = LogFactory.getLog(); private static HashMap cnMap = null; @@ -31,7 +32,7 @@ public static HashMap getCompanyMap() { } // company_freq - + private static void init() { try (BufferedReader br = MyStaticValue.getCompanReader()) { cnMap = new HashMap(); diff --git a/src/main/java/org/ansj/library/name/PersonAttrLibrary.java b/src/main/java/org/ansj/library/name/PersonAttrLibrary.java index b7d67154..4a81982f 100644 --- a/src/main/java/org/ansj/library/name/PersonAttrLibrary.java +++ b/src/main/java/org/ansj/library/name/PersonAttrLibrary.java @@ -10,6 +10,7 @@ import org.ansj.domain.PersonNatureAttr; import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 人名标注所用的词典就是简单的hashmap简单方便谁用谁知道,只在加载词典的时候用 @@ -19,7 +20,7 @@ public class PersonAttrLibrary { - private static final Log logger = MyStaticValue.getLog(); + private static final Log logger = LogFactory.getLog(); private HashMap pnMap = null; diff --git a/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java b/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java index c2ecabb7..69c100e1 100644 --- a/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java +++ b/src/main/java/org/ansj/recognition/arrimpl/UserDefineRecognition.java @@ -3,7 +3,6 @@ import org.ansj.domain.Term; import org.ansj.domain.TermNature; import org.ansj.domain.TermNatures; -import org.ansj.library.UserDefineLibrary; import org.ansj.recognition.TermArrRecognition; import org.ansj.util.MyStaticValue; import org.ansj.util.TermUtil; @@ -11,6 +10,7 @@ import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.tire.domain.SmartForest; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 用户自定义词典.又称补充词典 @@ -20,11 +20,11 @@ */ public class UserDefineRecognition implements TermArrRecognition { - public static final Log logger = MyStaticValue.getLog(); + public static final Log logger = LogFactory.getLog(UserDefineRecognition.class); private Term[] terms = null; - private Forest[] forests = { UserDefineLibrary.FOREST }; + private Forest[] forests = { MyStaticValue.dic() }; private int offe = -1; private int endOffe = -1; diff --git a/src/main/java/org/ansj/recognition/impl/NatureRecognition.java b/src/main/java/org/ansj/recognition/impl/NatureRecognition.java index 65e5adce..4df53e9b 100644 --- a/src/main/java/org/ansj/recognition/impl/NatureRecognition.java +++ b/src/main/java/org/ansj/recognition/impl/NatureRecognition.java @@ -17,11 +17,11 @@ import org.ansj.recognition.arrimpl.ForeignPersonRecognition; import org.ansj.splitWord.analysis.ToAnalysis; import org.ansj.util.MathUtil; -import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.tire.domain.SmartForest; import org.nlpcn.commons.lang.util.WordAlert; import org.nlpcn.commons.lang.util.logging.Log; +import org.nlpcn.commons.lang.util.logging.LogFactory; /** * 词性标注工具类 @@ -32,7 +32,7 @@ public class NatureRecognition implements Recognition { private static final long serialVersionUID = 1L; - private static final Log logger = MyStaticValue.getLog() ; + private static final Log logger = LogFactory.getLog() ; private static final Forest SUFFIX_FOREST = new Forest(); diff --git a/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java b/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java index a9a66b9e..1d163c47 100644 --- a/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java +++ b/src/main/java/org/ansj/recognition/impl/SynonymsRecgnition.java @@ -1,21 +1,12 @@ package org.ansj.recognition.impl; -import java.io.BufferedReader; -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; import java.util.List; -import java.util.Set; import org.ansj.domain.Result; import org.ansj.domain.Term; import org.ansj.recognition.Recognition; import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.tire.domain.SmartForest; -import org.nlpcn.commons.lang.util.IOUtil; -import org.nlpcn.commons.lang.util.StringUtil; -import org.nlpcn.commons.lang.util.logging.Log; /** * 同义词功能 @@ -25,177 +16,18 @@ */ public class SynonymsRecgnition implements Recognition { - /** - * - */ private static final long serialVersionUID = 5961499108093950130L; - private static final Log LOG = MyStaticValue.getLog(); - - private static SmartForest> SYS_SYNONYMS = null; - - private SmartForest> synonyms = new SmartForest<>(); - - public SmartForest> initLibrary(String synonymsLibrary) { - - if (synonymsLibrary == null || !new File(synonymsLibrary).exists()) { - MyStaticValue.LIBRARYLOG.warn(synonymsLibrary + " not exists so set syn to empty!"); - } else { - try (BufferedReader reader = IOUtil.getReader(synonymsLibrary, IOUtil.UTF8)) { - String temp = null; - while ((temp = reader.readLine()) != null) { - if (StringUtil.isBlank(temp)) { - continue; - } - String[] split = temp.split("\t"); - - List list = new ArrayList<>(); - for (String word : split) { - if (StringUtil.isBlank(word)) { - continue; - } - list.add(word); - } - - if (split.length <= 1) { - MyStaticValue.LIBRARYLOG.warn(temp + " in synonymsLibrary not in to library !"); - continue; - } - - for (int i = 0; i < split.length; i++) { - synonyms.add(split[i], list); - } - } - } catch (Exception e) { - e.printStackTrace(); - } - } - - LOG.info("init library synonymsLibrary ok from " + new File(synonymsLibrary).getAbsolutePath()); - return synonyms; - } + private SmartForest> synonyms = null; public SynonymsRecgnition() { - if (SYS_SYNONYMS == null) { - synchronized (SynonymsRecgnition.class) { - if (SYS_SYNONYMS == null) { - SYS_SYNONYMS = initLibrary(MyStaticValue.synonymsLibrary); - } - } - } - synonyms = SYS_SYNONYMS; + this.synonyms = MyStaticValue.synonyms(); } - public SynonymsRecgnition(String synonymsLibrary) { - initLibrary(synonymsLibrary); + public SynonymsRecgnition(String key) { + this.synonyms = MyStaticValue.synonyms(key); } - /** - * 覆盖更新同义词 [中国, 中华, 我国] -> replace([中国,华夏]) -> [中国,华夏] - * - * @param words - */ - public void insert(String[] words) { - - List list = new ArrayList<>(); - - for (String word : words) { - if (StringUtil.isBlank(word)) { - continue; - } - list.add(word); - } - - if (list.size() <= 1) { - LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word"); - return; - } - - Set set = findAllWords(words); - - for (String word : list) { - set.remove(word); - synonyms.add(word, list); - } - - for (String word : set) { //删除所有 - synonyms.remove(word); - synonyms.getBranch(word).setParam(null); - } - - } - - private Set findAllWords(String[] words) { - Set set = new HashSet<>(); - - for (String word : words) { - SmartForest> branch = synonyms.getBranch(word); - if (branch != null) { - List params = branch.getParam(); - if (params != null) { - set.addAll(params); - } - } - } - return set; - } - - /** - * 合并更新同义词 覆盖更新同义词 [中国, 中华, 我国] -> append([中国,华夏]) -> [中国, 中华, 我国 , 华夏] - * - * @param words - */ - public void append(String[] words) { - - Set set = new HashSet<>(); - - for (String word : words) { - if (StringUtil.isBlank(word)) { - continue; - } - set.add(word); - } - - if (set.size() <= 1) { - LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word"); - return; - } - - set.addAll(findAllWords(words)); - - List list = new ArrayList<>(set); - - for (String word : list) { - synonyms.addBranch(word, list); - } - } - - /** - * 从同义词组中删除掉一个词 [中国, 中华, 我国] -> remove(我国) -> [中国, 中华] - * - * @param words - */ - public void remove(String word) { - - SmartForest> branch = synonyms.getBranch(word); - - if (branch == null || branch.getStatus() < 2) { - return; - } - - List params = branch.getParam(); - - synonyms.remove(word); - branch.setParam(null); - params.remove(word); - - if (params.size() == 1) { //如果是1 个也删除 - synonyms.remove(params.get(0)); - params.remove(0); - } else { - params.remove(word); - } - } @Override public void recognition(Result result) { diff --git a/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java b/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java index 8c843f63..e7a18107 100644 --- a/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java +++ b/src/main/java/org/ansj/recognition/impl/UserDicNatureRecognition.java @@ -3,8 +3,8 @@ import org.ansj.domain.Nature; import org.ansj.domain.Result; import org.ansj.domain.Term; -import org.ansj.library.UserDefineLibrary; import org.ansj.recognition.Recognition; +import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.tire.domain.SmartForest; @@ -20,7 +20,7 @@ public class UserDicNatureRecognition implements Recognition { * */ private static final long serialVersionUID = 1L; - private Forest[] forests = new Forest[] { UserDefineLibrary.FOREST }; + private Forest[] forests = new Forest[] { MyStaticValue.dic() }; public UserDicNatureRecognition() { } diff --git a/src/main/java/org/ansj/util/MyStaticValue.java b/src/main/java/org/ansj/util/MyStaticValue.java index 150f6da2..f30e609e 100644 --- a/src/main/java/org/ansj/util/MyStaticValue.java +++ b/src/main/java/org/ansj/util/MyStaticValue.java @@ -2,25 +2,33 @@ import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.UnsupportedEncodingException; import java.lang.reflect.Field; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.PropertyResourceBundle; import java.util.ResourceBundle; -import java.util.logging.Logger; import org.ansj.app.crf.Model; import org.ansj.app.crf.SplitWord; import org.ansj.app.crf.model.CRFModel; import org.ansj.dic.DicReader; +import org.ansj.dic.PathToStream; import org.ansj.domain.AnsjItem; +import org.ansj.exception.LibraryException; import org.ansj.library.DATDictionary; +import org.ansj.library.DicLibrary; import org.ansj.library.UserDefineLibrary; import org.nlpcn.commons.lang.tire.domain.Forest; +import org.nlpcn.commons.lang.tire.domain.SmartForest; +import org.nlpcn.commons.lang.tire.domain.Value; +import org.nlpcn.commons.lang.tire.library.Library; import org.nlpcn.commons.lang.util.FileFinder; import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.ObjConver; @@ -38,11 +46,12 @@ public class MyStaticValue { public static final Forest EMPTY_FOREST = new Forest(); - public static final Log LIBRARYLOG = getLog(); + private static final Log LOG = LogFactory.getLog(MyStaticValue.class); - public static final String DIC_DEFAULT = "dic"; - public static final String CRF_DEFAULT = "crf"; + public static final String AMBIGUITY_DEFAULT = "ambiguity_"; + + public static final String SYNONYMS_DEFAULT = "synonyms_"; // 是否开启人名识别 public static Boolean isNameRecognition = true; @@ -56,21 +65,21 @@ public class MyStaticValue { // 是否显示真实词语 public static Boolean isRealName = false; - // 用户自定义词典 - public static final Map DIC = new HashMap(); - // CRF模型 - public static final Map CRF = new HashMap(); + // 歧义词典 + public static final Map AMBIGUITY = new HashMap<>(); - /** - * 用户自定义词典的加载,如果是路径就扫描路径下的dic文件 - */ - public static String ambiguityLibrary = "library/ambiguity.dic"; + // 同义词典 + public static final Map SYNONYMS = new HashMap<>(); - /** - * 增加同义词词典路径变量 - */ - public static String synonymsLibrary = "library/synonyms.dic"; + //存放所有的词典 + private static final Map ALL = new HashMap<>(); + + //默认的词性 + public static final String DEFAULT_NATURE = "userDefine"; + + //默认的词频 + public static final String DEFAULT_FREQ_STR = "1000"; /** * 是否用户辞典不加载相同的词 @@ -89,10 +98,10 @@ public class MyStaticValue { File find = FileFinder.find("ansj_library.properties", 1); if (find != null && find.isFile()) { rb = new PropertyResourceBundle(IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding"))); - LIBRARYLOG.info("load ansj_library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!"); + LOG.info("load ansj_library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!"); } } catch (Exception e1) { - LIBRARYLOG.warn("not find ansj_library.properties. and err {} i think it is a bug!", e1); + LOG.warn("not find ansj_library.properties. and err {} i think it is a bug!", e1); } } @@ -104,54 +113,75 @@ public class MyStaticValue { File find = FileFinder.find("library.properties", 2); if (find != null && find.isFile()) { rb = new PropertyResourceBundle(IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding"))); - LIBRARYLOG.info("load library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!"); + LOG.info("load library not find in classPath ! i find it in " + find.getAbsolutePath() + " make sure it is your config!"); } } catch (Exception e1) { - LIBRARYLOG.warn("not find library.properties. and err {} i think it is a bug!", e1); + LOG.warn("not find library.properties. and err {} i think it is a bug!", e1); } } } - DIC.put(DIC_DEFAULT, "library/default.dic"); - if (rb == null) { - LIBRARYLOG.warn("not find library.properties in classpath use it by default !"); + LOG.warn("not find library.properties in classpath use it by default !"); } else { for (String key : rb.keySet()) { if (key.equals("dic")) { - DIC.put(key, rb.getString(key)); + DicLibrary.put(DicLibrary.DEFAULT, rb.getString(key)); } else if (key.equals("crf")) { - CRF.put(key, rb.getString(key)); + CRF.put(CRF_DEFAULT, rb.getString(key)); + } else if (key.equals("ambiguity")) { + AMBIGUITY.put(AMBIGUITY_DEFAULT, rb.getString(key)); + } else if (key.equals("synonyms")) { + SYNONYMS.put(AMBIGUITY_DEFAULT, rb.getString(key)); } else if (key.startsWith("dic_")) { - if (DIC.containsKey(key)) { - LIBRARYLOG.warn(key + "{} dic config repeat definition now overwrite it !"); + if (DicLibrary.DIC.containsKey(key)) { + LOG.warn(key + " dic config repeat definition now overwrite it !"); } - DIC.put(key, rb.getString(key)); + DicLibrary.put(key, rb.getString(key)); } else if (key.startsWith("crf_")) { if (CRF.containsKey(key)) { - LIBRARYLOG.warn(key + " crf config repeat definition now overwrite it !"); + LOG.warn(key + " crf config repeat definition now overwrite it !"); } CRF.put(key, rb.getString(key)); + } else if (key.startsWith("synonyms_")) { + if (CRF.containsKey(key)) { + LOG.warn(key + " crf config repeat definition now overwrite it !"); + } + SYNONYMS.put(key, rb.getString(key)); + } else if (key.startsWith("ambiguity_")) { + if (CRF.containsKey(key)) { + LOG.warn(key + " crf config repeat definition now overwrite it !"); + } + AMBIGUITY.put(key, rb.getString(key)); } else { try { Field field = MyStaticValue.class.getField(key); field.set(null, ObjConver.conversion(rb.getString(key), field.getType())); } catch (NoSuchFieldException e) { - LIBRARYLOG.error("not find field by " + key); + LOG.error("not find field by " + key); } catch (SecurityException e) { - LIBRARYLOG.error("安全异常", e); + LOG.error("安全异常", e); } catch (IllegalArgumentException e) { - LIBRARYLOG.error("非法参数", e); + LOG.error("非法参数", e); } catch (IllegalAccessException e) { - LIBRARYLOG.error("非法访问", e); + LOG.error("非法访问", e); } } } } + + //如果没有设置则设置默认路径 + DicLibrary.putIfAbsent(DicLibrary.DEFAULT, "library/default.dic"); + + CRF.putIfAbsent(CRF_DEFAULT, "jar://crf.model"); + + AMBIGUITY.putIfAbsent(DIC_DEFAULT, "library/ambiguity.dic"); + + SYNONYMS.putIfAbsent(DIC_DEFAULT, "library/synonyms.dic"); } /** @@ -247,9 +277,9 @@ public static Map getPersonFreqMap() { ObjectInputStream objectInputStream = new ObjectInputStream(inputStream); map = (Map) objectInputStream.readObject(); } catch (IOException e) { - LIBRARYLOG.warn("IO异常", e); + LOG.warn("IO异常", e); } catch (ClassNotFoundException e) { - LIBRARYLOG.warn("找不到类", e); + LOG.warn("找不到类", e); } return map; } @@ -295,152 +325,149 @@ public static void initBigramTables() { } } catch (NumberFormatException e) { - LIBRARYLOG.warn("数字格式异常", e); + LOG.warn("数字格式异常", e); } catch (UnsupportedEncodingException e) { - LIBRARYLOG.warn("不支持的编码", e); + LOG.warn("不支持的编码", e); } catch (IOException e) { - LIBRARYLOG.warn("IO异常", e); + LOG.warn("IO异常", e); } } - /** - * 得到默认的模型 - * - * @return - */ - public static SplitWord getCRFSplitWord() { - return getCRFSplitWord(CRF_DEFAULT); - } - - /** - * 根据模型名称获取crf模型 - * - * @param key - * @return - */ - public static SplitWord getCRFSplitWord(String key) { - Object temp = CRF.get(key); - - if (temp == null) { - if (CRF_DEFAULT.equals(key)) { // 加载内置模型 - return initDefaultModel(); - } else { - LIBRARYLOG.warn("crf " + key + " not found in config "); - return null; - } - } else if (temp instanceof String) { - return initCRFModel(key, (String) temp); - } else { - return (SplitWord) temp; - } - } + /** - * 加载默认的crf模型 + * 加载歧义词典 * + * @param modelName * @return */ - private static synchronized SplitWord initDefaultModel() { + public static Forest ambiguity(String key) { + String path = AMBIGUITY.get(fix("ambiguity_", key)); - Object obj = CRF.get(CRF_DEFAULT); - if (obj != null && obj instanceof SplitWord) { - return (SplitWord) obj; + if (path == null) { + LOG.warn("ambiguity " + key + " not found in config "); + return null; } - try { - LIBRARYLOG.info("init deafult crf model begin !"); - CRFModel model = new CRFModel(CRF_DEFAULT); - model.loadModel(DicReader.getInputStream("crf.model")); - SplitWord splitWord = new SplitWord(model); - CRF.put(CRF_DEFAULT, splitWord); - return splitWord; - } catch (Exception e) { - LIBRARYLOG.error("init err!", e); + Forest forest = (Forest) ALL.get(path); + if (forest == null) { + forest = initAmbiguity(key, path); } - return null; + return forest; + } /** - * 加载CRF模型 + * 加载歧义词典 * - * @param modelPath + * @param key + * @param path * @return */ - private static synchronized SplitWord initCRFModel(String key, String modelPath) { - try { - Object obj = CRF.get(key); - if (obj != null && obj instanceof SplitWord) { - return (SplitWord) obj; - } - if (new File(modelPath).isFile() && new File(modelPath).exists()) { - long start = System.currentTimeMillis(); - LIBRARYLOG.info("begin init crf model!"); - SplitWord crfSplitWord = new SplitWord(Model.load(key, modelPath)); - CRF.put(key, crfSplitWord); - LIBRARYLOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + modelPath); - return crfSplitWord; - } else { - LIBRARYLOG.info(key + " file not found ,please make sure it is exists : " + modelPath); + private synchronized static Forest initAmbiguity(String key, String path) { + Forest forest = (Forest) ALL.get(path); + if (forest != null) { + return forest; + } + forest = new Forest(); + try (BufferedReader br = IOUtil.getReader(PathToStream.stream(path), "utf-8")) { + String temp; + LOG.info("begin init dic !"); + long start = System.currentTimeMillis(); + while ((temp = br.readLine()) != null) { + if (StringUtil.isNotBlank(temp)) { + temp = StringUtil.trim(temp); + String[] split = temp.split("\t"); + StringBuilder sb = new StringBuilder(); + if (split.length % 2 != 0) { + LOG.error("init ambiguity error in line :" + temp + " format err !"); + continue; + } + for (int i = 0; i < split.length; i += 2) { + sb.append(split[i]); + } + forest.addBranch(sb.toString(), split); + } } + LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + path); + ALL.put(path, forest); + return forest; } catch (Exception e) { - LIBRARYLOG.info(key + " file : " + modelPath + " load err " + e.getMessage()); + LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + path); + return null; } - return null; - } - - /** - * 得到默认的模型 - * - * @return - */ - public static Forest getDicForest() { - return getDicForest(DIC_DEFAULT); } /** - * 根据模型名称获取crf模型 + * 加载同义词典 * * @param modelName * @return */ - public static Forest getDicForest(String key) { - Object temp = DIC.get(key); - - if (temp == null) { - LIBRARYLOG.warn("dic " + key + " not found in config "); + public static SmartForest> synonyms(String key) { + String path = SYNONYMS.get(fix("synonyms_", key)); + if (path == null) { + LOG.warn("synonyms " + key + " not found in config "); return null; - } else if (temp instanceof String) { - return initForest(key, (String) temp); - } else { - return (Forest) temp; } + @SuppressWarnings("unchecked") + SmartForest> forest = (SmartForest>) ALL.get(path); + if (forest == null) { + forest = initSynonyms(key, path); + } + return forest; + } /** - * 用户自定义词典加载 + * 加载同义词典 * * @param key - * @param dicPath + * @param path * @return */ - private synchronized static Forest initForest(String key, String dicPath) { - Object obj = CRF.get(key); + private synchronized static SmartForest> initSynonyms(String key, String path) { + @SuppressWarnings("unchecked") + SmartForest> forest = (SmartForest>) ALL.get(path); + if (forest != null) { + return forest; + } + forest = new SmartForest<>(); + + LOG.info("begin init synonyms " + key); + long start = System.currentTimeMillis(); + + try (BufferedReader reader = IOUtil.getReader(PathToStream.stream(path), IOUtil.UTF8)) { + String temp = null; + while ((temp = reader.readLine()) != null) { + if (StringUtil.isBlank(temp)) { + continue; + } + String[] split = temp.split("\t"); + + List list = new ArrayList<>(); + for (String word : split) { + if (StringUtil.isBlank(word)) { + continue; + } + list.add(word); + } - if (obj != null && obj instanceof Forest) { - return (Forest) obj; + if (split.length <= 1) { + LOG.warn(temp + " in synonymsLibrary not in to library !"); + continue; + } + + for (int i = 0; i < split.length; i++) { + forest.add(split[i], list); + } + } + LOG.info("load synonyms use time:" + (System.currentTimeMillis() - start) + " path is : " + path); + return forest; + } catch (Exception e) { + LOG.error("Init synonyms library error :" + e.getMessage() + ", path: " + path); + return null; } - Forest forest = new Forest(); - UserDefineLibrary.loadLibrary(forest, dicPath); - DIC.put(key, forest); - return forest; - } - /** - * 获取log默认当前类,不支持android - * - * @return - */ - public static Log getLog() { - StackTraceElement[] sts = Thread.currentThread().getStackTrace(); - return LogFactory.getLog(sts[2].getClassName()); } + }