From 3bec38c8666c1d65b2d8f25b7be99e90b31100d3 Mon Sep 17 00:00:00 2001 From: yeyuelong Date: Thu, 22 Sep 2016 18:21:12 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0synonyms=E5=90=8C=E4=B9=89?= =?UTF-8?q?=E8=AF=8D=E8=AF=8D=E5=85=B8=E5=8A=9F=E8=83=BD=EF=BC=8C=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E7=B1=BB=E4=BC=BCambiguity=E6=AD=A7=E4=B9=89=E8=AF=8D?= =?UTF-8?q?=E8=AF=8D=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ansj_library.properties | 1 + library/synonyms.dic | 2 + pom.xml | 325 +++---- .../org/ansj/library/UserDefineLibrary.java | 38 + .../java/org/ansj/util/MyStaticValue.java | 879 +++++++++--------- 5 files changed, 646 insertions(+), 599 deletions(-) create mode 100644 library/synonyms.dic diff --git a/ansj_library.properties b/ansj_library.properties index 69f5a376..2f0cedbd 100644 --- a/ansj_library.properties +++ b/ansj_library.properties @@ -6,6 +6,7 @@ dic=library/default.dic #redress dic file path ambiguityLibrary=library/ambiguity.dic +synonymsLibrary=library/synonyms.dic #set real name isRealName=true diff --git a/library/synonyms.dic b/library/synonyms.dic new file mode 100644 index 00000000..a3f04b39 --- /dev/null +++ b/library/synonyms.dic @@ -0,0 +1,2 @@ +枇杷 苹果 香蕉 +中国 中华 华夏 diff --git a/pom.xml b/pom.xml index 7cea0303..1dc9299f 100644 --- a/pom.xml +++ b/pom.xml @@ -1,162 +1,163 @@ - - 4.0.0 - org.ansj - ansj_seg - jar - ansj_seg - 5.0.2 - best java chinese word seg ! - https://github.com/NLPchina/ansj_seg - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - scm:git:git@github.com:ansjsun/ansj_seg.git - scm:git:git@github.com:ansjsun/ansj_seg.git - git@github.com:ansjsun/ansj_seg.git - - - - - - ansj - ansj - ansj-sun@163.com - - - - - UTF-8 - - - - - org.nlpcn - nlp-lang - 1.7 - compile - - - - org.slf4j - slf4j-api - 1.7.21 - - - - org.slf4j - slf4j-log4j12 - 1.7.21 - provided - - - - log4j - log4j - 1.2.16 - provided - - - - junit - junit - 4.8.1 - test - - - - - - - - net.orfjackal.retrolambda - retrolambda-maven-plugin - 2.0.6 - - - - process-main - - - - - 1.6 - false - false - - - - - maven-compiler-plugin - 2.3.2 - - 1.7 - 1.7 - UTF-8 - - - - - org.apache.maven.plugins - maven-source-plugin - - - attach-sources - - jar-no-fork - - - - - - true - - **/*.java - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.3 - - -Xdoclint:none - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.4 - - - sign-artifacts - verify - - sign - - - - - - - - - - sonatype-nexus-snapshots - Sonatype Nexus snapshot repository - https://oss.sonatype.org/content/repositories/snapshots - - - - sonatype-nexus-staging - Sonatype Nexus release repository - https://oss.sonatype.org/service/local/staging/deploy/maven2 - - - + + 4.0.0 + org.ansj + ansj_seg + jar + ansj_seg + 5.0.2 + best java chinese word seg ! + https://github.com/NLPchina/ansj_seg + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + scm:git:git@github.com:ansjsun/ansj_seg.git + scm:git:git@github.com:ansjsun/ansj_seg.git + git@github.com:ansjsun/ansj_seg.git + + + + + + ansj + ansj + ansj-sun@163.com + + + + + UTF-8 + + + + + org.nlpcn + nlp-lang + 1.7 + compile + + + + org.slf4j + slf4j-api + 1.7.21 + + + + org.slf4j + slf4j-log4j12 + 1.7.21 + provided + + + + log4j + log4j + 1.2.16 + provided + + + + junit + junit + 4.8.1 + test + + + + + + + + net.orfjackal.retrolambda + retrolambda-maven-plugin + 2.0.6 + + + default + + process-main + + + + + 1.6 + false + false + + + + + maven-compiler-plugin + 2.3.2 + + 1.7 + 1.7 + UTF-8 + + + + + org.apache.maven.plugins + maven-source-plugin + + + attach-sources + + jar-no-fork + + + + + + true + + **/*.java + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.3 + + -Xdoclint:none + + + + + + + + + + + + + + + + + + + + + + sonatype-nexus-snapshots + Sonatype Nexus snapshot repository + https://oss.sonatype.org/content/repositories/snapshots + + + + sonatype-nexus-staging + Sonatype Nexus release repository + https://oss.sonatype.org/service/local/staging/deploy/maven2 + + + diff --git a/src/main/java/org/ansj/library/UserDefineLibrary.java b/src/main/java/org/ansj/library/UserDefineLibrary.java index ca48eb2e..05bfc532 100644 --- a/src/main/java/org/ansj/library/UserDefineLibrary.java +++ b/src/main/java/org/ansj/library/UserDefineLibrary.java @@ -29,10 +29,12 @@ public class UserDefineLibrary { public static Forest FOREST = null; public static Forest ambiguityForest = null; + public static Forest synonymsForest = null; static { initUserLibrary(); initAmbiguityLibrary(); + initSynonymsLibrary(); } /** @@ -104,6 +106,42 @@ private static void initAmbiguityLibrary() { } } + + /** + * 初始化同义词词典 + */ + private static void initSynonymsLibrary() { + + File[] lib = findLibrary(MyStaticValue.synonymsLibrary); + + if (lib.length > 0) { + synonymsForest = new Forest(); + for (File file : lib) { + try (BufferedReader br = IOUtil.getReader(file, "utf-8")) { + String temp; + while ((temp = br.readLine()) != null) { + if (StringUtil.isNotBlank(temp)) { + temp = StringUtil.trim(temp); + String[] split = temp.split("\t"); + LIBRARYLOG.info("init synonyms in line :" + temp); + synonymsForest.addBranch(split[0], split); + } + } + + } catch (UnsupportedEncodingException e) { + LIBRARYLOG.warn("不支持的编码", e); + } catch (IOException e) { + LIBRARYLOG.warn("Init synonyms library error :{}, path: {}", e.getMessage(), file.getPath()); + } + } + + LIBRARYLOG.info("Init synonyms library ok!"); + + } else { + LIBRARYLOG.warn("Init synonyms library warning :{} because : file not found or failed to read !", MyStaticValue.synonymsLibrary); + } + + } /** * 加载用户自定义词典和补充词典 diff --git a/src/main/java/org/ansj/util/MyStaticValue.java b/src/main/java/org/ansj/util/MyStaticValue.java index c6cb472c..17e26d49 100644 --- a/src/main/java/org/ansj/util/MyStaticValue.java +++ b/src/main/java/org/ansj/util/MyStaticValue.java @@ -1,437 +1,442 @@ -package org.ansj.util; - -import java.io.BufferedReader; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.UnsupportedEncodingException; -import java.lang.reflect.Field; -import java.util.HashMap; -import java.util.Map; -import java.util.PropertyResourceBundle; -import java.util.ResourceBundle; - -import org.ansj.app.crf.Model; -import org.ansj.app.crf.SplitWord; -import org.ansj.app.crf.model.CRFModel; -import org.ansj.dic.DicReader; -import org.ansj.domain.AnsjItem; -import org.ansj.library.DATDictionary; -import org.ansj.library.UserDefineLibrary; -import org.nlpcn.commons.lang.tire.domain.Forest; -import org.nlpcn.commons.lang.util.FileFinder; -import org.nlpcn.commons.lang.util.IOUtil; -import org.nlpcn.commons.lang.util.ObjConver; -import org.nlpcn.commons.lang.util.StringUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * 这个类储存一些公用变量. - * - * @author ansj - * - */ -public class MyStaticValue { - - public static final Forest EMPTY_FOREST = new Forest(); - - public static final Logger LIBRARYLOG = LoggerFactory.getLogger("DICLOG"); - - public static final String DIC_DEFAULT = "dic"; - - public static final String CRF_DEFAULT = "crf"; - - // 是否开启人名识别 - public static Boolean isNameRecognition = true; - - // 是否开启数字识别 - public static Boolean isNumRecognition = true; - - // 是否数字和量词合并 - public static Boolean isQuantifierRecognition = true; - - // 是否显示真实词语 - public static Boolean isRealName = false; - - // 用户自定义词典 - public static final Map DIC = new HashMap(); - - // CRF模型 - public static final Map CRF = new HashMap(); - - /** - * 用户自定义词典的加载,如果是路径就扫描路径下的dic文件 - */ - public static String ambiguityLibrary = "library/ambiguity.dic"; - - /** - * 是否用户辞典不加载相同的词 - */ - public static boolean isSkipUserDefine = false; - - static { - /** - * 配置文件变量 - */ - ResourceBundle rb = null; - try { - rb = ResourceBundle.getBundle("ansj_library"); - } catch (Exception e) { - try { - File find = FileFinder.find("ansj_library.properties", 1); - if (find != null && find.isFile()) { - rb = new PropertyResourceBundle( - IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding"))); - LIBRARYLOG.info( - "load ansj_library not find in classPath ! i find it in {} make sure it is your config!", - find.getAbsolutePath()); - } - } catch (Exception e1) { - LIBRARYLOG.warn("not find ansj_library.properties. and err {} i think it is a bug!", e1.getMessage()); - } - } - - if (rb == null) { - try { - rb = ResourceBundle.getBundle("library"); - } catch (Exception e) { - try { - File find = FileFinder.find("library.properties", 2); - if (find != null && find.isFile()) { - rb = new PropertyResourceBundle( - IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding"))); - LIBRARYLOG.info( - "load library not find in classPath ! i find it in {} make sure it is your config!", - find.getAbsolutePath()); - } - } catch (Exception e1) { - LIBRARYLOG.warn("not find library.properties. and err {} i think it is a bug!", e1.getMessage()); - } - } - } - - DIC.put(DIC_DEFAULT, "library/default.dic"); - - if (rb == null) { - LIBRARYLOG.warn("not find library.properties in classpath use it by default !"); - } else { - - for (String key : rb.keySet()) { - - if (key.equals("dic")) { - DIC.put(key, rb.getString(key)); - } else if (key.equals("crf")) { - CRF.put(key, rb.getString(key)); - } else if (key.startsWith("dic_")) { - if (DIC.containsKey(key)) { - LIBRARYLOG.warn("{} dic config repeat definition now overwrite it !", key); - } - DIC.put(key, rb.getString(key)); - } else if (key.startsWith("crf_")) { - if (CRF.containsKey(key)) { - LIBRARYLOG.warn("{} crf config repeat definition now overwrite it !", key); - } - CRF.put(key, rb.getString(key)); - } else { - try { - Field field = MyStaticValue.class.getField(key); - field.set(null, ObjConver.conversion(rb.getString(key), field.getType())); - } catch (NoSuchFieldException e) { - LIBRARYLOG.error("not find field by {}", key); - } catch (SecurityException e) { - LIBRARYLOG.error("安全异常", e); - } catch (IllegalArgumentException e) { - LIBRARYLOG.error("非法参数", e); - } catch (IllegalAccessException e) { - LIBRARYLOG.error("非法访问", e); - } - } - - } - - } - } - - /** - * 人名词典 - * - * @return - */ - public static BufferedReader getPersonReader() { - return DicReader.getReader("person/person.dic"); - } - - /** - * 机构名词典 - * - * @return - */ - public static BufferedReader getCompanReader() { - return DicReader.getReader("company/company.data"); - } - - /** - * 机构名词典 - * - * @return - */ - public static BufferedReader getNewWordReader() { - return DicReader.getReader("newWord/new_word_freq.dic"); - } - - /** - * 核心词典 - * - * @return - */ - public static BufferedReader getArraysReader() { - return DicReader.getReader("arrays.dic"); - } - - /** - * 数字词典 - * - * @return - */ - public static BufferedReader getNumberReader() { - return DicReader.getReader("numberLibrary.dic"); - } - - /** - * 英文词典 - * - * @return - */ - public static BufferedReader getEnglishReader() { - return DicReader.getReader("englishLibrary.dic"); - } - - /** - * 词性表 - * - * @return - */ - public static BufferedReader getNatureMapReader() { - return DicReader.getReader("nature/nature.map"); - } - - /** - * 词性关联表 - * - * @return - */ - public static BufferedReader getNatureTableReader() { - return DicReader.getReader("nature/nature.table"); - } - - /** - * 得道姓名单字的词频词典 - * - * @return - */ - public static BufferedReader getPersonFreqReader() { - return DicReader.getReader("person/name_freq.dic"); - } - - /** - * 名字词性对象反序列化 - * - * @return - */ - @SuppressWarnings("unchecked") - public static Map getPersonFreqMap() { - Map map = new HashMap(0); - try (InputStream inputStream = DicReader.getInputStream("person/asian_name_freq.data")) { - ObjectInputStream objectInputStream = new ObjectInputStream(inputStream); - map = (Map) objectInputStream.readObject(); - } catch (IOException e) { - LIBRARYLOG.warn("IO异常", e); - } catch (ClassNotFoundException e) { - LIBRARYLOG.warn("找不到类", e); - } - return map; - } - - /** - * 词与词之间的关联表数据 - * - * @return - */ - public static void initBigramTables() { - try (BufferedReader reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8")){ - String temp = null; - String[] strs = null; - int freq = 0; - while ((temp = reader.readLine()) != null) { - if (StringUtil.isBlank(temp)) { - continue; - } - strs = temp.split("\t"); - freq = Integer.parseInt(strs[1]); - strs = strs[0].split("@"); - AnsjItem fromItem = DATDictionary.getItem(strs[0]); - - AnsjItem toItem = DATDictionary.getItem(strs[1]); - - if (fromItem == AnsjItem.NULL && strs[0].contains("#")) { - fromItem = AnsjItem.BEGIN; - } - - if (toItem == AnsjItem.NULL && strs[1].contains("#")) { - toItem = AnsjItem.END; - } - - if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) { - continue; - } - - if (fromItem.bigramEntryMap == null) { - fromItem.bigramEntryMap = new HashMap(); - } - - fromItem.bigramEntryMap.put(toItem.getIndex(), freq); - - } - } catch (NumberFormatException e) { - LIBRARYLOG.warn("数字格式异常", e); - } catch (UnsupportedEncodingException e) { - LIBRARYLOG.warn("不支持的编码", e); - } catch (IOException e) { - LIBRARYLOG.warn("IO异常", e); - } - } - - /** - * 得到默认的模型 - * - * @return - */ - public static SplitWord getCRFSplitWord() { - return getCRFSplitWord(CRF_DEFAULT); - } - - /** - * 根据模型名称获取crf模型 - * - * @param key - * @return - */ - public static SplitWord getCRFSplitWord(String key) { - Object temp = CRF.get(key); - - if (temp == null) { - if (CRF_DEFAULT.equals(key)) { // 加载内置模型 - return initDefaultModel(); - } else { - LIBRARYLOG.warn("crf {} not found in config ", key); - return null; - } - } else if (temp instanceof String) { - return initCRFModel(key, (String) temp); - } else { - return (SplitWord) temp; - } - } - - /** - * 加载默认的crf模型 - * - * @return - */ - private static synchronized SplitWord initDefaultModel() { - - Object obj = CRF.get(CRF_DEFAULT); - if (obj != null && obj instanceof SplitWord) { - return (SplitWord) obj; - } - try { - LIBRARYLOG.info("init deafult crf model begin !"); - CRFModel model = new CRFModel(CRF_DEFAULT); - model.loadModel(DicReader.getInputStream("crf.model")); - SplitWord splitWord = new SplitWord(model); - CRF.put(CRF_DEFAULT, splitWord); - return splitWord; - } catch (Exception e) { - LIBRARYLOG.error("init err!", e); - } - return null; - } - - /** - * 加载CRF模型 - * - * @param modelPath - * @return - */ - private static synchronized SplitWord initCRFModel(String key, String modelPath) { - try { - Object obj = CRF.get(key); - if (obj != null && obj instanceof SplitWord) { - return (SplitWord) obj; - } - if (new File(modelPath).isFile() && new File(modelPath).exists()) { - long start = System.currentTimeMillis(); - LIBRARYLOG.info("begin init crf model!"); - SplitWord crfSplitWord = new SplitWord(Model.load(key, modelPath)); - CRF.put(key, crfSplitWord); - LIBRARYLOG.info("load crf use time:{} path is : {}", System.currentTimeMillis() - start, modelPath); - return crfSplitWord; - } else { - LIBRARYLOG.info("{} file not found ,please make sure it is exists : {}", key, modelPath); - } - } catch (Exception e) { - LIBRARYLOG.info("{} file : {} load err {}", key, modelPath, e.getMessage()); - } - return null; - } - - /** - * 得到默认的模型 - * - * @return - */ - public static Forest getDicForest() { - return getDicForest(DIC_DEFAULT); - } - - /** - * 根据模型名称获取crf模型 - * - * @param modelName - * @return - */ - public static Forest getDicForest(String key) { - Object temp = DIC.get(key); - - if (temp == null) { - LIBRARYLOG.warn("dic {} not found in config ",key); - return null; - } else if (temp instanceof String) { - return initForest(key, (String) temp); - } else { - return (Forest) temp; - } - } - - /** - * 用户自定义词典加载 - * - * @param key - * @param dicPath - * @return - */ - private synchronized static Forest initForest(String key, String dicPath) { - Object obj = CRF.get(key); - - if (obj != null && obj instanceof Forest) { - return (Forest) obj; - } - Forest forest = new Forest(); - UserDefineLibrary.loadLibrary(forest, dicPath); - DIC.put(key, forest); - return forest; - } - -} +package org.ansj.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.Field; +import java.util.HashMap; +import java.util.Map; +import java.util.PropertyResourceBundle; +import java.util.ResourceBundle; + +import org.ansj.app.crf.Model; +import org.ansj.app.crf.SplitWord; +import org.ansj.app.crf.model.CRFModel; +import org.ansj.dic.DicReader; +import org.ansj.domain.AnsjItem; +import org.ansj.library.DATDictionary; +import org.ansj.library.UserDefineLibrary; +import org.nlpcn.commons.lang.tire.domain.Forest; +import org.nlpcn.commons.lang.util.FileFinder; +import org.nlpcn.commons.lang.util.IOUtil; +import org.nlpcn.commons.lang.util.ObjConver; +import org.nlpcn.commons.lang.util.StringUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * 这个类储存一些公用变量. + * + * @author ansj + * + */ +public class MyStaticValue { + + public static final Forest EMPTY_FOREST = new Forest(); + + public static final Logger LIBRARYLOG = LoggerFactory.getLogger("DICLOG"); + + public static final String DIC_DEFAULT = "dic"; + + public static final String CRF_DEFAULT = "crf"; + + // 是否开启人名识别 + public static Boolean isNameRecognition = true; + + // 是否开启数字识别 + public static Boolean isNumRecognition = true; + + // 是否数字和量词合并 + public static Boolean isQuantifierRecognition = true; + + // 是否显示真实词语 + public static Boolean isRealName = false; + + // 用户自定义词典 + public static final Map DIC = new HashMap(); + + // CRF模型 + public static final Map CRF = new HashMap(); + + /** + * 用户自定义词典的加载,如果是路径就扫描路径下的dic文件 + */ + public static String ambiguityLibrary = "library/ambiguity.dic"; + + /** + * 增加同义词词典路径变量 + */ + public static String synonymsLibrary = "library/synonyms.dic"; + + /** + * 是否用户辞典不加载相同的词 + */ + public static boolean isSkipUserDefine = false; + + static { + /** + * 配置文件变量 + */ + ResourceBundle rb = null; + try { + rb = ResourceBundle.getBundle("ansj_library"); + } catch (Exception e) { + try { + File find = FileFinder.find("ansj_library.properties", 1); + if (find != null && find.isFile()) { + rb = new PropertyResourceBundle( + IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding"))); + LIBRARYLOG.info( + "load ansj_library not find in classPath ! i find it in {} make sure it is your config!", + find.getAbsolutePath()); + } + } catch (Exception e1) { + LIBRARYLOG.warn("not find ansj_library.properties. and err {} i think it is a bug!", e1.getMessage()); + } + } + + if (rb == null) { + try { + rb = ResourceBundle.getBundle("library"); + } catch (Exception e) { + try { + File find = FileFinder.find("library.properties", 2); + if (find != null && find.isFile()) { + rb = new PropertyResourceBundle( + IOUtil.getReader(find.getAbsolutePath(), System.getProperty("file.encoding"))); + LIBRARYLOG.info( + "load library not find in classPath ! i find it in {} make sure it is your config!", + find.getAbsolutePath()); + } + } catch (Exception e1) { + LIBRARYLOG.warn("not find library.properties. and err {} i think it is a bug!", e1.getMessage()); + } + } + } + + DIC.put(DIC_DEFAULT, "library/default.dic"); + + if (rb == null) { + LIBRARYLOG.warn("not find library.properties in classpath use it by default !"); + } else { + + for (String key : rb.keySet()) { + + if (key.equals("dic")) { + DIC.put(key, rb.getString(key)); + } else if (key.equals("crf")) { + CRF.put(key, rb.getString(key)); + } else if (key.startsWith("dic_")) { + if (DIC.containsKey(key)) { + LIBRARYLOG.warn("{} dic config repeat definition now overwrite it !", key); + } + DIC.put(key, rb.getString(key)); + } else if (key.startsWith("crf_")) { + if (CRF.containsKey(key)) { + LIBRARYLOG.warn("{} crf config repeat definition now overwrite it !", key); + } + CRF.put(key, rb.getString(key)); + } else { + try { + Field field = MyStaticValue.class.getField(key); + field.set(null, ObjConver.conversion(rb.getString(key), field.getType())); + } catch (NoSuchFieldException e) { + LIBRARYLOG.error("not find field by {}", key); + } catch (SecurityException e) { + LIBRARYLOG.error("安全异常", e); + } catch (IllegalArgumentException e) { + LIBRARYLOG.error("非法参数", e); + } catch (IllegalAccessException e) { + LIBRARYLOG.error("非法访问", e); + } + } + + } + + } + } + + /** + * 人名词典 + * + * @return + */ + public static BufferedReader getPersonReader() { + return DicReader.getReader("person/person.dic"); + } + + /** + * 机构名词典 + * + * @return + */ + public static BufferedReader getCompanReader() { + return DicReader.getReader("company/company.data"); + } + + /** + * 机构名词典 + * + * @return + */ + public static BufferedReader getNewWordReader() { + return DicReader.getReader("newWord/new_word_freq.dic"); + } + + /** + * 核心词典 + * + * @return + */ + public static BufferedReader getArraysReader() { + return DicReader.getReader("arrays.dic"); + } + + /** + * 数字词典 + * + * @return + */ + public static BufferedReader getNumberReader() { + return DicReader.getReader("numberLibrary.dic"); + } + + /** + * 英文词典 + * + * @return + */ + public static BufferedReader getEnglishReader() { + return DicReader.getReader("englishLibrary.dic"); + } + + /** + * 词性表 + * + * @return + */ + public static BufferedReader getNatureMapReader() { + return DicReader.getReader("nature/nature.map"); + } + + /** + * 词性关联表 + * + * @return + */ + public static BufferedReader getNatureTableReader() { + return DicReader.getReader("nature/nature.table"); + } + + /** + * 得道姓名单字的词频词典 + * + * @return + */ + public static BufferedReader getPersonFreqReader() { + return DicReader.getReader("person/name_freq.dic"); + } + + /** + * 名字词性对象反序列化 + * + * @return + */ + @SuppressWarnings("unchecked") + public static Map getPersonFreqMap() { + Map map = new HashMap(0); + try (InputStream inputStream = DicReader.getInputStream("person/asian_name_freq.data")) { + ObjectInputStream objectInputStream = new ObjectInputStream(inputStream); + map = (Map) objectInputStream.readObject(); + } catch (IOException e) { + LIBRARYLOG.warn("IO异常", e); + } catch (ClassNotFoundException e) { + LIBRARYLOG.warn("找不到类", e); + } + return map; + } + + /** + * 词与词之间的关联表数据 + * + * @return + */ + public static void initBigramTables() { + try (BufferedReader reader = IOUtil.getReader(DicReader.getInputStream("bigramdict.dic"), "UTF-8")){ + String temp = null; + String[] strs = null; + int freq = 0; + while ((temp = reader.readLine()) != null) { + if (StringUtil.isBlank(temp)) { + continue; + } + strs = temp.split("\t"); + freq = Integer.parseInt(strs[1]); + strs = strs[0].split("@"); + AnsjItem fromItem = DATDictionary.getItem(strs[0]); + + AnsjItem toItem = DATDictionary.getItem(strs[1]); + + if (fromItem == AnsjItem.NULL && strs[0].contains("#")) { + fromItem = AnsjItem.BEGIN; + } + + if (toItem == AnsjItem.NULL && strs[1].contains("#")) { + toItem = AnsjItem.END; + } + + if (fromItem == AnsjItem.NULL || toItem == AnsjItem.NULL) { + continue; + } + + if (fromItem.bigramEntryMap == null) { + fromItem.bigramEntryMap = new HashMap(); + } + + fromItem.bigramEntryMap.put(toItem.getIndex(), freq); + + } + } catch (NumberFormatException e) { + LIBRARYLOG.warn("数字格式异常", e); + } catch (UnsupportedEncodingException e) { + LIBRARYLOG.warn("不支持的编码", e); + } catch (IOException e) { + LIBRARYLOG.warn("IO异常", e); + } + } + + /** + * 得到默认的模型 + * + * @return + */ + public static SplitWord getCRFSplitWord() { + return getCRFSplitWord(CRF_DEFAULT); + } + + /** + * 根据模型名称获取crf模型 + * + * @param key + * @return + */ + public static SplitWord getCRFSplitWord(String key) { + Object temp = CRF.get(key); + + if (temp == null) { + if (CRF_DEFAULT.equals(key)) { // 加载内置模型 + return initDefaultModel(); + } else { + LIBRARYLOG.warn("crf {} not found in config ", key); + return null; + } + } else if (temp instanceof String) { + return initCRFModel(key, (String) temp); + } else { + return (SplitWord) temp; + } + } + + /** + * 加载默认的crf模型 + * + * @return + */ + private static synchronized SplitWord initDefaultModel() { + + Object obj = CRF.get(CRF_DEFAULT); + if (obj != null && obj instanceof SplitWord) { + return (SplitWord) obj; + } + try { + LIBRARYLOG.info("init deafult crf model begin !"); + CRFModel model = new CRFModel(CRF_DEFAULT); + model.loadModel(DicReader.getInputStream("crf.model")); + SplitWord splitWord = new SplitWord(model); + CRF.put(CRF_DEFAULT, splitWord); + return splitWord; + } catch (Exception e) { + LIBRARYLOG.error("init err!", e); + } + return null; + } + + /** + * 加载CRF模型 + * + * @param modelPath + * @return + */ + private static synchronized SplitWord initCRFModel(String key, String modelPath) { + try { + Object obj = CRF.get(key); + if (obj != null && obj instanceof SplitWord) { + return (SplitWord) obj; + } + if (new File(modelPath).isFile() && new File(modelPath).exists()) { + long start = System.currentTimeMillis(); + LIBRARYLOG.info("begin init crf model!"); + SplitWord crfSplitWord = new SplitWord(Model.load(key, modelPath)); + CRF.put(key, crfSplitWord); + LIBRARYLOG.info("load crf use time:{} path is : {}", System.currentTimeMillis() - start, modelPath); + return crfSplitWord; + } else { + LIBRARYLOG.info("{} file not found ,please make sure it is exists : {}", key, modelPath); + } + } catch (Exception e) { + LIBRARYLOG.info("{} file : {} load err {}", key, modelPath, e.getMessage()); + } + return null; + } + + /** + * 得到默认的模型 + * + * @return + */ + public static Forest getDicForest() { + return getDicForest(DIC_DEFAULT); + } + + /** + * 根据模型名称获取crf模型 + * + * @param modelName + * @return + */ + public static Forest getDicForest(String key) { + Object temp = DIC.get(key); + + if (temp == null) { + LIBRARYLOG.warn("dic {} not found in config ",key); + return null; + } else if (temp instanceof String) { + return initForest(key, (String) temp); + } else { + return (Forest) temp; + } + } + + /** + * 用户自定义词典加载 + * + * @param key + * @param dicPath + * @return + */ + private synchronized static Forest initForest(String key, String dicPath) { + Object obj = CRF.get(key); + + if (obj != null && obj instanceof Forest) { + return (Forest) obj; + } + Forest forest = new Forest(); + UserDefineLibrary.loadLibrary(forest, dicPath); + DIC.put(key, forest); + return forest; + } + +}