diff --git a/pom.xml b/pom.xml index 2e4f6b13..06619eec 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ ansj_seg jar ansj_seg - 5.1.1 + 5.1.2 best java chinese word seg ! https://github.com/NLPchina/ansj_seg diff --git a/src/main/java/org/ansj/dic/LearnTool.java b/src/main/java/org/ansj/dic/LearnTool.java index 829003b9..d1e31455 100644 --- a/src/main/java/org/ansj/dic/LearnTool.java +++ b/src/main/java/org/ansj/dic/LearnTool.java @@ -7,9 +7,12 @@ import org.ansj.app.crf.SplitWord; import org.ansj.domain.Nature; import org.ansj.domain.NewWord; +import org.ansj.domain.TermNatures; import org.ansj.recognition.arrimpl.AsianPersonRecognition; import org.ansj.recognition.arrimpl.ForeignPersonRecognition; +import org.ansj.recognition.impl.NatureRecognition; import org.ansj.util.Graph; +import org.nlpcn.commons.lang.tire.domain.Forest; import org.nlpcn.commons.lang.tire.domain.SmartForest; import org.nlpcn.commons.lang.util.CollectionUtil; @@ -40,15 +43,22 @@ public class LearnTool { */ private final SmartForest sf = new SmartForest(); + /** + * 学习新词排除用户自定义词典那中的词语 + */ + private Forest[] forests; + /** * 公司名称学习. * * @param graph */ - public void learn(Graph graph, SplitWord splitWord) { + public void learn(Graph graph, SplitWord splitWord, Forest... forests) { this.splitWord = splitWord; + this.forests = forests; + // 亚洲人名识别 if (isAsianName) { findAsianPerson(graph); @@ -76,7 +86,12 @@ private void addListToTerm(List newWords) { if (newWords.size() == 0) return; for (NewWord newWord : newWords) { - addTerm(newWord); + + TermNatures termNatures = new NatureRecognition(forests).getTermNatures(newWord.getName()); + + if (termNatures == TermNatures.NULL) { + addTerm(newWord); + } } } @@ -93,12 +108,12 @@ public void addTerm(NewWord newWord) { temp.update(newWord.getNature(), newWord.getAllFreq()); } else { count++; - if(splitWord==null){ + if (splitWord == null) { newWord.setScore(-1); - }else{ - newWord.setScore(-splitWord.cohesion(newWord.getName())); + } else { + newWord.setScore(-splitWord.cohesion(newWord.getName())); } - + synchronized (sf) { sf.add(newWord.getName(), newWord); } @@ -112,8 +127,7 @@ public SmartForest getForest() { /** * 返回学习到的新词. * - * @param num - * 返回数目.0为全部返回 + * @param num 返回数目.0为全部返回 * @return */ public List> getTopTree(int num) { @@ -138,7 +152,7 @@ public List> getTopTree(int num, Nature nature) { } private void valueResult(SmartForest smartForest, HashMap hm, Nature nature) { - + if (smartForest == null || smartForest.branches == null) { return; } diff --git a/src/main/java/org/ansj/splitWord/analysis/NlpAnalysis.java b/src/main/java/org/ansj/splitWord/analysis/NlpAnalysis.java index ea75d6e8..1c68da01 100644 --- a/src/main/java/org/ansj/splitWord/analysis/NlpAnalysis.java +++ b/src/main/java/org/ansj/splitWord/analysis/NlpAnalysis.java @@ -63,7 +63,7 @@ public List merger() { graph.walkPath(); - learn.learn(graph, splitWord); + learn.learn(graph, splitWord ,forests); // 姓名识别 if (graph.hasPerson && isNameRecognition) {