Skip to content

Commit

Permalink
dic 50%
Browse files Browse the repository at this point in the history
  • Loading branch information
孙健 committed Dec 3, 2016
1 parent 572aeb3 commit 9c246c9
Show file tree
Hide file tree
Showing 41 changed files with 1,209 additions and 804 deletions.
Binary file added crf.model
Binary file not shown.
3 changes: 2 additions & 1 deletion library/synonyms.dic
Original file line number Diff line number Diff line change
Expand Up @@ -2411,7 +2411,8 @@
农机 农械 农机具
工作母机 母机
蒸气机 汽机 蒸汽机
内燃机 摩托 热机
内燃机 热机
摩托 摩托车 机车
发动机 动力机 引擎
柴油机 狄塞耳机
发电机 电机
Expand Down
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
</properties>

<dependencies>

<dependency>
<groupId>org.nlpcn</groupId>
<artifactId>nlp-lang</artifactId>
Expand Down
20 changes: 18 additions & 2 deletions src/main/java/org/ansj/domain/KV.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,28 @@ public class KV<K, V> {

private V v;

private <K,V> KV<K,V>(K k, V v) {
private KV(K k, V v) {
this.k = k;
this.v = v;
}

public static <K, V> KV<K, V> with(K k, V v) {
return new KV(k, v);
return new KV<K, V>(k, v);
}

public void setK(K k) {
this.k = k;
}

public void setV(V v) {
this.v = v;
}

public K getK() {
return k;
}

public V getV() {
return v;
}
}
186 changes: 186 additions & 0 deletions src/main/java/org/ansj/library/AmbiguityLibrary.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package org.ansj.library;

import java.io.BufferedReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.ansj.dic.PathToStream;
import org.ansj.domain.KV;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.nlpcn.commons.lang.tire.domain.Value;
import org.nlpcn.commons.lang.tire.library.Library;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;

public class AmbiguityLibrary {

private static final Log LOG = LogFactory.getLog();

// 同义词典
private static final Map<String, KV<String, Forest>> AMBIGUITY = new HashMap<>();

public static final String DEFAULT = "ambiguity_";

/**
* 获取系统默认词典
*
* @return
*/
public static Forest get() {
return get(DEFAULT);
}

/**
* 根据key获取
*
* @param key
* @return crf分词器
*/
public static Forest get(String key) {
key = fix(key);
KV<String, Forest> kv = AMBIGUITY.get(key);

if (kv == null) {
LOG.warn("crf " + key + " not found in config ");
return null;
}

Forest sw = (Forest) kv.getV();
if (sw == null) {
sw = init(kv);
}
return sw;
}

/**
* 加载
*
* @return
*/
private static synchronized Forest init(KV<String, Forest> kv) {
Forest forest = kv.getV();
if (forest != null) {
return forest;
}
forest = new Forest();
try (BufferedReader br = IOUtil.getReader(PathToStream.stream(kv.getK()), "utf-8")) {
String temp;
LOG.info("begin init dic !");
long start = System.currentTimeMillis();
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
String[] split = temp.split("\t");
StringBuilder sb = new StringBuilder();
if (split.length % 2 != 0) {
LOG.error("init ambiguity error in line :" + temp + " format err !");
continue;
}
for (int i = 0; i < split.length; i += 2) {
sb.append(split[i]);
}
forest.addBranch(sb.toString(), split);
}
}
LOG.info("load dic use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
kv.setV(forest);
return forest;
} catch (Exception e) {
LOG.error("Init ambiguity library error :" + e.getMessage() + ", path: " + kv.getK());
return null;
}
}

/**
* 插入到树中呀
*
* @param key
* @param split
* @return
*/
public static void insert(String key, String... split) {
Forest forest = get(key);
StringBuilder sb = new StringBuilder();
if (split.length % 2 != 0) {
LOG.error("init ambiguity error in line :" + Arrays.toString(split) + " format err !");
return;
}
for (int i = 0; i < split.length; i += 2) {
sb.append(split[i]);
}
forest.addBranch(sb.toString(), split);
}

/**
* 插入到树种
* @param key
* @param value
*/
public static void insert(String key, Value value) {
Forest forest = get(key);
Library.insertWord(forest, value);
}

/**
* 动态添加
*
* @param dicDefault
* @param dicDefault2
* @param dic2
*/
public static void put(String key, String path) {
key = fix(key);
put(key, path, null);
}

public static void put(String key, String path, Forest value) {
key = fix(key);
AMBIGUITY.put(key, KV.with(path, value));
}

/**
* 删除一个key
*
* @param key
* @return
*/
public KV<String, Forest> remove(String key) {
key = fix(key);
return AMBIGUITY.remove(key);
}

/**
* 刷新一个,将值设置为null
*
* @param key
* @return
*/
public static void reload(String key) {
key = fix(key);
AMBIGUITY.get(key).setV(null);
get(key);
}

public static Set<String> keys() {
return AMBIGUITY.keySet();
}

private static String fix(String key) {
if (key.startsWith(DEFAULT)) {
return key;
} else {
return DEFAULT + key;
}
}

public static void putIfAbsent(String key, String path) {
if (!AMBIGUITY.containsKey(key)) {
AMBIGUITY.put(key, KV.with(path, (Forest) null));
}
}

}
50 changes: 33 additions & 17 deletions src/main/java/org/ansj/library/CrfLibrary.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,39 @@
import org.ansj.app.crf.SplitWord;
import org.ansj.app.crf.model.CRFModel;
import org.ansj.dic.PathToStream;
import org.nlpcn.commons.lang.tire.domain.Forest;
import org.ansj.domain.KV;
import org.nlpcn.commons.lang.util.logging.Log;
import org.nlpcn.commons.lang.util.logging.LogFactory;
import org.nlpcn.commons.lang.util.tuples.KeyValue;

public class CrfLibrary {
public class CrfLibrary {

private static final Log LOG = LogFactory.getLog();

// CRF模型
private static final Map<String, KeyValue<String, SplitWord>> CRF = new HashMap<>();
private static final Map<String, KV<String, SplitWord>> CRF = new HashMap<>();

public static final String DEFAULT = "crf_";

public static SplitWord get() {
return get(DEFAULT);
}

/**
* 根据key获取crf分词器
*
* @param key
* @return crf分词器
*/
public static SplitWord crf(String key) {
KeyValue<String, SplitWord> kv = CRF.get(fix(key));
public static SplitWord get(String key) {
key = fix(key);
KV<String, SplitWord> kv = CRF.get(key);

if (kv == null) {
LOG.warn("crf " + key + " not found in config ");
return null;
}

SplitWord sw = (SplitWord) kv.getValue();
SplitWord sw = (SplitWord) kv.getV();
if (sw == null) {
sw = initCRFModel(kv);
}
Expand All @@ -50,18 +54,18 @@ public static SplitWord crf(String key) {
* @param modelPath
* @return
*/
private static synchronized SplitWord initCRFModel(KeyValue<String, SplitWord> kv) {
private static synchronized SplitWord initCRFModel(KV<String, SplitWord> kv) {
try {
if (kv.getValue() != null) {
return kv.getValue();
if (kv.getV() != null) {
return kv.getV();
}

long start = System.currentTimeMillis();
LOG.info("begin init crf model!");
try (InputStream is = PathToStream.stream(kv.getKey())) {
try (InputStream is = PathToStream.stream(kv.getK())) {
SplitWord crfSplitWord = new SplitWord(Model.load(CRFModel.class, is));
kv.setValue(crfSplitWord);
LOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getKey());
kv.setV(crfSplitWord);
LOG.info("load crf use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK());
return crfSplitWord;
}
} catch (Exception e) {
Expand All @@ -78,11 +82,13 @@ private static synchronized SplitWord initCRFModel(KeyValue<String, SplitWord> k
* @param dic2
*/
public static void put(String key, String path) {
key = fix(key);
put(key, path, null);
}

public static void put(String key, String path, SplitWord sw) {
CRF.put(key, KeyValue.with(path, sw));
key = fix(key);
CRF.put(key, KV.with(path, sw));
}

/**
Expand All @@ -91,17 +97,21 @@ public static void put(String key, String path, SplitWord sw) {
* @param key
* @return
*/
public static KeyValue<String, SplitWord> remove(String key) {
public KV<String, SplitWord> remove(String key) {
key = fix(key);
return CRF.remove(key);
}

/**
* 刷新一个,将值设置为null
*
* @param key
* @return
*/
public static KeyValue<String, SplitWord> flush(String key) {
CRF.get(key).setValue(null);
public static void reload(String key) {
key = fix(key);
CRF.get(key).setV(null);
get(key);
}

public static Set<String> keys() {
Expand All @@ -115,4 +125,10 @@ private static String fix(String key) {
return DEFAULT + key;
}
}

public static void putIfAbsent(String key, String path) {
if (!CRF.containsKey(key)) {
CRF.put(key, KV.with(path, (SplitWord) null));
}
}
}
Loading

0 comments on commit 9c246c9

Please sign in to comment.