Skip to content

Commit

Permalink
增加synonyms同义词词典功能,实现类似ambiguity歧义词词典
Browse files Browse the repository at this point in the history
  • Loading branch information
yeyuelong committed Sep 22, 2016
1 parent 6cf0dd0 commit 3bec38c
Show file tree
Hide file tree
Showing 5 changed files with 646 additions and 599 deletions.
1 change: 1 addition & 0 deletions ansj_library.properties
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ dic=library/default.dic

#redress dic file path
ambiguityLibrary=library/ambiguity.dic
synonymsLibrary=library/synonyms.dic

#set real name
isRealName=true
Expand Down
2 changes: 2 additions & 0 deletions library/synonyms.dic
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
枇杷 苹果 香蕉
中国 中华 华夏
325 changes: 163 additions & 162 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,162 +1,163 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<packaging>jar</packaging>
<name>ansj_seg</name>
<version>5.0.2</version>
<description>best java chinese word seg ! </description>
<url>https://github.com/NLPchina/ansj_seg</url>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>

<scm>
<connection>scm:git:[email protected]:ansjsun/ansj_seg.git</connection>
<developerConnection>scm:git:[email protected]:ansjsun/ansj_seg.git</developerConnection>
<url>[email protected]:ansjsun/ansj_seg.git</url>
</scm>


<developers>
<developer>
<id>ansj</id>
<name>ansj</name>
<email>[email protected]</email>
</developer>
</developers>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>org.nlpcn</groupId>
<artifactId>nlp-lang</artifactId>
<version>1.7</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.16</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
<scope>test</scope>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>net.orfjackal.retrolambda</groupId>
<artifactId>retrolambda-maven-plugin</artifactId>
<version>2.0.6</version>
<executions>
<execution>
<goals>
<goal>process-main</goal>
</goals>
</execution>
</executions>
<configuration>
<target>1.6</target>
<defaultMethods>false</defaultMethods>
<fork>false</fork>
</configuration>
</plugin>

<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>

<configuration>
<attach>true</attach>
<includes>
<include>**/*.java</include>
</includes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.3</version>
<configuration>
<additionalparam>-Xdoclint:none</additionalparam>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-gpg-plugin</artifactId>
<version>1.4</version>
<executions>
<execution>
<id>sign-artifacts</id>
<phase>verify</phase>
<goals>
<goal>sign</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

<distributionManagement>
<snapshotRepository>
<id>sonatype-nexus-snapshots</id>
<name>Sonatype Nexus snapshot repository</name>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>

<repository>
<id>sonatype-nexus-staging</id>
<name>Sonatype Nexus release repository</name>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2</url>
</repository>
</distributionManagement>
</project>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<packaging>jar</packaging>
<name>ansj_seg</name>
<version>5.0.2</version>
<description>best java chinese word seg ! </description>
<url>https://github.com/NLPchina/ansj_seg</url>
<licenses>
<license>
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>

<scm>
<connection>scm:git:[email protected]:ansjsun/ansj_seg.git</connection>
<developerConnection>scm:git:[email protected]:ansjsun/ansj_seg.git</developerConnection>
<url>[email protected]:ansjsun/ansj_seg.git</url>
</scm>


<developers>
<developer>
<id>ansj</id>
<name>ansj</name>
<email>[email protected]</email>
</developer>
</developers>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>org.nlpcn</groupId>
<artifactId>nlp-lang</artifactId>
<version>1.7</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.16</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
<scope>test</scope>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>net.orfjackal.retrolambda</groupId>
<artifactId>retrolambda-maven-plugin</artifactId>
<version>2.0.6</version>
<executions>
<execution>
<phase>default</phase>
<goals>
<goal>process-main</goal>
</goals>
</execution>
</executions>
<configuration>
<target>1.6</target>
<defaultMethods>false</defaultMethods>
<fork>false</fork>
</configuration>
</plugin>

<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar-no-fork</goal>
</goals>
</execution>
</executions>

<configuration>
<attach>true</attach>
<includes>
<include>**/*.java</include>
</includes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.3</version>
<configuration>
<additionalparam>-Xdoclint:none</additionalparam>
</configuration>
</plugin>
<!-- <plugin> -->
<!-- <groupId>org.apache.maven.plugins</groupId> -->
<!-- <artifactId>maven-gpg-plugin</artifactId> -->
<!-- <version>1.4</version> -->
<!-- <executions> -->
<!-- <execution> -->
<!-- <id>sign-artifacts</id> -->
<!-- <phase>verify</phase> -->
<!-- <goals> -->
<!-- <goal>sign</goal> -->
<!-- </goals> -->
<!-- </execution> -->
<!-- </executions> -->
<!-- </plugin> -->
</plugins>
</build>

<distributionManagement>
<snapshotRepository>
<id>sonatype-nexus-snapshots</id>
<name>Sonatype Nexus snapshot repository</name>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>

<repository>
<id>sonatype-nexus-staging</id>
<name>Sonatype Nexus release repository</name>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2</url>
</repository>
</distributionManagement>
</project>
38 changes: 38 additions & 0 deletions src/main/java/org/ansj/library/UserDefineLibrary.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ public class UserDefineLibrary {
public static Forest FOREST = null;

public static Forest ambiguityForest = null;
public static Forest synonymsForest = null;

static {
initUserLibrary();
initAmbiguityLibrary();
initSynonymsLibrary();
}

/**
Expand Down Expand Up @@ -104,6 +106,42 @@ private static void initAmbiguityLibrary() {
}

}

/**
* 初始化同义词词典
*/
private static void initSynonymsLibrary() {

File[] lib = findLibrary(MyStaticValue.synonymsLibrary);

if (lib.length > 0) {
synonymsForest = new Forest();
for (File file : lib) {
try (BufferedReader br = IOUtil.getReader(file, "utf-8")) {
String temp;
while ((temp = br.readLine()) != null) {
if (StringUtil.isNotBlank(temp)) {
temp = StringUtil.trim(temp);
String[] split = temp.split("\t");
LIBRARYLOG.info("init synonyms in line :" + temp);
synonymsForest.addBranch(split[0], split);
}
}

} catch (UnsupportedEncodingException e) {
LIBRARYLOG.warn("不支持的编码", e);
} catch (IOException e) {
LIBRARYLOG.warn("Init synonyms library error :{}, path: {}", e.getMessage(), file.getPath());
}
}

LIBRARYLOG.info("Init synonyms library ok!");

} else {
LIBRARYLOG.warn("Init synonyms library warning :{} because : file not found or failed to read !", MyStaticValue.synonymsLibrary);
}

}

/**
* 加载用户自定义词典和补充词典
Expand Down
Loading

0 comments on commit 3bec38c

Please sign in to comment.