Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: the failure-jump pattern #66

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions src/main/java/org/ahocorasick/trie/FailureJumpState.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package org.ahocorasick.trie;

/**
* The failure-jump state for failure-jump pattern.
* <p>
* When failure-jump config open it will jump a few of continues characters
* while current not math.
*
* @author wangzhao
*/
public class FailureJumpState {
private State state;

/**
* the current failure times
* <p>
* <li>-1: INIT state
* <li> 0: READY to failure-jump
* <li> N: PLUS the failure-jump times
* <li>+N: TOTAL the current failure-jump times
*/
private int failureTimes;
/**
* the max failure-jump times
*/
private final int maxFailureTimes;
private int total;

private FailureJumpState(int maxFailureTimes) {
if (maxFailureTimes > 0) {
this.maxFailureTimes = maxFailureTimes;
} else {
throw new IllegalArgumentException("the maxFailureTimes max large than 0: " + maxFailureTimes);
}
}

public static FailureJumpState createInstance(State state, int maxFailureTimes) {
FailureJumpState failureJumpState = new FailureJumpState(maxFailureTimes);
failureJumpState.reset(state);

return failureJumpState;
}

/**
* reset to INIT state
*
* @param state
*/
public void reset(State state) {
this.state = state;
this.failureTimes = -1;
this.total = 0;
}

/**
* get next failure-jump state
*
* <li>INIT(-1): the start state when a failure-jump state created or reset.
* <li>READY(0): INIT will goto READY while current char is matched. READY
* state will remains until current not match.
* <li>PLUS(N): READY goto PLUS when current not match. It will be END when
* current failure-jump times larger than {@link #maxFailureTimes}
* <li>TOTAL(+N): when current match, the PLUS has done and sum current failureTimes.
* Then turn on READY for next failure-jump.
*
* @param nextState
*/
public void nextState(State nextState) {
if (this.failureTimes == -1 && nextState.getDepth() > 0) {
// init --> ready
this.failureTimes = 0;
} else if (this.failureTimes >= 0 && nextState.getDepth() == 0) {
// ready --> plus
if (this.failureTimes < this.maxFailureTimes) {
// remains the previous state before the failure-jump end or
// out of the max times.
this.failureTimes++;
} else {
// reset the current state when current failure-jump
// times large than maxFailureTimes
this.reset(state);
}
return;
} else if (nextState.getDepth() > 0) {
// plus --> total
this.total += this.failureTimes;
this.failureTimes = 0;
}
this.state = nextState;
}

public int getTotal() {
return this.total;
}

public State getState() {
return state;
}
}
167 changes: 147 additions & 20 deletions src/main/java/org/ahocorasick/trie/Trie.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ public class Trie {

private final State rootState;

/**
* The max failure-jump times. It need to be set before parseText.
* It will not take effect before the allowFailureJump config not set.
*/
private int failureTimes;

private Trie(final TrieConfig trieConfig) {
this.trieConfig = trieConfig;
this.rootState = new State();
Expand Down Expand Up @@ -106,6 +112,15 @@ private Token createMatch(Emit emit, String text) {
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
}

public Trie failureTimes(int times) {
if (times > 0) {
this.failureTimes = times;
return this;
}

throw new IllegalArgumentException("the failureTimes mast be a positive integer: " + times);
}

public Collection<Emit> parseText(final CharSequence text) {
return parseText(text, new DefaultEmitHandler());
}
Expand Down Expand Up @@ -137,10 +152,18 @@ public boolean containsMatch(final CharSequence text) {
}

public void parseText(final CharSequence text, final EmitHandler emitHandler) {
if (this.trieConfig.isAllowFailureJump() && this.failureTimes > 0) {
doParseTextWithFailureJump(text, emitHandler);
} else {
doParseText(text, emitHandler);
}
}

private void doParseText(final CharSequence text, final EmitHandler emitHandler) {
State currentState = getRootState();

for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
char character = text.charAt(position);

// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
Expand All @@ -154,6 +177,30 @@ public void parseText(final CharSequence text, final EmitHandler emitHandler) {
}
}

private void doParseTextWithFailureJump(final CharSequence text, final EmitHandler emitHandler) {
State currentState = getRootState();
FailureJumpState state = FailureJumpState.createInstance(currentState, this.failureTimes);

for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position);

// TODO: Maybe lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}

currentState = getStateWithFailureJump(state, character);
boolean emitted;
if ((emitted = storeEmitsWithFailureJump(position, state, emitHandler)) && trieConfig.isStopOnHit()) {
return;
}

if (emitted) {
state.reset(currentState);
}
}
}

/**
* The first matching text sequence.
*
Expand All @@ -168,31 +215,39 @@ public Emit firstMatch(final CharSequence text) {
if (parseText != null && !parseText.isEmpty()) {
return parseText.iterator().next();
}
} else if (trieConfig.isAllowFailureJump()){
return doFirstMatchWithFailureJump(text);
} else {
// Fast path. Returns first match found.
State currentState = getRootState();
return doFirstMatch(text);
}

return null;
}

for (int position = 0; position < text.length(); position++) {
Character character = text.charAt(position);
private Emit doFirstMatch(final CharSequence text) {
// Fast path. Returns first match found.
State currentState = getRootState();

// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}
for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position);

// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}

currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();

currentState = getState(currentState, character);
Collection<String> emitStrs = currentState.emit();

if (emitStrs != null && !emitStrs.isEmpty()) {
for (final String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
if (emitStrs != null && !emitStrs.isEmpty()) {
for (final String emitStr : emitStrs) {
final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}
Expand All @@ -201,6 +256,38 @@ public Emit firstMatch(final CharSequence text) {
return null;
}

private Emit doFirstMatchWithFailureJump(final CharSequence text) {
// Fast path. Returns first match found.
State currentState = getRootState();
FailureJumpState state = FailureJumpState.createInstance(currentState, this.failureTimes);

for (int position = 0; position < text.length(); position++) {
char character = text.charAt(position);

// TODO: Lowercase the entire string at once?
if (trieConfig.isCaseInsensitive()) {
character = Character.toLowerCase(character);
}

currentState = getStateWithFailureJump(state, character);
Collection<String> emitStrs = currentState.emit();

for (final String emitStr : emitStrs) {
int start = position - emitStr.length() - state.getTotal() + 1;
final Emit emit = new Emit(start, position, emitStr);
if (trieConfig.isOnlyWholeWords()) {
if (!isPartialMatch(text, emit)) {
return emit;
}
} else {
return emit;
}
}
}

return null;
}

private boolean isPartialMatch(final CharSequence searchText, final Emit emit) {
return (emit.getStart() != 0 &&
Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) ||
Expand Down Expand Up @@ -250,6 +337,20 @@ private State getState(State currentState, final Character character) {
return newCurrentState;
}

private State getStateWithFailureJump(FailureJumpState state, final Character c) {
State currentState = state.getState();
State newCurrentState = currentState.nextState(c);

while (newCurrentState == null) {
currentState = currentState.failure();
newCurrentState = currentState.nextState(c);
}

state.nextState(newCurrentState);

return newCurrentState;
}

private void constructFailureStates() {
final Queue<State> queue = new LinkedBlockingDeque<>();
final State startState = getRootState();
Expand Down Expand Up @@ -297,6 +398,22 @@ private boolean storeEmits(final int position, final State currentState, final E
return emitted;
}

private boolean storeEmitsWithFailureJump(final int position, final FailureJumpState currentState, final EmitHandler emitHandler) {
boolean emitted = false;
final Collection<String> emits = currentState.getState().emit();

// TODO: The check for empty might be superfluous.
for (final String emit : emits) {
int start = position - emit.length() - currentState.getTotal() + 1;
emitted = emitHandler.emit(new Emit(start, position, emit)) || emitted;
if (emitted && trieConfig.isStopOnHit()) {
break;
}
}

return emitted;
}

private boolean isCaseInsensitive() {
return trieConfig.isCaseInsensitive();
}
Expand Down Expand Up @@ -441,5 +558,15 @@ public TrieBuilder caseInsensitive() {
public TrieBuilder removeOverlaps() {
return ignoreOverlaps();
}

/**
* allow failure-jump pattern. It will jump a few of continues characters
* while current not mach
* @return
*/
public TrieBuilder allowFailureJump() {
this.trieConfig.setAllowFailureJump(true);
return this;
}
}
}
13 changes: 13 additions & 0 deletions src/main/java/org/ahocorasick/trie/TrieConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ public class TrieConfig {

private boolean stopOnHit = false;

/**
* allow the failure-jump when current not match
*/
private boolean allowFailureJump = false;

public boolean isStopOnHit() {
return stopOnHit;
}
Expand Down Expand Up @@ -51,4 +56,12 @@ public boolean isCaseInsensitive() {
public void setCaseInsensitive(boolean caseInsensitive) {
this.caseInsensitive = caseInsensitive;
}

public boolean isAllowFailureJump() {
return allowFailureJump;
}

public void setAllowFailureJump(boolean allowFailureJump) {
this.allowFailureJump = allowFailureJump;
}
}
22 changes: 22 additions & 0 deletions src/test/java/org/ahocorasick/trie/TrieTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,28 @@ public void testLargeString() {
assertEquals(textSize / interval, emits.size());
}

@Test
public void testParseTextWithFailureJump() {
String text = "Hello Trump";
String textForFailureJump = "Hello T!r**ump";

Trie trie = Trie.builder()
.allowFailureJump()
.addKeyword("Trump")
.build();

Collection<Emit> emits = trie.failureTimes(2).parseText(text);
Collection<Emit> emitsForFailureJump = trie.failureTimes(2).parseText(textForFailureJump);

for (Emit e : emits) {
System.out.printf("keyword: %s, start: %d, end: %d\n", e.getKeyword(), e.getStart(), e.getEnd());
}

for (Emit e : emitsForFailureJump) {
System.out.printf("keyword: %s, start: %d, end: %d\n", e.getKeyword(), e.getStart(), e.getEnd());
}
}

/**
* Generates a random sequence of ASCII numbers.
*
Expand Down