From 1f191f3c35f6a0a34dae046323e8507286d73231 Mon Sep 17 00:00:00 2001 From: cherry Date: Thu, 18 Apr 2019 23:45:51 +0800 Subject: [PATCH] finish feature: the failure-jump pattern --- .../ahocorasick/trie/FailureJumpState.java | 99 +++++++++++ src/main/java/org/ahocorasick/trie/Trie.java | 167 +++++++++++++++--- .../java/org/ahocorasick/trie/TrieConfig.java | 13 ++ .../java/org/ahocorasick/trie/TrieTest.java | 22 +++ 4 files changed, 281 insertions(+), 20 deletions(-) create mode 100644 src/main/java/org/ahocorasick/trie/FailureJumpState.java diff --git a/src/main/java/org/ahocorasick/trie/FailureJumpState.java b/src/main/java/org/ahocorasick/trie/FailureJumpState.java new file mode 100644 index 0000000..7edcf0e --- /dev/null +++ b/src/main/java/org/ahocorasick/trie/FailureJumpState.java @@ -0,0 +1,99 @@ +package org.ahocorasick.trie; + +/** + * The failure-jump state for failure-jump pattern. + *

+ * When failure-jump config open it will jump a few of continues characters + * while current not math. + * + * @author wangzhao + */ +public class FailureJumpState { + private State state; + + /** + * the current failure times + *

+ *

  • -1: INIT state + *
  • 0: READY to failure-jump + *
  • N: PLUS the failure-jump times + *
  • +N: TOTAL the current failure-jump times + */ + private int failureTimes; + /** + * the max failure-jump times + */ + private final int maxFailureTimes; + private int total; + + private FailureJumpState(int maxFailureTimes) { + if (maxFailureTimes > 0) { + this.maxFailureTimes = maxFailureTimes; + } else { + throw new IllegalArgumentException("the maxFailureTimes max large than 0: " + maxFailureTimes); + } + } + + public static FailureJumpState createInstance(State state, int maxFailureTimes) { + FailureJumpState failureJumpState = new FailureJumpState(maxFailureTimes); + failureJumpState.reset(state); + + return failureJumpState; + } + + /** + * reset to INIT state + * + * @param state + */ + public void reset(State state) { + this.state = state; + this.failureTimes = -1; + this.total = 0; + } + + /** + * get next failure-jump state + * + *
  • INIT(-1): the start state when a failure-jump state created or reset. + *
  • READY(0): INIT will goto READY while current char is matched. READY + * state will remains until current not match. + *
  • PLUS(N): READY goto PLUS when current not match. It will be END when + * current failure-jump times larger than {@link #maxFailureTimes} + *
  • TOTAL(+N): when current match, the PLUS has done and sum current failureTimes. + * Then turn on READY for next failure-jump. + * + * @param nextState + */ + public void nextState(State nextState) { + if (this.failureTimes == -1 && nextState.getDepth() > 0) { + // init --> ready + this.failureTimes = 0; + } else if (this.failureTimes >= 0 && nextState.getDepth() == 0) { + // ready --> plus + if (this.failureTimes < this.maxFailureTimes) { + // remains the previous state before the failure-jump end or + // out of the max times. + this.failureTimes++; + } else { + // reset the current state when current failure-jump + // times large than maxFailureTimes + this.reset(state); + } + return; + } else if (nextState.getDepth() > 0) { + // plus --> total + this.total += this.failureTimes; + this.failureTimes = 0; + } + this.state = nextState; + } + + public int getTotal() { + return this.total; + } + + public State getState() { + return state; + } +} diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 25922db..0ef2e80 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -28,6 +28,12 @@ public class Trie { private final State rootState; + /** + * The max failure-jump times. It need to be set before parseText. + * It will not take effect before the allowFailureJump config not set. + */ + private int failureTimes; + private Trie(final TrieConfig trieConfig) { this.trieConfig = trieConfig; this.rootState = new State(); @@ -106,6 +112,15 @@ private Token createMatch(Emit emit, String text) { return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit); } + public Trie failureTimes(int times) { + if (times > 0) { + this.failureTimes = times; + return this; + } + + throw new IllegalArgumentException("the failureTimes mast be a positive integer: " + times); + } + public Collection parseText(final CharSequence text) { return parseText(text, new DefaultEmitHandler()); } @@ -137,10 +152,18 @@ public boolean containsMatch(final CharSequence text) { } public void parseText(final CharSequence text, final EmitHandler emitHandler) { + if (this.trieConfig.isAllowFailureJump() && this.failureTimes > 0) { + doParseTextWithFailureJump(text, emitHandler); + } else { + doParseText(text, emitHandler); + } + } + + private void doParseText(final CharSequence text, final EmitHandler emitHandler) { State currentState = getRootState(); for (int position = 0; position < text.length(); position++) { - Character character = text.charAt(position); + char character = text.charAt(position); // TODO: Maybe lowercase the entire string at once? if (trieConfig.isCaseInsensitive()) { @@ -154,6 +177,30 @@ public void parseText(final CharSequence text, final EmitHandler emitHandler) { } } + private void doParseTextWithFailureJump(final CharSequence text, final EmitHandler emitHandler) { + State currentState = getRootState(); + FailureJumpState state = FailureJumpState.createInstance(currentState, this.failureTimes); + + for (int position = 0; position < text.length(); position++) { + char character = text.charAt(position); + + // TODO: Maybe lowercase the entire string at once? + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getStateWithFailureJump(state, character); + boolean emitted; + if ((emitted = storeEmitsWithFailureJump(position, state, emitHandler)) && trieConfig.isStopOnHit()) { + return; + } + + if (emitted) { + state.reset(currentState); + } + } + } + /** * The first matching text sequence. * @@ -168,31 +215,39 @@ public Emit firstMatch(final CharSequence text) { if (parseText != null && !parseText.isEmpty()) { return parseText.iterator().next(); } + } else if (trieConfig.isAllowFailureJump()){ + return doFirstMatchWithFailureJump(text); } else { - // Fast path. Returns first match found. - State currentState = getRootState(); + return doFirstMatch(text); + } + + return null; + } - for (int position = 0; position < text.length(); position++) { - Character character = text.charAt(position); + private Emit doFirstMatch(final CharSequence text) { + // Fast path. Returns first match found. + State currentState = getRootState(); - // TODO: Lowercase the entire string at once? - if (trieConfig.isCaseInsensitive()) { - character = Character.toLowerCase(character); - } + for (int position = 0; position < text.length(); position++) { + char character = text.charAt(position); + + // TODO: Lowercase the entire string at once? + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getState(currentState, character); + Collection emitStrs = currentState.emit(); - currentState = getState(currentState, character); - Collection emitStrs = currentState.emit(); - - if (emitStrs != null && !emitStrs.isEmpty()) { - for (final String emitStr : emitStrs) { - final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); - if (trieConfig.isOnlyWholeWords()) { - if (!isPartialMatch(text, emit)) { - return emit; - } - } else { + if (emitStrs != null && !emitStrs.isEmpty()) { + for (final String emitStr : emitStrs) { + final Emit emit = new Emit(position - emitStr.length() + 1, position, emitStr); + if (trieConfig.isOnlyWholeWords()) { + if (!isPartialMatch(text, emit)) { return emit; } + } else { + return emit; } } } @@ -201,6 +256,38 @@ public Emit firstMatch(final CharSequence text) { return null; } + private Emit doFirstMatchWithFailureJump(final CharSequence text) { + // Fast path. Returns first match found. + State currentState = getRootState(); + FailureJumpState state = FailureJumpState.createInstance(currentState, this.failureTimes); + + for (int position = 0; position < text.length(); position++) { + char character = text.charAt(position); + + // TODO: Lowercase the entire string at once? + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getStateWithFailureJump(state, character); + Collection emitStrs = currentState.emit(); + + for (final String emitStr : emitStrs) { + int start = position - emitStr.length() - state.getTotal() + 1; + final Emit emit = new Emit(start, position, emitStr); + if (trieConfig.isOnlyWholeWords()) { + if (!isPartialMatch(text, emit)) { + return emit; + } + } else { + return emit; + } + } + } + + return null; + } + private boolean isPartialMatch(final CharSequence searchText, final Emit emit) { return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) || @@ -250,6 +337,20 @@ private State getState(State currentState, final Character character) { return newCurrentState; } + private State getStateWithFailureJump(FailureJumpState state, final Character c) { + State currentState = state.getState(); + State newCurrentState = currentState.nextState(c); + + while (newCurrentState == null) { + currentState = currentState.failure(); + newCurrentState = currentState.nextState(c); + } + + state.nextState(newCurrentState); + + return newCurrentState; + } + private void constructFailureStates() { final Queue queue = new LinkedBlockingDeque<>(); final State startState = getRootState(); @@ -297,6 +398,22 @@ private boolean storeEmits(final int position, final State currentState, final E return emitted; } + private boolean storeEmitsWithFailureJump(final int position, final FailureJumpState currentState, final EmitHandler emitHandler) { + boolean emitted = false; + final Collection emits = currentState.getState().emit(); + + // TODO: The check for empty might be superfluous. + for (final String emit : emits) { + int start = position - emit.length() - currentState.getTotal() + 1; + emitted = emitHandler.emit(new Emit(start, position, emit)) || emitted; + if (emitted && trieConfig.isStopOnHit()) { + break; + } + } + + return emitted; + } + private boolean isCaseInsensitive() { return trieConfig.isCaseInsensitive(); } @@ -441,5 +558,15 @@ public TrieBuilder caseInsensitive() { public TrieBuilder removeOverlaps() { return ignoreOverlaps(); } + + /** + * allow failure-jump pattern. It will jump a few of continues characters + * while current not mach + * @return + */ + public TrieBuilder allowFailureJump() { + this.trieConfig.setAllowFailureJump(true); + return this; + } } } diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java index f7487dd..2fe1a25 100644 --- a/src/main/java/org/ahocorasick/trie/TrieConfig.java +++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java @@ -12,6 +12,11 @@ public class TrieConfig { private boolean stopOnHit = false; + /** + * allow the failure-jump when current not match + */ + private boolean allowFailureJump = false; + public boolean isStopOnHit() { return stopOnHit; } @@ -51,4 +56,12 @@ public boolean isCaseInsensitive() { public void setCaseInsensitive(boolean caseInsensitive) { this.caseInsensitive = caseInsensitive; } + + public boolean isAllowFailureJump() { + return allowFailureJump; + } + + public void setAllowFailureJump(boolean allowFailureJump) { + this.allowFailureJump = allowFailureJump; + } } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index bf01589..e6eb399 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -464,6 +464,28 @@ public void testLargeString() { assertEquals(textSize / interval, emits.size()); } + @Test + public void testParseTextWithFailureJump() { + String text = "Hello Trump"; + String textForFailureJump = "Hello T!r**ump"; + + Trie trie = Trie.builder() + .allowFailureJump() + .addKeyword("Trump") + .build(); + + Collection emits = trie.failureTimes(2).parseText(text); + Collection emitsForFailureJump = trie.failureTimes(2).parseText(textForFailureJump); + + for (Emit e : emits) { + System.out.printf("keyword: %s, start: %d, end: %d\n", e.getKeyword(), e.getStart(), e.getEnd()); + } + + for (Emit e : emitsForFailureJump) { + System.out.printf("keyword: %s, start: %d, end: %d\n", e.getKeyword(), e.getStart(), e.getEnd()); + } + } + /** * Generates a random sequence of ASCII numbers. *