diff --git a/src/main/java/org/ahocorasick/trie/Trie.java b/src/main/java/org/ahocorasick/trie/Trie.java index 8c8b58f..0b62c82 100644 --- a/src/main/java/org/ahocorasick/trie/Trie.java +++ b/src/main/java/org/ahocorasick/trie/Trie.java @@ -80,6 +80,10 @@ public Collection parseText(CharSequence text) { removePartialMatches(text, collectedEmits); } + if (trieConfig.isOnlyWholeWordsWhiteSpaceSeparated()) { + removePartialMatchesWhiteSpaceSeparated(text, collectedEmits); + } + if (!trieConfig.isAllowOverlaps()) { IntervalTree intervalTree = new IntervalTree((List)(List)collectedEmits); intervalTree.removeOverlaps((List) (List) collectedEmits); @@ -161,6 +165,21 @@ private void removePartialMatches(CharSequence searchText, List collectedE } } + private void removePartialMatchesWhiteSpaceSeparated(CharSequence searchText, List collectedEmits) { + long size = searchText.length(); + List removeEmits = new ArrayList<>(); + for (Emit emit : collectedEmits) { + if ((emit.getStart() == 0 || Character.isWhitespace(searchText.charAt(emit.getStart() - 1))) && + (emit.getEnd() + 1 == size || Character.isWhitespace(searchText.charAt(emit.getEnd() + 1)))) { + continue; + } + removeEmits.add(emit); + } + for (Emit removeEmit : removeEmits) { + collectedEmits.remove(removeEmit); + } + } + private State getState(State currentState, Character character) { State newCurrentState = currentState.nextState(character); while (newCurrentState == null) { @@ -237,6 +256,11 @@ public TrieBuilder onlyWholeWords() { return this; } + public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { + this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); + return this; + } + public TrieBuilder addKeyword(String keyword) { trie.addKeyword(keyword); return this; diff --git a/src/main/java/org/ahocorasick/trie/TrieConfig.java b/src/main/java/org/ahocorasick/trie/TrieConfig.java index 2d29788..f9f0125 100644 --- a/src/main/java/org/ahocorasick/trie/TrieConfig.java +++ b/src/main/java/org/ahocorasick/trie/TrieConfig.java @@ -6,6 +6,8 @@ public class TrieConfig { private boolean onlyWholeWords = false; + private boolean onlyWholeWordsWhiteSpaceSeparated = false; + private boolean caseInsensitive = false; private boolean stopOnHit = false; @@ -30,6 +32,12 @@ public void setOnlyWholeWords(boolean onlyWholeWords) { this.onlyWholeWords = onlyWholeWords; } + public boolean isOnlyWholeWordsWhiteSpaceSeparated() { return onlyWholeWordsWhiteSpaceSeparated; } + + public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) { + this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated; + } + public boolean isCaseInsensitive() { return caseInsensitive; } diff --git a/src/test/java/org/ahocorasick/trie/TrieTest.java b/src/test/java/org/ahocorasick/trie/TrieTest.java index f4d3a6c..6a620f0 100644 --- a/src/test/java/org/ahocorasick/trie/TrieTest.java +++ b/src/test/java/org/ahocorasick/trie/TrieTest.java @@ -16,61 +16,61 @@ public class TrieTest { @Test public void keywordAndTextAreTheSame() { Trie trie = Trie.builder() - .addKeyword("abc") - .build(); + .addKeyword("abc") + .build(); Collection emits = trie.parseText("abc"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 0, 2, "abc"); } - @Test + @Test public void keywordAndTextAreTheSameFirstMatch() { Trie trie = Trie.builder() .addKeyword("abc") .build(); - Emit firstMatch = trie.firstMatch("abc"); + Emit firstMatch = trie.firstMatch("abc"); checkEmit(firstMatch, 0, 2, "abc"); } @Test public void textIsLongerThanKeyword() { Trie trie = Trie.builder() - .addKeyword("abc") - .build(); + .addKeyword("abc") + .build(); Collection emits = trie.parseText(" abc"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 1, 3, "abc"); } - @Test + @Test public void textIsLongerThanKeywordFirstMatch() { Trie trie = Trie.builder() .addKeyword("abc") .build(); - Emit firstMatch = trie.firstMatch(" abc"); + Emit firstMatch = trie.firstMatch(" abc"); checkEmit(firstMatch, 1, 3, "abc"); } @Test public void variousKeywordsOneMatch() { Trie trie = Trie.builder() - .addKeyword("abc") - .addKeyword("bcd") - .addKeyword("cde") - .build(); + .addKeyword("abc") + .addKeyword("bcd") + .addKeyword("cde") + .build(); Collection emits = trie.parseText("bcd"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 0, 2, "bcd"); } - @Test + @Test public void variousKeywordsFirstMatch() { Trie trie = Trie.builder() .addKeyword("abc") .addKeyword("bcd") .addKeyword("cde") .build(); - Emit firstMatch = trie.firstMatch("bcd"); + Emit firstMatch = trie.firstMatch("bcd"); checkEmit(firstMatch, 0, 2, "bcd"); } @@ -125,15 +125,15 @@ public void ushersTestWithCapitalKeywords() { @Test public void ushersTestFirstMatch() { - Trie trie = Trie.builder() + Trie trie = Trie.builder() .addKeyword("hers") .addKeyword("his") .addKeyword("she") .addKeyword("he") .build(); - Emit firstMatch = trie.firstMatch("ushers"); - checkEmit(firstMatch, 2, 3, "he"); - } + Emit firstMatch = trie.firstMatch("ushers"); + checkEmit(firstMatch, 2, 3, "he"); + } @Test public void ushersTestByCallback() { @@ -163,30 +163,30 @@ public void emit(Emit emit) { @Test public void misleadingTest() { Trie trie = Trie.builder() - .addKeyword("hers") - .build(); + .addKeyword("hers") + .build(); Collection emits = trie.parseText("h he her hers"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 9, 12, "hers"); } - @Test + @Test public void misleadingTestFirstMatch() { - Trie trie = Trie.builder() - .addKeyword("hers") - .build(); - Emit firstMatch = trie.firstMatch("h he her hers"); + Trie trie = Trie.builder() + .addKeyword("hers") + .build(); + Emit firstMatch = trie.firstMatch("h he her hers"); checkEmit(firstMatch, 9, 12, "hers"); } @Test public void recipes() { Trie trie = Trie.builder() - .addKeyword("veal") - .addKeyword("cauliflower") - .addKeyword("broccoli") - .addKeyword("tomatoes") - .build(); + .addKeyword("veal") + .addKeyword("cauliflower") + .addKeyword("broccoli") + .addKeyword("tomatoes") + .build(); Collection emits = trie.parseText("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); Iterator iterator = emits.iterator(); checkEmit(iterator.next(), 2, 12, "cauliflower"); @@ -195,15 +195,15 @@ public void recipes() { checkEmit(iterator.next(), 51, 58, "broccoli"); } - @Test + @Test public void recipesFirstMatch() { - Trie trie = Trie.builder() - .addKeyword("veal") - .addKeyword("cauliflower") - .addKeyword("broccoli") - .addKeyword("tomatoes") - .build(); - Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); + Trie trie = Trie.builder() + .addKeyword("veal") + .addKeyword("cauliflower") + .addKeyword("broccoli") + .addKeyword("tomatoes") + .build(); + Emit firstMatch = trie.firstMatch("2 cauliflowers, 3 tomatoes, 4 slices of veal, 100g broccoli"); checkEmit(firstMatch, 2, 12, "cauliflower"); } @@ -228,10 +228,10 @@ public void longAndShortOverlappingMatch() { @Test public void nonOverlapping() { Trie trie = Trie.builder().removeOverlaps() - .addKeyword("ab") - .addKeyword("cba") - .addKeyword("ababc") - .build(); + .addKeyword("ab") + .addKeyword("cba") + .addKeyword("ababc") + .build(); Collection emits = trie.parseText("ababcbab"); assertEquals(2, emits.size()); Iterator iterator = emits.iterator(); @@ -242,40 +242,40 @@ public void nonOverlapping() { @Test public void nonOverlappingFirstMatch() { - Trie trie = Trie.builder().removeOverlaps() - .addKeyword("ab") - .addKeyword("cba") - .addKeyword("ababc") - .build(); - Emit firstMatch = trie.firstMatch("ababcbab"); + Trie trie = Trie.builder().removeOverlaps() + .addKeyword("ab") + .addKeyword("cba") + .addKeyword("ababc") + .build(); + Emit firstMatch = trie.firstMatch("ababcbab"); checkEmit(firstMatch, 0, 4, "ababc"); } @Test public void containsMatch() { - Trie trie = Trie.builder().removeOverlaps() - .addKeyword("ab") - .addKeyword("cba") - .addKeyword("ababc") - .build(); + Trie trie = Trie.builder().removeOverlaps() + .addKeyword("ab") + .addKeyword("cba") + .addKeyword("ababc") + .build(); assertTrue(trie.containsMatch("ababcbab")); } @Test public void startOfChurchillSpeech() { Trie trie = Trie.builder().removeOverlaps() - .addKeyword("T") - .addKeyword("u") - .addKeyword("ur") - .addKeyword("r") - .addKeyword("urn") - .addKeyword("ni") - .addKeyword("i") - .addKeyword("in") - .addKeyword("n") - .addKeyword("urning") - .build(); + .addKeyword("T") + .addKeyword("u") + .addKeyword("ur") + .addKeyword("r") + .addKeyword("urn") + .addKeyword("ni") + .addKeyword("i") + .addKeyword("in") + .addKeyword("n") + .addKeyword("urning") + .build(); Collection emits = trie.parseText("Turning"); assertEquals(2, emits.size()); } @@ -283,32 +283,32 @@ public void startOfChurchillSpeech() { @Test public void partialMatch() { Trie trie = Trie.builder() - .onlyWholeWords() - .addKeyword("sugar") - .build(); + .onlyWholeWords() + .addKeyword("sugar") + .build(); Collection emits = trie.parseText("sugarcane sugarcane sugar canesugar"); // left, middle, right test assertEquals(1, emits.size()); // Match must not be made checkEmit(emits.iterator().next(), 20, 24, "sugar"); } - @Test + @Test public void partialMatchFirstMatch() { Trie trie = Trie.builder() .onlyWholeWords() .addKeyword("sugar") .build(); - Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test - + Emit firstMatch = trie.firstMatch("sugarcane sugarcane sugar canesugar"); // left, middle, right test + checkEmit(firstMatch, 20, 24, "sugar"); } @Test public void tokenizeFullSentence() { Trie trie = Trie.builder() - .addKeyword("Alpha") - .addKeyword("Beta") - .addKeyword("Gamma") - .build(); + .addKeyword("Alpha") + .addKeyword("Beta") + .addKeyword("Gamma") + .build(); Collection tokens = trie.tokenize("Hear: Alpha team first, Beta from the rear, Gamma in reserve"); assertEquals(7, tokens.size()); Iterator tokensIt = tokens.iterator(); @@ -324,11 +324,11 @@ public void tokenizeFullSentence() { @Test public void bug5InGithubReportedByXCurry() { Trie trie = Trie.builder().caseInsensitive().onlyWholeWords() - .addKeyword("turning") - .addKeyword("once") - .addKeyword("again") - .addKeyword("börkü") - .build(); + .addKeyword("turning") + .addKeyword("once") + .addKeyword("again") + .addKeyword("börkü") + .build(); Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made Iterator it = emits.iterator(); @@ -341,11 +341,11 @@ public void bug5InGithubReportedByXCurry() { @Test public void caseInsensitive() { Trie trie = Trie.builder().caseInsensitive() - .addKeyword("turning") - .addKeyword("once") - .addKeyword("again") - .addKeyword("börkü") - .build(); + .addKeyword("turning") + .addKeyword("once") + .addKeyword("again") + .addKeyword("börkü") + .build(); Collection emits = trie.parseText("TurninG OnCe AgAiN BÖRKÜ"); assertEquals(4, emits.size()); // Match must not be made Iterator it = emits.iterator(); @@ -355,7 +355,7 @@ public void caseInsensitive() { checkEmit(it.next(), 19, 23, "börkü"); } - @Test + @Test public void caseInsensitiveFirstMatch() { Trie trie = Trie.builder().caseInsensitive() .addKeyword("turning") @@ -363,7 +363,7 @@ public void caseInsensitiveFirstMatch() { .addKeyword("again") .addKeyword("börkü") .build(); - Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); + Emit firstMatch = trie.firstMatch("TurninG OnCe AgAiN BÖRKÜ"); checkEmit(firstMatch, 0, 6, "turning"); } @@ -371,10 +371,10 @@ public void caseInsensitiveFirstMatch() { @Test public void tokenizeTokensInSequence() { Trie trie = Trie.builder() - .addKeyword("Alpha") - .addKeyword("Beta") - .addKeyword("Gamma") - .build(); + .addKeyword("Alpha") + .addKeyword("Beta") + .addKeyword("Gamma") + .build(); Collection tokens = trie.tokenize("Alpha Beta Gamma"); assertEquals(5, tokens.size()); } @@ -383,8 +383,8 @@ public void tokenizeTokensInSequence() { @Test public void zeroLengthTestBug7InGithubReportedByXCurry() { Trie trie = Trie.builder().removeOverlaps().onlyWholeWords().caseInsensitive() - .addKeyword("") - .build(); + .addKeyword("") + .build(); trie.tokenize("Try a natural lip and subtle bronzer to keep all the focus on those big bright eyes with NARS Eyeshadow Duo in Rated R And the winner is... Boots No7 Advanced Renewal Anti-ageing Glycolic Peel Kit ($25 amazon.com) won most-appealing peel."); } @@ -394,15 +394,15 @@ public void unicodeIssueBug8ReportedByDwyerk() { String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char assertEquals("THIS", target.substring(5, 9)); // Java does it the right way Trie trie = Trie.builder().caseInsensitive().onlyWholeWords() - .addKeyword("this") - .build(); + .addKeyword("this") + .build(); Collection emits = trie.parseText(target); assertEquals(1, emits.size()); Iterator it = emits.iterator(); checkEmit(it.next(), 5, 8, "this"); } - @Test + @Test public void unicodeIssueBug8ReportedByDwyerkFirstMatch() { String target = "LİKE THIS"; // The second character ('İ') is Unicode, which was read by AC as a 2-byte char Trie trie = Trie.builder() @@ -411,13 +411,24 @@ public void unicodeIssueBug8ReportedByDwyerkFirstMatch() { .addKeyword("this") .build(); assertEquals("THIS", target.substring(5, 9)); // Java does it the right way - Emit firstMatch = trie.firstMatch(target); + Emit firstMatch = trie.firstMatch(target); checkEmit(firstMatch, 5, 8, "this"); } + @Test + public void partialMatchWhiteSpaces() { + Trie trie = Trie.builder() + .onlyWholeWordsWhiteSpaceSeparated() + .addKeyword("#sugar-123") + .build(); + Collection < Emit > emits = trie.parseText("#sugar-123 #sugar-1234"); // left, middle, right test + assertEquals(1, emits.size()); // Match must not be made + checkEmit(emits.iterator().next(), 0, 9, "#sugar-123"); + } + private void checkEmit(Emit next, int expectedStart, int expectedEnd, String expectedKeyword) { - assertEquals("Start of emit should have been "+expectedStart, expectedStart, next.getStart()); - assertEquals("End of emit should have been "+expectedEnd, expectedEnd, next.getEnd()); + assertEquals("Start of emit should have been " + expectedStart, expectedStart, next.getStart()); + assertEquals("End of emit should have been " + expectedEnd, expectedEnd, next.getEnd()); assertEquals(expectedKeyword, next.getKeyword()); }