[BUGS#1251] fix and refactor: FindMatches

- add internal search method to handle normal and segmented search conditions, also use for test purpose - drop threshold arguments for CalcMatchStatistics usage - FindMatchesThread.finderSearch to take threshold argument for testing. - update search() callers accordingly. Signed-off-by: Hiroshi Miura <[email protected]>
omegat-org · Nov 26, 2024 · f049239 · f049239
1 parent 711f747
commit f049239
Show file tree

Hide file tree

Showing 6 changed files with 130 additions and 96 deletions.
diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java
@@ -51,7 +51,6 @@
 import org.omegat.core.threads.LongProcessThread;
 import org.omegat.util.OConsts;
 import org.omegat.util.OStrings;
-import org.omegat.util.Preferences;
 import org.omegat.util.StringUtil;
 import org.omegat.util.Token;
 import org.omegat.util.gui.TextUtil;
@@ -108,19 +107,15 @@ public class CalcMatchStatistics extends LongProcessThread {
     private final IProject project;
 
     public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
-        this(Core.getProject(), Core.getSegmenter(), callback, perFile,
-                Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
-                OConsts.FUZZY_MATCH_THRESHOLD));
+        this(Core.getProject(), Core.getSegmenter(), callback, perFile);
     }
 
-    public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
-                               boolean perFile, int threshold) {
+    public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) {
         this.project = project;
         this.callback = callback;
         this.perFile = perFile;
         finder = ThreadLocal.withInitial(
-                () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
-                        false, false, threshold));
+                () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1));
     }
 
     @Override
@@ -313,7 +308,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
     int calcMaxSimilarity(SourceTextEntry ste) {
         String srcNoXmlTags = removeXmlTags(ste);
         FindMatches localFinder = finder.get();
-        List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
+        List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
         final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
         int maxSimilarity = 0;
         CACHE: for (NearString near : nears) {

diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java
@@ -127,46 +127,68 @@ public class FindMatches {
     /** Tokens for original string, includes numbers and tags. */
     private Token[] strTokensAll;
 
-    // This finder used for search separate segment matches
-    private FindMatches separateSegmentMatcher;
-
     private final int fuzzyMatchThreshold;
 
-    private final boolean applyThreshold;
-
     private final Segmenter segmenter;
 
-    /**
-     * @param searchExactlyTheSame
-     *            allows to search similarities with the same text as source
-     *            segment. This mode used only for separate sentence match in
-     *            paragraph project, i.e. where source is just part of current
-     *            source.
-     */
+    @Deprecated
     public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
             boolean searchExactlyTheSame) {
-        this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true,
-                Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
-                        OConsts.FUZZY_MATCH_THRESHOLD));
+        this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault(
+                Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
     }
 
-    public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
-            boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
+    /**
+     * Constructor.
+     *
+     * @param project
+     *            OmegaT project.
+     * @param segmenter
+     *            Segmenter to use.
+     * @param maxCount
+     *            limit the maximum count of the results.
+     * @param searchExactlyTheSame
+     *            allows searching similarities with the same text as a source
+     *            segment. This mode is used only for separate sentence match in
+     *            a paragraph project, i.e. where a source is just part of the
+     *            current source.
+     * @param threshold
+     *            threshold to use.
+     */
+    public FindMatches(IProject project, Segmenter segmenter, int maxCount,
+            boolean searchExactlyTheSame, int threshold) {
         this.project = project;
         this.segmenter = segmenter;
         this.tok = project.getSourceTokenizer();
         this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
         this.maxCount = maxCount;
         this.searchExactlyTheSame = searchExactlyTheSame;
-        if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
-            separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold);
-        }
         this.fuzzyMatchThreshold = threshold;
-        this.applyThreshold = applyThreshold;
     }
 
-    public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
+    public List<NearString> search(String searchText, boolean fillSimilarityData,
             IStopped stop) throws StoppedException {
+        return search(searchText, true, fillSimilarityData, stop, true, true);
+    }
+
+    /**
+     * Search Translation memories.
+     * <p>
+     * Internal method to handle search conditions.
+     * It is accecible as package-private for testing.
+     *
+     * @param searchText
+     * @param requiresTranslation
+     * @param fillSimilarityData
+     * @param stop
+     * @param allowSeparateSegmentMatch
+     * @param travelExternal
+     * @return
+     * @throws StoppedException
+     */
+    List<NearString> search(String searchText, boolean requiresTranslation,
+                            boolean fillSimilarityData, IStopped stop, boolean allowSeparateSegmentMatch,
+                            boolean travelExternal) throws StoppedException {
         result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
         srcText = searchText;
         removedText = "";
@@ -225,28 +247,31 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
          */
         int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
                 Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);
-        // travel by translation memories
-        for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
-            int penalty = 0;
-            Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
-            if (matcher.find()) {
-                penalty = Integer.parseInt(matcher.group(1));
-            }
-            for (ITMXEntry tmen : en.getValue().getEntries()) {
-                checkStopped(stop);
-                if (tmen.getSourceText() == null) {
-                    // Not all TMX entries have a source; in that case there can
-                    // be no meaningful match, so skip.
-                    continue;
+        // travel by external translation memories, when non-segmented search
+        // (see BUGS#1251)
+        if (travelExternal) {
+            for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
+                int penalty = 0;
+                Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
+                if (matcher.find()) {
+                    penalty = Integer.parseInt(matcher.group(1));
                 }
-                if (requiresTranslation && tmen.getTranslationText() == null) {
-                    continue;
-                }
-                int tmenPenalty = penalty;
-                if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) {
-                    tmenPenalty += foreignPenalty;
+                for (ITMXEntry tmen : en.getValue().getEntries()) {
+                    checkStopped(stop);
+                    if (tmen.getSourceText() == null) {
+                        // Not all TMX entries have a source; skip it in
+                        // the case, because of no meaningful.
+                        continue;
+                    }
+                    if (requiresTranslation && tmen.getTranslationText() == null) {
+                        continue;
+                    }
+                    int tmenPenalty = penalty;
+                    if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) {
+                        tmenPenalty += foreignPenalty;
+                    }
+                    processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty);
                 }
-                processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty);
             }
         }
         // travel by all entries for check source file translations
@@ -260,7 +285,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
                         ste.isSourceTranslationFuzzy(), 0);
             }
         }
-        if (separateSegmentMatcher != null) {
+        if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
+            FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true,
+                    fuzzyMatchThreshold);
             // split paragraph even when segmentation disabled, then find
             // matches for every segment
             List<StringBuilder> spaces = new ArrayList<>();
@@ -275,7 +302,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
                 for (String onesrc : segments) {
                     // find match for a separate segment
                     List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
-                            false, stop);
+                            false, stop, false, false);
                     if (!segmentMatch.isEmpty()
                             && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
                         fsrc.add(segmentMatch.get(0).source);
@@ -385,7 +412,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
         }
 
         // BUGS#1236 - stat display does not use threshold config check
-        if (applyThreshold && similarityStem < fuzzyMatchThreshold
+        if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
                 && similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
             return;
         }

diff --git a/src/org/omegat/gui/matches/FindMatchesThread.java b/src/org/omegat/gui/matches/FindMatchesThread.java
@@ -33,13 +33,16 @@
 import java.util.List;
 import java.util.logging.Logger;
 
+import org.omegat.core.Core;
 import org.omegat.core.data.IProject;
 import org.omegat.core.data.SourceTextEntry;
 import org.omegat.core.events.IStopped;
 import org.omegat.core.matching.NearString;
+import org.omegat.core.segmentation.Segmenter;
 import org.omegat.core.statistics.FindMatches;
 import org.omegat.gui.common.EntryInfoSearchThread;
 import org.omegat.util.OConsts;
+import org.omegat.util.Preferences;
 
 /**
  * Find matches in separate thread then show a result in the matches' pane.
@@ -82,7 +85,9 @@ protected List<NearString> search() throws Exception {
         long before = System.currentTimeMillis();
 
         try {
-            List<NearString> result = finderSearch(project, processedEntry.getSrcText(), this::isEntryChanged);
+            List<NearString> result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(),
+                    this::isEntryChanged, Preferences.getPreferenceDefault(
+                            Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
             LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before));
             return result;
         } catch (FindMatches.StoppedException ex) {
@@ -97,8 +102,9 @@ protected List<NearString> search() throws Exception {
      * @param isEntryChanged stop and raise StopException when it returns true.
      * @return result as a list of NearString.
      */
-    protected static List<NearString> finderSearch(IProject project, String srcText, IStopped isEntryChanged) {
-        FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
-        return finder.search(srcText, true, true, isEntryChanged);
+    protected static List<NearString> finderSearch(IProject project, Segmenter segmenter, String srcText,
+                                                   IStopped isEntryChanged, int threshold) {
+        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold);
+        return finder.search(srcText, true, isEntryChanged);
     }
 }
diff --git a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java
@@ -25,6 +25,7 @@
 
 package org.omegat.core.statistics;
 
+import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
@@ -35,6 +36,7 @@
 
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.omegat.core.Core;
@@ -48,6 +50,7 @@
 import org.omegat.core.data.ProtectedPart;
 import org.omegat.core.data.SourceTextEntry;
 import org.omegat.core.data.TMXEntry;
+import org.omegat.core.segmentation.SRX;
 import org.omegat.core.segmentation.Segmenter;
 import org.omegat.filters2.FilterContext;
 import org.omegat.filters2.IFilter;
@@ -64,13 +67,17 @@
 
 public class CalcMatchStatisticsTest {
 
+    @BeforeClass
+    public static void setup() throws IOException {
+        TestPreferencesInitializer.init();
+    }
+
     @Test
     public void testCalcMatchStatics() throws Exception {
         TestProject project = new TestProject(new ProjectPropertiesTest());
         IStatsConsumer callback = new TestStatsConsumer();
-        Segmenter segmenter = new Segmenter(Preferences.getSRX());
-        CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter,
-                callback, 30);
+        Segmenter segmenter = new Segmenter(SRX.getDefault());
+        CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback);
         calcMatchStatistics.start();
         try {
             calcMatchStatistics.join();
@@ -123,7 +130,8 @@ public void testCalcMatchStatics() throws Exception {
         Assert.assertEquals("5699", result[7][4]);
 
         // change threshold
-        calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70);
+        Preferences.setPreference(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, 70);
+        calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback);
         calcMatchStatistics.start();
         try {
             calcMatchStatistics.join();
@@ -362,9 +370,8 @@ static class CalcMatchStatisticsMock extends CalcMatchStatistics {
         private MatchStatCounts result;
         private final IStatsConsumer callback;
 
-        CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback,
-                                int threshold) {
-            super(project, segmenter, callback, false, threshold);
+        CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback) {
+            super(project, segmenter, callback, false);
             this.project = project;
             this.callback = callback;
         }

diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java
@@ -111,22 +111,22 @@ public void testSegmented() throws Exception {
                 + "han passat prou temps al lloc web per a convertir-se en usuaris bàsics."
                 + " Una comunitat vibrant necessita una entrada regular de nouvinguts que hi participen habitualment"
                 + " i aporten veus noves a les converses.\n";
-        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, false,
-                true, 30);
-        List<NearString> result = finder.search(srcText, true, true, iStopped);
+        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
+        // search without a separated segment match.
+        List<NearString> result = finder.search(srcText, true, true, iStopped, false, true);
         assertEquals(OConsts.MAX_NEAR_STRINGS, result.size());
         assertEquals(65, result.get(0).scores[0].score);
         assertEquals(62, result.get(0).scores[0].scoreNoStem);
         assertEquals(62, result.get(0).scores[0].adjustedScore);
         assertEquals(expectFirst, result.get(0).translation);
         assertEquals(expectNear, result.get(1).translation);
-        //
+        // search with a segmented match.
         List<StringBuilder> spaces = new ArrayList<>();
         List<Rule> brules = new ArrayList<>();
         List<String> segments = segmenter.segment(prop.getSourceLanguage(), srcText, spaces, brules);
         assertEquals(3, segments.size());
-        finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, true, 30);
-        result = finder.search(srcText, true, true, iStopped);
+        finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
+        result = finder.search(srcText, false, iStopped);
         assertEquals(OConsts.MAX_NEAR_STRINGS, result.size());
         assertEquals("Hit with segmented tmx record", 100, result.get(0).scores[0].score);
         assertEquals(100, result.get(0).scores[0].scoreNoStem);
@@ -167,9 +167,8 @@ public void testSearchRFE1578() throws Exception {
         IProject project = new TestProject(prop, null, TMX_EN_US_SR, new LuceneEnglishTokenizer(),
                 new DefaultTokenizer(), segmenter);
         IStopped iStopped = () -> false;
-        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false,
-                true, 30);
-        List<NearString> result = finder.search("XXX", true, true, iStopped);
+        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
+        List<NearString> result = finder.search("XXX", false, iStopped);
         // Without the fix, the result has two entries, but it should one.
         assertEquals(1, result.size());
         assertEquals("XXX", result.get(0).source);
@@ -206,10 +205,9 @@ public void testSearchRFE1578_2() throws Exception {
         IProject project = new TestProject(prop, null, TMX_EN_US_GB_SR, new LuceneEnglishTokenizer(),
                 new DefaultTokenizer(), segmenter);
         IStopped iStopped = () -> false;
-        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false,
-                true, 30);
+        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
         // Search source "XXx" in en-US
-        List<NearString> result = finder.search("XXX", true, true, iStopped);
+        List<NearString> result = finder.search("XXX", false, iStopped);
         // There should be three entries.
         assertEquals(3, result.size());
         assertEquals("XXx", result.get(0).source); // should be en-US.
@@ -237,9 +235,8 @@ public void testSearchBUGS1251() throws Exception {
         List<String> segments = segmenter.segment(sourceLanguage, srcText, spaces, brules);
         assertEquals(2, segments.size());
         IStopped iStopped = () -> false;
-        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false,
-                true, 30);
-        List<NearString> result = finder.search(srcText, true, true, iStopped);
+        FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
+        List<NearString> result = finder.search(srcText, false, iStopped);
         assertEquals(srcText, result.get(0).source);
         assertEquals(1, result.size());
         assertEquals("TM", result.get(0).comesFrom.name());