diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java index f48f2dcf57..2aa2e4fc7a 100644 --- a/src/org/omegat/core/statistics/CalcMatchStatistics.java +++ b/src/org/omegat/core/statistics/CalcMatchStatistics.java @@ -51,7 +51,6 @@ import org.omegat.core.threads.LongProcessThread; import org.omegat.util.OConsts; import org.omegat.util.OStrings; -import org.omegat.util.Preferences; import org.omegat.util.StringUtil; import org.omegat.util.Token; import org.omegat.util.gui.TextUtil; @@ -108,19 +107,15 @@ public class CalcMatchStatistics extends LongProcessThread { private final IProject project; public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) { - this(Core.getProject(), Core.getSegmenter(), callback, perFile, - Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, - OConsts.FUZZY_MATCH_THRESHOLD)); + this(Core.getProject(), Core.getSegmenter(), callback, perFile); } - public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, - boolean perFile, int threshold) { + public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) { this.project = project; this.callback = callback; this.perFile = perFile; finder = ThreadLocal.withInitial( - () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, - false, false, threshold)); + () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1)); } @Override @@ -313,7 +308,7 @@ Optional calcSimilarity(List untranslatedEntri int calcMaxSimilarity(SourceTextEntry ste) { String srcNoXmlTags = removeXmlTags(ste); FindMatches localFinder = finder.get(); - List nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted); + List nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted); final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText()); int maxSimilarity = 0; CACHE: for (NearString near : nears) { diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java index e6cbfcedd9..45de07abd8 100644 --- a/src/org/omegat/core/statistics/FindMatches.java +++ b/src/org/omegat/core/statistics/FindMatches.java @@ -127,46 +127,68 @@ public class FindMatches { /** Tokens for original string, includes numbers and tags. */ private Token[] strTokensAll; - // This finder used for search separate segment matches - private FindMatches separateSegmentMatcher; - private final int fuzzyMatchThreshold; - private final boolean applyThreshold; - private final Segmenter segmenter; - /** - * @param searchExactlyTheSame - * allows to search similarities with the same text as source - * segment. This mode used only for separate sentence match in - * paragraph project, i.e. where source is just part of current - * source. - */ + @Deprecated public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch, boolean searchExactlyTheSame) { - this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true, - Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, - OConsts.FUZZY_MATCH_THRESHOLD)); + this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault( + Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD)); } - public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch, - boolean searchExactlyTheSame, boolean applyThreshold, int threshold) { + /** + * Constructor. + * + * @param project + * OmegaT project. + * @param segmenter + * Segmenter to use. + * @param maxCount + * limit the maximum count of the results. + * @param searchExactlyTheSame + * allows searching similarities with the same text as a source + * segment. This mode is used only for separate sentence match in + * a paragraph project, i.e. where a source is just part of the + * current source. + * @param threshold + * threshold to use. + */ + public FindMatches(IProject project, Segmenter segmenter, int maxCount, + boolean searchExactlyTheSame, int threshold) { this.project = project; this.segmenter = segmenter; this.tok = project.getSourceTokenizer(); this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale(); this.maxCount = maxCount; this.searchExactlyTheSame = searchExactlyTheSame; - if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) { - separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold); - } this.fuzzyMatchThreshold = threshold; - this.applyThreshold = applyThreshold; } - public List search(String searchText, boolean requiresTranslation, boolean fillSimilarityData, + public List search(String searchText, boolean fillSimilarityData, IStopped stop) throws StoppedException { + return search(searchText, true, fillSimilarityData, stop, true, true); + } + + /** + * Search Translation memories. + *

+ * Internal method to handle search conditions. + * It is accecible as package-private for testing. + * + * @param searchText + * @param requiresTranslation + * @param fillSimilarityData + * @param stop + * @param allowSeparateSegmentMatch + * @param travelExternal + * @return + * @throws StoppedException + */ + List search(String searchText, boolean requiresTranslation, + boolean fillSimilarityData, IStopped stop, boolean allowSeparateSegmentMatch, + boolean travelExternal) throws StoppedException { result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1); srcText = searchText; removedText = ""; @@ -225,28 +247,31 @@ public List search(String searchText, boolean requiresTranslation, b */ int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES, Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT); - // travel by translation memories - for (Map.Entry en : project.getTransMemories().entrySet()) { - int penalty = 0; - Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey()); - if (matcher.find()) { - penalty = Integer.parseInt(matcher.group(1)); - } - for (ITMXEntry tmen : en.getValue().getEntries()) { - checkStopped(stop); - if (tmen.getSourceText() == null) { - // Not all TMX entries have a source; in that case there can - // be no meaningful match, so skip. - continue; + // travel by external translation memories, when non-segmented search + // (see BUGS#1251) + if (travelExternal) { + for (Map.Entry en : project.getTransMemories().entrySet()) { + int penalty = 0; + Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey()); + if (matcher.find()) { + penalty = Integer.parseInt(matcher.group(1)); } - if (requiresTranslation && tmen.getTranslationText() == null) { - continue; - } - int tmenPenalty = penalty; - if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) { - tmenPenalty += foreignPenalty; + for (ITMXEntry tmen : en.getValue().getEntries()) { + checkStopped(stop); + if (tmen.getSourceText() == null) { + // Not all TMX entries have a source; skip it in + // the case, because of no meaningful. + continue; + } + if (requiresTranslation && tmen.getTranslationText() == null) { + continue; + } + int tmenPenalty = penalty; + if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) { + tmenPenalty += foreignPenalty; + } + processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty); } - processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty); } } // travel by all entries for check source file translations @@ -260,7 +285,9 @@ public List search(String searchText, boolean requiresTranslation, b ste.isSourceTranslationFuzzy(), 0); } } - if (separateSegmentMatcher != null) { + if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) { + FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true, + fuzzyMatchThreshold); // split paragraph even when segmentation disabled, then find // matches for every segment List spaces = new ArrayList<>(); @@ -275,7 +302,7 @@ public List search(String searchText, boolean requiresTranslation, b for (String onesrc : segments) { // find match for a separate segment List segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, - false, stop); + false, stop, false, false); if (!segmentMatch.isEmpty() && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) { fsrc.add(segmentMatch.get(0).source); @@ -385,7 +412,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName, } // BUGS#1236 - stat display does not use threshold config check - if (applyThreshold && similarityStem < fuzzyMatchThreshold + if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold && similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) { return; } diff --git a/src/org/omegat/gui/matches/FindMatchesThread.java b/src/org/omegat/gui/matches/FindMatchesThread.java index 2e9b5746ab..029c6c94dd 100644 --- a/src/org/omegat/gui/matches/FindMatchesThread.java +++ b/src/org/omegat/gui/matches/FindMatchesThread.java @@ -33,13 +33,16 @@ import java.util.List; import java.util.logging.Logger; +import org.omegat.core.Core; import org.omegat.core.data.IProject; import org.omegat.core.data.SourceTextEntry; import org.omegat.core.events.IStopped; import org.omegat.core.matching.NearString; +import org.omegat.core.segmentation.Segmenter; import org.omegat.core.statistics.FindMatches; import org.omegat.gui.common.EntryInfoSearchThread; import org.omegat.util.OConsts; +import org.omegat.util.Preferences; /** * Find matches in separate thread then show a result in the matches' pane. @@ -82,7 +85,9 @@ protected List search() throws Exception { long before = System.currentTimeMillis(); try { - List result = finderSearch(project, processedEntry.getSrcText(), this::isEntryChanged); + List result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(), + this::isEntryChanged, Preferences.getPreferenceDefault( + Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD)); LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before)); return result; } catch (FindMatches.StoppedException ex) { @@ -97,8 +102,9 @@ protected List search() throws Exception { * @param isEntryChanged stop and raise StopException when it returns true. * @return result as a list of NearString. */ - protected static List finderSearch(IProject project, String srcText, IStopped isEntryChanged) { - FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false); - return finder.search(srcText, true, true, isEntryChanged); + protected static List finderSearch(IProject project, Segmenter segmenter, String srcText, + IStopped isEntryChanged, int threshold) { + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold); + return finder.search(srcText, true, isEntryChanged); } } diff --git a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java index 5da4dd3ffc..25ab768787 100644 --- a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java +++ b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java @@ -25,6 +25,7 @@ package org.omegat.core.statistics; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -35,6 +36,7 @@ import org.junit.Assert; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import org.omegat.core.Core; @@ -48,6 +50,7 @@ import org.omegat.core.data.ProtectedPart; import org.omegat.core.data.SourceTextEntry; import org.omegat.core.data.TMXEntry; +import org.omegat.core.segmentation.SRX; import org.omegat.core.segmentation.Segmenter; import org.omegat.filters2.FilterContext; import org.omegat.filters2.IFilter; @@ -64,13 +67,17 @@ public class CalcMatchStatisticsTest { + @BeforeClass + public static void setup() throws IOException { + TestPreferencesInitializer.init(); + } + @Test public void testCalcMatchStatics() throws Exception { TestProject project = new TestProject(new ProjectPropertiesTest()); IStatsConsumer callback = new TestStatsConsumer(); - Segmenter segmenter = new Segmenter(Preferences.getSRX()); - CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, - callback, 30); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback); calcMatchStatistics.start(); try { calcMatchStatistics.join(); @@ -123,7 +130,8 @@ public void testCalcMatchStatics() throws Exception { Assert.assertEquals("5699", result[7][4]); // change threshold - calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70); + Preferences.setPreference(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, 70); + calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback); calcMatchStatistics.start(); try { calcMatchStatistics.join(); @@ -362,9 +370,8 @@ static class CalcMatchStatisticsMock extends CalcMatchStatistics { private MatchStatCounts result; private final IStatsConsumer callback; - CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback, - int threshold) { - super(project, segmenter, callback, false, threshold); + CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback) { + super(project, segmenter, callback, false); this.project = project; this.callback = callback; } diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java index d846118915..c7df514eaf 100644 --- a/test/src/org/omegat/core/statistics/FindMatchesTest.java +++ b/test/src/org/omegat/core/statistics/FindMatchesTest.java @@ -111,22 +111,22 @@ public void testSegmented() throws Exception { + "han passat prou temps al lloc web per a convertir-se en usuaris bàsics." + " Una comunitat vibrant necessita una entrada regular de nouvinguts que hi participen habitualment" + " i aporten veus noves a les converses.\n"; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, false, - true, 30); - List result = finder.search(srcText, true, true, iStopped); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + // search without a separated segment match. + List result = finder.search(srcText, true, true, iStopped, false, true); assertEquals(OConsts.MAX_NEAR_STRINGS, result.size()); assertEquals(65, result.get(0).scores[0].score); assertEquals(62, result.get(0).scores[0].scoreNoStem); assertEquals(62, result.get(0).scores[0].adjustedScore); assertEquals(expectFirst, result.get(0).translation); assertEquals(expectNear, result.get(1).translation); - // + // search with a segmented match. List spaces = new ArrayList<>(); List brules = new ArrayList<>(); List segments = segmenter.segment(prop.getSourceLanguage(), srcText, spaces, brules); assertEquals(3, segments.size()); - finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, true, 30); - result = finder.search(srcText, true, true, iStopped); + finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + result = finder.search(srcText, false, iStopped); assertEquals(OConsts.MAX_NEAR_STRINGS, result.size()); assertEquals("Hit with segmented tmx record", 100, result.get(0).scores[0].score); assertEquals(100, result.get(0).scores[0].scoreNoStem); @@ -167,9 +167,8 @@ public void testSearchRFE1578() throws Exception { IProject project = new TestProject(prop, null, TMX_EN_US_SR, new LuceneEnglishTokenizer(), new DefaultTokenizer(), segmenter); IStopped iStopped = () -> false; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, - true, 30); - List result = finder.search("XXX", true, true, iStopped); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + List result = finder.search("XXX", false, iStopped); // Without the fix, the result has two entries, but it should one. assertEquals(1, result.size()); assertEquals("XXX", result.get(0).source); @@ -206,10 +205,9 @@ public void testSearchRFE1578_2() throws Exception { IProject project = new TestProject(prop, null, TMX_EN_US_GB_SR, new LuceneEnglishTokenizer(), new DefaultTokenizer(), segmenter); IStopped iStopped = () -> false; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, - true, 30); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); // Search source "XXx" in en-US - List result = finder.search("XXX", true, true, iStopped); + List result = finder.search("XXX", false, iStopped); // There should be three entries. assertEquals(3, result.size()); assertEquals("XXx", result.get(0).source); // should be en-US. @@ -237,9 +235,8 @@ public void testSearchBUGS1251() throws Exception { List segments = segmenter.segment(sourceLanguage, srcText, spaces, brules); assertEquals(2, segments.size()); IStopped iStopped = () -> false; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, - true, 30); - List result = finder.search(srcText, true, true, iStopped); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + List result = finder.search(srcText, false, iStopped); assertEquals(srcText, result.get(0).source); assertEquals(1, result.size()); assertEquals("TM", result.get(0).comesFrom.name()); diff --git a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java index e7f5660d01..3af345ef1c 100644 --- a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java +++ b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java @@ -26,11 +26,26 @@ package org.omegat.gui.matches; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + import org.apache.commons.io.FileUtils; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; + import org.omegat.core.Core; import org.omegat.core.data.EntryKey; import org.omegat.core.data.ExternalTMFactory; @@ -52,20 +67,6 @@ import org.omegat.util.Preferences; import org.omegat.util.TestPreferencesInitializer; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - public class FindMatchesThreadTest { private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx"); private static final String SOURCE_TEXT = "地力の搾取と浪費が現われる。(1)"; @@ -97,8 +98,9 @@ public void testSearchBUGS1248() throws Exception { prop.setSentenceSegmentingEnabled(false); IProject project = new TestProject(prop, TMX_SEGMENT, new LuceneCJKTokenizer(), new LuceneFrenchTokenizer()); Core.setProject(project); - Core.setSegmenter(new Segmenter(SRX.getDefault())); - List result = FindMatchesThread.finderSearch(project, SOURCE_TEXT, () -> false); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + List result = FindMatchesThread.finderSearch(project, segmenter, SOURCE_TEXT, () -> false, + 30); assertEquals(SOURCE_TEXT, result.get(0).source); assertEquals("TM", result.get(0).comesFrom.name()); assertEquals(1, result.size());