From e3ca7b4474082080c4425ac2487281280d0bb32c Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Fri, 16 Feb 2024 15:16:30 +0900 Subject: [PATCH 1/5] [BUGS#1251] feat: add regression test - Add FindMatchesThreadTest to reproduce BUGS#1251 - Add a test case to test FindMatches with the case of BUGS#1251. Signed-off-by: Hiroshi Miura --- .../omegat/gui/matches/FindMatchesThread.java | 26 ++- test/data/tmx/penalty-010/segment_1.tmx | 16 ++ test/data/tmx/test-multiple-entries.tmx | 46 +++++ .../core/statistics/FindMatchesTest.java | 68 ++++++- .../gui/matches/FindMatchesThreadTest.java | 171 ++++++++++++++++++ 5 files changed, 320 insertions(+), 7 deletions(-) create mode 100644 test/data/tmx/penalty-010/segment_1.tmx create mode 100644 test/data/tmx/test-multiple-entries.tmx create mode 100644 test/src/org/omegat/gui/matches/FindMatchesThreadTest.java diff --git a/src/org/omegat/gui/matches/FindMatchesThread.java b/src/org/omegat/gui/matches/FindMatchesThread.java index a83b1704f3..2e9b5746ab 100644 --- a/src/org/omegat/gui/matches/FindMatchesThread.java +++ b/src/org/omegat/gui/matches/FindMatchesThread.java @@ -7,6 +7,7 @@ 2008 Alex Buloichik 2012 Thomas Cordonnier, Martin Fleurke 2013 Aaron Madlon-Kay + 2024 Hiroshi Miura Home page: https://www.omegat.org/ Support center: https://omegat.org/support @@ -34,15 +35,17 @@ import org.omegat.core.data.IProject; import org.omegat.core.data.SourceTextEntry; +import org.omegat.core.events.IStopped; import org.omegat.core.matching.NearString; import org.omegat.core.statistics.FindMatches; import org.omegat.gui.common.EntryInfoSearchThread; import org.omegat.util.OConsts; /** - * Find matches in separate thread then show result in the matches pane. + * Find matches in separate thread then show a result in the matches' pane. * * @author Alex Buloichik (alex73mail@gmail.com) + * @author Hiroshi Miura */ public class FindMatchesThread extends EntryInfoSearchThread> { private static final Logger LOGGER = Logger.getLogger(FindMatchesThread.class.getName()); @@ -52,9 +55,9 @@ public class FindMatchesThread extends EntryInfoSearchThread> { /** * Entry which is processed currently. - * - * If entry in controller was changed, it means user has moved to another entry, and there is no sense to - * continue. + *

+ * If entry in controller was changed, it means the user has moved to + * another entry, and there is no sense to continue. */ private final SourceTextEntry processedEntry; @@ -79,12 +82,23 @@ protected List search() throws Exception { long before = System.currentTimeMillis(); try { - FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false); - List result = finder.search(processedEntry.getSrcText(), true, true, this::isEntryChanged); + List result = finderSearch(project, processedEntry.getSrcText(), this::isEntryChanged); LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before)); return result; } catch (FindMatches.StoppedException ex) { throw new EntryChangedException(); } } + + /** + * Search matches (static for test purpose). + * @param project OmegaT project. + * @param srcText source text to look for. + * @param isEntryChanged stop and raise StopException when it returns true. + * @return result as a list of NearString. + */ + protected static List finderSearch(IProject project, String srcText, IStopped isEntryChanged) { + FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false); + return finder.search(srcText, true, true, isEntryChanged); + } } diff --git a/test/data/tmx/penalty-010/segment_1.tmx b/test/data/tmx/penalty-010/segment_1.tmx new file mode 100644 index 0000000000..18e55fa724 --- /dev/null +++ b/test/data/tmx/penalty-010/segment_1.tmx @@ -0,0 +1,16 @@ + + + + +

+ + + + weird behavior + + + 地力の搾取と浪費が現われる。(1) + + + + diff --git a/test/data/tmx/test-multiple-entries.tmx b/test/data/tmx/test-multiple-entries.tmx new file mode 100644 index 0000000000..b2b9008c56 --- /dev/null +++ b/test/data/tmx/test-multiple-entries.tmx @@ -0,0 +1,46 @@ + + + +
+ + + + + Other + + + Altre + + + + + For installation on Linux. + + + Per l’installazioni nant’à i sistemi Linux. + + + + + For installation on other operating systems (such as FreeBSD and Solaris). + + + Per l’installazioni nant’à d’altri sistemi (cum’è FreeBSD è Solaris). + + + + + website/download.html + For installation on Linux. + For installation on other operating systems (such as FreeBSD and Solaris).<br0/> + + Other + + + Altri + + + + diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java index 46c1349c8c..34a712f3c9 100644 --- a/test/src/org/omegat/core/statistics/FindMatchesTest.java +++ b/test/src/org/omegat/core/statistics/FindMatchesTest.java @@ -3,7 +3,7 @@ with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. - Copyright (C) 2021 Hiroshi Miura + Copyright (C) 2021-2024 Hiroshi Miura Home page: https://www.omegat.org/ Support center: https://omegat.org/support @@ -26,6 +26,8 @@ package org.omegat.core.statistics; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.io.File; @@ -55,10 +57,13 @@ import org.omegat.core.events.IStopped; import org.omegat.core.matching.NearString; import org.omegat.core.segmentation.Rule; +import org.omegat.core.segmentation.SRX; import org.omegat.core.segmentation.Segmenter; import org.omegat.tokenizer.DefaultTokenizer; import org.omegat.tokenizer.ITokenizer; +import org.omegat.tokenizer.LuceneCJKTokenizer; import org.omegat.tokenizer.LuceneEnglishTokenizer; +import org.omegat.tokenizer.LuceneFrenchTokenizer; import org.omegat.util.Language; import org.omegat.util.Log; import org.omegat.util.OConsts; @@ -71,6 +76,8 @@ public class FindMatchesTest { private static final File TMX_MATCH_EN_CA = new File("test/data/tmx/test-match-stat-en-ca.tmx"); private static final File TMX_EN_US_SR = new File("test/data/tmx/en-US_sr.tmx"); private static final File TMX_EN_US_GB_SR = new File("test/data/tmx/en-US_en-GB_fr_sr.tmx"); + private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx"); + private static final File TMX_MULTI = new File("test/data/tmx/test-multiple-entries.tmx"); private static Path tmpDir; @@ -214,6 +221,59 @@ public void testSearchRFE1578_2() throws Exception { assertEquals("ZZZ", result.get(2).translation); // sr } + @Test + public void testSearchBUGS1251() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("ja"); + prop.setTargetLanguage("fr"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(false); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + IProject project = new TestProject(prop, null, TMX_SEGMENT, new LuceneCJKTokenizer(), + new LuceneFrenchTokenizer(), segmenter); + Core.setProject(project); + SourceTextEntry ste = project.getAllEntries().get(1); + Language sourceLanguage = prop.getSourceLanguage(); + String srcText = ste.getSrcText(); + List spaces = new ArrayList<>(); + List brules = new ArrayList<>(); + List segments = segmenter.segment(sourceLanguage, srcText, spaces, brules); + assertEquals(2, segments.size()); + IStopped iStopped = () -> false; + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, + true, 30); + List result = finder.search(srcText, true, true, iStopped); + assertEquals(srcText, result.get(0).source); + assertEquals(1, result.size()); + assertEquals("TM", result.get(0).comesFrom.name()); + assertEquals(90, result.get(0).scores[0].score); + assertEquals("weird behavior", result.get(0).translation); + } + + @Test + public void testSearchMulti() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("en-US"); + prop.setTargetLanguage("co"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(true); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + IProject project = new TestProject(prop, TMX_MULTI, null, new LuceneEnglishTokenizer(), + new DefaultTokenizer(), segmenter); + IStopped iStopped = () -> false; + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, 85); + List result = finder.search("Other", false, iStopped); + assertEquals(3, result.size()); + assertEquals("Other", result.get(0).source); + assertEquals("Altre", result.get(0).translation); // default + assertNull(result.get(0).key); + assertEquals("Altri", result.get(1).translation); // alternative + assertNotNull(result.get(1).key); + assertEquals("website/download.html", result.get(1).key.file); + assertEquals("Other", result.get(2).translation); // source translation + } + + @BeforeClass public static void setUpClass() throws Exception { tmpDir = Files.createTempDirectory("omegat"); @@ -309,6 +369,12 @@ public List getAllEntries() { List ste = new ArrayList<>(); ste.add(new SourceTextEntry(new EntryKey("source.txt", "XXX", null, "", "", null), 1, null, null, Collections.emptyList())); + ste.add(new SourceTextEntry(new EntryKey("source.txt", "地力の搾取と浪費が現われる。(1)", null, "", "", null), + 1, null, null, Collections.emptyList())); + ste.add(new SourceTextEntry(new EntryKey("website/download.html", "Other", "id", + "For installation on Linux.", + "For installation on other operating systems (such as FreeBSD and Solaris).<br0/>", + null), 1, null, "Other", Collections.emptyList())); return ste; } diff --git a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java new file mode 100644 index 0000000000..e7f5660d01 --- /dev/null +++ b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java @@ -0,0 +1,171 @@ +/******************************************************************************* + + OmegaT - Computer Assisted Translation (CAT) tool + with fuzzy matching, translation memory, keyword search, + glossaries, and translation leveraging into updated projects. + + Copyright (C) 2024 Hiroshi Miura + Home page: https://www.omegat.org/ + Support center: https://omegat.org/support + + This file is part of OmegaT. + + OmegaT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OmegaT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + ******************************************************************************/ + +package org.omegat.gui.matches; + +import org.apache.commons.io.FileUtils; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.omegat.core.Core; +import org.omegat.core.data.EntryKey; +import org.omegat.core.data.ExternalTMFactory; +import org.omegat.core.data.ExternalTMX; +import org.omegat.core.data.IProject; +import org.omegat.core.data.NotLoadedProject; +import org.omegat.core.data.ProjectProperties; +import org.omegat.core.data.ProjectTMX; +import org.omegat.core.data.SourceTextEntry; +import org.omegat.core.matching.NearString; +import org.omegat.core.segmentation.SRX; +import org.omegat.core.segmentation.Segmenter; +import org.omegat.tokenizer.DefaultTokenizer; +import org.omegat.tokenizer.ITokenizer; +import org.omegat.tokenizer.LuceneCJKTokenizer; +import org.omegat.tokenizer.LuceneEnglishTokenizer; +import org.omegat.tokenizer.LuceneFrenchTokenizer; +import org.omegat.util.Language; +import org.omegat.util.Preferences; +import org.omegat.util.TestPreferencesInitializer; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class FindMatchesThreadTest { + private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx"); + private static final String SOURCE_TEXT = "地力の搾取と浪費が現われる。(1)"; + private static Path tmpDir; + + @BeforeClass + public static void setUpClass() throws Exception { + tmpDir = Files.createTempDirectory("omegat"); + assertTrue(tmpDir.toFile().isDirectory()); + } + + @Before + public void setUp() throws Exception { + Core.initializeConsole(new TreeMap<>()); + TestPreferencesInitializer.init(); + Preferences.setPreference(Preferences.EXT_TMX_SHOW_LEVEL2, false); + Preferences.setPreference(Preferences.EXT_TMX_USE_SLASH, false); + Preferences.setPreference(Preferences.EXT_TMX_KEEP_FOREIGN_MATCH, true); + Core.registerTokenizerClass(DefaultTokenizer.class); + Core.registerTokenizerClass(LuceneEnglishTokenizer.class); + } + + @Test + public void testSearchBUGS1248() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("ja"); + prop.setTargetLanguage("fr"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(false); + IProject project = new TestProject(prop, TMX_SEGMENT, new LuceneCJKTokenizer(), new LuceneFrenchTokenizer()); + Core.setProject(project); + Core.setSegmenter(new Segmenter(SRX.getDefault())); + List result = FindMatchesThread.finderSearch(project, SOURCE_TEXT, () -> false); + assertEquals(SOURCE_TEXT, result.get(0).source); + assertEquals("TM", result.get(0).comesFrom.name()); + assertEquals(1, result.size()); + assertEquals(90, result.get(0).scores[0].score); + assertEquals("weird behavior", result.get(0).translation); + } + + static class TestProject extends NotLoadedProject implements IProject { + private final ProjectProperties prop; + private final File testTmx; + private final ITokenizer sourceTokenizer; + private final ITokenizer targetTokenizer; + + TestProject(ProjectProperties prop, File testTmx) { + this(prop, testTmx, new LuceneEnglishTokenizer(), new DefaultTokenizer()); + } + + TestProject(ProjectProperties prop, File testTmx, ITokenizer source, ITokenizer target) { + this.prop = prop; + this.testTmx = testTmx; + sourceTokenizer = source; + targetTokenizer = target; + } + + @Override + public ProjectProperties getProjectProperties() { + return prop; + } + + @Override + public List getAllEntries() { + List ste = new ArrayList<>(); + ste.add(new SourceTextEntry(new EntryKey("source.txt", SOURCE_TEXT, null, "", "", null), + 1, null, null, Collections.emptyList())); + return ste; + } + + @Override + public ITokenizer getSourceTokenizer() { + return sourceTokenizer; + }; + + @Override + public ITokenizer getTargetTokenizer() { + return targetTokenizer; + } + + @Override + public Map getOtherTargetLanguageTMs() { + return Collections.emptyMap(); + } + + @Override + public Map getTransMemories() { + Map transMemories = new TreeMap<>(); + try { + ExternalTMX newTMX = ExternalTMFactory.load(testTmx); + transMemories.put(testTmx.getPath(), newTMX); + } catch (Exception ignored) { + } + return Collections.unmodifiableMap(transMemories); + } + } + + @AfterClass + public static void tearDown() throws IOException { + FileUtils.deleteDirectory(tmpDir.toFile()); + assertFalse(tmpDir.toFile().exists()); + } +} From 2c44aed053f1c92e3e870d7a4019e9a32a84fa65 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Tue, 26 Nov 2024 13:06:52 +0900 Subject: [PATCH 2/5] [BUGS#1251] refactor: FindMatches - add internal search method to handle normal and segmented search conditions, also use for test purpose - drop threshold arguments for CalcMatchStatistics usage - FindMatchesThread.finderSearch to take threshold argument for testing. - update search() callers accordingly. Signed-off-by: Hiroshi Miura --- .../core/statistics/CalcMatchStatistics.java | 13 +-- .../omegat/core/statistics/FindMatches.java | 85 +++++++++++-------- .../omegat/gui/matches/FindMatchesThread.java | 14 ++- .../statistics/CalcMatchStatisticsTest.java | 21 +++-- .../core/statistics/FindMatchesTest.java | 27 +++--- .../gui/matches/FindMatchesThreadTest.java | 37 ++++---- 6 files changed, 110 insertions(+), 87 deletions(-) diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java index f48f2dcf57..2aa2e4fc7a 100644 --- a/src/org/omegat/core/statistics/CalcMatchStatistics.java +++ b/src/org/omegat/core/statistics/CalcMatchStatistics.java @@ -51,7 +51,6 @@ import org.omegat.core.threads.LongProcessThread; import org.omegat.util.OConsts; import org.omegat.util.OStrings; -import org.omegat.util.Preferences; import org.omegat.util.StringUtil; import org.omegat.util.Token; import org.omegat.util.gui.TextUtil; @@ -108,19 +107,15 @@ public class CalcMatchStatistics extends LongProcessThread { private final IProject project; public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) { - this(Core.getProject(), Core.getSegmenter(), callback, perFile, - Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, - OConsts.FUZZY_MATCH_THRESHOLD)); + this(Core.getProject(), Core.getSegmenter(), callback, perFile); } - public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, - boolean perFile, int threshold) { + public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) { this.project = project; this.callback = callback; this.perFile = perFile; finder = ThreadLocal.withInitial( - () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, - false, false, threshold)); + () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1)); } @Override @@ -313,7 +308,7 @@ Optional calcSimilarity(List untranslatedEntri int calcMaxSimilarity(SourceTextEntry ste) { String srcNoXmlTags = removeXmlTags(ste); FindMatches localFinder = finder.get(); - List nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted); + List nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted); final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText()); int maxSimilarity = 0; CACHE: for (NearString near : nears) { diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java index 614cb2c8f4..ebffbd8f3b 100644 --- a/src/org/omegat/core/statistics/FindMatches.java +++ b/src/org/omegat/core/statistics/FindMatches.java @@ -126,27 +126,15 @@ public class FindMatches { /** Tokens for original string, includes numbers and tags. */ private Token[] strTokensAll; - // This finder used for search separate segment matches - private FindMatches separateSegmentMatcher; - private final int fuzzyMatchThreshold; - private final boolean applyThreshold; - private final Segmenter segmenter; - /** - * @param searchExactlyTheSame - * allows to search similarities with the same text as source - * segment. This mode used only for separate sentence match in - * paragraph project, i.e. where source is just part of current - * source. - */ + @Deprecated(since = "6.1.0") public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch, boolean searchExactlyTheSame) { - this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true, - Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, - OConsts.FUZZY_MATCH_THRESHOLD)); + this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault( + Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD)); } /** @@ -166,19 +154,21 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM * @param threshold * threshold to use. */ - public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch, - boolean searchExactlyTheSame, boolean applyThreshold, int threshold) { + public FindMatches(IProject project, Segmenter segmenter, int maxCount, + boolean searchExactlyTheSame, int threshold) { this.project = project; this.segmenter = segmenter; this.tok = project.getSourceTokenizer(); this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale(); this.maxCount = maxCount; this.searchExactlyTheSame = searchExactlyTheSame; - if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) { - separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold); - } this.fuzzyMatchThreshold = threshold; - this.applyThreshold = applyThreshold; + } + + @Deprecated(since = "6.1.0") + public List search(final String searchText, final boolean requiresTranslation, + final boolean fillSimilarityData, final IStopped stop) throws StoppedException { + return search(searchText, fillSimilarityData, stop); } /** @@ -195,8 +185,33 @@ public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean * @throws StoppedException * raised when stopped during a search process. */ - public List search(String searchText, boolean requiresTranslation, boolean fillSimilarityData, - IStopped stop) throws StoppedException { + public List search(String searchText, boolean fillSimilarityData, IStopped stop) + throws StoppedException { + return search(searchText, fillSimilarityData, stop, + !project.getProjectProperties().isSentenceSegmentingEnabled()); + } + + /** + * Search Translation memories. + *

+ * Internal method to handle search conditions. + * It is accessible as package-private for testing. + * + * @param searchText + * target segment or term to search. + * @param fillSimilarityData + * fill similarity data into the result of NearString objects. + * @param stop + * IStopped callback object to indicate cancel operation. + * @param runSeparateSegmentMatch + * Also search with segmented terms search. + * @return + * List of NearString objects. + * @throws StoppedException + * When stopped the process during search. + */ + List search(String searchText, boolean fillSimilarityData, IStopped stop, + boolean runSeparateSegmentMatch) throws StoppedException { result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1); srcText = searchText; removedText = ""; @@ -226,7 +241,7 @@ public List search(String searchText, boolean requiresTranslation, b // skip original==original entry comparison return; } - if (requiresTranslation && trans.translation == null) { + if (trans.translation == null) { return; } String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null; @@ -241,7 +256,7 @@ public List search(String searchText, boolean requiresTranslation, b // skip original==original entry comparison return; } - if (requiresTranslation && trans.translation == null) { + if (trans.translation == null) { return; } String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null; @@ -255,7 +270,6 @@ public List search(String searchText, boolean requiresTranslation, b */ int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES, Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT); - // travel by translation memories for (Map.Entry en : project.getTransMemories().entrySet()) { int penalty = 0; Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey()); @@ -265,11 +279,11 @@ public List search(String searchText, boolean requiresTranslation, b for (ITMXEntry tmen : en.getValue().getEntries()) { checkStopped(stop); if (tmen.getSourceText() == null) { - // Not all TMX entries have a source; in that case there can - // be no meaningful match, so skip. + // Not all TMX entries have a source; skip it in + // the case, because of no meaningful. continue; } - if (requiresTranslation && tmen.getTranslationText() == null) { + if (tmen.getTranslationText() == null) { continue; } int tmenPenalty = penalty; @@ -290,7 +304,9 @@ public List search(String searchText, boolean requiresTranslation, b ste.isSourceTranslationFuzzy(), 0); } } - if (separateSegmentMatcher != null) { + if (runSeparateSegmentMatch) { + FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true, + fuzzyMatchThreshold); // split paragraph even when segmentation disabled, then find // matches for every segment List spaces = new ArrayList<>(); @@ -303,9 +319,10 @@ public List search(String searchText, boolean requiresTranslation, b List ftrans = new ArrayList<>(segments.size()); // multiple segments for (String onesrc : segments) { - // find match for a separate segment - List segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, - false, stop); + // find match for a separate segment. + // WARN: the 5th argument should be + // `false` to avoid an infinite-loop. + List segmentMatch = separateSegmentMatcher.search(onesrc, false, stop, false); if (!segmentMatch.isEmpty() && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) { fsrc.add(segmentMatch.get(0).source); @@ -415,7 +432,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName, } // BUGS#1236 - stat display does not use threshold config check - if (applyThreshold && similarityStem < fuzzyMatchThreshold + if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold && similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) { return; } diff --git a/src/org/omegat/gui/matches/FindMatchesThread.java b/src/org/omegat/gui/matches/FindMatchesThread.java index 2e9b5746ab..029c6c94dd 100644 --- a/src/org/omegat/gui/matches/FindMatchesThread.java +++ b/src/org/omegat/gui/matches/FindMatchesThread.java @@ -33,13 +33,16 @@ import java.util.List; import java.util.logging.Logger; +import org.omegat.core.Core; import org.omegat.core.data.IProject; import org.omegat.core.data.SourceTextEntry; import org.omegat.core.events.IStopped; import org.omegat.core.matching.NearString; +import org.omegat.core.segmentation.Segmenter; import org.omegat.core.statistics.FindMatches; import org.omegat.gui.common.EntryInfoSearchThread; import org.omegat.util.OConsts; +import org.omegat.util.Preferences; /** * Find matches in separate thread then show a result in the matches' pane. @@ -82,7 +85,9 @@ protected List search() throws Exception { long before = System.currentTimeMillis(); try { - List result = finderSearch(project, processedEntry.getSrcText(), this::isEntryChanged); + List result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(), + this::isEntryChanged, Preferences.getPreferenceDefault( + Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD)); LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before)); return result; } catch (FindMatches.StoppedException ex) { @@ -97,8 +102,9 @@ protected List search() throws Exception { * @param isEntryChanged stop and raise StopException when it returns true. * @return result as a list of NearString. */ - protected static List finderSearch(IProject project, String srcText, IStopped isEntryChanged) { - FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false); - return finder.search(srcText, true, true, isEntryChanged); + protected static List finderSearch(IProject project, Segmenter segmenter, String srcText, + IStopped isEntryChanged, int threshold) { + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold); + return finder.search(srcText, true, isEntryChanged); } } diff --git a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java index 5da4dd3ffc..25ab768787 100644 --- a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java +++ b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java @@ -25,6 +25,7 @@ package org.omegat.core.statistics; +import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -35,6 +36,7 @@ import org.junit.Assert; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Test; import org.omegat.core.Core; @@ -48,6 +50,7 @@ import org.omegat.core.data.ProtectedPart; import org.omegat.core.data.SourceTextEntry; import org.omegat.core.data.TMXEntry; +import org.omegat.core.segmentation.SRX; import org.omegat.core.segmentation.Segmenter; import org.omegat.filters2.FilterContext; import org.omegat.filters2.IFilter; @@ -64,13 +67,17 @@ public class CalcMatchStatisticsTest { + @BeforeClass + public static void setup() throws IOException { + TestPreferencesInitializer.init(); + } + @Test public void testCalcMatchStatics() throws Exception { TestProject project = new TestProject(new ProjectPropertiesTest()); IStatsConsumer callback = new TestStatsConsumer(); - Segmenter segmenter = new Segmenter(Preferences.getSRX()); - CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, - callback, 30); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback); calcMatchStatistics.start(); try { calcMatchStatistics.join(); @@ -123,7 +130,8 @@ public void testCalcMatchStatics() throws Exception { Assert.assertEquals("5699", result[7][4]); // change threshold - calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70); + Preferences.setPreference(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, 70); + calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback); calcMatchStatistics.start(); try { calcMatchStatistics.join(); @@ -362,9 +370,8 @@ static class CalcMatchStatisticsMock extends CalcMatchStatistics { private MatchStatCounts result; private final IStatsConsumer callback; - CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback, - int threshold) { - super(project, segmenter, callback, false, threshold); + CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback) { + super(project, segmenter, callback, false); this.project = project; this.callback = callback; } diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java index 34a712f3c9..c1124e4a9c 100644 --- a/test/src/org/omegat/core/statistics/FindMatchesTest.java +++ b/test/src/org/omegat/core/statistics/FindMatchesTest.java @@ -114,22 +114,22 @@ public void testSegmented() throws Exception { + "han passat prou temps al lloc web per a convertir-se en usuaris bàsics." + " Una comunitat vibrant necessita una entrada regular de nouvinguts que hi participen habitualment" + " i aporten veus noves a les converses.\n"; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, false, - true, 30); - List result = finder.search(srcText, true, true, iStopped); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + // search without a separated segment match. + List result = finder.search(srcText, true, iStopped, false); assertEquals(OConsts.MAX_NEAR_STRINGS, result.size()); assertEquals(65, result.get(0).scores[0].score); assertEquals(62, result.get(0).scores[0].scoreNoStem); assertEquals(62, result.get(0).scores[0].adjustedScore); assertEquals(expectFirst, result.get(0).translation); assertEquals(expectNear, result.get(1).translation); - // + // search with a segmented match. List spaces = new ArrayList<>(); List brules = new ArrayList<>(); List segments = segmenter.segment(prop.getSourceLanguage(), srcText, spaces, brules); assertEquals(3, segments.size()); - finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, true, 30); - result = finder.search(srcText, true, true, iStopped); + finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + result = finder.search(srcText, false, iStopped); assertEquals(OConsts.MAX_NEAR_STRINGS, result.size()); assertEquals("Hit with segmented tmx record", 100, result.get(0).scores[0].score); assertEquals(100, result.get(0).scores[0].scoreNoStem); @@ -170,9 +170,8 @@ public void testSearchRFE1578() throws Exception { IProject project = new TestProject(prop, null, TMX_EN_US_SR, new LuceneEnglishTokenizer(), new DefaultTokenizer(), segmenter); IStopped iStopped = () -> false; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, - true, 30); - List result = finder.search("XXX", true, true, iStopped); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + List result = finder.search("XXX", false, iStopped); // Without the fix, the result has two entries, but it should one. assertEquals(1, result.size()); assertEquals("XXX", result.get(0).source); @@ -209,10 +208,9 @@ public void testSearchRFE1578_2() throws Exception { IProject project = new TestProject(prop, null, TMX_EN_US_GB_SR, new LuceneEnglishTokenizer(), new DefaultTokenizer(), segmenter); IStopped iStopped = () -> false; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, - true, 30); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); // Search source "XXx" in en-US - List result = finder.search("XXX", true, true, iStopped); + List result = finder.search("XXX", false, iStopped); // There should be three entries. assertEquals(3, result.size()); assertEquals("XXx", result.get(0).source); // should be en-US. @@ -240,9 +238,8 @@ public void testSearchBUGS1251() throws Exception { List segments = segmenter.segment(sourceLanguage, srcText, spaces, brules); assertEquals(2, segments.size()); IStopped iStopped = () -> false; - FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, - true, 30); - List result = finder.search(srcText, true, true, iStopped); + FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30); + List result = finder.search(srcText, false, iStopped); assertEquals(srcText, result.get(0).source); assertEquals(1, result.size()); assertEquals("TM", result.get(0).comesFrom.name()); diff --git a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java index e7f5660d01..fce9961136 100644 --- a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java +++ b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java @@ -1,5 +1,4 @@ /******************************************************************************* - OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. @@ -26,11 +25,26 @@ package org.omegat.gui.matches; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + import org.apache.commons.io.FileUtils; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; + import org.omegat.core.Core; import org.omegat.core.data.EntryKey; import org.omegat.core.data.ExternalTMFactory; @@ -52,20 +66,6 @@ import org.omegat.util.Preferences; import org.omegat.util.TestPreferencesInitializer; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - public class FindMatchesThreadTest { private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx"); private static final String SOURCE_TEXT = "地力の搾取と浪費が現われる。(1)"; @@ -97,11 +97,12 @@ public void testSearchBUGS1248() throws Exception { prop.setSentenceSegmentingEnabled(false); IProject project = new TestProject(prop, TMX_SEGMENT, new LuceneCJKTokenizer(), new LuceneFrenchTokenizer()); Core.setProject(project); - Core.setSegmenter(new Segmenter(SRX.getDefault())); - List result = FindMatchesThread.finderSearch(project, SOURCE_TEXT, () -> false); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + List result = FindMatchesThread.finderSearch(project, segmenter, SOURCE_TEXT, () -> false, + 30); + assertEquals(2, result.size()); assertEquals(SOURCE_TEXT, result.get(0).source); assertEquals("TM", result.get(0).comesFrom.name()); - assertEquals(1, result.size()); assertEquals(90, result.get(0).scores[0].score); assertEquals("weird behavior", result.get(0).translation); } From f2de1ee5d7c901a63aeb8469308e449dbe6e4260 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 18 Dec 2024 13:31:38 +0900 Subject: [PATCH 3/5] refactor: ignore regression test It should be enabled when the fix is proposed. Signed-off-by: Hiroshi Miura --- test/src/org/omegat/core/statistics/FindMatchesTest.java | 2 ++ test/src/org/omegat/gui/matches/FindMatchesThreadTest.java | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java index c1124e4a9c..128be5dc9c 100644 --- a/test/src/org/omegat/core/statistics/FindMatchesTest.java +++ b/test/src/org/omegat/core/statistics/FindMatchesTest.java @@ -42,6 +42,7 @@ import org.junit.Before; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import org.omegat.core.Core; @@ -219,6 +220,7 @@ public void testSearchRFE1578_2() throws Exception { assertEquals("ZZZ", result.get(2).translation); // sr } + @Ignore("Should be enalbed when the bug fix proposed.") @Test public void testSearchBUGS1251() throws Exception { ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); diff --git a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java index fce9961136..81246276e8 100644 --- a/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java +++ b/test/src/org/omegat/gui/matches/FindMatchesThreadTest.java @@ -43,6 +43,7 @@ import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import org.omegat.core.Core; @@ -88,6 +89,7 @@ public void setUp() throws Exception { Core.registerTokenizerClass(LuceneEnglishTokenizer.class); } + @Ignore("Should be enalbed when the bug fix proposed.") @Test public void testSearchBUGS1248() throws Exception { ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); From bf0938f91516cfd6c2e200833a75de5ee3aaaea7 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 18 Dec 2024 13:31:38 +0900 Subject: [PATCH 4/5] refactor: fix merge error when split PR#963 Signed-off-by: Hiroshi Miura --- .../core/statistics/CalcMatchStatistics.java | 10 +++++++--- .../statistics/CalcMatchStatisticsTest.java | 18 ++++++------------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/org/omegat/core/statistics/CalcMatchStatistics.java b/src/org/omegat/core/statistics/CalcMatchStatistics.java index 2aa2e4fc7a..674e4ddabd 100644 --- a/src/org/omegat/core/statistics/CalcMatchStatistics.java +++ b/src/org/omegat/core/statistics/CalcMatchStatistics.java @@ -51,6 +51,7 @@ import org.omegat.core.threads.LongProcessThread; import org.omegat.util.OConsts; import org.omegat.util.OStrings; +import org.omegat.util.Preferences; import org.omegat.util.StringUtil; import org.omegat.util.Token; import org.omegat.util.gui.TextUtil; @@ -107,15 +108,18 @@ public class CalcMatchStatistics extends LongProcessThread { private final IProject project; public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) { - this(Core.getProject(), Core.getSegmenter(), callback, perFile); + this(Core.getProject(), Core.getSegmenter(), callback, perFile, + Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, + OConsts.FUZZY_MATCH_THRESHOLD)); } - public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) { + public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, + boolean perFile, int threshold) { this.project = project; this.callback = callback; this.perFile = perFile; finder = ThreadLocal.withInitial( - () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1)); + () -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold)); } @Override diff --git a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java index 25ab768787..e90729d4c6 100644 --- a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java +++ b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java @@ -25,7 +25,6 @@ package org.omegat.core.statistics; -import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; @@ -36,7 +35,6 @@ import org.junit.Assert; import org.junit.Before; -import org.junit.BeforeClass; import org.junit.Test; import org.omegat.core.Core; @@ -67,17 +65,13 @@ public class CalcMatchStatisticsTest { - @BeforeClass - public static void setup() throws IOException { - TestPreferencesInitializer.init(); - } - @Test public void testCalcMatchStatics() throws Exception { TestProject project = new TestProject(new ProjectPropertiesTest()); IStatsConsumer callback = new TestStatsConsumer(); Segmenter segmenter = new Segmenter(SRX.getDefault()); - CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback); + CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, + callback, 30); calcMatchStatistics.start(); try { calcMatchStatistics.join(); @@ -130,8 +124,7 @@ public void testCalcMatchStatics() throws Exception { Assert.assertEquals("5699", result[7][4]); // change threshold - Preferences.setPreference(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, 70); - calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback); + calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70); calcMatchStatistics.start(); try { calcMatchStatistics.join(); @@ -370,8 +363,9 @@ static class CalcMatchStatisticsMock extends CalcMatchStatistics { private MatchStatCounts result; private final IStatsConsumer callback; - CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback) { - super(project, segmenter, callback, false); + CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback, + int threshold) { + super(project, segmenter, callback, false, threshold); this.project = project; this.callback = callback; } From 7ac17f443ff806cd1a4543ef8bec89fa7eda386b Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Wed, 18 Dec 2024 23:41:45 +0900 Subject: [PATCH 5/5] fix: when disable threshold, set value <= 0 - CalcMatchStatisticsTest checks with disabling threshold, so set it -1 Signed-off-by: Hiroshi Miura --- .../src/org/omegat/core/statistics/CalcMatchStatisticsTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java index e90729d4c6..75e95739d8 100644 --- a/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java +++ b/test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java @@ -124,7 +124,7 @@ public void testCalcMatchStatics() throws Exception { Assert.assertEquals("5699", result[7][4]); // change threshold - calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70); + calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, -1); calcMatchStatistics.start(); try { calcMatchStatistics.join();