diff --git a/release/changes.txt b/release/changes.txt index b5334b8d63..128962eaad 100644 --- a/release/changes.txt +++ b/release/changes.txt @@ -2,7 +2,7 @@ OmegaT 6.0.2 ---------------------------------------------------------------------- 0 Enhancement - 4 Bug fixes + 5 Bug fixes 0 Localisation updates ---------------------------------------------------------------------- @@ -11,6 +11,11 @@ - Preferences/CutomColorSelector don't show notification immediately when modifying color https://sourceforge.net/p/omegat/bugs/1273/ + - tm/penalty-nnn value is not respected + - FindMatcher always search separate segment match without penalty + https://sourceforge.net/p/omegat/bugs/1248/ + https://sourceforge.net/p/omegat/bugs/1251/ + - [Revert regression] List of issues stays hidden in the background https://sourceforge.net/p/omegat/bugs/1225/ diff --git a/src/org/omegat/core/data/ProjectTMX.java b/src/org/omegat/core/data/ProjectTMX.java index 061b96f9b6..65c33e8d2a 100644 --- a/src/org/omegat/core/data/ProjectTMX.java +++ b/src/org/omegat/core/data/ProjectTMX.java @@ -74,14 +74,14 @@ public class ProjectTMX { * * It must be used with synchronization around ProjectTMX. */ - Map defaults; + protected Map defaults; /** * Storage for alternative translations for current project. * * It must be used with synchronization around ProjectTMX. */ - Map alternatives; + protected Map alternatives; final CheckOrphanedCallback checkOrphanedCallback; diff --git a/src/org/omegat/core/matching/NearString.java b/src/org/omegat/core/matching/NearString.java index 8f6df79f36..69aa587e9d 100644 --- a/src/org/omegat/core/matching/NearString.java +++ b/src/org/omegat/core/matching/NearString.java @@ -47,7 +47,7 @@ */ public class NearString { public enum MATCH_SOURCE { - MEMORY, TM, FILES + MEMORY, TM, FILES, TM_SUBSEG }; public enum SORT_KEY { @@ -55,15 +55,23 @@ public enum SORT_KEY { } public NearString(final EntryKey key, final String source, final String translation, MATCH_SOURCE comesFrom, - final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem, final int adjustedScore, - final byte[] nearData, final String projName, final String creator, final long creationDate, + final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem, final int adjustedScore, + final byte[] nearData, final String projName, final String creator, final long creationDate, + final String changer, final long changedDate, final List props) { + this(key, source, translation, comesFrom, fuzzyMark, new Scores(nearScore, nearScoreNoStem, + adjustedScore, 0), nearData, projName, creator, creationDate, changer, changedDate, props); + } + + public NearString(final EntryKey key, final String source, final String translation, MATCH_SOURCE comesFrom, + final boolean fuzzyMark, final Scores scores, final byte[] nearData, final String projName, + final String creator, final long creationDate, final String changer, final long changedDate, final List props) { this.key = key; this.source = source; this.translation = translation; this.comesFrom = comesFrom; this.fuzzyMark = fuzzyMark; - this.scores = new Scores[] { new Scores(nearScore, nearScoreNoStem, adjustedScore) }; + this.scores = new Scores[] { scores }; this.attr = nearData; this.projs = new String[] { projName == null ? "" : projName }; this.props = props; @@ -77,27 +85,35 @@ public static NearString merge(NearString ns, final EntryKey key, final String s MATCH_SOURCE comesFrom, final boolean fuzzyMark, final int nearScore, final int nearScoreNoStem, final int adjustedScore, final byte[] nearData, final String projName, final String creator, final long creationDate, final String changer, final long changedDate, final List props) { + return merge(ns, key, source, translation, comesFrom, fuzzyMark, new Scores(nearScore, + nearScoreNoStem, adjustedScore, 0), nearData, projName, creator, creationDate, changer, + changedDate, props); + } + + public static NearString merge(NearString ns, final EntryKey key, final String source, final String translation, + MATCH_SOURCE comesFrom, final boolean fuzzyMark, final Scores scores, + final byte[] nearData, final String projName, final String creator, + final long creationDate, final String changer, final long changedDate, final List props) { List projs = new ArrayList<>(); - List scores = new ArrayList<>(); + List mergedScores = new ArrayList<>(); projs.addAll(Arrays.asList(ns.projs)); - scores.addAll(Arrays.asList(ns.scores)); + mergedScores.addAll(Arrays.asList(ns.scores)); NearString merged; - if (nearScore > ns.scores[0].score) { - merged = new NearString(key, source, translation, comesFrom, fuzzyMark, nearScore, - nearScoreNoStem, adjustedScore, nearData, null, creator, creationDate, changer, changedDate, props); + if (scores.score > ns.scores[0].score) { + merged = new NearString(key, source, translation, comesFrom, fuzzyMark, scores, + nearData, null, creator, creationDate, changer, changedDate, props); projs.add(0, projName); - scores.add(0, merged.scores[0]); + mergedScores.add(0, merged.scores[0]); } else { - merged = new NearString(ns.key, ns.source, ns.translation, ns.comesFrom, ns.fuzzyMark, nearScore, - nearScoreNoStem, adjustedScore, ns.attr, null, ns.creator, ns.creationDate, ns.changer, - ns.changedDate, ns.props); + merged = new NearString(ns.key, ns.source, ns.translation, ns.comesFrom, ns.fuzzyMark, scores, + ns.attr, null, ns.creator, ns.creationDate, ns.changer, ns.changedDate, ns.props); projs.add(projName); - scores.add(merged.scores[0]); + mergedScores.add(merged.scores[0]); } merged.projs = projs.toArray(new String[projs.size()]); - merged.scores = scores.toArray(new Scores[scores.size()]); + merged.scores = mergedScores.toArray(new Scores[mergedScores.size()]); return merged; } @@ -130,11 +146,13 @@ public static class Scores { public final int scoreNoStem; /** adjusted similarity score for match including all tokens */ public final int adjustedScore; + public final int penalty; - public Scores(int score, int scoreNoStem, int adjustedScore) { + public Scores(int score, int scoreNoStem, int adjustedScore, int penalty) { this.score = score; this.scoreNoStem = scoreNoStem; this.adjustedScore = adjustedScore; + this.penalty = penalty; } public String toString() { diff --git a/src/org/omegat/core/statistics/FindMatches.java b/src/org/omegat/core/statistics/FindMatches.java index 203d34d151..6293da9403 100644 --- a/src/org/omegat/core/statistics/FindMatches.java +++ b/src/org/omegat/core/statistics/FindMatches.java @@ -30,10 +30,12 @@ import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -273,6 +275,8 @@ public void iterate(EntryKey source, TMXEntry trans) { if (separateSegmentMatcher != null) { // split paragraph even when segmentation disabled, then find // matches for every segment + int maxPenalty = 0; + Set tmxNames = new HashSet<>(); List spaces = new ArrayList(); List brules = new ArrayList(); Language sourceLang = project.getProjectProperties().getSourceLanguage(); @@ -292,6 +296,14 @@ public void iterate(EntryKey source, TMXEntry trans) { && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) { fsrc.add(segmentMatch.get(0).source); ftrans.add(segmentMatch.get(0).translation); + segmentMatch.stream().filter(match -> !match.projs[0].isEmpty()) + .map(match -> match.projs[0]).forEach(tmxNames::add); + if (segmentMatch.get(0).fuzzyMark) { + if (maxPenalty < PENALTY_FOR_FUZZY) { + maxPenalty = PENALTY_FOR_FUZZY; + } + } + maxPenalty = Math.max(maxPenalty, segmentMatch.get(0).scores[0].penalty); } else { fsrc.add(""); ftrans.add(""); @@ -301,8 +313,8 @@ public void iterate(EntryKey source, TMXEntry trans) { String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules); // glue found translations String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules); - processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "", - 0, null); + processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM_SUBSEG, false, maxPenalty, + String.join(",", tmxNames), "", 0, "", 0, null); } } @@ -421,8 +433,9 @@ protected void processEntry(EntryKey key, String source, String translation, return; } - addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem, - simAdjusted, null, tmxName, creator, creationDate, changer, changedDate, props); + addNearString(key, source, translation, comesFrom, fuzzy, new NearString.Scores(similarityStem, + similarityNoStem, simAdjusted, penalty), + null, tmxName, creator, creationDate, changer, changedDate, props); } /** @@ -457,8 +470,7 @@ protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final * "similarity,simAdjusted" */ protected void addNearString(final EntryKey key, final String source, final String translation, - NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int similarity, - final int similarityNoStem, final int simAdjusted, final byte[] similarityData, + NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, NearString.Scores scores, final byte[] similarityData, final String tmxName, final String creator, final long creationDate, final String changer, final long changedDate, final List tuProperties) { // find position for new data @@ -470,25 +482,25 @@ protected void addNearString(final EntryKey key, final String source, final Stri // single NearString with // multiple project entries. result.set(i, - NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity, - similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, + NearString.merge(st, key, source, translation, comesFrom, fuzzy, + scores, similarityData, tmxName, creator, creationDate, changer, changedDate, tuProperties)); return; } - if (st.scores[0].score < similarity) { + if (st.scores[0].score < scores.score) { break; } - if (st.scores[0].score == similarity) { - if (st.scores[0].scoreNoStem < similarityNoStem) { + if (st.scores[0].score == scores.score) { + if (st.scores[0].scoreNoStem < scores.scoreNoStem) { break; } - if (st.scores[0].scoreNoStem == similarityNoStem) { - if (st.scores[0].adjustedScore < simAdjusted) { + if (st.scores[0].scoreNoStem == scores.scoreNoStem) { + if (st.scores[0].adjustedScore < scores.adjustedScore) { break; } // Patch contributed by Antonio Vilei // text with the same case has precedence - if (similarity == 100 && !st.source.equals(srcText) && source.equals(srcText)) { + if (scores.score == 100 && !st.source.equals(srcText) && source.equals(srcText)) { break; } } @@ -497,9 +509,8 @@ protected void addNearString(final EntryKey key, final String source, final Stri } result.add(pos, - new NearString(key, source, translation, comesFrom, fuzzy, similarity, similarityNoStem, - simAdjusted, similarityData, tmxName, creator, creationDate, changer, changedDate, - tuProperties)); + new NearString(key, source, translation, comesFrom, fuzzy, scores, similarityData, tmxName, creator, + creationDate, changer, changedDate, tuProperties)); if (result.size() > maxCount) { result.remove(result.size() - 1); } @@ -508,9 +519,9 @@ protected void addNearString(final EntryKey key, final String source, final Stri /* * Methods for tokenize strings with caching. */ - Map tokenizeStemCache = new HashMap(); - Map tokenizeNoStemCache = new HashMap(); - Map tokenizeAllCache = new HashMap(); + Map tokenizeStemCache = new HashMap<>(); + Map tokenizeNoStemCache = new HashMap<>(); + Map tokenizeAllCache = new HashMap<>(); public Token[] tokenizeStem(String str) { Token[] tokens = tokenizeStemCache.get(str); diff --git a/test/data/tmx/penalty-010/segment_1.tmx b/test/data/tmx/penalty-010/segment_1.tmx new file mode 100644 index 0000000000..18e55fa724 --- /dev/null +++ b/test/data/tmx/penalty-010/segment_1.tmx @@ -0,0 +1,16 @@ + + + + +
+ + + + weird behavior + + + 地力の搾取と浪費が現われる。(1) + + + +
diff --git a/test/data/tmx/test-multiple-entries.tmx b/test/data/tmx/test-multiple-entries.tmx new file mode 100644 index 0000000000..b2b9008c56 --- /dev/null +++ b/test/data/tmx/test-multiple-entries.tmx @@ -0,0 +1,46 @@ + + + +
+ + + + + Other + + + Altre + + + + + For installation on Linux. + + + Per l’installazioni nant’à i sistemi Linux. + + + + + For installation on other operating systems (such as FreeBSD and Solaris). + + + Per l’installazioni nant’à d’altri sistemi (cum’è FreeBSD è Solaris). + + + + + website/download.html + For installation on Linux. + For installation on other operating systems (such as FreeBSD and Solaris).<br0/> + + Other + + + Altri + + + + diff --git a/test/src/org/omegat/core/statistics/FindMatchesTest.java b/test/src/org/omegat/core/statistics/FindMatchesTest.java index f9dcceeaf2..2c5ac9e9a0 100644 --- a/test/src/org/omegat/core/statistics/FindMatchesTest.java +++ b/test/src/org/omegat/core/statistics/FindMatchesTest.java @@ -27,6 +27,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import java.io.File; @@ -37,6 +39,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import org.apache.commons.io.FileUtils; @@ -54,14 +57,19 @@ import org.omegat.core.data.ProjectProperties; import org.omegat.core.data.ProjectTMX; import org.omegat.core.data.SourceTextEntry; +import org.omegat.core.data.TMXEntry; import org.omegat.core.events.IStopped; import org.omegat.core.matching.NearString; +import org.omegat.core.segmentation.Rule; import org.omegat.core.segmentation.SRX; import org.omegat.core.segmentation.Segmenter; import org.omegat.tokenizer.DefaultTokenizer; import org.omegat.tokenizer.ITokenizer; +import org.omegat.tokenizer.LuceneCJKTokenizer; import org.omegat.tokenizer.LuceneEnglishTokenizer; +import org.omegat.tokenizer.LuceneFrenchTokenizer; import org.omegat.util.Language; +import org.omegat.util.Log; import org.omegat.util.OConsts; import org.omegat.util.Preferences; import org.omegat.util.TestPreferencesInitializer; @@ -71,6 +79,8 @@ public class FindMatchesTest { private static final File TMX_EN_US_SR = new File("test/data/tmx/en-US_sr.tmx"); private static final File TMX_EN_US_GB_SR = new File("test/data/tmx/en-US_en-GB_fr_sr.tmx"); + private static final File TMX_SEGMENT = new File("test/data/tmx/penalty-010/segment_1.tmx"); + private static final File TMX_MULTI = new File("test/data/tmx/test-multiple-entries.tmx"); private static Path tmpDir; /** @@ -96,7 +106,7 @@ public void testSearchRFE1578() throws Exception { prop.setTargetLanguage("cnr"); prop.setSupportDefaultTranslations(true); prop.setSentenceSegmentingEnabled(false); - IProject project = new TestProject(prop, TMX_EN_US_SR); + IProject project = new TestProject(prop, null, TMX_EN_US_SR, new LuceneEnglishTokenizer(), new DefaultTokenizer()); Core.setProject(project); Core.setSegmenter(new Segmenter(new SRX())); IStopped iStopped = () -> false; @@ -131,7 +141,8 @@ public void testSearchRFE1578_2() throws Exception { prop.setTargetLanguage("cnr"); prop.setSupportDefaultTranslations(true); prop.setSentenceSegmentingEnabled(false); - IProject project = new TestProject(prop, TMX_EN_US_GB_SR); + IProject project = new TestProject(prop, null, TMX_EN_US_GB_SR, new LuceneEnglishTokenizer(), + new DefaultTokenizer()); Core.setProject(project); Core.setSegmenter(new Segmenter(new SRX())); IStopped iStopped = () -> false; @@ -146,6 +157,63 @@ public void testSearchRFE1578_2() throws Exception { assertEquals(3, result.size()); } + @Test + public void testSearchBUGS1251() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("ja"); + prop.setTargetLanguage("fr"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(false); + Segmenter segmenter = new Segmenter(SRX.getDefault()); + IProject project = new FindMatchesTest.TestProject(prop, null, TMX_SEGMENT, new LuceneCJKTokenizer(), + new LuceneFrenchTokenizer()); + Core.setProject(project); + SourceTextEntry ste = project.getAllEntries().get(1); + Language sourceLanguage = prop.getSourceLanguage(); + String srcText = ste.getSrcText(); + List spaces = new ArrayList<>(); + List brules = new ArrayList<>(); + List segments = segmenter.segment(sourceLanguage, srcText, spaces, brules); + assertEquals(2, segments.size()); + IStopped iStopped = () -> false; + FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, true, true); + List result = finder.search(srcText, false, false, iStopped); + assertEquals(srcText, result.get(0).source); + assertEquals(2, result.size()); + // match normal + assertEquals("TM", result.get(0).comesFrom.name()); + assertEquals(90, result.get(0).scores[0].score); + assertEquals("weird behavior", result.get(0).translation); + assertTrue(result.get(0).projs[0].contains("penalty-010")); + // match segmented, with penalty + assertEquals("TM_SUBSEG", result.get(1).comesFrom.name()); + assertEquals(90, result.get(1).scores[0].score); + assertTrue(result.get(1).projs[0].contains("penalty-010")); + } + + @Test + public void testSearchMulti() throws Exception { + ProjectProperties prop = new ProjectProperties(tmpDir.toFile()); + prop.setSourceLanguage("en-US"); + prop.setTargetLanguage("co"); + prop.setSupportDefaultTranslations(true); + prop.setSentenceSegmentingEnabled(true); + IProject project = new TestProject(prop, TMX_MULTI, null, new LuceneEnglishTokenizer(), + new DefaultTokenizer()); + IStopped iStopped = () -> false; + FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, true, true); + List result = finder.search("Other", false, false, iStopped); + assertEquals(3, result.size()); + assertEquals("Other", result.get(0).source); + assertEquals("Altre", result.get(0).translation); // default + assertNull(result.get(0).key); + assertEquals("Altri", result.get(1).translation); // alternative + assertNotNull(result.get(1).key); + assertEquals("website/download.html", result.get(1).key.file); + assertEquals("Other", result.get(2).translation); // source translation + } + + @BeforeClass public static void setUpClass() throws Exception { tmpDir = Files.createTempDirectory("omegat"); @@ -161,15 +229,74 @@ public void setUp() throws Exception { Preferences.setPreference(Preferences.EXT_TMX_KEEP_FOREIGN_MATCH, true); Core.registerTokenizerClass(DefaultTokenizer.class); Core.registerTokenizerClass(LuceneEnglishTokenizer.class); + Core.registerTokenizerClass(LuceneFrenchTokenizer.class); + Core.registerTokenizerClass(LuceneCJKTokenizer.class); + Core.setSegmenter(new Segmenter(SRX.getDefault())); } static class TestProject extends NotLoadedProject implements IProject { - private ProjectProperties prop; - private File testTmx; + private final ProjectProperties prop; + private final File externalTmx; + private final ITokenizer sourceTokenizer; + private final ITokenizer targetTokenizer; + private ProjectTMXMock projectTMX; - public TestProject(final ProjectProperties prop, final File testTmx) { + public TestProject(ProjectProperties prop, File testTmx, File externalTmx, + ITokenizer sourceTokenizer, ITokenizer targetTokenizer) { this.prop = prop; - this.testTmx = testTmx; + this.externalTmx = externalTmx; + this.sourceTokenizer = sourceTokenizer; + this.targetTokenizer = targetTokenizer; + if (testTmx != null) { + try { + projectTMX = new ProjectTMXMock(prop.getSourceLanguage(), prop.getTargetLanguage(), + prop.isSentenceSegmentingEnabled(), testTmx, checkOrphanedCallback); + } catch (Exception e) { + Log.log(e); + } + } + } + + final ProjectTMX.CheckOrphanedCallback checkOrphanedCallback = new ProjectTMX.CheckOrphanedCallback() { + public boolean existSourceInProject(String src) { + return false; + } + public boolean existEntryInProject(EntryKey key) { + return false; + } + }; + + public void iterateByDefaultTranslations(DefaultTranslationsIterator it) { + if (projectTMX == null) { + return; + } + Map.Entry[] entries; + synchronized (checkOrphanedCallback) { + entries = entrySetToArray(projectTMX.getDefaultsMap().entrySet()); + } + for (Map.Entry en : entries) { + it.iterate(en.getKey(), en.getValue()); + } + } + + public void iterateByMultipleTranslations(MultipleTranslationsIterator it) { + if (projectTMX == null) { + return; + } + Map.Entry[] entries; + synchronized (checkOrphanedCallback) { + entries = entrySetToArray(projectTMX.getAlternativesMap().entrySet()); + } + for (Map.Entry en : entries) { + it.iterate(en.getKey(), en.getValue()); + } + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + private Map.Entry[] entrySetToArray(Set> set) { + // Assign to variable to facilitate suppressing the rawtypes warning + Map.Entry[] a = new Map.Entry[set.size()]; + return set.toArray(a); } @Override @@ -182,17 +309,25 @@ public List getAllEntries() { List ste = new ArrayList<>(); ste.add(new SourceTextEntry(new EntryKey("source.txt", "XXX", null, "", "", null), 1, null, null, new ArrayList<>())); + ste.add(new SourceTextEntry(new EntryKey("source.txt", + "\u5730\u529B\u306E\u643E\u53D6\u3068\u6D6A\u8CBB\u304C\u73FE\u308F\u308C\u308B\u3002(1)", + null, "", "", null), + 1, null, null, Collections.emptyList())); + ste.add(new SourceTextEntry(new EntryKey("website/download.html", "Other", "id", + "For installation on Linux.", + "For installation on other operating systems (such as FreeBSD and Solaris).<br0/>", + null), 1, null, "Other", Collections.emptyList())); return ste; } @Override public ITokenizer getSourceTokenizer() { - return new LuceneEnglishTokenizer(); + return sourceTokenizer; }; @Override public ITokenizer getTargetTokenizer() { - return new DefaultTokenizer(); + return targetTokenizer; } @Override @@ -204,8 +339,8 @@ public Map getOtherTargetLanguageTMs() { public Map getTransMemories() { Map transMemories = new TreeMap<>(); try { - ExternalTMX newTMX = ExternalTMFactory.load(testTmx); - transMemories.put(testTmx.getPath(), newTMX); + ExternalTMX newTMX = ExternalTMFactory.load(externalTmx); + transMemories.put(externalTmx.getPath(), newTMX); } catch (Exception ignored) { } return Collections.unmodifiableMap(transMemories); @@ -217,4 +352,21 @@ public static void tearDown() throws IOException { FileUtils.deleteDirectory(tmpDir.toFile()); assertFalse(tmpDir.toFile().exists()); } + + public static class ProjectTMXMock extends ProjectTMX { + + public ProjectTMXMock(Language sourceLanguage, Language targetLanguage, + boolean isSentenceSegmentingEnabled, + File file, CheckOrphanedCallback callback) throws Exception { + super(sourceLanguage, targetLanguage, isSentenceSegmentingEnabled, file, callback); + } + + public Map getDefaultsMap() { + return defaults; + }; + + public Map getAlternativesMap() { + return alternatives; + } + } }