Skip to content

Commit

Permalink
[BUGS#1251] fix and refactor: FindMatches
Browse files Browse the repository at this point in the history
- add internal search method to handle normal and segmented search conditions, also use for test purpose
- drop threshold arguments for CalcMatchStatistics usage
- FindMatchesThread.finderSearch to take threshold argument for testing.
- update search() callers accordingly.

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Nov 26, 2024
1 parent 711f747 commit f049239
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 96 deletions.
13 changes: 4 additions & 9 deletions src/org/omegat/core/statistics/CalcMatchStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
import org.omegat.core.threads.LongProcessThread;
import org.omegat.util.OConsts;
import org.omegat.util.OStrings;
import org.omegat.util.Preferences;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;
import org.omegat.util.gui.TextUtil;
Expand Down Expand Up @@ -108,19 +107,15 @@ public class CalcMatchStatistics extends LongProcessThread {
private final IProject project;

public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
this(Core.getProject(), Core.getSegmenter(), callback, perFile,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
this(Core.getProject(), Core.getSegmenter(), callback, perFile);
}

public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
boolean perFile, int threshold) {
public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback, boolean perFile) {
this.project = project;
this.callback = callback;
this.perFile = perFile;
finder = ThreadLocal.withInitial(
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
false, false, threshold));
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, -1));
}

@Override
Expand Down Expand Up @@ -313,7 +308,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
int calcMaxSimilarity(SourceTextEntry ste) {
String srcNoXmlTags = removeXmlTags(ste);
FindMatches localFinder = finder.get();
List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
int maxSimilarity = 0;
CACHE: for (NearString near : nears) {
Expand Down
117 changes: 72 additions & 45 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,46 +127,68 @@ public class FindMatches {
/** Tokens for original string, includes numbers and tags. */
private Token[] strTokensAll;

// This finder used for search separate segment matches
private FindMatches separateSegmentMatcher;

private final int fuzzyMatchThreshold;

private final boolean applyThreshold;

private final Segmenter segmenter;

/**
* @param searchExactlyTheSame
* allows to search similarities with the same text as source
* segment. This mode used only for separate sentence match in
* paragraph project, i.e. where source is just part of current
* source.
*/
@Deprecated
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame) {
this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
}

public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
/**
* Constructor.
*
* @param project
* OmegaT project.
* @param segmenter
* Segmenter to use.
* @param maxCount
* limit the maximum count of the results.
* @param searchExactlyTheSame
* allows searching similarities with the same text as a source
* segment. This mode is used only for separate sentence match in
* a paragraph project, i.e. where a source is just part of the
* current source.
* @param threshold
* threshold to use.
*/
public FindMatches(IProject project, Segmenter segmenter, int maxCount,
boolean searchExactlyTheSame, int threshold) {
this.project = project;
this.segmenter = segmenter;
this.tok = project.getSourceTokenizer();
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
this.maxCount = maxCount;
this.searchExactlyTheSame = searchExactlyTheSame;
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold);
}
this.fuzzyMatchThreshold = threshold;
this.applyThreshold = applyThreshold;
}

public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
public List<NearString> search(String searchText, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
return search(searchText, true, fillSimilarityData, stop, true, true);
}

/**
* Search Translation memories.
* <p>
* Internal method to handle search conditions.
* It is accecible as package-private for testing.
*
* @param searchText
* @param requiresTranslation
* @param fillSimilarityData
* @param stop
* @param allowSeparateSegmentMatch
* @param travelExternal
* @return
* @throws StoppedException
*/
List<NearString> search(String searchText, boolean requiresTranslation,
boolean fillSimilarityData, IStopped stop, boolean allowSeparateSegmentMatch,
boolean travelExternal) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
Expand Down Expand Up @@ -225,28 +247,31 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
*/
int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
if (matcher.find()) {
penalty = Integer.parseInt(matcher.group(1));
}
for (ITMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.getSourceText() == null) {
// Not all TMX entries have a source; in that case there can
// be no meaningful match, so skip.
continue;
// travel by external translation memories, when non-segmented search
// (see BUGS#1251)
if (travelExternal) {
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
if (matcher.find()) {
penalty = Integer.parseInt(matcher.group(1));
}
if (requiresTranslation && tmen.getTranslationText() == null) {
continue;
}
int tmenPenalty = penalty;
if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) {
tmenPenalty += foreignPenalty;
for (ITMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.getSourceText() == null) {
// Not all TMX entries have a source; skip it in
// the case, because of no meaningful.
continue;
}
if (requiresTranslation && tmen.getTranslationText() == null) {
continue;
}
int tmenPenalty = penalty;
if (tmen.hasPropValue(ExternalTMFactory.TMXLoader.PROP_FOREIGN_MATCH, "true")) {
tmenPenalty += foreignPenalty;
}
processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty);
}
processEntry(null, tmen, en.getKey(), NearString.MATCH_SOURCE.TM, false, tmenPenalty);
}
}
// travel by all entries for check source file translations
Expand All @@ -260,7 +285,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
ste.isSourceTranslationFuzzy(), 0);
}
}
if (separateSegmentMatcher != null) {
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true,
fuzzyMatchThreshold);
// split paragraph even when segmentation disabled, then find
// matches for every segment
List<StringBuilder> spaces = new ArrayList<>();
Expand All @@ -275,7 +302,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
for (String onesrc : segments) {
// find match for a separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
false, stop);
false, stop, false, false);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
Expand Down Expand Up @@ -385,7 +412,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
}

// BUGS#1236 - stat display does not use threshold config check
if (applyThreshold && similarityStem < fuzzyMatchThreshold
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
return;
}
Expand Down
14 changes: 10 additions & 4 deletions src/org/omegat/gui/matches/FindMatchesThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,16 @@
import java.util.List;
import java.util.logging.Logger;

import org.omegat.core.Core;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.core.statistics.FindMatches;
import org.omegat.gui.common.EntryInfoSearchThread;
import org.omegat.util.OConsts;
import org.omegat.util.Preferences;

/**
* Find matches in separate thread then show a result in the matches' pane.
Expand Down Expand Up @@ -82,7 +85,9 @@ protected List<NearString> search() throws Exception {
long before = System.currentTimeMillis();

try {
List<NearString> result = finderSearch(project, processedEntry.getSrcText(), this::isEntryChanged);
List<NearString> result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(),
this::isEntryChanged, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before));
return result;
} catch (FindMatches.StoppedException ex) {
Expand All @@ -97,8 +102,9 @@ protected List<NearString> search() throws Exception {
* @param isEntryChanged stop and raise StopException when it returns true.
* @return result as a list of NearString.
*/
protected static List<NearString> finderSearch(IProject project, String srcText, IStopped isEntryChanged) {
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
return finder.search(srcText, true, true, isEntryChanged);
protected static List<NearString> finderSearch(IProject project, Segmenter segmenter, String srcText,
IStopped isEntryChanged, int threshold) {
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold);
return finder.search(srcText, true, isEntryChanged);
}
}
21 changes: 14 additions & 7 deletions test/src/org/omegat/core/statistics/CalcMatchStatisticsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

package org.omegat.core.statistics;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
Expand All @@ -35,6 +36,7 @@

import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

import org.omegat.core.Core;
Expand All @@ -48,6 +50,7 @@
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.segmentation.SRX;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.IFilter;
Expand All @@ -64,13 +67,17 @@

public class CalcMatchStatisticsTest {

@BeforeClass
public static void setup() throws IOException {
TestPreferencesInitializer.init();
}

@Test
public void testCalcMatchStatics() throws Exception {
TestProject project = new TestProject(new ProjectPropertiesTest());
IStatsConsumer callback = new TestStatsConsumer();
Segmenter segmenter = new Segmenter(Preferences.getSRX());
CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter,
callback, 30);
Segmenter segmenter = new Segmenter(SRX.getDefault());
CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback);
calcMatchStatistics.start();
try {
calcMatchStatistics.join();
Expand Down Expand Up @@ -123,7 +130,8 @@ public void testCalcMatchStatics() throws Exception {
Assert.assertEquals("5699", result[7][4]);

// change threshold
calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70);
Preferences.setPreference(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, 70);
calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback);
calcMatchStatistics.start();
try {
calcMatchStatistics.join();
Expand Down Expand Up @@ -362,9 +370,8 @@ static class CalcMatchStatisticsMock extends CalcMatchStatistics {
private MatchStatCounts result;
private final IStatsConsumer callback;

CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback,
int threshold) {
super(project, segmenter, callback, false, threshold);
CalcMatchStatisticsMock(IProject project, Segmenter segmenter, IStatsConsumer callback) {
super(project, segmenter, callback, false);
this.project = project;
this.callback = callback;
}
Expand Down
27 changes: 12 additions & 15 deletions test/src/org/omegat/core/statistics/FindMatchesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -111,22 +111,22 @@ public void testSegmented() throws Exception {
+ "han passat prou temps al lloc web per a convertir-se en usuaris bàsics."
+ " Una comunitat vibrant necessita una entrada regular de nouvinguts que hi participen habitualment"
+ " i aporten veus noves a les converses.\n";
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, false,
true, 30);
List<NearString> result = finder.search(srcText, true, true, iStopped);
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
// search without a separated segment match.
List<NearString> result = finder.search(srcText, true, true, iStopped, false, true);
assertEquals(OConsts.MAX_NEAR_STRINGS, result.size());
assertEquals(65, result.get(0).scores[0].score);
assertEquals(62, result.get(0).scores[0].scoreNoStem);
assertEquals(62, result.get(0).scores[0].adjustedScore);
assertEquals(expectFirst, result.get(0).translation);
assertEquals(expectNear, result.get(1).translation);
//
// search with a segmented match.
List<StringBuilder> spaces = new ArrayList<>();
List<Rule> brules = new ArrayList<>();
List<String> segments = segmenter.segment(prop.getSourceLanguage(), srcText, spaces, brules);
assertEquals(3, segments.size());
finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false, true, 30);
result = finder.search(srcText, true, true, iStopped);
finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
result = finder.search(srcText, false, iStopped);
assertEquals(OConsts.MAX_NEAR_STRINGS, result.size());
assertEquals("Hit with segmented tmx record", 100, result.get(0).scores[0].score);
assertEquals(100, result.get(0).scores[0].scoreNoStem);
Expand Down Expand Up @@ -167,9 +167,8 @@ public void testSearchRFE1578() throws Exception {
IProject project = new TestProject(prop, null, TMX_EN_US_SR, new LuceneEnglishTokenizer(),
new DefaultTokenizer(), segmenter);
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false,
true, 30);
List<NearString> result = finder.search("XXX", true, true, iStopped);
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
List<NearString> result = finder.search("XXX", false, iStopped);
// Without the fix, the result has two entries, but it should one.
assertEquals(1, result.size());
assertEquals("XXX", result.get(0).source);
Expand Down Expand Up @@ -206,10 +205,9 @@ public void testSearchRFE1578_2() throws Exception {
IProject project = new TestProject(prop, null, TMX_EN_US_GB_SR, new LuceneEnglishTokenizer(),
new DefaultTokenizer(), segmenter);
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false,
true, 30);
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
// Search source "XXx" in en-US
List<NearString> result = finder.search("XXX", true, true, iStopped);
List<NearString> result = finder.search("XXX", false, iStopped);
// There should be three entries.
assertEquals(3, result.size());
assertEquals("XXx", result.get(0).source); // should be en-US.
Expand Down Expand Up @@ -237,9 +235,8 @@ public void testSearchBUGS1251() throws Exception {
List<String> segments = segmenter.segment(sourceLanguage, srcText, spaces, brules);
assertEquals(2, segments.size());
IStopped iStopped = () -> false;
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true, false,
true, 30);
List<NearString> result = finder.search(srcText, true, true, iStopped);
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, 30);
List<NearString> result = finder.search(srcText, false, iStopped);
assertEquals(srcText, result.get(0).source);
assertEquals(1, result.size());
assertEquals("TM", result.get(0).comesFrom.name());
Expand Down
Loading

0 comments on commit f049239

Please sign in to comment.