Skip to content

Commit

Permalink
Merge pull request #1222 from omegat-org/topic/miurahr/matches/refact…
Browse files Browse the repository at this point in the history
…or-find-matches-class

refactor: FindMatches class
  • Loading branch information
miurahr authored Dec 21, 2024
2 parents 228bfd3 + 7ac17f4 commit ffc7616
Show file tree
Hide file tree
Showing 8 changed files with 397 additions and 59 deletions.
7 changes: 3 additions & 4 deletions src/org/omegat/core/statistics/CalcMatchStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ public class CalcMatchStatistics extends LongProcessThread {
public CalcMatchStatistics(IStatsConsumer callback, boolean perFile) {
this(Core.getProject(), Core.getSegmenter(), callback, perFile,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
OConsts.FUZZY_MATCH_THRESHOLD));
}

public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer callback,
Expand All @@ -119,8 +119,7 @@ public CalcMatchStatistics(IProject project, Segmenter segmenter, IStatsConsumer
this.callback = callback;
this.perFile = perFile;
finder = ThreadLocal.withInitial(
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, true,
false, false, threshold));
() -> new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold));
}

@Override
Expand Down Expand Up @@ -313,7 +312,7 @@ Optional<MatchStatCounts> calcSimilarity(List<SourceTextEntry> untranslatedEntri
int calcMaxSimilarity(SourceTextEntry ste) {
String srcNoXmlTags = removeXmlTags(ste);
FindMatches localFinder = finder.get();
List<NearString> nears = localFinder.search(srcNoXmlTags, true, false, this::isInterrupted);
List<NearString> nears = localFinder.search(srcNoXmlTags, false, this::isInterrupted);
final Token[] strTokensStem = localFinder.tokenizeAll(ste.getSrcText());
int maxSimilarity = 0;
CACHE: for (NearString near : nears) {
Expand Down
85 changes: 51 additions & 34 deletions src/org/omegat/core/statistics/FindMatches.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,27 +126,15 @@ public class FindMatches {
/** Tokens for original string, includes numbers and tags. */
private Token[] strTokensAll;

// This finder used for search separate segment matches
private FindMatches separateSegmentMatcher;

private final int fuzzyMatchThreshold;

private final boolean applyThreshold;

private final Segmenter segmenter;

/**
* @param searchExactlyTheSame
* allows to search similarities with the same text as source
* segment. This mode used only for separate sentence match in
* paragraph project, i.e. where source is just part of current
* source.
*/
@Deprecated(since = "6.1.0")
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame) {
this(project, Core.getSegmenter(), maxCount, allowSeparateSegmentMatch, searchExactlyTheSame, true,
Preferences.getPreferenceDefault(Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD,
OConsts.FUZZY_MATCH_THRESHOLD));
this(project, Core.getSegmenter(), maxCount, searchExactlyTheSame, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
}

/**
Expand All @@ -166,19 +154,21 @@ public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentM
* @param threshold
* threshold to use.
*/
public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame, boolean applyThreshold, int threshold) {
public FindMatches(IProject project, Segmenter segmenter, int maxCount,
boolean searchExactlyTheSame, int threshold) {
this.project = project;
this.segmenter = segmenter;
this.tok = project.getSourceTokenizer();
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
this.maxCount = maxCount;
this.searchExactlyTheSame = searchExactlyTheSame;
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
separateSegmentMatcher = new FindMatches(project, segmenter, 1, false, true, true, threshold);
}
this.fuzzyMatchThreshold = threshold;
this.applyThreshold = applyThreshold;
}

@Deprecated(since = "6.1.0")
public List<NearString> search(final String searchText, final boolean requiresTranslation,
final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
return search(searchText, fillSimilarityData, stop);
}

/**
Expand All @@ -195,8 +185,33 @@ public FindMatches(IProject project, Segmenter segmenter, int maxCount, boolean
* @throws StoppedException
* raised when stopped during a search process.
*/
public List<NearString> search(String searchText, boolean requiresTranslation, boolean fillSimilarityData,
IStopped stop) throws StoppedException {
public List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop)
throws StoppedException {
return search(searchText, fillSimilarityData, stop,
!project.getProjectProperties().isSentenceSegmentingEnabled());
}

/**
* Search Translation memories.
* <p>
* Internal method to handle search conditions.
* It is accessible as package-private for testing.
*
* @param searchText
* target segment or term to search.
* @param fillSimilarityData
* fill similarity data into the result of NearString objects.
* @param stop
* IStopped callback object to indicate cancel operation.
* @param runSeparateSegmentMatch
* Also search with segmented terms search.
* @return
* List of NearString objects.
* @throws StoppedException
* When stopped the process during search.
*/
List<NearString> search(String searchText, boolean fillSimilarityData, IStopped stop,
boolean runSeparateSegmentMatch) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
Expand Down Expand Up @@ -226,7 +241,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
if (trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
Expand All @@ -241,7 +256,7 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
if (trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
Expand All @@ -255,7 +270,6 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
*/
int foreignPenalty = Preferences.getPreferenceDefault(Preferences.PENALTY_FOR_FOREIGN_MATCHES,
Preferences.PENALTY_FOR_FOREIGN_MATCHES_DEFAULT);
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
Expand All @@ -265,11 +279,11 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
for (ITMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (tmen.getSourceText() == null) {
// Not all TMX entries have a source; in that case there can
// be no meaningful match, so skip.
// Not all TMX entries have a source; skip it in
// the case, because of no meaningful.
continue;
}
if (requiresTranslation && tmen.getTranslationText() == null) {
if (tmen.getTranslationText() == null) {
continue;
}
int tmenPenalty = penalty;
Expand All @@ -290,7 +304,9 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
ste.isSourceTranslationFuzzy(), 0);
}
}
if (separateSegmentMatcher != null) {
if (runSeparateSegmentMatch) {
FindMatches separateSegmentMatcher = new FindMatches(project, segmenter, 1, true,
fuzzyMatchThreshold);
// split paragraph even when segmentation disabled, then find
// matches for every segment
List<StringBuilder> spaces = new ArrayList<>();
Expand All @@ -304,9 +320,10 @@ public List<NearString> search(String searchText, boolean requiresTranslation, b
int maxPenalty = 0;
// multiple segments
for (String onesrc : segments) {
// find match for a separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation,
false, stop);
// find match for a separate segment.
// WARN: the 5th argument should be
// `false` to avoid an infinite-loop.
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, false, stop, false);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
Expand Down Expand Up @@ -428,7 +445,7 @@ public void processEntry(EntryKey key, ITMXEntry entry, String tmxName,
}

// BUGS#1236 - stat display does not use threshold config check
if (applyThreshold && similarityStem < fuzzyMatchThreshold
if (fuzzyMatchThreshold > 0 && similarityStem < fuzzyMatchThreshold
&& similarityNoStem < fuzzyMatchThreshold && simAdjusted < fuzzyMatchThreshold) {
return;
}
Expand Down
32 changes: 26 additions & 6 deletions src/org/omegat/gui/matches/FindMatchesThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
2008 Alex Buloichik
2012 Thomas Cordonnier, Martin Fleurke
2013 Aaron Madlon-Kay
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand All @@ -32,17 +33,22 @@
import java.util.List;
import java.util.logging.Logger;

import org.omegat.core.Core;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.core.statistics.FindMatches;
import org.omegat.gui.common.EntryInfoSearchThread;
import org.omegat.util.OConsts;
import org.omegat.util.Preferences;

/**
* Find matches in separate thread then show result in the matches pane.
* Find matches in separate thread then show a result in the matches' pane.
*
* @author Alex Buloichik ([email protected])
* @author Hiroshi Miura
*/
public class FindMatchesThread extends EntryInfoSearchThread<List<NearString>> {
private static final Logger LOGGER = Logger.getLogger(FindMatchesThread.class.getName());
Expand All @@ -52,9 +58,9 @@ public class FindMatchesThread extends EntryInfoSearchThread<List<NearString>> {

/**
* Entry which is processed currently.
*
* If entry in controller was changed, it means user has moved to another entry, and there is no sense to
* continue.
* <p>
* If entry in controller was changed, it means the user has moved to
* another entry, and there is no sense to continue.
*/
private final SourceTextEntry processedEntry;

Expand All @@ -79,12 +85,26 @@ protected List<NearString> search() throws Exception {
long before = System.currentTimeMillis();

try {
FindMatches finder = new FindMatches(project, OConsts.MAX_NEAR_STRINGS, true, false);
List<NearString> result = finder.search(processedEntry.getSrcText(), true, true, this::isEntryChanged);
List<NearString> result = finderSearch(project, Core.getSegmenter(), processedEntry.getSrcText(),
this::isEntryChanged, Preferences.getPreferenceDefault(
Preferences.EXT_TMX_FUZZY_MATCH_THRESHOLD, OConsts.FUZZY_MATCH_THRESHOLD));
LOGGER.finer(() -> "Time for find matches: " + (System.currentTimeMillis() - before));
return result;
} catch (FindMatches.StoppedException ex) {
throw new EntryChangedException();
}
}

/**
* Search matches (static for test purpose).
* @param project OmegaT project.
* @param srcText source text to look for.
* @param isEntryChanged stop and raise StopException when it returns true.
* @return result as a list of NearString.
*/
protected static List<NearString> finderSearch(IProject project, Segmenter segmenter, String srcText,
IStopped isEntryChanged, int threshold) {
FindMatches finder = new FindMatches(project, segmenter, OConsts.MAX_NEAR_STRINGS, false, threshold);
return finder.search(srcText, true, isEntryChanged);
}
}
16 changes: 16 additions & 0 deletions test/data/tmx/penalty-010/segment_1.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx PUBLIC "-//LISA OSCAR:1998//DTD for Translation Memory eXchange//EN" "tmx14.dtd">

<tmx version="1.4">
<header creationtoolversion="0.1" adminlang="en" segtype="paragraph" creationdate="20230930T155211Z" datatype="unknown" srclang="ja" creationtool="txt2tmx" o-tmf="TextEdit"></header>
<body>
<tu>
<tuv xml:lang="fr">
<seg>weird behavior</seg>
</tuv>
<tuv xml:lang="ja">
<seg>地力の搾取と浪費が現われる。(1)</seg>
</tuv>
</tu>
</body>
</tmx>
46 changes: 46 additions & 0 deletions test/data/tmx/test-multiple-entries.tmx
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx SYSTEM "tmx14.dtd">
<tmx version="1.4">
<header datatype="plaintext" srclang="en-US" adminlang="EN-US" o-tmf="OmegaT TMX" segtype="sentence"
creationtoolversion="test" creationtool="test"/>
<body>
<!-- Default translations -->
<tu>
<tuv lang="en-US">
<seg>Other</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200523T143256Z">
<seg>Altre</seg>
</tuv>
</tu>
<tu>
<tuv lang="en-US">
<seg>For installation on Linux.</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131725Z" creationid="id" creationdate="20200526T131725Z">
<seg>Per l’installazioni nant’à i sistemi Linux.</seg>
</tuv>
</tu>
<tu>
<tuv lang="en-US">
<seg>For installation on other operating systems (such as FreeBSD and Solaris).</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131840Z" creationid="id"
creationdate="20200526T131840Z">
<seg>Per l’installazioni nant’à d’altri sistemi (cum’è FreeBSD è Solaris).</seg>
</tuv>
</tu>
<!-- Alternative translations -->
<tu>
<prop type="file">website/download.html</prop>
<prop type="prev">For installation on Linux.</prop>
<prop type="next">For installation on other operating systems (such as FreeBSD and Solaris).&lt;br0/></prop>
<tuv lang="en-US">
<seg>Other</seg>
</tuv>
<tuv lang="co" changeid="id" changedate="20200526T131742Z" creationid="id" creationdate="20200526T131742Z">
<seg>Altri</seg>
</tuv>
</tu>
</body>
</tmx>
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.segmentation.SRX;
import org.omegat.core.segmentation.Segmenter;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.IFilter;
Expand All @@ -68,7 +69,7 @@ public class CalcMatchStatisticsTest {
public void testCalcMatchStatics() throws Exception {
TestProject project = new TestProject(new ProjectPropertiesTest());
IStatsConsumer callback = new TestStatsConsumer();
Segmenter segmenter = new Segmenter(Preferences.getSRX());
Segmenter segmenter = new Segmenter(SRX.getDefault());
CalcMatchStatisticsMock calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter,
callback, 30);
calcMatchStatistics.start();
Expand Down Expand Up @@ -123,7 +124,7 @@ public void testCalcMatchStatics() throws Exception {
Assert.assertEquals("5699", result[7][4]);

// change threshold
calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, 70);
calcMatchStatistics = new CalcMatchStatisticsMock(project, segmenter, callback, -1);
calcMatchStatistics.start();
try {
calcMatchStatistics.join();
Expand Down
Loading

0 comments on commit ffc7616

Please sign in to comment.