Skip to content

Commit

Permalink
fix: improve map rule parser
Browse files Browse the repository at this point in the history
Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Oct 31, 2024
1 parent c0defb6 commit 7fe46d0
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 34 deletions.
42 changes: 39 additions & 3 deletions src/org/omegat/core/segmentation/LanguageCodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public final class LanguageCodes {
private LanguageCodes() {
}

// Language Codes
// Codes of "languagerulename".
public static final String CATALAN_CODE = "Catalan";
public static final String CZECH_CODE = "Czech";
public static final String GERMAN_CODE = "German";
Expand Down Expand Up @@ -83,8 +83,25 @@ private LanguageCodes() {
public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";

private static final String CATALAN_ISO = "CA";
private static final String CZECH_ISO = "CS";
private static final String GERMAN_ISO = "DE";
private static final String ENGLISH_ISO = "EN";
private static final String SPANISH_ISO = "ES";
private static final String FINNISH_ISO = "FI";
private static final String FRENCH_ISO = "FR";
private static final String ITALIAN_ISO = "IT";
private static final String JAPANESE_ISO = "JA";
private static final String DUTCH_ISO = "NL";
private static final String POLISH_ISO = "PL";
private static final String RUSSIAN_ISO = "RU";
private static final String SWEDISH_ISO = "SV";
private static final String SLOVAK_ISO = "SK";
private static final String CHINESE_ISO = "ZH";

/** A Map from language codes to language keys. */
private static Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> isoKeyHash = new HashMap<>();

static {
codeKeyHash.put(CATALAN_CODE, CATALAN_KEY);
Expand All @@ -105,6 +122,21 @@ private LanguageCodes() {
codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY);
codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY);
codeKeyHash.put(F_HTML_CODE, F_HTML_KEY);
isoKeyHash.put(CATALAN_ISO, CATALAN_CODE);
isoKeyHash.put(CZECH_ISO, CZECH_CODE);
isoKeyHash.put(GERMAN_ISO, GERMAN_CODE);
isoKeyHash.put(ENGLISH_ISO, ENGLISH_CODE);
isoKeyHash.put(SPANISH_ISO, SPANISH_CODE);
isoKeyHash.put(FINNISH_ISO, FINNISH_CODE);
isoKeyHash.put(FRENCH_ISO, FRENCH_CODE);
isoKeyHash.put(ITALIAN_ISO, ITALIAN_CODE);
isoKeyHash.put(JAPANESE_ISO, JAPANESE_CODE);
isoKeyHash.put(DUTCH_ISO, DUTCH_CODE);
isoKeyHash.put(POLISH_ISO, POLISH_CODE);
isoKeyHash.put(RUSSIAN_ISO, RUSSIAN_CODE);
isoKeyHash.put(SWEDISH_ISO, SWEDISH_CODE);
isoKeyHash.put(SLOVAK_ISO, SLOVAK_CODE);
isoKeyHash.put(CHINESE_ISO, CHINESE_CODE);
}

/**
Expand All @@ -126,11 +158,15 @@ public static boolean isLanguageCodeKnown(String code) {
}

public static String getLanguageCodeByName(String name) {
for (Map.Entry<String, String> entry: codeKeyHash.entrySet()) {
for (Map.Entry<String, String> entry : codeKeyHash.entrySet()) {
if (OStrings.getString(entry.getValue()).equals(name)) {
return entry.getKey();
}
}
return null;
}

public static String getLanguageCodeByISO(String isoCode) {
return isoKeyHash.get(isoCode);
}
}
72 changes: 56 additions & 16 deletions src/org/omegat/core/segmentation/MapRule.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import java.util.regex.PatternSyntaxException;

import gen.core.segmentation.Languagemap;
import org.omegat.util.Log;
import org.omegat.util.StringUtil;

/**
Expand All @@ -51,45 +50,85 @@ public MapRule() {

/** creates an initialized MapRule */
public MapRule(String language, String pattern, List<Rule> rules) {
this.setLanguage(language);
this.setLanguage(language, pattern);
this.setPattern(pattern);
this.setRules(rules);
}

/** Language Name */
private String languageName;

/** Language Code */
private String languageCode;

public MapRule(Languagemap languagemap, List<Rule> rules) {
this.setLanguage(languagemap.getLanguagerulename());
this.setPattern(languagemap.getLanguagepattern());
String pat = languagemap.getLanguagepattern();
this.setLanguage(languagemap.getLanguagerulename(), pat);
this.setPattern(pat);
this.setRules(rules);
}

/** Returns Language Name (to display it in a dialog). */
public String getLanguageName() {
String res = LanguageCodes.getLanguageName(languageCode);
return StringUtil.isEmpty(res) ? languageCode : res;
return languageName;
}

/** Sets Language Code */
public void setLanguage(String code) {
if (!LanguageCodes.isLanguageCodeKnown(code)) {
String alt = LanguageCodes.getLanguageCodeByName(code);
if (alt != null) {
languageCode = alt;
return;
if (LanguageCodes.isLanguageCodeKnown(code)) {
languageCode = code;
languageName = LanguageCodes.getLanguageName(code);
} else {
String code1 = LanguageCodes.getLanguageCodeByName(code);
if (code1 != null) {
languageName = code;
languageCode = code1;
} else {
languageName = code;
languageCode = code;
}
}
}

/** Sets language code from human-readable name */
public void setLanguageByName(String name) {
if (LanguageCodes.isLanguageCodeKnown(name)) {
// call with SRX standard language code.
languageCode = name;
languageName = LanguageCodes.getLanguageName(name);
} else {
String code = LanguageCodes.getLanguageCodeByName(name);
if (code != null) {
languageName = name;
languageCode = code;
} else {
Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code);
languageName = name;
languageCode = name;
}
}
languageCode = code;
}

/** Returns Language Code for programmatic usage. */
public String getLanguage() {
return languageCode;
}

private void setLanguage(String languageRuleName, String languagePattern) {
if (languagePattern.length() > 3 && languagePattern.endsWith(".*")) {
String lang = languagePattern.substring(0, languagePattern.length() - 2);
String code = LanguageCodes.getLanguageCodeByISO(lang);
if (code != null) {
String res = LanguageCodes.getLanguageName(code);
languageName = StringUtil.isEmpty(res) ? code : res;
languageCode = code;
return;
}
}
// It is "Text", "Default", "HTML" or
// unknown languagepattern or unkonwn ISO code.
setLanguageByName(languageRuleName);
}

/** Pattern for the language/country ISO code (of a form LL-CC). */
private Pattern pattern;

Expand Down Expand Up @@ -148,8 +187,7 @@ public boolean equals(Object obj) {
return false;
}
MapRule that = (MapRule) obj;
return this.getPattern().equals(that.getPattern())
&& this.getLanguage().equals(that.getLanguage())
return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage())
&& this.getRules().equals(that.getRules());
}

Expand All @@ -158,7 +196,9 @@ public int hashCode() {
return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode();
}

/** Returns a string representation of the MapRule for debugging purposes. */
/**
* Returns a string representation of the MapRule for debugging purposes.
*/
public String toString() {
return getLanguage() + " (" + getPattern() + ") " + getRules().toString();
}
Expand Down
39 changes: 25 additions & 14 deletions src/org/omegat/core/segmentation/Segmenter.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ public SRX getSRX() {
* @param paragraph
* the paragraph text
* @param spaces
* list to store information about spaces between sentences (can be null)
* list to store information about spaces between sentences (can
* be null)
* @param brules
* list to store rules that account to breaks (can be null)
* @return list of sentences (String objects)
Expand Down Expand Up @@ -117,11 +118,13 @@ public List<String> segment(Language lang, String paragraph, List<StringBuilder>
}

/**
* Returns pre-sentences (sentences with spaces between), computed by breaking paragraph into chunks of
* text. Also returns the list with "the reasons" why the breaks were made, i.e. the list of break rules
* that contributed to each of the breaks made.
* Returns pre-sentences (sentences with spaces between), computed by
* breaking paragraph into chunks of text. Also returns the list with "the
* reasons" why the breaks were made, i.e. the list of break rules that
* contributed to each of the breaks made.
* <p>
* If glued back together, these strings form the same paragraph text as this function was fed.
* If glued back together, these strings form the same paragraph text as
* this function was fed.
*
* @param paragraph
* the paragraph text
Expand Down Expand Up @@ -244,7 +247,8 @@ static class BreakPosition implements Comparable<BreakPosition> {
}

/**
* Other BreakPosition is "equal to" this one iff it has the same position.
* Other BreakPosition is "equal to" this one iff it has the same
* position.
*/
public boolean equals(Object obj) {
if (obj == null) {
Expand All @@ -266,10 +270,12 @@ public int hashCode() {
/**
* Compares this break position with another.
*
* @return a negative integer if its position is less than the another's, zero if they are equal, or a
* positive integer as its position is greater than the another's.
* @return a negative integer if its position is less than the
* another's, zero if they are equal, or a positive integer as
* its position is greater than the another's.
* @throws ClassCastException
* if the specified object's type prevents it from being compared to this Object.
* if the specified object's type prevents it from being
* compared to this Object.
*/
public int compareTo(BreakPosition that) {
return this.position - that.position;
Expand Down Expand Up @@ -323,14 +329,15 @@ public String glue(Language sourceLang, Language targetLang, List<String> senten
Matcher matcher = LINE_BREAK_OR_TAB_PATTERN.matcher(sp.toString());
if (matcher.find()) {
// If we found line break or tab, trim left spaces.
// Right spaces are left for indentation of the next line.
// Right spaces are left for indentation of the next
// line.
String leftSpaces = matcher.group(1);
if (!leftSpaces.isEmpty()) {
sp.replace(0, leftSpaces.length(), "");
}
} else if ((lastChar != '.')
&& (!PatternConsts.SPACY_REGEX.matcher(rule.getBeforebreak()).matches()
|| !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
|| !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
sp.setLength(0);
}
}
Expand All @@ -347,10 +354,13 @@ public String glue(Language sourceLang, Language targetLang, List<String> senten
* Segment source and target entries from TMX when counts are equals.
*/
public void segmentEntries(boolean needResegment, Language sourceLang, String sourceEntry,
Language targetLang, String targetEntry, List<String> sourceSegments, List<String> targetSegments) {
Language targetLang, String targetEntry, List<String> sourceSegments,
List<String> targetSegments) {
if (needResegment) {
List<String> srcSegments = segment(sourceLang, sourceEntry, null, null);
if (targetEntry != null) { // There is no translation for this entry, because for instance it's a note
if (targetEntry != null) { // There is no translation for this
// entry, because for instance it's a
// note
// on an untranslated entry
List<String> tarSegments = segment(targetLang, targetEntry, null, null);

Expand All @@ -361,7 +371,8 @@ public void segmentEntries(boolean needResegment, Language sourceLang, String so
}
}
}
// No need to resegment, or segments counts not equals, or no translation
// No need to resegment, or segments counts not equals, or no
// translation
sourceSegments.add(sourceEntry);
targetSegments.add(targetEntry);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public void setValueAt(Object aValue, int rowIndex, int columnIndex) {
if (code != null) {
maprule.setLanguage(code);
} else {
maprule.setLanguage(target);
maprule.setLanguageByName(target);
}
break;
case 1:
Expand Down

0 comments on commit 7fe46d0

Please sign in to comment.