Skip to content

Commit

Permalink
fix: workaournd for an unknown language code
Browse files Browse the repository at this point in the history
- rulename for text in Germany was changed in v5.5
- when reading "segmentation.conf" generated before v5.4,
  migration is failed.
- Add workaround to detect ancient rulename

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr committed Nov 2, 2024
1 parent e962706 commit a69a418
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 81 deletions.
114 changes: 75 additions & 39 deletions src/org/omegat/core/segmentation/LanguageCodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,48 +43,65 @@ public final class LanguageCodes {
private LanguageCodes() {
}

// Language Codes
public static final String CATALAN_CODE = "Catalan";
public static final String CZECH_CODE = "Czech";
public static final String GERMAN_CODE = "German";
public static final String ENGLISH_CODE = "English";
public static final String SPANISH_CODE = "Spanish";
public static final String FINNISH_CODE = "Finnish";
public static final String FRENCH_CODE = "French";
public static final String ITALIAN_CODE = "Italian";
public static final String JAPANESE_CODE = "Japanese";
public static final String DUTCH_CODE = "Dutch";
public static final String POLISH_CODE = "Polish";
public static final String RUSSIAN_CODE = "Russian";
public static final String SWEDISH_CODE = "Swedish";
public static final String SLOVAK_CODE = "Slovak";
public static final String CHINESE_CODE = "Chinese";
public static final String DEFAULT_CODE = "Default";
public static final String F_TEXT_CODE = "Text";
public static final String F_HTML_CODE = "HTML";
// Codes of "languagerulename".
static final String CATALAN_CODE = "Catalan";
static final String CZECH_CODE = "Czech";
static final String GERMAN_CODE = "German";
static final String ENGLISH_CODE = "English";
static final String SPANISH_CODE = "Spanish";
static final String FINNISH_CODE = "Finnish";
static final String FRENCH_CODE = "French";
static final String ITALIAN_CODE = "Italian";
static final String JAPANESE_CODE = "Japanese";
static final String DUTCH_CODE = "Dutch";
static final String POLISH_CODE = "Polish";
static final String RUSSIAN_CODE = "Russian";
static final String SWEDISH_CODE = "Swedish";
static final String SLOVAK_CODE = "Slovak";
static final String CHINESE_CODE = "Chinese";
static final String DEFAULT_CODE = "Default";
static final String F_TEXT_CODE = "Text";
static final String F_HTML_CODE = "HTML";

// Language Keys from Resource Bundle
public static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
public static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
public static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
public static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
public static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
public static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
public static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
public static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
public static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
public static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
public static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
public static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
public static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
public static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
public static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
public static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";
static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";

private static final String CATALAN_PATTERN = "CA.*";
private static final String CZECH_PATTERN = "CS.*";
private static final String GERMAN_PATTERN = "DE.*";
private static final String ENGLISH_PATTERN = "EN.*";
private static final String SPANISH_PATTERN = "ES.*";
private static final String FINNISH_PATTERN = "FI.*";
private static final String FRENCH_PATTERN = "FR.*";
private static final String ITALIAN_PATTERN = "IT.*";
private static final String JAPANESE_PATTERN = "JA.*";
private static final String DUTCH_PATTERN = "NL.*";
private static final String POLISH_PATTERN = "PL.*";
private static final String RUSSIAN_PATTERN = "RU.*";
private static final String SWEDISH_PATTERN = "SV.*";
private static final String SLOVAK_PATTERN = "SK.*";
private static final String CHINESE_PATTERN = "ZH.*";

/** A Map from language codes to language keys. */
private static Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> patternHash = new HashMap<>();

static {
codeKeyHash.put(CATALAN_CODE, CATALAN_KEY);
Expand All @@ -105,6 +122,21 @@ private LanguageCodes() {
codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY);
codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY);
codeKeyHash.put(F_HTML_CODE, F_HTML_KEY);
patternHash.put(CATALAN_PATTERN, CATALAN_CODE);
patternHash.put(CZECH_PATTERN, CZECH_CODE);
patternHash.put(GERMAN_PATTERN, GERMAN_CODE);
patternHash.put(ENGLISH_PATTERN, ENGLISH_CODE);
patternHash.put(SPANISH_PATTERN, SPANISH_CODE);
patternHash.put(FINNISH_PATTERN, FINNISH_CODE);
patternHash.put(FRENCH_PATTERN, FRENCH_CODE);
patternHash.put(ITALIAN_PATTERN, ITALIAN_CODE);
patternHash.put(JAPANESE_PATTERN, JAPANESE_CODE);
patternHash.put(DUTCH_PATTERN, DUTCH_CODE);
patternHash.put(POLISH_PATTERN, POLISH_CODE);
patternHash.put(RUSSIAN_PATTERN, RUSSIAN_CODE);
patternHash.put(SWEDISH_PATTERN, SWEDISH_CODE);
patternHash.put(SLOVAK_PATTERN, SLOVAK_CODE);
patternHash.put(CHINESE_PATTERN, CHINESE_CODE);
}

/**
Expand All @@ -126,11 +158,15 @@ public static boolean isLanguageCodeKnown(String code) {
}

public static String getLanguageCodeByName(String name) {
for (Map.Entry<String, String> entry: codeKeyHash.entrySet()) {
for (Map.Entry<String, String> entry : codeKeyHash.entrySet()) {
if (OStrings.getString(entry.getValue()).equals(name)) {
return entry.getKey();
}
}
return null;
}

public static String getLanguageCodeByPattern(String pattern) {
return patternHash.get(pattern);
}
}
110 changes: 85 additions & 25 deletions src/org/omegat/core/segmentation/MapRule.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand Down Expand Up @@ -31,7 +32,8 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import gen.core.segmentation.Languagemap;
import tokyo.northside.logging.ILogger;

import org.omegat.util.Log;
import org.omegat.util.StringUtil;

Expand All @@ -44,56 +46,99 @@
public class MapRule implements Serializable {

private static final long serialVersionUID = -5868132953113679291L;
private static final ILogger LOGGER = Log.getLogger(MapRule.class);

/** Language Name */
private String languageCode;

/** creates a new empty MapRule */
public MapRule() {
}

/** creates an initialized MapRule */
/**
* Create initialized MapRule object.
*
* @param language
* localized language name (from segmentation.conf), or language
* code (from SRX)
* @param pattern
* language pattern such as "EN.*" or ".*"
* @param rules
* segmentation rules.
*/
public MapRule(String language, String pattern, List<Rule> rules) {
this.setLanguage(language);
String code = LanguageCodes.getLanguageCodeByPattern(pattern);
this.setLanguage(code != null ? code : language);
this.setPattern(pattern);
this.setRules(rules);
}

/** Language Name */
private String languageCode;

public MapRule(Languagemap languagemap, List<Rule> rules) {
this.setLanguage(languagemap.getLanguagerulename());
this.setPattern(languagemap.getLanguagepattern());
this.setRules(rules);
}

/** Returns Language Name (to display it in a dialog). */
public String getLanguageName() {
/*
* When there has already migrated a SRX file store, languageCode fields
* has a name defined as "LanguageCodes.*_CODE". Otherwise, MapRule
* object is created from "segmentation.conf" java beans file, so it is
* localized name of language. We first assume the latter. If res is
* empty, the object is created from a SRX file, then return
* languageCode itself.
*/
String res = LanguageCodes.getLanguageName(languageCode);
return StringUtil.isEmpty(res) ? languageCode : res;
}

/** Sets Language Code */
public void setLanguage(String code) {
/*
* setLanguage method is called from XmlDecoder of a Java beans library
* when migrating from "segmentation.conf" beans file. An argument will
* be localized name of language. When the object is created from a
* standard SRX file, the argument will be standard language name,
* defined as "LanguageCodes.*_CODE". The behavior was changed in OmegaT
* 6.0.0 release in 2023. We first detect whether the argument is
* standard code. If the code is not a standard code, then try to find a
* localized name of the language name. When you read the comment long
* after OmegaT 6.x, and you believe all the OmegaT 4.x and 5.x users
* are migrated to OmegaT 6.x or later, you may want to remove the chunk
* below.
*/
if (!LanguageCodes.isLanguageCodeKnown(code)) {
String alt = LanguageCodes.getLanguageCodeByName(code);
if (alt != null) {
languageCode = alt;
return;
} else {
Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code);
// migration heuristics: Germany translation changed in v5.5.
// See:
// https://github.com/omegat-org/omegat/pull/1158#issuecomment-2448788253
if (code != null && code.contains("Textdateien")) {
languageCode = LanguageCodes.F_TEXT_CODE;
} else {
LOGGER.atDebug().setMessageRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE").addArgument(code)
.log();
languageCode = code;
}
}
return;
}
languageCode = code;
}

/** Returns Language Code for programmatic usage. */
/**
* Returns Language Code for programmatic usage.
*/
public String getLanguage() {
return languageCode;
}

/** Pattern for the language/country ISO code (of a form LL-CC). */
/*
* Pattern for the language/country ISO code (of a form LL-CC). It is like
* "EN.*".
*/
private Pattern pattern;

/** Returns Pattern for the language/country ISO code (of a form LL-CC). */
/**
* Returns Pattern for the language/country ISO code (of a form LL-CC).
*/
public String getPattern() {
if (pattern != null) {
return pattern.pattern();
Expand All @@ -110,14 +155,24 @@ public Pattern getCompiledPattern() {
return pattern;
}

/** Sets Pattern for the language/country ISO code (of a form LL-CC). */
/**
* Sets Pattern for the language/country ISO code (of a form LL-CC).
*
* @param pattern
* pattern string such as "EN.*"
*/
public void setPattern(String pattern) throws PatternSyntaxException {
// Fix for bug [1643500]
// language code in segmentation rule is case sensitive
// language code in segmentation rule is a case-sensitive
// Correction contributed by Tiago Saboga.
this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
}

/**
* Deep copy of the object, mandatory for java beans.
*
* @return new MapRule object
*/
public MapRule copy() {
MapRule result = new MapRule();
result.languageCode = languageCode;
Expand All @@ -142,23 +197,28 @@ public void setRules(List<Rule> rules) {
this.rules = rules;
}

/** Indicates whether some other MapRule is "equal to" this one. */
/**
* Indicates whether some other MapRule is "equal to" this one.
*/
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof MapRule)) {
if (!(obj instanceof MapRule)) {
return false;
}
MapRule that = (MapRule) obj;
return this.getPattern().equals(that.getPattern())
&& this.getLanguage().equals(that.getLanguage())
return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage())
&& this.getRules().equals(that.getRules());
}

/** Returns a hash code value for the object. */
/**
* Returns a hash code value for the object.
*/
public int hashCode() {
return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode();
}

/** Returns a string representation of the MapRule for debugging purposes. */
/**
* Returns a string representation of the MapRule for debugging purposes.
*/
public String toString() {
return getLanguage() + " (" + getPattern() + ") " + getRules().toString();
}
Expand Down
4 changes: 3 additions & 1 deletion src/org/omegat/core/segmentation/SRX.java
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,9 @@ private static SRX loadSrxInputStream(InputStream io) throws IOException {
res.setCascade(!"no".equalsIgnoreCase(srx.getHeader().getCascade()));
res.setVersion(srx.getVersion());
res.setMappingRules(srx.getBody().getMaprules().getLanguagemap().stream()
.map(s -> new MapRule(s, mapping.get(s.getLanguagerulename()))).collect(Collectors.toList()));
.map(languagemap -> new MapRule(languagemap.getLanguagerulename(),
languagemap.getLanguagepattern(), mapping.get(languagemap.getLanguagerulename())))
.collect(Collectors.toList()));
return res;
}

Expand Down
Loading

0 comments on commit a69a418

Please sign in to comment.