Skip to content

Commit

Permalink
fix: o.o.c.segmentation.SRX to load conf and save srx in more robust …
Browse files Browse the repository at this point in the history
…way and remove warning message (#1159)

* chore: update tests

- reafactor SRXTest class
- Add germany locale conf file built from OmegaT 5.4.0 as test data
- refactor SRX class to help testing
- Load resource bundle in specified test locale

Signed-off-by: Hiroshi Miura <[email protected]>

* feat: SRX.saveToSrx to use standard name

- Harden the save method to robust for localized language name.
- Even when MapRule has a localized language code, it detects language from a language pattern and write standard name.

Signed-off-by: Hiroshi Miura <[email protected]>

* fix: workaournd for an unknown language code

- rulename for text in Germany was changed in v5.5
- when reading "segmentation.conf" generated before v5.4,
  migration is failed.
- Add workaround to detect ancient rulename

Signed-off-by: Hiroshi Miura <[email protected]>

* Update src/org/omegat/core/segmentation/MapRule.java

Use non localized message for debug level

* refactor: adjust review feedbacks

- Update LanguageCode.getLanguageCodeByName
    - add null check at first
    - move a migration heuristics code from MapRule
- Update MapRule javadoc descriptions

Signed-off-by: Hiroshi Miura <[email protected]>

* revert unrelated changes

Signed-off-by: Hiroshi Miura <[email protected]>

* fix: remove a warning message in Bundle.properties

Signed-off-by: Hiroshi Miura <[email protected]>

---------

Signed-off-by: Hiroshi Miura <[email protected]>
  • Loading branch information
miurahr authored Nov 12, 2024
1 parent 28dcc86 commit 2469dd8
Show file tree
Hide file tree
Showing 8 changed files with 28,539 additions and 353 deletions.
2 changes: 0 additions & 2 deletions src/org/omegat/Bundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1734,8 +1734,6 @@ CORE_SRX_RULES_LANG_DEFAULT=Default
CORE_SRX_RULES_FORMATTING_TEXT=Text files segmentation
CORE_SRX_RULES_FORMATTING_HTML=HTML, XHTML, ODF and Infix segmentation

CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE=Unknown language code {0} specified

# org.omegat.core.spellchecker.SpellCheckerManager
CORE_SPELLCHECKER_NO_ENGINE=No active spell checker engine found

Expand Down
340 changes: 169 additions & 171 deletions src/org/omegat/Bundle_it.properties

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions src/org/omegat/Bundle_nl.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1555,8 +1555,6 @@ CORE_SRX_RULES_LANG_DEFAULT=Standaard
CORE_SRX_RULES_FORMATTING_TEXT=Segmentatie van tekstbestanden
CORE_SRX_RULES_FORMATTING_HTML=segmentatie voor HTML, XHTML, ODF en Infix

CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE=Onbekende taalcode {0} gespecificeerd

# org.omegat.core.spellchecker.SpellCheckerManager
CORE_SPELLCHECKER_NO_ENGINE=Geen actief programma voor spellingscontrole gevonden

Expand Down
123 changes: 84 additions & 39 deletions src/org/omegat/core/segmentation/LanguageCodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,48 +43,65 @@ public final class LanguageCodes {
private LanguageCodes() {
}

// Language Codes
public static final String CATALAN_CODE = "Catalan";
public static final String CZECH_CODE = "Czech";
public static final String GERMAN_CODE = "German";
public static final String ENGLISH_CODE = "English";
public static final String SPANISH_CODE = "Spanish";
public static final String FINNISH_CODE = "Finnish";
public static final String FRENCH_CODE = "French";
public static final String ITALIAN_CODE = "Italian";
public static final String JAPANESE_CODE = "Japanese";
public static final String DUTCH_CODE = "Dutch";
public static final String POLISH_CODE = "Polish";
public static final String RUSSIAN_CODE = "Russian";
public static final String SWEDISH_CODE = "Swedish";
public static final String SLOVAK_CODE = "Slovak";
public static final String CHINESE_CODE = "Chinese";
public static final String DEFAULT_CODE = "Default";
public static final String F_TEXT_CODE = "Text";
public static final String F_HTML_CODE = "HTML";
// Codes of "languagerulename".
static final String CATALAN_CODE = "Catalan";
static final String CZECH_CODE = "Czech";
static final String GERMAN_CODE = "German";
static final String ENGLISH_CODE = "English";
static final String SPANISH_CODE = "Spanish";
static final String FINNISH_CODE = "Finnish";
static final String FRENCH_CODE = "French";
static final String ITALIAN_CODE = "Italian";
static final String JAPANESE_CODE = "Japanese";
static final String DUTCH_CODE = "Dutch";
static final String POLISH_CODE = "Polish";
static final String RUSSIAN_CODE = "Russian";
static final String SWEDISH_CODE = "Swedish";
static final String SLOVAK_CODE = "Slovak";
static final String CHINESE_CODE = "Chinese";
static final String DEFAULT_CODE = "Default";
static final String F_TEXT_CODE = "Text";
static final String F_HTML_CODE = "HTML";

// Language Keys from Resource Bundle
public static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
public static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
public static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
public static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
public static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
public static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
public static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
public static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
public static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
public static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
public static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
public static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
public static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
public static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
public static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
public static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";
static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";

private static final String CATALAN_PATTERN = "CA.*";
private static final String CZECH_PATTERN = "CS.*";
private static final String GERMAN_PATTERN = "DE.*";
private static final String ENGLISH_PATTERN = "EN.*";
private static final String SPANISH_PATTERN = "ES.*";
private static final String FINNISH_PATTERN = "FI.*";
private static final String FRENCH_PATTERN = "FR.*";
private static final String ITALIAN_PATTERN = "IT.*";
private static final String JAPANESE_PATTERN = "JA.*";
private static final String DUTCH_PATTERN = "NL.*";
private static final String POLISH_PATTERN = "PL.*";
private static final String RUSSIAN_PATTERN = "RU.*";
private static final String SWEDISH_PATTERN = "SV.*";
private static final String SLOVAK_PATTERN = "SK.*";
private static final String CHINESE_PATTERN = "ZH.*";

/** A Map from language codes to language keys. */
private static Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> patternHash = new HashMap<>();

static {
codeKeyHash.put(CATALAN_CODE, CATALAN_KEY);
Expand All @@ -105,6 +122,21 @@ private LanguageCodes() {
codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY);
codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY);
codeKeyHash.put(F_HTML_CODE, F_HTML_KEY);
patternHash.put(CATALAN_PATTERN, CATALAN_CODE);
patternHash.put(CZECH_PATTERN, CZECH_CODE);
patternHash.put(GERMAN_PATTERN, GERMAN_CODE);
patternHash.put(ENGLISH_PATTERN, ENGLISH_CODE);
patternHash.put(SPANISH_PATTERN, SPANISH_CODE);
patternHash.put(FINNISH_PATTERN, FINNISH_CODE);
patternHash.put(FRENCH_PATTERN, FRENCH_CODE);
patternHash.put(ITALIAN_PATTERN, ITALIAN_CODE);
patternHash.put(JAPANESE_PATTERN, JAPANESE_CODE);
patternHash.put(DUTCH_PATTERN, DUTCH_CODE);
patternHash.put(POLISH_PATTERN, POLISH_CODE);
patternHash.put(RUSSIAN_PATTERN, RUSSIAN_CODE);
patternHash.put(SWEDISH_PATTERN, SWEDISH_CODE);
patternHash.put(SLOVAK_PATTERN, SLOVAK_CODE);
patternHash.put(CHINESE_PATTERN, CHINESE_CODE);
}

/**
Expand All @@ -126,11 +158,24 @@ public static boolean isLanguageCodeKnown(String code) {
}

public static String getLanguageCodeByName(String name) {
for (Map.Entry<String, String> entry: codeKeyHash.entrySet()) {
if (name == null) {
return null;
}
for (Map.Entry<String, String> entry : codeKeyHash.entrySet()) {
if (OStrings.getString(entry.getValue()).equals(name)) {
return entry.getKey();
}
}
// migration heuristics: Germany translation changed in v5.5.
// See:
// https://github.com/omegat-org/omegat/pull/1158#issuecomment-2448788253
if (name.contains("Textdateien")) {
return LanguageCodes.F_TEXT_CODE;
}
return null;
}

public static String getLanguageCodeByPattern(String pattern) {
return patternHash.get(pattern);
}
}
106 changes: 81 additions & 25 deletions src/org/omegat/core/segmentation/MapRule.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support
Expand Down Expand Up @@ -31,7 +32,8 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import gen.core.segmentation.Languagemap;
import tokyo.northside.logging.ILogger;

import org.omegat.util.Log;
import org.omegat.util.StringUtil;

Expand All @@ -44,56 +46,95 @@
public class MapRule implements Serializable {

private static final long serialVersionUID = -5868132953113679291L;
private static final ILogger LOGGER = Log.getLogger(MapRule.class);

/** Language Name */
private String languageCode;

/** creates a new empty MapRule */
/**
* Creates a new empty MapRule.
* <p>
* When SRX.loadSrxFile loads segmentation.conf, java.beans.XMLDecoder
* create an empty object, then calls setLanguage and setPattern methods.
* </p>
*/
public MapRule() {
}

/** creates an initialized MapRule */
/**
* Create initialized MapRule object.
*
* @param language
* localized language name (from segmentation.conf), or language
* code (from SRX)
* @param pattern
* language pattern such as "EN.*" or ".*"
* @param rules
* segmentation rules.
*/
public MapRule(String language, String pattern, List<Rule> rules) {
this.setLanguage(language);
String code = LanguageCodes.getLanguageCodeByPattern(pattern);
this.setLanguage(code != null ? code : language);
this.setPattern(pattern);
this.setRules(rules);
}

/** Language Name */
private String languageCode;

public MapRule(Languagemap languagemap, List<Rule> rules) {
this.setLanguage(languagemap.getLanguagerulename());
this.setPattern(languagemap.getLanguagepattern());
this.setRules(rules);
}

/** Returns Language Name (to display it in a dialog). */
public String getLanguageName() {
/*
* When there has already migrated a SRX file store, languageCode fields
* has a name defined as "LanguageCodes.*_CODE". Otherwise, MapRule
* object is created from "segmentation.conf" java beans file, so it is
* localized name of language. We first assume the latter. If res is
* empty, the object is created from a SRX file, then return
* languageCode itself.
*/
String res = LanguageCodes.getLanguageName(languageCode);
return StringUtil.isEmpty(res) ? languageCode : res;
}

/** Sets Language Code */
public void setLanguage(String code) {
/*
* setLanguage method is called from XmlDecoder of a Java beans library
* when migrating from "segmentation.conf" beans file. An argument will
* be localized name of language. When the object is created from a
* standard SRX file, the argument will be standard language name,
* defined as "LanguageCodes.*_CODE". The behavior was changed in OmegaT
* 6.0.0 release in 2023. We first detect whether the argument is
* standard code. If the code is not a standard code, then try to find a
* localized name of the language name. When you believe all the OmegaT
* 4.x and 5.x users are migrated to OmegaT 6.x or later, you may want
* to remove the workaround here.
*/
if (!LanguageCodes.isLanguageCodeKnown(code)) {
String alt = LanguageCodes.getLanguageCodeByName(code);
if (alt != null) {
languageCode = alt;
return;
} else {
Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code);
LOGGER.atDebug().setMessage("Unknown languagerulename '{}'").addArgument(code).log();
}
}
languageCode = code;
}

/** Returns Language Code for programmatic usage. */
/**
* Returns Language Code for programmatic usage.
*/
public String getLanguage() {
return languageCode;
}

/** Pattern for the language/country ISO code (of a form LL-CC). */
/*
* Pattern for the language/country ISO code (of a form LL-CC). It is like
* "EN.*".
*/
private Pattern pattern;

/** Returns Pattern for the language/country ISO code (of a form LL-CC). */
/**
* Returns Pattern for the language/country ISO code (of a form LL-CC).
*/
public String getPattern() {
if (pattern != null) {
return pattern.pattern();
Expand All @@ -110,14 +151,24 @@ public Pattern getCompiledPattern() {
return pattern;
}

/** Sets Pattern for the language/country ISO code (of a form LL-CC). */
/**
* Sets Pattern for the language/country ISO code (of a form LL-CC).
*
* @param pattern
* pattern string such as "EN.*"
*/
public void setPattern(String pattern) throws PatternSyntaxException {
// Fix for bug [1643500]
// language code in segmentation rule is case sensitive
// language code in segmentation rule is a case-sensitive
// Correction contributed by Tiago Saboga.
this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
}

/**
* Deep copy of the object, mandatory for java beans.
*
* @return new MapRule object
*/
public MapRule copy() {
MapRule result = new MapRule();
result.languageCode = languageCode;
Expand All @@ -142,23 +193,28 @@ public void setRules(List<Rule> rules) {
this.rules = rules;
}

/** Indicates whether some other MapRule is "equal to" this one. */
/**
* Indicates whether some other MapRule is "equal to" this one.
*/
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof MapRule)) {
if (!(obj instanceof MapRule)) {
return false;
}
MapRule that = (MapRule) obj;
return this.getPattern().equals(that.getPattern())
&& this.getLanguage().equals(that.getLanguage())
return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage())
&& this.getRules().equals(that.getRules());
}

/** Returns a hash code value for the object. */
/**
* Returns a hash code value for the object.
*/
public int hashCode() {
return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode();
}

/** Returns a string representation of the MapRule for debugging purposes. */
/**
* Returns a string representation of the MapRule for debugging purposes.
*/
public String toString() {
return getLanguage() + " (" + getPattern() + ") " + getRules().toString();
}
Expand Down
Loading

0 comments on commit 2469dd8

Please sign in to comment.