Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: o.o.c.segmentation.SRX to load conf and save srx in more robust way and remove warning message #1159

Merged
merged 7 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/org/omegat/Bundle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1734,8 +1734,6 @@ CORE_SRX_RULES_LANG_DEFAULT=Default
CORE_SRX_RULES_FORMATTING_TEXT=Text files segmentation
CORE_SRX_RULES_FORMATTING_HTML=HTML, XHTML, ODF and Infix segmentation

CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE=Unknown language code {0} specified

# org.omegat.core.spellchecker.SpellCheckerManager
CORE_SPELLCHECKER_NO_ENGINE=No active spell checker engine found

Expand Down
340 changes: 169 additions & 171 deletions src/org/omegat/Bundle_it.properties

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions src/org/omegat/Bundle_nl.properties
Original file line number Diff line number Diff line change
Expand Up @@ -1555,8 +1555,6 @@ CORE_SRX_RULES_LANG_DEFAULT=Standaard
CORE_SRX_RULES_FORMATTING_TEXT=Segmentatie van tekstbestanden
CORE_SRX_RULES_FORMATTING_HTML=segmentatie voor HTML, XHTML, ODF en Infix

CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE=Onbekende taalcode {0} gespecificeerd

# org.omegat.core.spellchecker.SpellCheckerManager
CORE_SPELLCHECKER_NO_ENGINE=Geen actief programma voor spellingscontrole gevonden

Expand Down
123 changes: 84 additions & 39 deletions src/org/omegat/core/segmentation/LanguageCodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,48 +43,65 @@ public final class LanguageCodes {
private LanguageCodes() {
}

// Language Codes
public static final String CATALAN_CODE = "Catalan";
public static final String CZECH_CODE = "Czech";
public static final String GERMAN_CODE = "German";
public static final String ENGLISH_CODE = "English";
public static final String SPANISH_CODE = "Spanish";
public static final String FINNISH_CODE = "Finnish";
public static final String FRENCH_CODE = "French";
public static final String ITALIAN_CODE = "Italian";
public static final String JAPANESE_CODE = "Japanese";
public static final String DUTCH_CODE = "Dutch";
public static final String POLISH_CODE = "Polish";
public static final String RUSSIAN_CODE = "Russian";
public static final String SWEDISH_CODE = "Swedish";
public static final String SLOVAK_CODE = "Slovak";
public static final String CHINESE_CODE = "Chinese";
public static final String DEFAULT_CODE = "Default";
public static final String F_TEXT_CODE = "Text";
public static final String F_HTML_CODE = "HTML";
// Codes of "languagerulename".
static final String CATALAN_CODE = "Catalan";
static final String CZECH_CODE = "Czech";
static final String GERMAN_CODE = "German";
static final String ENGLISH_CODE = "English";
static final String SPANISH_CODE = "Spanish";
static final String FINNISH_CODE = "Finnish";
static final String FRENCH_CODE = "French";
static final String ITALIAN_CODE = "Italian";
static final String JAPANESE_CODE = "Japanese";
static final String DUTCH_CODE = "Dutch";
static final String POLISH_CODE = "Polish";
static final String RUSSIAN_CODE = "Russian";
static final String SWEDISH_CODE = "Swedish";
static final String SLOVAK_CODE = "Slovak";
static final String CHINESE_CODE = "Chinese";
static final String DEFAULT_CODE = "Default";
static final String F_TEXT_CODE = "Text";
static final String F_HTML_CODE = "HTML";

// Language Keys from Resource Bundle
public static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
public static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
public static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
public static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
public static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
public static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
public static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
public static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
public static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
public static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
public static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
public static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
public static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
public static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
public static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
public static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";
static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";

private static final String CATALAN_PATTERN = "CA.*";
private static final String CZECH_PATTERN = "CS.*";
private static final String GERMAN_PATTERN = "DE.*";
private static final String ENGLISH_PATTERN = "EN.*";
private static final String SPANISH_PATTERN = "ES.*";
private static final String FINNISH_PATTERN = "FI.*";
private static final String FRENCH_PATTERN = "FR.*";
private static final String ITALIAN_PATTERN = "IT.*";
private static final String JAPANESE_PATTERN = "JA.*";
private static final String DUTCH_PATTERN = "NL.*";
private static final String POLISH_PATTERN = "PL.*";
private static final String RUSSIAN_PATTERN = "RU.*";
private static final String SWEDISH_PATTERN = "SV.*";
private static final String SLOVAK_PATTERN = "SK.*";
private static final String CHINESE_PATTERN = "ZH.*";

/** A Map from language codes to language keys. */
private static Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> codeKeyHash = new HashMap<>();
private static final Map<String, String> patternHash = new HashMap<>();

static {
codeKeyHash.put(CATALAN_CODE, CATALAN_KEY);
Expand All @@ -105,6 +122,21 @@ private LanguageCodes() {
codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY);
codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY);
codeKeyHash.put(F_HTML_CODE, F_HTML_KEY);
patternHash.put(CATALAN_PATTERN, CATALAN_CODE);
patternHash.put(CZECH_PATTERN, CZECH_CODE);
patternHash.put(GERMAN_PATTERN, GERMAN_CODE);
patternHash.put(ENGLISH_PATTERN, ENGLISH_CODE);
patternHash.put(SPANISH_PATTERN, SPANISH_CODE);
patternHash.put(FINNISH_PATTERN, FINNISH_CODE);
patternHash.put(FRENCH_PATTERN, FRENCH_CODE);
patternHash.put(ITALIAN_PATTERN, ITALIAN_CODE);
patternHash.put(JAPANESE_PATTERN, JAPANESE_CODE);
patternHash.put(DUTCH_PATTERN, DUTCH_CODE);
patternHash.put(POLISH_PATTERN, POLISH_CODE);
patternHash.put(RUSSIAN_PATTERN, RUSSIAN_CODE);
patternHash.put(SWEDISH_PATTERN, SWEDISH_CODE);
patternHash.put(SLOVAK_PATTERN, SLOVAK_CODE);
patternHash.put(CHINESE_PATTERN, CHINESE_CODE);
}

/**
Expand All @@ -126,11 +158,24 @@ public static boolean isLanguageCodeKnown(String code) {
}

public static String getLanguageCodeByName(String name) {
for (Map.Entry<String, String> entry: codeKeyHash.entrySet()) {
if (name == null) {
return null;
}
for (Map.Entry<String, String> entry : codeKeyHash.entrySet()) {
if (OStrings.getString(entry.getValue()).equals(name)) {
return entry.getKey();
}
}
// migration heuristics: Germany translation changed in v5.5.
// See:
// https://github.com/omegat-org/omegat/pull/1158#issuecomment-2448788253
if (name.contains("Textdateien")) {
return LanguageCodes.F_TEXT_CODE;
}
return null;
}

public static String getLanguageCodeByPattern(String pattern) {
return patternHash.get(pattern);
}
}
106 changes: 81 additions & 25 deletions src/org/omegat/core/segmentation/MapRule.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
glossaries, and translation leveraging into updated projects.

Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2024 Hiroshi Miura
Home page: https://www.omegat.org/
Support center: https://omegat.org/support

Expand Down Expand Up @@ -31,7 +32,8 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import gen.core.segmentation.Languagemap;
import tokyo.northside.logging.ILogger;

import org.omegat.util.Log;
import org.omegat.util.StringUtil;

Expand All @@ -44,56 +46,95 @@
public class MapRule implements Serializable {

private static final long serialVersionUID = -5868132953113679291L;
private static final ILogger LOGGER = Log.getLogger(MapRule.class);

/** Language Name */
private String languageCode;

/** creates a new empty MapRule */
/**
* Creates a new empty MapRule.
* <p>
* When SRX.loadSrxFile loads segmentation.conf, java.beans.XMLDecoder
* create an empty object, then calls setLanguage and setPattern methods.
* </p>
*/
public MapRule() {
}

/** creates an initialized MapRule */
/**
* Create initialized MapRule object.
*
* @param language
* localized language name (from segmentation.conf), or language
* code (from SRX)
* @param pattern
* language pattern such as "EN.*" or ".*"
* @param rules
* segmentation rules.
*/
public MapRule(String language, String pattern, List<Rule> rules) {
this.setLanguage(language);
String code = LanguageCodes.getLanguageCodeByPattern(pattern);
this.setLanguage(code != null ? code : language);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We got language from the pattern, if it got "EN.*" as pattern we can know it is for English.

this.setPattern(pattern);
this.setRules(rules);
}

/** Language Name */
private String languageCode;

public MapRule(Languagemap languagemap, List<Rule> rules) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed the ctor with Languagemap as argument. It reduces a dependency of classes. A retrieval of fields of languagemap is done in the caller.

this.setLanguage(languagemap.getLanguagerulename());
this.setPattern(languagemap.getLanguagepattern());
this.setRules(rules);
}

/** Returns Language Name (to display it in a dialog). */
public String getLanguageName() {
/*
* When there has already migrated a SRX file store, languageCode fields
* has a name defined as "LanguageCodes.*_CODE". Otherwise, MapRule
* object is created from "segmentation.conf" java beans file, so it is
* localized name of language. We first assume the latter. If res is
* empty, the object is created from a SRX file, then return
* languageCode itself.
*/
String res = LanguageCodes.getLanguageName(languageCode);
return StringUtil.isEmpty(res) ? languageCode : res;
}

/** Sets Language Code */
public void setLanguage(String code) {
/*
* setLanguage method is called from XmlDecoder of a Java beans library
* when migrating from "segmentation.conf" beans file. An argument will
* be localized name of language. When the object is created from a
* standard SRX file, the argument will be standard language name,
* defined as "LanguageCodes.*_CODE". The behavior was changed in OmegaT
* 6.0.0 release in 2023. We first detect whether the argument is
* standard code. If the code is not a standard code, then try to find a
* localized name of the language name. When you believe all the OmegaT
* 4.x and 5.x users are migrated to OmegaT 6.x or later, you may want
* to remove the workaround here.
*/
if (!LanguageCodes.isLanguageCodeKnown(code)) {
String alt = LanguageCodes.getLanguageCodeByName(code);
if (alt != null) {
languageCode = alt;
return;
} else {
Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code);
LOGGER.atDebug().setMessage("Unknown languagerulename '{}'").addArgument(code).log();
}
}
languageCode = code;
}

/** Returns Language Code for programmatic usage. */
/**
* Returns Language Code for programmatic usage.
*/
public String getLanguage() {
return languageCode;
}

/** Pattern for the language/country ISO code (of a form LL-CC). */
/*
* Pattern for the language/country ISO code (of a form LL-CC). It is like
* "EN.*".
*/
private Pattern pattern;

/** Returns Pattern for the language/country ISO code (of a form LL-CC). */
/**
* Returns Pattern for the language/country ISO code (of a form LL-CC).
*/
public String getPattern() {
if (pattern != null) {
return pattern.pattern();
Expand All @@ -110,14 +151,24 @@ public Pattern getCompiledPattern() {
return pattern;
}

/** Sets Pattern for the language/country ISO code (of a form LL-CC). */
/**
* Sets Pattern for the language/country ISO code (of a form LL-CC).
*
* @param pattern
* pattern string such as "EN.*"
*/
public void setPattern(String pattern) throws PatternSyntaxException {
// Fix for bug [1643500]
// language code in segmentation rule is case sensitive
// language code in segmentation rule is a case-sensitive
// Correction contributed by Tiago Saboga.
this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
}

/**
* Deep copy of the object, mandatory for java beans.
*
* @return new MapRule object
*/
public MapRule copy() {
MapRule result = new MapRule();
result.languageCode = languageCode;
Expand All @@ -142,23 +193,28 @@ public void setRules(List<Rule> rules) {
this.rules = rules;
}

/** Indicates whether some other MapRule is "equal to" this one. */
/**
* Indicates whether some other MapRule is "equal to" this one.
*/
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof MapRule)) {
if (!(obj instanceof MapRule)) {
return false;
}
MapRule that = (MapRule) obj;
return this.getPattern().equals(that.getPattern())
&& this.getLanguage().equals(that.getLanguage())
return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage())
&& this.getRules().equals(that.getRules());
}

/** Returns a hash code value for the object. */
/**
* Returns a hash code value for the object.
*/
public int hashCode() {
return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode();
}

/** Returns a string representation of the MapRule for debugging purposes. */
/**
* Returns a string representation of the MapRule for debugging purposes.
*/
public String toString() {
return getLanguage() + " (" + getPattern() + ") " + getRules().toString();
}
Expand Down
Loading
Loading