From a69a4187fb30877238049d81a6fe70b69e471de8 Mon Sep 17 00:00:00 2001
From: Hiroshi Miura <miurahr@linux.com>
Date: Thu, 31 Oct 2024 20:51:25 +0900
Subject: [PATCH] fix: workaournd for an unknown language code

- rulename for text in Germany was changed in v5.5
- when reading "segmentation.conf" generated before v5.4,
  migration is failed.
- Add workaround to detect ancient rulename

Signed-off-by: Hiroshi Miura <miurahr@linux.com>
---
 .../core/segmentation/LanguageCodes.java      | 114 ++++++++++++------
 src/org/omegat/core/segmentation/MapRule.java | 110 +++++++++++++----
 src/org/omegat/core/segmentation/SRX.java     |   4 +-
 .../omegat/core/segmentation/Segmenter.java   |  39 +++---
 .../datamodels/MappingRulesModel.java         |   4 +-
 5 files changed, 190 insertions(+), 81 deletions(-)

diff --git a/src/org/omegat/core/segmentation/LanguageCodes.java b/src/org/omegat/core/segmentation/LanguageCodes.java
index 20f126bac9..8deaff107e 100644
--- a/src/org/omegat/core/segmentation/LanguageCodes.java
+++ b/src/org/omegat/core/segmentation/LanguageCodes.java
@@ -43,48 +43,65 @@ public final class LanguageCodes {
     private LanguageCodes() {
     }
 
-    // Language Codes
-    public static final String CATALAN_CODE = "Catalan";
-    public static final String CZECH_CODE = "Czech";
-    public static final String GERMAN_CODE = "German";
-    public static final String ENGLISH_CODE = "English";
-    public static final String SPANISH_CODE = "Spanish";
-    public static final String FINNISH_CODE = "Finnish";
-    public static final String FRENCH_CODE = "French";
-    public static final String ITALIAN_CODE = "Italian";
-    public static final String JAPANESE_CODE = "Japanese";
-    public static final String DUTCH_CODE = "Dutch";
-    public static final String POLISH_CODE = "Polish";
-    public static final String RUSSIAN_CODE = "Russian";
-    public static final String SWEDISH_CODE = "Swedish";
-    public static final String SLOVAK_CODE = "Slovak";
-    public static final String CHINESE_CODE = "Chinese";
-    public static final String DEFAULT_CODE = "Default";
-    public static final String F_TEXT_CODE = "Text";
-    public static final String F_HTML_CODE = "HTML";
+    // Codes of "languagerulename".
+    static final String CATALAN_CODE = "Catalan";
+    static final String CZECH_CODE = "Czech";
+    static final String GERMAN_CODE = "German";
+    static final String ENGLISH_CODE = "English";
+    static final String SPANISH_CODE = "Spanish";
+    static final String FINNISH_CODE = "Finnish";
+    static final String FRENCH_CODE = "French";
+    static final String ITALIAN_CODE = "Italian";
+    static final String JAPANESE_CODE = "Japanese";
+    static final String DUTCH_CODE = "Dutch";
+    static final String POLISH_CODE = "Polish";
+    static final String RUSSIAN_CODE = "Russian";
+    static final String SWEDISH_CODE = "Swedish";
+    static final String SLOVAK_CODE = "Slovak";
+    static final String CHINESE_CODE = "Chinese";
+    static final String DEFAULT_CODE = "Default";
+    static final String F_TEXT_CODE = "Text";
+    static final String F_HTML_CODE = "HTML";
 
     // Language Keys from Resource Bundle
-    public static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
-    public static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
-    public static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
-    public static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
-    public static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
-    public static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
-    public static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
-    public static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
-    public static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
-    public static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
-    public static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
-    public static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
-    public static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
-    public static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
-    public static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
-    public static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
-    public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
-    public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";
+    static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN";
+    static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH";
+    static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN";
+    static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH";
+    static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH";
+    static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH";
+    static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH";
+    static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN";
+    static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE";
+    static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH";
+    static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH";
+    static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN";
+    static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH";
+    static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK";
+    static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE";
+    static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT";
+    static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
+    static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";
+
+    private static final String CATALAN_PATTERN = "CA.*";
+    private static final String CZECH_PATTERN = "CS.*";
+    private static final String GERMAN_PATTERN = "DE.*";
+    private static final String ENGLISH_PATTERN = "EN.*";
+    private static final String SPANISH_PATTERN = "ES.*";
+    private static final String FINNISH_PATTERN = "FI.*";
+    private static final String FRENCH_PATTERN = "FR.*";
+    private static final String ITALIAN_PATTERN = "IT.*";
+    private static final String JAPANESE_PATTERN = "JA.*";
+    private static final String DUTCH_PATTERN = "NL.*";
+    private static final String POLISH_PATTERN = "PL.*";
+    private static final String RUSSIAN_PATTERN = "RU.*";
+    private static final String SWEDISH_PATTERN = "SV.*";
+    private static final String SLOVAK_PATTERN = "SK.*";
+    private static final String CHINESE_PATTERN = "ZH.*";
 
     /** A Map from language codes to language keys. */
-    private static Map<String, String> codeKeyHash = new HashMap<>();
+    private static final Map<String, String> codeKeyHash = new HashMap<>();
+    private static final Map<String, String> patternHash = new HashMap<>();
 
     static {
         codeKeyHash.put(CATALAN_CODE, CATALAN_KEY);
@@ -105,6 +122,21 @@ private LanguageCodes() {
         codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY);
         codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY);
         codeKeyHash.put(F_HTML_CODE, F_HTML_KEY);
+        patternHash.put(CATALAN_PATTERN, CATALAN_CODE);
+        patternHash.put(CZECH_PATTERN, CZECH_CODE);
+        patternHash.put(GERMAN_PATTERN, GERMAN_CODE);
+        patternHash.put(ENGLISH_PATTERN, ENGLISH_CODE);
+        patternHash.put(SPANISH_PATTERN, SPANISH_CODE);
+        patternHash.put(FINNISH_PATTERN, FINNISH_CODE);
+        patternHash.put(FRENCH_PATTERN, FRENCH_CODE);
+        patternHash.put(ITALIAN_PATTERN, ITALIAN_CODE);
+        patternHash.put(JAPANESE_PATTERN, JAPANESE_CODE);
+        patternHash.put(DUTCH_PATTERN, DUTCH_CODE);
+        patternHash.put(POLISH_PATTERN, POLISH_CODE);
+        patternHash.put(RUSSIAN_PATTERN, RUSSIAN_CODE);
+        patternHash.put(SWEDISH_PATTERN, SWEDISH_CODE);
+        patternHash.put(SLOVAK_PATTERN, SLOVAK_CODE);
+        patternHash.put(CHINESE_PATTERN, CHINESE_CODE);
     }
 
     /**
@@ -126,11 +158,15 @@ public static boolean isLanguageCodeKnown(String code) {
     }
 
     public static String getLanguageCodeByName(String name) {
-        for (Map.Entry<String, String> entry: codeKeyHash.entrySet()) {
+        for (Map.Entry<String, String> entry : codeKeyHash.entrySet()) {
             if (OStrings.getString(entry.getValue()).equals(name)) {
                 return entry.getKey();
             }
         }
         return null;
     }
+
+    public static String getLanguageCodeByPattern(String pattern) {
+        return patternHash.get(pattern);
+    }
 }
diff --git a/src/org/omegat/core/segmentation/MapRule.java b/src/org/omegat/core/segmentation/MapRule.java
index d3dbfb4cb2..63053d32b1 100644
--- a/src/org/omegat/core/segmentation/MapRule.java
+++ b/src/org/omegat/core/segmentation/MapRule.java
@@ -4,6 +4,7 @@
           glossaries, and translation leveraging into updated projects.
 
  Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
+               2024 Hiroshi Miura
                Home page: https://www.omegat.org/
                Support center: https://omegat.org/support
 
@@ -31,7 +32,8 @@
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
-import gen.core.segmentation.Languagemap;
+import tokyo.northside.logging.ILogger;
+
 import org.omegat.util.Log;
 import org.omegat.util.StringUtil;
 
@@ -44,56 +46,99 @@
 public class MapRule implements Serializable {
 
     private static final long serialVersionUID = -5868132953113679291L;
+    private static final ILogger LOGGER = Log.getLogger(MapRule.class);
+
+    /** Language Name */
+    private String languageCode;
 
     /** creates a new empty MapRule */
     public MapRule() {
     }
 
-    /** creates an initialized MapRule */
+    /**
+     * Create initialized MapRule object.
+     * 
+     * @param language
+     *            localized language name (from segmentation.conf), or language
+     *            code (from SRX)
+     * @param pattern
+     *            language pattern such as "EN.*" or ".*"
+     * @param rules
+     *            segmentation rules.
+     */
     public MapRule(String language, String pattern, List<Rule> rules) {
-        this.setLanguage(language);
+        String code = LanguageCodes.getLanguageCodeByPattern(pattern);
+        this.setLanguage(code != null ? code : language);
         this.setPattern(pattern);
         this.setRules(rules);
     }
 
-    /** Language Name */
-    private String languageCode;
-
-    public MapRule(Languagemap languagemap, List<Rule> rules) {
-        this.setLanguage(languagemap.getLanguagerulename());
-        this.setPattern(languagemap.getLanguagepattern());
-        this.setRules(rules);
-    }
-
     /** Returns Language Name (to display it in a dialog). */
     public String getLanguageName() {
+        /*
+         * When there has already migrated a SRX file store, languageCode fields
+         * has a name defined as "LanguageCodes.*_CODE". Otherwise, MapRule
+         * object is created from "segmentation.conf" java beans file, so it is
+         * localized name of language. We first assume the latter. If res is
+         * empty, the object is created from a SRX file, then return
+         * languageCode itself.
+         */
         String res = LanguageCodes.getLanguageName(languageCode);
         return StringUtil.isEmpty(res) ? languageCode : res;
     }
 
     /** Sets Language Code */
     public void setLanguage(String code) {
+        /*
+         * setLanguage method is called from XmlDecoder of a Java beans library
+         * when migrating from "segmentation.conf" beans file. An argument will
+         * be localized name of language. When the object is created from a
+         * standard SRX file, the argument will be standard language name,
+         * defined as "LanguageCodes.*_CODE". The behavior was changed in OmegaT
+         * 6.0.0 release in 2023. We first detect whether the argument is
+         * standard code. If the code is not a standard code, then try to find a
+         * localized name of the language name. When you read the comment long
+         * after OmegaT 6.x, and you believe all the OmegaT 4.x and 5.x users
+         * are migrated to OmegaT 6.x or later, you may want to remove the chunk
+         * below.
+         */
         if (!LanguageCodes.isLanguageCodeKnown(code)) {
             String alt = LanguageCodes.getLanguageCodeByName(code);
             if (alt != null) {
                 languageCode = alt;
-                return;
             } else {
-                Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code);
+                // migration heuristics: Germany translation changed in v5.5.
+                // See:
+                // https://github.com/omegat-org/omegat/pull/1158#issuecomment-2448788253
+                if (code != null && code.contains("Textdateien")) {
+                    languageCode = LanguageCodes.F_TEXT_CODE;
+                } else {
+                    LOGGER.atDebug().setMessageRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE").addArgument(code)
+                            .log();
+                    languageCode = code;
+                }
             }
+            return;
         }
         languageCode = code;
     }
 
-    /** Returns Language Code for programmatic usage. */
+    /**
+     * Returns Language Code for programmatic usage.
+     */
     public String getLanguage() {
         return languageCode;
     }
 
-    /** Pattern for the language/country ISO code (of a form LL-CC). */
+    /*
+     * Pattern for the language/country ISO code (of a form LL-CC). It is like
+     * "EN.*".
+     */
     private Pattern pattern;
 
-    /** Returns Pattern for the language/country ISO code (of a form LL-CC). */
+    /**
+     * Returns Pattern for the language/country ISO code (of a form LL-CC).
+     */
     public String getPattern() {
         if (pattern != null) {
             return pattern.pattern();
@@ -110,14 +155,24 @@ public Pattern getCompiledPattern() {
         return pattern;
     }
 
-    /** Sets Pattern for the language/country ISO code (of a form LL-CC). */
+    /**
+     * Sets Pattern for the language/country ISO code (of a form LL-CC).
+     * 
+     * @param pattern
+     *            pattern string such as "EN.*"
+     */
     public void setPattern(String pattern) throws PatternSyntaxException {
         // Fix for bug [1643500]
-        // language code in segmentation rule is case sensitive
+        // language code in segmentation rule is a case-sensitive
         // Correction contributed by Tiago Saboga.
         this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
     }
 
+    /**
+     * Deep copy of the object, mandatory for java beans.
+     * 
+     * @return new MapRule object
+     */
     public MapRule copy() {
         MapRule result = new MapRule();
         result.languageCode = languageCode;
@@ -142,23 +197,28 @@ public void setRules(List<Rule> rules) {
         this.rules = rules;
     }
 
-    /** Indicates whether some other MapRule is "equal to" this one. */
+    /**
+     * Indicates whether some other MapRule is "equal to" this one.
+     */
     public boolean equals(Object obj) {
-        if (obj == null || !(obj instanceof MapRule)) {
+        if (!(obj instanceof MapRule)) {
             return false;
         }
         MapRule that = (MapRule) obj;
-        return this.getPattern().equals(that.getPattern())
-                && this.getLanguage().equals(that.getLanguage())
+        return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage())
                 && this.getRules().equals(that.getRules());
     }
 
-    /** Returns a hash code value for the object. */
+    /**
+     * Returns a hash code value for the object.
+     */
     public int hashCode() {
         return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode();
     }
 
-    /** Returns a string representation of the MapRule for debugging purposes. */
+    /**
+     * Returns a string representation of the MapRule for debugging purposes.
+     */
     public String toString() {
         return getLanguage() + " (" + getPattern() + ") " + getRules().toString();
     }
diff --git a/src/org/omegat/core/segmentation/SRX.java b/src/org/omegat/core/segmentation/SRX.java
index 96cb6950ea..06cbe6b672 100644
--- a/src/org/omegat/core/segmentation/SRX.java
+++ b/src/org/omegat/core/segmentation/SRX.java
@@ -287,7 +287,9 @@ private static SRX loadSrxInputStream(InputStream io) throws IOException {
         res.setCascade(!"no".equalsIgnoreCase(srx.getHeader().getCascade()));
         res.setVersion(srx.getVersion());
         res.setMappingRules(srx.getBody().getMaprules().getLanguagemap().stream()
-                .map(s -> new MapRule(s, mapping.get(s.getLanguagerulename()))).collect(Collectors.toList()));
+                .map(languagemap -> new MapRule(languagemap.getLanguagerulename(),
+                        languagemap.getLanguagepattern(), mapping.get(languagemap.getLanguagerulename())))
+                .collect(Collectors.toList()));
         return res;
     }
 
diff --git a/src/org/omegat/core/segmentation/Segmenter.java b/src/org/omegat/core/segmentation/Segmenter.java
index 8b1c774133..e587e3766c 100644
--- a/src/org/omegat/core/segmentation/Segmenter.java
+++ b/src/org/omegat/core/segmentation/Segmenter.java
@@ -68,7 +68,8 @@ public SRX getSRX() {
      * @param paragraph
      *            the paragraph text
      * @param spaces
-     *            list to store information about spaces between sentences (can be null)
+     *            list to store information about spaces between sentences (can
+     *            be null)
      * @param brules
      *            list to store rules that account to breaks (can be null)
      * @return list of sentences (String objects)
@@ -117,11 +118,13 @@ public List<String> segment(Language lang, String paragraph, List<StringBuilder>
     }
 
     /**
-     * Returns pre-sentences (sentences with spaces between), computed by breaking paragraph into chunks of
-     * text. Also returns the list with "the reasons" why the breaks were made, i.e. the list of break rules
-     * that contributed to each of the breaks made.
+     * Returns pre-sentences (sentences with spaces between), computed by
+     * breaking paragraph into chunks of text. Also returns the list with "the
+     * reasons" why the breaks were made, i.e. the list of break rules that
+     * contributed to each of the breaks made.
      * <p>
-     * If glued back together, these strings form the same paragraph text as this function was fed.
+     * If glued back together, these strings form the same paragraph text as
+     * this function was fed.
      *
      * @param paragraph
      *            the paragraph text
@@ -244,7 +247,8 @@ static class BreakPosition implements Comparable<BreakPosition> {
         }
 
         /**
-         * Other BreakPosition is "equal to" this one iff it has the same position.
+         * Other BreakPosition is "equal to" this one iff it has the same
+         * position.
          */
         public boolean equals(Object obj) {
             if (obj == null) {
@@ -266,10 +270,12 @@ public int hashCode() {
         /**
          * Compares this break position with another.
          *
-         * @return a negative integer if its position is less than the another's, zero if they are equal, or a
-         *         positive integer as its position is greater than the another's.
+         * @return a negative integer if its position is less than the
+         *         another's, zero if they are equal, or a positive integer as
+         *         its position is greater than the another's.
          * @throws ClassCastException
-         *             if the specified object's type prevents it from being compared to this Object.
+         *             if the specified object's type prevents it from being
+         *             compared to this Object.
          */
         public int compareTo(BreakPosition that) {
             return this.position - that.position;
@@ -323,14 +329,15 @@ public String glue(Language sourceLang, Language targetLang, List<String> senten
                     Matcher matcher = LINE_BREAK_OR_TAB_PATTERN.matcher(sp.toString());
                     if (matcher.find()) {
                         // If we found line break or tab, trim left spaces.
-                        // Right spaces are left for indentation of the next line.
+                        // Right spaces are left for indentation of the next
+                        // line.
                         String leftSpaces = matcher.group(1);
                         if (!leftSpaces.isEmpty()) {
                             sp.replace(0, leftSpaces.length(), "");
                         }
                     } else if ((lastChar != '.')
                             && (!PatternConsts.SPACY_REGEX.matcher(rule.getBeforebreak()).matches()
-                            || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
+                                    || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
                         sp.setLength(0);
                     }
                 }
@@ -347,10 +354,13 @@ public String glue(Language sourceLang, Language targetLang, List<String> senten
      * Segment source and target entries from TMX when counts are equals.
      */
     public void segmentEntries(boolean needResegment, Language sourceLang, String sourceEntry,
-            Language targetLang, String targetEntry, List<String> sourceSegments, List<String> targetSegments) {
+            Language targetLang, String targetEntry, List<String> sourceSegments,
+            List<String> targetSegments) {
         if (needResegment) {
             List<String> srcSegments = segment(sourceLang, sourceEntry, null, null);
-            if (targetEntry != null) { // There is no translation for this entry, because for instance it's a note
+            if (targetEntry != null) { // There is no translation for this
+                                       // entry, because for instance it's a
+                                       // note
                                        // on an untranslated entry
                 List<String> tarSegments = segment(targetLang, targetEntry, null, null);
 
@@ -361,7 +371,8 @@ public void segmentEntries(boolean needResegment, Language sourceLang, String so
                 }
             }
         }
-        // No need to resegment, or segments counts not equals, or no translation
+        // No need to resegment, or segments counts not equals, or no
+        // translation
         sourceSegments.add(sourceEntry);
         targetSegments.add(targetEntry);
 
diff --git a/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java b/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java
index 0929df2b40..856d2f3c39 100644
--- a/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java
+++ b/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java
@@ -118,8 +118,8 @@ public Class<?> getColumnClass(int columnIndex) {
     /** Adds a new empty mapping rule. */
     public int addRow() {
         int rows = srx.getMappingRules().size();
-        srx.getMappingRules().add(
-                new MapRule(OStrings.getString("SEG_NEW_LN_CO"), "LN-CO", new ArrayList<Rule>()));
+        srx.getMappingRules()
+                .add(new MapRule(OStrings.getString("SEG_NEW_LN_CO"), "LN-CO", new ArrayList<Rule>()));
         fireTableRowsInserted(rows, rows);
         return rows;
     }