fix: improve map rule parser

Signed-off-by: Hiroshi Miura <[email protected]>
omegat-org · Oct 31, 2024 · 7fe46d0 · 7fe46d0
1 parent c0defb6
commit 7fe46d0
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 34 deletions.
diff --git a/src/org/omegat/core/segmentation/LanguageCodes.java b/src/org/omegat/core/segmentation/LanguageCodes.java
@@ -43,7 +43,7 @@ public final class LanguageCodes {
     private LanguageCodes() {
     }
 
-    // Language Codes
+    // Codes of "languagerulename".
     public static final String CATALAN_CODE = "Catalan";
     public static final String CZECH_CODE = "Czech";
     public static final String GERMAN_CODE = "German";
@@ -83,8 +83,25 @@ private LanguageCodes() {
     public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT";
     public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML";
 
+    private static final String CATALAN_ISO = "CA";
+    private static final String CZECH_ISO = "CS";
+    private static final String GERMAN_ISO = "DE";
+    private static final String ENGLISH_ISO = "EN";
+    private static final String SPANISH_ISO = "ES";
+    private static final String FINNISH_ISO = "FI";
+    private static final String FRENCH_ISO = "FR";
+    private static final String ITALIAN_ISO = "IT";
+    private static final String JAPANESE_ISO = "JA";
+    private static final String DUTCH_ISO = "NL";
+    private static final String POLISH_ISO = "PL";
+    private static final String RUSSIAN_ISO = "RU";
+    private static final String SWEDISH_ISO = "SV";
+    private static final String SLOVAK_ISO = "SK";
+    private static final String CHINESE_ISO = "ZH";
+
     /** A Map from language codes to language keys. */
-    private static Map<String, String> codeKeyHash = new HashMap<>();
+    private static final Map<String, String> codeKeyHash = new HashMap<>();
+    private static final Map<String, String> isoKeyHash = new HashMap<>();
 
     static {
         codeKeyHash.put(CATALAN_CODE, CATALAN_KEY);
@@ -105,6 +122,21 @@ private LanguageCodes() {
         codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY);
         codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY);
         codeKeyHash.put(F_HTML_CODE, F_HTML_KEY);
+        isoKeyHash.put(CATALAN_ISO, CATALAN_CODE);
+        isoKeyHash.put(CZECH_ISO, CZECH_CODE);
+        isoKeyHash.put(GERMAN_ISO, GERMAN_CODE);
+        isoKeyHash.put(ENGLISH_ISO, ENGLISH_CODE);
+        isoKeyHash.put(SPANISH_ISO, SPANISH_CODE);
+        isoKeyHash.put(FINNISH_ISO, FINNISH_CODE);
+        isoKeyHash.put(FRENCH_ISO, FRENCH_CODE);
+        isoKeyHash.put(ITALIAN_ISO, ITALIAN_CODE);
+        isoKeyHash.put(JAPANESE_ISO, JAPANESE_CODE);
+        isoKeyHash.put(DUTCH_ISO, DUTCH_CODE);
+        isoKeyHash.put(POLISH_ISO, POLISH_CODE);
+        isoKeyHash.put(RUSSIAN_ISO, RUSSIAN_CODE);
+        isoKeyHash.put(SWEDISH_ISO, SWEDISH_CODE);
+        isoKeyHash.put(SLOVAK_ISO, SLOVAK_CODE);
+        isoKeyHash.put(CHINESE_ISO, CHINESE_CODE);
     }
 
     /**
@@ -126,11 +158,15 @@ public static boolean isLanguageCodeKnown(String code) {
     }
 
     public static String getLanguageCodeByName(String name) {
-        for (Map.Entry<String, String> entry: codeKeyHash.entrySet()) {
+        for (Map.Entry<String, String> entry : codeKeyHash.entrySet()) {
             if (OStrings.getString(entry.getValue()).equals(name)) {
                 return entry.getKey();
             }
         }
         return null;
     }
+
+    public static String getLanguageCodeByISO(String isoCode) {
+        return isoKeyHash.get(isoCode);
+    }
 }
diff --git a/src/org/omegat/core/segmentation/MapRule.java b/src/org/omegat/core/segmentation/MapRule.java
@@ -32,7 +32,6 @@
 import java.util.regex.PatternSyntaxException;
 
 import gen.core.segmentation.Languagemap;
-import org.omegat.util.Log;
 import org.omegat.util.StringUtil;
 
 /**
@@ -51,45 +50,85 @@ public MapRule() {
 
     /** creates an initialized MapRule */
     public MapRule(String language, String pattern, List<Rule> rules) {
-        this.setLanguage(language);
+        this.setLanguage(language, pattern);
         this.setPattern(pattern);
         this.setRules(rules);
     }
 
     /** Language Name */
+    private String languageName;
+
+    /** Language Code */
     private String languageCode;
 
     public MapRule(Languagemap languagemap, List<Rule> rules) {
-        this.setLanguage(languagemap.getLanguagerulename());
-        this.setPattern(languagemap.getLanguagepattern());
+        String pat = languagemap.getLanguagepattern();
+        this.setLanguage(languagemap.getLanguagerulename(), pat);
+        this.setPattern(pat);
         this.setRules(rules);
     }
 
     /** Returns Language Name (to display it in a dialog). */
     public String getLanguageName() {
-        String res = LanguageCodes.getLanguageName(languageCode);
-        return StringUtil.isEmpty(res) ? languageCode : res;
+        return languageName;
     }
 
     /** Sets Language Code */
     public void setLanguage(String code) {
-        if (!LanguageCodes.isLanguageCodeKnown(code)) {
-            String alt = LanguageCodes.getLanguageCodeByName(code);
-            if (alt != null) {
-                languageCode = alt;
-                return;
+        if (LanguageCodes.isLanguageCodeKnown(code)) {
+            languageCode = code;
+            languageName = LanguageCodes.getLanguageName(code);
+        } else {
+            String code1 = LanguageCodes.getLanguageCodeByName(code);
+            if (code1 != null) {
+                languageName = code;
+                languageCode = code1;
+            } else {
+                languageName = code;
+                languageCode = code;
+            }
+        }
+    }
+
+    /** Sets language code from human-readable name */
+    public void setLanguageByName(String name) {
+        if (LanguageCodes.isLanguageCodeKnown(name)) {
+            // call with SRX standard language code.
+            languageCode = name;
+            languageName = LanguageCodes.getLanguageName(name);
+        } else {
+            String code = LanguageCodes.getLanguageCodeByName(name);
+            if (code != null) {
+                languageName = name;
+                languageCode = code;
             } else {
-                Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code);
+                languageName = name;
+                languageCode = name;
             }
         }
-        languageCode = code;
     }
 
     /** Returns Language Code for programmatic usage. */
     public String getLanguage() {
         return languageCode;
     }
 
+    private void setLanguage(String languageRuleName, String languagePattern) {
+        if (languagePattern.length() > 3 && languagePattern.endsWith(".*")) {
+            String lang = languagePattern.substring(0, languagePattern.length() - 2);
+            String code = LanguageCodes.getLanguageCodeByISO(lang);
+            if (code != null) {
+                String res = LanguageCodes.getLanguageName(code);
+                languageName = StringUtil.isEmpty(res) ? code : res;
+                languageCode = code;
+                return;
+            }
+        }
+        // It is "Text", "Default", "HTML" or
+        // unknown languagepattern or unkonwn ISO code.
+        setLanguageByName(languageRuleName);
+    }
+
     /** Pattern for the language/country ISO code (of a form LL-CC). */
     private Pattern pattern;
 
@@ -148,8 +187,7 @@ public boolean equals(Object obj) {
             return false;
         }
         MapRule that = (MapRule) obj;
-        return this.getPattern().equals(that.getPattern())
-                && this.getLanguage().equals(that.getLanguage())
+        return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage())
                 && this.getRules().equals(that.getRules());
     }
 
@@ -158,7 +196,9 @@ public int hashCode() {
         return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode();
     }
 
-    /** Returns a string representation of the MapRule for debugging purposes. */
+    /**
+     * Returns a string representation of the MapRule for debugging purposes.
+     */
     public String toString() {
         return getLanguage() + " (" + getPattern() + ") " + getRules().toString();
     }

diff --git a/src/org/omegat/core/segmentation/Segmenter.java b/src/org/omegat/core/segmentation/Segmenter.java
@@ -68,7 +68,8 @@ public SRX getSRX() {
      * @param paragraph
      *            the paragraph text
      * @param spaces
-     *            list to store information about spaces between sentences (can be null)
+     *            list to store information about spaces between sentences (can
+     *            be null)
      * @param brules
      *            list to store rules that account to breaks (can be null)
      * @return list of sentences (String objects)
@@ -117,11 +118,13 @@ public List<String> segment(Language lang, String paragraph, List<StringBuilder>
     }
 
     /**
-     * Returns pre-sentences (sentences with spaces between), computed by breaking paragraph into chunks of
-     * text. Also returns the list with "the reasons" why the breaks were made, i.e. the list of break rules
-     * that contributed to each of the breaks made.
+     * Returns pre-sentences (sentences with spaces between), computed by
+     * breaking paragraph into chunks of text. Also returns the list with "the
+     * reasons" why the breaks were made, i.e. the list of break rules that
+     * contributed to each of the breaks made.
      * <p>
-     * If glued back together, these strings form the same paragraph text as this function was fed.
+     * If glued back together, these strings form the same paragraph text as
+     * this function was fed.
      *
      * @param paragraph
      *            the paragraph text
@@ -244,7 +247,8 @@ static class BreakPosition implements Comparable<BreakPosition> {
         }
 
         /**
-         * Other BreakPosition is "equal to" this one iff it has the same position.
+         * Other BreakPosition is "equal to" this one iff it has the same
+         * position.
          */
         public boolean equals(Object obj) {
             if (obj == null) {
@@ -266,10 +270,12 @@ public int hashCode() {
         /**
          * Compares this break position with another.
          *
-         * @return a negative integer if its position is less than the another's, zero if they are equal, or a
-         *         positive integer as its position is greater than the another's.
+         * @return a negative integer if its position is less than the
+         *         another's, zero if they are equal, or a positive integer as
+         *         its position is greater than the another's.
          * @throws ClassCastException
-         *             if the specified object's type prevents it from being compared to this Object.
+         *             if the specified object's type prevents it from being
+         *             compared to this Object.
          */
         public int compareTo(BreakPosition that) {
             return this.position - that.position;
@@ -323,14 +329,15 @@ public String glue(Language sourceLang, Language targetLang, List<String> senten
                     Matcher matcher = LINE_BREAK_OR_TAB_PATTERN.matcher(sp.toString());
                     if (matcher.find()) {
                         // If we found line break or tab, trim left spaces.
-                        // Right spaces are left for indentation of the next line.
+                        // Right spaces are left for indentation of the next
+                        // line.
                         String leftSpaces = matcher.group(1);
                         if (!leftSpaces.isEmpty()) {
                             sp.replace(0, leftSpaces.length(), "");
                         }
                     } else if ((lastChar != '.')
                             && (!PatternConsts.SPACY_REGEX.matcher(rule.getBeforebreak()).matches()
-                            || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
+                                    || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) {
                         sp.setLength(0);
                     }
                 }
@@ -347,10 +354,13 @@ public String glue(Language sourceLang, Language targetLang, List<String> senten
      * Segment source and target entries from TMX when counts are equals.
      */
     public void segmentEntries(boolean needResegment, Language sourceLang, String sourceEntry,
-            Language targetLang, String targetEntry, List<String> sourceSegments, List<String> targetSegments) {
+            Language targetLang, String targetEntry, List<String> sourceSegments,
+            List<String> targetSegments) {
         if (needResegment) {
             List<String> srcSegments = segment(sourceLang, sourceEntry, null, null);
-            if (targetEntry != null) { // There is no translation for this entry, because for instance it's a note
+            if (targetEntry != null) { // There is no translation for this
+                                       // entry, because for instance it's a
+                                       // note
                                        // on an untranslated entry
                 List<String> tarSegments = segment(targetLang, targetEntry, null, null);
 
@@ -361,7 +371,8 @@ public void segmentEntries(boolean needResegment, Language sourceLang, String so
                 }
             }
         }
-        // No need to resegment, or segments counts not equals, or no translation
+        // No need to resegment, or segments counts not equals, or no
+        // translation
         sourceSegments.add(sourceEntry);
         targetSegments.add(targetEntry);
 

diff --git a/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java b/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java
@@ -96,7 +96,7 @@ public void setValueAt(Object aValue, int rowIndex, int columnIndex) {
             if (code != null) {
                 maprule.setLanguage(code);
             } else {
-                maprule.setLanguage(target);
+                maprule.setLanguageByName(target);
             }
             break;
         case 1: