From a69a4187fb30877238049d81a6fe70b69e471de8 Mon Sep 17 00:00:00 2001 From: Hiroshi Miura Date: Thu, 31 Oct 2024 20:51:25 +0900 Subject: [PATCH] fix: workaournd for an unknown language code - rulename for text in Germany was changed in v5.5 - when reading "segmentation.conf" generated before v5.4, migration is failed. - Add workaround to detect ancient rulename Signed-off-by: Hiroshi Miura --- .../core/segmentation/LanguageCodes.java | 114 ++++++++++++------ src/org/omegat/core/segmentation/MapRule.java | 110 +++++++++++++---- src/org/omegat/core/segmentation/SRX.java | 4 +- .../omegat/core/segmentation/Segmenter.java | 39 +++--- .../datamodels/MappingRulesModel.java | 4 +- 5 files changed, 190 insertions(+), 81 deletions(-) diff --git a/src/org/omegat/core/segmentation/LanguageCodes.java b/src/org/omegat/core/segmentation/LanguageCodes.java index 20f126bac9..8deaff107e 100644 --- a/src/org/omegat/core/segmentation/LanguageCodes.java +++ b/src/org/omegat/core/segmentation/LanguageCodes.java @@ -43,48 +43,65 @@ public final class LanguageCodes { private LanguageCodes() { } - // Language Codes - public static final String CATALAN_CODE = "Catalan"; - public static final String CZECH_CODE = "Czech"; - public static final String GERMAN_CODE = "German"; - public static final String ENGLISH_CODE = "English"; - public static final String SPANISH_CODE = "Spanish"; - public static final String FINNISH_CODE = "Finnish"; - public static final String FRENCH_CODE = "French"; - public static final String ITALIAN_CODE = "Italian"; - public static final String JAPANESE_CODE = "Japanese"; - public static final String DUTCH_CODE = "Dutch"; - public static final String POLISH_CODE = "Polish"; - public static final String RUSSIAN_CODE = "Russian"; - public static final String SWEDISH_CODE = "Swedish"; - public static final String SLOVAK_CODE = "Slovak"; - public static final String CHINESE_CODE = "Chinese"; - public static final String DEFAULT_CODE = "Default"; - public static final String F_TEXT_CODE = "Text"; - public static final String F_HTML_CODE = "HTML"; + // Codes of "languagerulename". + static final String CATALAN_CODE = "Catalan"; + static final String CZECH_CODE = "Czech"; + static final String GERMAN_CODE = "German"; + static final String ENGLISH_CODE = "English"; + static final String SPANISH_CODE = "Spanish"; + static final String FINNISH_CODE = "Finnish"; + static final String FRENCH_CODE = "French"; + static final String ITALIAN_CODE = "Italian"; + static final String JAPANESE_CODE = "Japanese"; + static final String DUTCH_CODE = "Dutch"; + static final String POLISH_CODE = "Polish"; + static final String RUSSIAN_CODE = "Russian"; + static final String SWEDISH_CODE = "Swedish"; + static final String SLOVAK_CODE = "Slovak"; + static final String CHINESE_CODE = "Chinese"; + static final String DEFAULT_CODE = "Default"; + static final String F_TEXT_CODE = "Text"; + static final String F_HTML_CODE = "HTML"; // Language Keys from Resource Bundle - public static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN"; - public static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH"; - public static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN"; - public static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH"; - public static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH"; - public static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH"; - public static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH"; - public static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN"; - public static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE"; - public static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH"; - public static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH"; - public static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN"; - public static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH"; - public static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK"; - public static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE"; - public static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT"; - public static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT"; - public static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML"; + static final String CATALAN_KEY = "CORE_SRX_RULES_LANG_CATALAN"; + static final String CZECH_KEY = "CORE_SRX_RULES_LANG_CZECH"; + static final String GERMAN_KEY = "CORE_SRX_RULES_LANG_GERMAN"; + static final String ENGLISH_KEY = "CORE_SRX_RULES_LANG_ENGLISH"; + static final String SPANISH_KEY = "CORE_SRX_RULES_LANG_SPANISH"; + static final String FINNISH_KEY = "CORE_SRX_RULES_LANG_FINNISH"; + static final String FRENCH_KEY = "CORE_SRX_RULES_LANG_FRENCH"; + static final String ITALIAN_KEY = "CORE_SRX_RULES_LANG_ITALIAN"; + static final String JAPANESE_KEY = "CORE_SRX_RULES_LANG_JAPANESE"; + static final String DUTCH_KEY = "CORE_SRX_RULES_LANG_DUTCH"; + static final String POLISH_KEY = "CORE_SRX_RULES_LANG_POLISH"; + static final String RUSSIAN_KEY = "CORE_SRX_RULES_LANG_RUSSIAN"; + static final String SWEDISH_KEY = "CORE_SRX_RULES_LANG_SWEDISH"; + static final String SLOVAK_KEY = "CORE_SRX_RULES_LANG_SLOVAK"; + static final String CHINESE_KEY = "CORE_SRX_RULES_LANG_CHINESE"; + static final String DEFAULT_KEY = "CORE_SRX_RULES_LANG_DEFAULT"; + static final String F_TEXT_KEY = "CORE_SRX_RULES_FORMATTING_TEXT"; + static final String F_HTML_KEY = "CORE_SRX_RULES_FORMATTING_HTML"; + + private static final String CATALAN_PATTERN = "CA.*"; + private static final String CZECH_PATTERN = "CS.*"; + private static final String GERMAN_PATTERN = "DE.*"; + private static final String ENGLISH_PATTERN = "EN.*"; + private static final String SPANISH_PATTERN = "ES.*"; + private static final String FINNISH_PATTERN = "FI.*"; + private static final String FRENCH_PATTERN = "FR.*"; + private static final String ITALIAN_PATTERN = "IT.*"; + private static final String JAPANESE_PATTERN = "JA.*"; + private static final String DUTCH_PATTERN = "NL.*"; + private static final String POLISH_PATTERN = "PL.*"; + private static final String RUSSIAN_PATTERN = "RU.*"; + private static final String SWEDISH_PATTERN = "SV.*"; + private static final String SLOVAK_PATTERN = "SK.*"; + private static final String CHINESE_PATTERN = "ZH.*"; /** A Map from language codes to language keys. */ - private static Map codeKeyHash = new HashMap<>(); + private static final Map codeKeyHash = new HashMap<>(); + private static final Map patternHash = new HashMap<>(); static { codeKeyHash.put(CATALAN_CODE, CATALAN_KEY); @@ -105,6 +122,21 @@ private LanguageCodes() { codeKeyHash.put(DEFAULT_CODE, DEFAULT_KEY); codeKeyHash.put(F_TEXT_CODE, F_TEXT_KEY); codeKeyHash.put(F_HTML_CODE, F_HTML_KEY); + patternHash.put(CATALAN_PATTERN, CATALAN_CODE); + patternHash.put(CZECH_PATTERN, CZECH_CODE); + patternHash.put(GERMAN_PATTERN, GERMAN_CODE); + patternHash.put(ENGLISH_PATTERN, ENGLISH_CODE); + patternHash.put(SPANISH_PATTERN, SPANISH_CODE); + patternHash.put(FINNISH_PATTERN, FINNISH_CODE); + patternHash.put(FRENCH_PATTERN, FRENCH_CODE); + patternHash.put(ITALIAN_PATTERN, ITALIAN_CODE); + patternHash.put(JAPANESE_PATTERN, JAPANESE_CODE); + patternHash.put(DUTCH_PATTERN, DUTCH_CODE); + patternHash.put(POLISH_PATTERN, POLISH_CODE); + patternHash.put(RUSSIAN_PATTERN, RUSSIAN_CODE); + patternHash.put(SWEDISH_PATTERN, SWEDISH_CODE); + patternHash.put(SLOVAK_PATTERN, SLOVAK_CODE); + patternHash.put(CHINESE_PATTERN, CHINESE_CODE); } /** @@ -126,11 +158,15 @@ public static boolean isLanguageCodeKnown(String code) { } public static String getLanguageCodeByName(String name) { - for (Map.Entry entry: codeKeyHash.entrySet()) { + for (Map.Entry entry : codeKeyHash.entrySet()) { if (OStrings.getString(entry.getValue()).equals(name)) { return entry.getKey(); } } return null; } + + public static String getLanguageCodeByPattern(String pattern) { + return patternHash.get(pattern); + } } diff --git a/src/org/omegat/core/segmentation/MapRule.java b/src/org/omegat/core/segmentation/MapRule.java index d3dbfb4cb2..63053d32b1 100644 --- a/src/org/omegat/core/segmentation/MapRule.java +++ b/src/org/omegat/core/segmentation/MapRule.java @@ -4,6 +4,7 @@ glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk + 2024 Hiroshi Miura Home page: https://www.omegat.org/ Support center: https://omegat.org/support @@ -31,7 +32,8 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import gen.core.segmentation.Languagemap; +import tokyo.northside.logging.ILogger; + import org.omegat.util.Log; import org.omegat.util.StringUtil; @@ -44,56 +46,99 @@ public class MapRule implements Serializable { private static final long serialVersionUID = -5868132953113679291L; + private static final ILogger LOGGER = Log.getLogger(MapRule.class); + + /** Language Name */ + private String languageCode; /** creates a new empty MapRule */ public MapRule() { } - /** creates an initialized MapRule */ + /** + * Create initialized MapRule object. + * + * @param language + * localized language name (from segmentation.conf), or language + * code (from SRX) + * @param pattern + * language pattern such as "EN.*" or ".*" + * @param rules + * segmentation rules. + */ public MapRule(String language, String pattern, List rules) { - this.setLanguage(language); + String code = LanguageCodes.getLanguageCodeByPattern(pattern); + this.setLanguage(code != null ? code : language); this.setPattern(pattern); this.setRules(rules); } - /** Language Name */ - private String languageCode; - - public MapRule(Languagemap languagemap, List rules) { - this.setLanguage(languagemap.getLanguagerulename()); - this.setPattern(languagemap.getLanguagepattern()); - this.setRules(rules); - } - /** Returns Language Name (to display it in a dialog). */ public String getLanguageName() { + /* + * When there has already migrated a SRX file store, languageCode fields + * has a name defined as "LanguageCodes.*_CODE". Otherwise, MapRule + * object is created from "segmentation.conf" java beans file, so it is + * localized name of language. We first assume the latter. If res is + * empty, the object is created from a SRX file, then return + * languageCode itself. + */ String res = LanguageCodes.getLanguageName(languageCode); return StringUtil.isEmpty(res) ? languageCode : res; } /** Sets Language Code */ public void setLanguage(String code) { + /* + * setLanguage method is called from XmlDecoder of a Java beans library + * when migrating from "segmentation.conf" beans file. An argument will + * be localized name of language. When the object is created from a + * standard SRX file, the argument will be standard language name, + * defined as "LanguageCodes.*_CODE". The behavior was changed in OmegaT + * 6.0.0 release in 2023. We first detect whether the argument is + * standard code. If the code is not a standard code, then try to find a + * localized name of the language name. When you read the comment long + * after OmegaT 6.x, and you believe all the OmegaT 4.x and 5.x users + * are migrated to OmegaT 6.x or later, you may want to remove the chunk + * below. + */ if (!LanguageCodes.isLanguageCodeKnown(code)) { String alt = LanguageCodes.getLanguageCodeByName(code); if (alt != null) { languageCode = alt; - return; } else { - Log.logWarningRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE", code); + // migration heuristics: Germany translation changed in v5.5. + // See: + // https://github.com/omegat-org/omegat/pull/1158#issuecomment-2448788253 + if (code != null && code.contains("Textdateien")) { + languageCode = LanguageCodes.F_TEXT_CODE; + } else { + LOGGER.atDebug().setMessageRB("CORE_SRX_RULES_UNKNOWN_LANGUAGE_CODE").addArgument(code) + .log(); + languageCode = code; + } } + return; } languageCode = code; } - /** Returns Language Code for programmatic usage. */ + /** + * Returns Language Code for programmatic usage. + */ public String getLanguage() { return languageCode; } - /** Pattern for the language/country ISO code (of a form LL-CC). */ + /* + * Pattern for the language/country ISO code (of a form LL-CC). It is like + * "EN.*". + */ private Pattern pattern; - /** Returns Pattern for the language/country ISO code (of a form LL-CC). */ + /** + * Returns Pattern for the language/country ISO code (of a form LL-CC). + */ public String getPattern() { if (pattern != null) { return pattern.pattern(); @@ -110,14 +155,24 @@ public Pattern getCompiledPattern() { return pattern; } - /** Sets Pattern for the language/country ISO code (of a form LL-CC). */ + /** + * Sets Pattern for the language/country ISO code (of a form LL-CC). + * + * @param pattern + * pattern string such as "EN.*" + */ public void setPattern(String pattern) throws PatternSyntaxException { // Fix for bug [1643500] - // language code in segmentation rule is case sensitive + // language code in segmentation rule is a case-sensitive // Correction contributed by Tiago Saboga. this.pattern = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE); } + /** + * Deep copy of the object, mandatory for java beans. + * + * @return new MapRule object + */ public MapRule copy() { MapRule result = new MapRule(); result.languageCode = languageCode; @@ -142,23 +197,28 @@ public void setRules(List rules) { this.rules = rules; } - /** Indicates whether some other MapRule is "equal to" this one. */ + /** + * Indicates whether some other MapRule is "equal to" this one. + */ public boolean equals(Object obj) { - if (obj == null || !(obj instanceof MapRule)) { + if (!(obj instanceof MapRule)) { return false; } MapRule that = (MapRule) obj; - return this.getPattern().equals(that.getPattern()) - && this.getLanguage().equals(that.getLanguage()) + return this.getPattern().equals(that.getPattern()) && this.getLanguage().equals(that.getLanguage()) && this.getRules().equals(that.getRules()); } - /** Returns a hash code value for the object. */ + /** + * Returns a hash code value for the object. + */ public int hashCode() { return this.getPattern().hashCode() + this.getLanguage().hashCode() + this.getRules().hashCode(); } - /** Returns a string representation of the MapRule for debugging purposes. */ + /** + * Returns a string representation of the MapRule for debugging purposes. + */ public String toString() { return getLanguage() + " (" + getPattern() + ") " + getRules().toString(); } diff --git a/src/org/omegat/core/segmentation/SRX.java b/src/org/omegat/core/segmentation/SRX.java index 96cb6950ea..06cbe6b672 100644 --- a/src/org/omegat/core/segmentation/SRX.java +++ b/src/org/omegat/core/segmentation/SRX.java @@ -287,7 +287,9 @@ private static SRX loadSrxInputStream(InputStream io) throws IOException { res.setCascade(!"no".equalsIgnoreCase(srx.getHeader().getCascade())); res.setVersion(srx.getVersion()); res.setMappingRules(srx.getBody().getMaprules().getLanguagemap().stream() - .map(s -> new MapRule(s, mapping.get(s.getLanguagerulename()))).collect(Collectors.toList())); + .map(languagemap -> new MapRule(languagemap.getLanguagerulename(), + languagemap.getLanguagepattern(), mapping.get(languagemap.getLanguagerulename()))) + .collect(Collectors.toList())); return res; } diff --git a/src/org/omegat/core/segmentation/Segmenter.java b/src/org/omegat/core/segmentation/Segmenter.java index 8b1c774133..e587e3766c 100644 --- a/src/org/omegat/core/segmentation/Segmenter.java +++ b/src/org/omegat/core/segmentation/Segmenter.java @@ -68,7 +68,8 @@ public SRX getSRX() { * @param paragraph * the paragraph text * @param spaces - * list to store information about spaces between sentences (can be null) + * list to store information about spaces between sentences (can + * be null) * @param brules * list to store rules that account to breaks (can be null) * @return list of sentences (String objects) @@ -117,11 +118,13 @@ public List segment(Language lang, String paragraph, List } /** - * Returns pre-sentences (sentences with spaces between), computed by breaking paragraph into chunks of - * text. Also returns the list with "the reasons" why the breaks were made, i.e. the list of break rules - * that contributed to each of the breaks made. + * Returns pre-sentences (sentences with spaces between), computed by + * breaking paragraph into chunks of text. Also returns the list with "the + * reasons" why the breaks were made, i.e. the list of break rules that + * contributed to each of the breaks made. *

- * If glued back together, these strings form the same paragraph text as this function was fed. + * If glued back together, these strings form the same paragraph text as + * this function was fed. * * @param paragraph * the paragraph text @@ -244,7 +247,8 @@ static class BreakPosition implements Comparable { } /** - * Other BreakPosition is "equal to" this one iff it has the same position. + * Other BreakPosition is "equal to" this one iff it has the same + * position. */ public boolean equals(Object obj) { if (obj == null) { @@ -266,10 +270,12 @@ public int hashCode() { /** * Compares this break position with another. * - * @return a negative integer if its position is less than the another's, zero if they are equal, or a - * positive integer as its position is greater than the another's. + * @return a negative integer if its position is less than the + * another's, zero if they are equal, or a positive integer as + * its position is greater than the another's. * @throws ClassCastException - * if the specified object's type prevents it from being compared to this Object. + * if the specified object's type prevents it from being + * compared to this Object. */ public int compareTo(BreakPosition that) { return this.position - that.position; @@ -323,14 +329,15 @@ public String glue(Language sourceLang, Language targetLang, List senten Matcher matcher = LINE_BREAK_OR_TAB_PATTERN.matcher(sp.toString()); if (matcher.find()) { // If we found line break or tab, trim left spaces. - // Right spaces are left for indentation of the next line. + // Right spaces are left for indentation of the next + // line. String leftSpaces = matcher.group(1); if (!leftSpaces.isEmpty()) { sp.replace(0, leftSpaces.length(), ""); } } else if ((lastChar != '.') && (!PatternConsts.SPACY_REGEX.matcher(rule.getBeforebreak()).matches() - || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) { + || !PatternConsts.SPACY_REGEX.matcher(rule.getAfterbreak()).matches())) { sp.setLength(0); } } @@ -347,10 +354,13 @@ public String glue(Language sourceLang, Language targetLang, List senten * Segment source and target entries from TMX when counts are equals. */ public void segmentEntries(boolean needResegment, Language sourceLang, String sourceEntry, - Language targetLang, String targetEntry, List sourceSegments, List targetSegments) { + Language targetLang, String targetEntry, List sourceSegments, + List targetSegments) { if (needResegment) { List srcSegments = segment(sourceLang, sourceEntry, null, null); - if (targetEntry != null) { // There is no translation for this entry, because for instance it's a note + if (targetEntry != null) { // There is no translation for this + // entry, because for instance it's a + // note // on an untranslated entry List tarSegments = segment(targetLang, targetEntry, null, null); @@ -361,7 +371,8 @@ public void segmentEntries(boolean needResegment, Language sourceLang, String so } } } - // No need to resegment, or segments counts not equals, or no translation + // No need to resegment, or segments counts not equals, or no + // translation sourceSegments.add(sourceEntry); targetSegments.add(targetEntry); diff --git a/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java b/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java index 0929df2b40..856d2f3c39 100644 --- a/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java +++ b/src/org/omegat/core/segmentation/datamodels/MappingRulesModel.java @@ -118,8 +118,8 @@ public Class getColumnClass(int columnIndex) { /** Adds a new empty mapping rule. */ public int addRow() { int rows = srx.getMappingRules().size(); - srx.getMappingRules().add( - new MapRule(OStrings.getString("SEG_NEW_LN_CO"), "LN-CO", new ArrayList())); + srx.getMappingRules() + .add(new MapRule(OStrings.getString("SEG_NEW_LN_CO"), "LN-CO", new ArrayList())); fireTableRowsInserted(rows, rows); return rows; }