diff --git a/.gitattributes b/.gitattributes index 83b2e161..d538a26a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -336,7 +336,6 @@ DotifyTranslator/build-properties.xml -text DotifyTranslator/build.xml -text DotifyTranslator/lib/icu4j-3_8.jar -text DotifyTranslator/lib/junit-4.7.jar -text -DotifyTranslator/lib/org.daisy.util.jar -text DotifyTranslator/lib/texhyphj.jar -text DotifyTranslator/logging.properties -text DotifyTranslator/src/META-INF/services/org.daisy.dotify.hyphenator.HyphenatorFactory -text @@ -515,6 +514,7 @@ DotifyTranslator/src/org/daisy/dotify/text/FilterFactory.java -text svneol=unset DotifyTranslator/src/org/daisy/dotify/text/FilterLocale.java -text svneol=unset#text/plain DotifyTranslator/src/org/daisy/dotify/text/IdentityFilter.java -text DotifyTranslator/src/org/daisy/dotify/text/RegexFilter.java -text svneol=unset#text/plain +DotifyTranslator/src/org/daisy/dotify/text/SimpleCharReplacer.java -text DotifyTranslator/src/org/daisy/dotify/text/SplitResult.java -text svneol=unset#text/plain DotifyTranslator/src/org/daisy/dotify/text/StringFilter.java -text svneol=unset#text/plain DotifyTranslator/src/org/daisy/dotify/text/StringSplitter.java -text svneol=unset#text/plain diff --git a/DotifyTranslator/.classpath b/DotifyTranslator/.classpath index d7657b14..de971715 100644 --- a/DotifyTranslator/.classpath +++ b/DotifyTranslator/.classpath @@ -5,6 +5,6 @@ - + diff --git a/DotifyTranslator/build.xml b/DotifyTranslator/build.xml index ad8d7dfd..692d5b58 100644 --- a/DotifyTranslator/build.xml +++ b/DotifyTranslator/build.xml @@ -112,7 +112,7 @@ - + diff --git a/DotifyTranslator/lib/org.daisy.util.jar b/DotifyTranslator/lib/org.daisy.util.jar deleted file mode 100644 index b1f04e80..00000000 Binary files a/DotifyTranslator/lib/org.daisy.util.jar and /dev/null differ diff --git a/DotifyTranslator/src/org/daisy/dotify/text/CharFilter.java b/DotifyTranslator/src/org/daisy/dotify/text/CharFilter.java index 8db972c9..f5a5fbd8 100644 --- a/DotifyTranslator/src/org/daisy/dotify/text/CharFilter.java +++ b/DotifyTranslator/src/org/daisy/dotify/text/CharFilter.java @@ -2,8 +2,6 @@ import java.net.URL; -import org.daisy.util.i18n.UCharReplacer; - /** * Implements StringFilter using UCharReplacer. * @@ -12,14 +10,14 @@ * @since 1.0 */ public class CharFilter implements StringFilter { - private UCharReplacer ucr; + private final SimpleCharReplacer ucr; /** * Create a new CharFilter * @param table relative path to replacement table, see UCharReplacement for more information */ public CharFilter(URL table) { - this.ucr = new UCharReplacer(); + this.ucr = new SimpleCharReplacer(); try { this.ucr.addSubstitutionTable(table); } catch (Exception e) { @@ -27,6 +25,10 @@ public CharFilter(URL table) { } } + public CharFilter(SimpleCharReplacer replacer) { + this.ucr = replacer; + } + public String filter(String str) { return ucr.replace(str).toString(); } diff --git a/DotifyTranslator/src/org/daisy/dotify/text/SimpleCharReplacer.java b/DotifyTranslator/src/org/daisy/dotify/text/SimpleCharReplacer.java new file mode 100644 index 00000000..ac06cd49 --- /dev/null +++ b/DotifyTranslator/src/org/daisy/dotify/text/SimpleCharReplacer.java @@ -0,0 +1,163 @@ +package org.daisy.dotify.text; + +import java.io.IOException; +import java.net.URL; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Properties; +import java.util.Set; + +import com.ibm.icu.text.Normalizer; +import com.ibm.icu.text.UCharacterIterator; + +/** + *

+ * Provides substitution for unicode characters with replacement strings. + *

+ * + *

+ * This is a much simplified version of UCharReplacer by Markus Gylling from the + * org.daisy.util package. + *

+ * + *

+ * The use of this class may result in a change in unicode character + * composition between input and output. If you need a certain normalization + * form, normalize after the use of this class. + *

+ * + *

+ * Usage example: + *

+ *
+ * SimpleCharReplacer ucr = new SimpleCharReplacer();
+ * ucr.addSubstitutionTable(fileURL);
+ * ucr.addSubstitutionTable(fileURL2);
+ * String ret = ucr.replace(input);
+ * 
+ * + *

+ * The translation table file is using the same xml format as that of + * java.util.Properties [1][2], using the HEX representation (without the + * characteristic 0x-prefix!) of a unicode character as the key + * attribute and the replacement string as value of the entry element. + *

+ * + *

+ * If the key attribute contains exactly one unicode codepoint (one + * character) it will be treated literally. It will not be interpreted as a HEX + * representation of another character, even if theoretically possible. E.g. if + * the key is "a", it will be treated as 0x0061 rather than as 0x000a + *

+ * + *

+ * Note - there is a significant difference between a unicode codepoint (32 bit + * int) and a UTF16 codeunit (=char) - a codepoint consists of one or two + * codeunits. + *

+ *

+ * To make sure an int represents a codepoint and not a codeunit, use for + * example com.ibm.icu.text.Normalizer to NFC compose, followed by + * com.ibm.icu.text.UCharacterIterator to retrieve possibly non-BMP + * codepoints from a string. + *

+ * + * @see [1] http://java.sun.com/j2se/1.5.0/docs/api/java/util/Properties.html + * @see [2] http://java.sun.com/dtd/properties.dtd + * + * @author Joel HÃ¥kansson + * @author Markus Gylling (UCharReplacer) + */ +public class SimpleCharReplacer { + private Map mSubstitutionTable = null; + + public SimpleCharReplacer() { + mSubstitutionTable = new HashMap(); + } + + public void addSubstitutionTable(URL table) throws IOException { + try { + loadTable(table); + } catch (Exception e) { + throw new IOException(e.getMessage()); + } + } + + public CharSequence replace(String input) { + int codePoint; + + StringBuilder sb = new StringBuilder(input.length()); + + // icu4j version + // normalize to eliminate any ambiguities vis-a-vis the user tables + Normalizer.normalize(input, Normalizer.NFC); + + // Java 1.6 SDK version + // Normalizer.normalize(input, Normalizer.Form.NFC); + + // icu4j version + // iterate over each code point in the input string + UCharacterIterator uci = UCharacterIterator.getInstance(input.toString()); + while ((codePoint = uci.nextCodePoint()) != UCharacterIterator.DONE) { + CharSequence substitution = substitute(codePoint); + if (null != substitution && substitution.length() > 0) { + // a replacement occurred + sb.append(substitution); + } else { + // a replacement didn't occur + sb.appendCodePoint(codePoint); + } + } + + /* + * Java 1.5 SDK version + * // iterate over each code point in the input string + * final int length = input.length(); + * for (int offset = 0; offset < length;) { + * codePoint = input.codePointAt(offset); + * CharSequence substitution = substitute(codePoint); + * if (null != substitution && substitution.length() > 0) { + * // a replacement occurred + * sb.append(substitution); + * } else { + * // a replacement didn't occur + * sb.appendCodePoint(codePoint); + * } + * offset += Character.charCount(codePoint); + * } + */ + + return sb; + } + + /** + * Loads a table using the Properties class. + */ + private void loadTable(URL tableURL) throws IOException { + Properties props = new Properties(); + props.loadFromXML(tableURL.openStream()); + Set keys = props.keySet(); + for (Iterator it = keys.iterator(); it.hasNext();) { + String key = (String) it.next(); + if (key.codePointCount(0, key.length()) == 1) { + mSubstitutionTable.put(key.codePointAt(0), props.getProperty(key)); + } else { + try { + mSubstitutionTable.put(Integer.decode("0x" + key), props.getProperty(key)); + } catch (NumberFormatException e) { + System.err.println("error in translation table " + tableURL.toString() + ": attribute key=\"" + key + "\" is not a hex number."); + } + } + } + } + + /** + * @return a substite string if available in tables, or null if not + * available + */ + private String substitute(int codePoint) { + return mSubstitutionTable.get(Integer.valueOf(codePoint)); + } + +}