[script.module.unidecode] 1.3.6 (#2494)

xbmc · Aug 18, 2023 · cce14b0 · cce14b0
1 parent d332aa9
commit cce14b0
Show file tree

Hide file tree

Showing 135 changed files with 5,719 additions and 5,484 deletions.
diff --git a/script.module.unidecode/addon.xml b/script.module.unidecode/addon.xml
@@ -1,19 +1,18 @@
 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<addon id="script.module.unidecode" name="unidecode" version="1.1.1+matrix.2" provider-name="Tomaz Solc ([email protected])">
-	<requires>
-		<import addon="xbmc.python" version="3.0.0"/>
-	</requires>
-	<extension point="xbmc.python.module" library="lib" />
-	<extension point="xbmc.addon.metadata">
-		<platform>all</platform>
-		<summary lang="en_GB">ASCII transliterations of Unicode text by Sean M. Burke and Tomaz Solc</summary>
-		<description lang="en_GB">ASCII transliterations of Unicode text by Sean M. Burke and Tomaz Solc</description>
-		<disclaimer lang="en_GB">Code taken from https://pypi.org/project/Unidecode</disclaimer>
-		<license>GPL-2.0-or-later</license>
-		<website>https://pypi.org/project/Unidecode</website>
-		<source>https://pypi.org/project/Unidecode</source>
-		<assets>
-			<icon>icon.png</icon>
-		</assets>
-	</extension>
+<addon id="script.module.unidecode" name="Unidecode" version="1.3.6" provider-name="Tomaz Solc">
+  <requires>
+    <import addon="xbmc.python" version="3.0.0"/>
+  </requires>
+  <extension point="xbmc.python.module" library="lib" />
+  <extension point="xbmc.addon.metadata">
+    <summary lang="en_GB">ASCII transliterations of Unicode text</summary>
+    <description lang="en_GB">Unidecode, lossy ASCII transliterations of Unicode text</description>
+    <license>GPL-2.0-or-later</license>
+    <platform>all</platform>
+    <website>https://pypi.org/project/Unidecode</website>
+    <source>https://pypi.org/project/Unidecode</source>
+    <assets>
+      <icon>resources/icon.png</icon>
+    </assets>
+  </extension>
 </addon>
diff --git a/script.module.unidecode/lib/unidecode/__init__.py b/script.module.unidecode/lib/unidecode/__init__.py
@@ -3,101 +3,136 @@
 """Transliterate Unicode text into plain 7-bit ASCII.
 
 Example usage:
+
 >>> from unidecode import unidecode
->>> unidecode(u"\u5317\u4EB0")
+>>> unidecode("\u5317\u4EB0")
 "Bei Jing "
 
 The transliteration uses a straightforward map, and doesn't have alternatives
 for the same character based on language, position, or anything else.
 
-In Python 3, a standard string object will be returned. If you need bytes, use:
+A standard string object will be returned. If you need bytes, use:
+
 >>> unidecode("Κνωσός").encode("ascii")
 b'Knosos'
 """
 import warnings
-from sys import version_info
+from typing import Dict, Optional, Sequence
 
-Cache = {}
+Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
 
+class UnidecodeError(ValueError):
+    def __init__(self, message: str, index: Optional[int] = None) -> None:
+        """Raised for Unidecode-related errors.
 
-def _warn_if_not_unicode(string):
-    if version_info[0] < 3 and not isinstance(string, unicode):
-        warnings.warn(  "Argument %r is not an unicode object. "
-                        "Passing an encoded string will likely have "
-                        "unexpected results." % (type(string),),
-                        RuntimeWarning, 2)
+        The index attribute contains the index of the character that caused
+        the error.
+        """
+        super(UnidecodeError, self).__init__(message)
+        self.index = index
 
 
-def unidecode_expect_ascii(string):
+def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
     """Transliterate an Unicode object into an ASCII string
 
-    >>> unidecode(u"\u5317\u4EB0")
+    >>> unidecode("\u5317\u4EB0")
     "Bei Jing "
 
     This function first tries to convert the string using ASCII codec.
     If it fails (because of non-ASCII characters), it falls back to
     transliteration using the character tables.
 
     This is approx. five times faster if the string only contains ASCII
-    characters, but slightly slower than using unidecode directly if non-ASCII
-    chars are present.
+    characters, but slightly slower than unicode_expect_nonascii if
+    non-ASCII characters are present.
+
+    errors specifies what to do with characters that have not been
+    found in replacement tables. The default is 'ignore' which ignores
+    the character. 'strict' raises an UnidecodeError. 'replace'
+    substitutes the character with replace_str (default is '?').
+    'preserve' keeps the original character.
+
+    Note that if 'preserve' is used the returned string might not be
+    ASCII!
     """
 
-    _warn_if_not_unicode(string)
     try:
         bytestring = string.encode('ASCII')
     except UnicodeEncodeError:
-        return _unidecode(string)
-    if version_info[0] >= 3:
-        return string
+        pass
     else:
-        return bytestring
+        return string
 
-def unidecode_expect_nonascii(string):
+    return _unidecode(string, errors, replace_str)
+
+def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
     """Transliterate an Unicode object into an ASCII string
 
-    >>> unidecode(u"\u5317\u4EB0")
+    >>> unidecode("\u5317\u4EB0")
     "Bei Jing "
+
+    See unidecode_expect_ascii.
     """
 
-    _warn_if_not_unicode(string)
-    return _unidecode(string)
+    return _unidecode(string, errors, replace_str)
 
 unidecode = unidecode_expect_ascii
 
-def _unidecode(string):
-    retval = []
+def _get_repl_str(char: str) -> Optional[str]:
+    codepoint = ord(char)
 
-    for char in string:
-        codepoint = ord(char)
+    if codepoint < 0x80:
+        # Already ASCII
+        return str(char)
 
-        if codepoint < 0x80: # Basic ASCII
-            retval.append(str(char))
-            continue
-
-        if codepoint > 0xeffff:
-            continue # Characters in Private Use Area and above are ignored
+    if codepoint > 0xeffff:
+        # No data on characters in Private Use Area and above.
+        return None
 
-        if 0xd800 <= codepoint <= 0xdfff:
-            warnings.warn(  "Surrogate character %r will be ignored. "
-                            "You might be using a narrow Python build." % (char,),
-                            RuntimeWarning, 2)
+    if 0xd800 <= codepoint <= 0xdfff:
+        warnings.warn(  "Surrogate character %r will be ignored. "
+                        "You might be using a narrow Python build." % (char,),
+                        RuntimeWarning, 2)
 
-        section = codepoint >> 8   # Chop off the last two hex digits
-        position = codepoint % 256 # Last two hex digits
+    section = codepoint >> 8   # Chop off the last two hex digits
+    position = codepoint % 256 # Last two hex digits
 
+    try:
+        table = Cache[section]
+    except KeyError:
         try:
-            table = Cache[section]
-        except KeyError:
-            try:
-                mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
-            except ImportError:
-                Cache[section] = None
-                continue   # No match: ignore this character and carry on.
-
-            Cache[section] = table = mod.data
-
-        if table and len(table) > position:
-            retval.append( table[position] )
+            mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
+        except ImportError:
+            # No data on this character
+            Cache[section] = None
+            return None
+
+        Cache[section] = table = mod.data
+
+    if table and len(table) > position:
+        return table[position]
+    else:
+        return None
+
+def _unidecode(string: str, errors: str, replace_str:str) -> str:
+    retval = []
+
+    for index, char in enumerate(string):
+        repl = _get_repl_str(char)
+
+        if repl is None:
+            if errors == 'ignore':
+                repl = ''
+            elif errors == 'strict':
+                raise UnidecodeError('no replacement found for character %r '
+                        'in position %d' % (char, index), index)
+            elif errors == 'replace':
+                repl = replace_str
+            elif errors == 'preserve':
+                repl = char
+            else:
+                raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
+
+        retval.append(repl)
 
     return ''.join(retval)
diff --git a/script.module.unidecode/lib/unidecode/__main__.py b/script.module.unidecode/lib/unidecode/__main__.py
diff --git a/script.module.unidecode/lib/unidecode/util.py b/script.module.unidecode/lib/unidecode/util.py
diff --git a/script.module.unidecode/lib/unidecode/x000.py b/script.module.unidecode/lib/unidecode/x000.py
@@ -76,9 +76,9 @@
 '1',    # 0xb9
 'o',    # 0xba
 '>>',    # 0xbb
-' 1/4 ',    # 0xbc
-' 1/2 ',    # 0xbd
-' 3/4 ',    # 0xbe
+' 1/4',    # 0xbc
+' 1/2',    # 0xbd
+' 3/4',    # 0xbe
 '?',    # 0xbf
 'A',    # 0xc0
 'A',    # 0xc1

diff --git a/script.module.unidecode/lib/unidecode/x002.py b/script.module.unidecode/lib/unidecode/x002.py
@@ -64,8 +64,8 @@
 'T',    # 0x3e
 's',    # 0x3f
 'z',    # 0x40
-'[?]',    # 0x41
-'[?]',    # 0x42
+None,    # 0x41
+None,    # 0x42
 'B',    # 0x43
 'U',    # 0x44
 '^',    # 0x45
@@ -238,20 +238,20 @@
 'V',    # 0xec
 '=',    # 0xed
 '"',    # 0xee
-'[?]',    # 0xef
-'[?]',    # 0xf0
-'[?]',    # 0xf1
-'[?]',    # 0xf2
-'[?]',    # 0xf3
-'[?]',    # 0xf4
-'[?]',    # 0xf5
-'[?]',    # 0xf6
-'[?]',    # 0xf7
-'[?]',    # 0xf8
-'[?]',    # 0xf9
-'[?]',    # 0xfa
-'[?]',    # 0xfb
-'[?]',    # 0xfc
-'[?]',    # 0xfd
-'[?]',    # 0xfe
+None,    # 0xef
+None,    # 0xf0
+None,    # 0xf1
+None,    # 0xf2
+None,    # 0xf3
+None,    # 0xf4
+None,    # 0xf5
+None,    # 0xf6
+None,    # 0xf7
+None,    # 0xf8
+None,    # 0xf9
+None,    # 0xfa
+None,    # 0xfb
+None,    # 0xfc
+None,    # 0xfd
+None,    # 0xfe
 )