Python 3

amrael · amrael · commit 1d18884260bc · 2017-04-10T14:20:59.000Z
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 argparse==1.2.1
 PyYAML==3.10
 beautifulsoup4==4.2.0
+six==1.10.0
diff --git a/tests.py b/tests.py
@@ -1,11 +1,12 @@
 # encoding=utf-8
 
+import six
 import twitter_text, sys, os, json, argparse, re
 from twitter_text.unicode import force_unicode
 
 narrow_build = True
 try:
-    unichr(0x20000)
+    six.unichr(0x20000)
     narrow_build = False
 except:
     pass
@@ -177,4 +178,4 @@ def assert_equal(result, test):
 
 sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted)
 sys.stdout.flush()
-sys.exit(os.EX_OK)
+sys.exit(os.EX_OK)
diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py
@@ -1,5 +1,5 @@
 # encoding=utf-8
-
+from __future__ import unicode_literals
 import re, cgi
 
 from twitter_text.regex import REGEXEN
@@ -113,7 +113,7 @@ def auto_link_entities(self, entities = [], options = {}):
             return self.text
 
         # NOTE deprecate these attributes not options keys in options hash, then use html_attrs
-        options = dict(DEFAULT_OPTIONS.items() + options.items())
+        options = dict(list(DEFAULT_OPTIONS.items()) + list(options.items()))
         options['html_attrs'] = self._extract_html_attrs_from_options(options)
         if not options.get('suppress_no_follow', False):
             options['html_attrs']['rel'] = "nofollow"
@@ -306,12 +306,12 @@ def _link_url_with_entity(self, entity, options = {}):
         expanded_url = entity.get('expanded_url')
         invisible_tag_attrs = options.get('invisible_tag_attrs', DEFAULT_INVISIBLE_TAG_ATTRS)
 
-        display_url_sans_ellipses = re.sub(ur'…', u'', display_url)
+        display_url_sans_ellipses = re.sub(r'…', '', display_url)
 
         if expanded_url.find(display_url_sans_ellipses) > -1:
             before_display_url, after_display_url = expanded_url.split(display_url_sans_ellipses, 2)
-            preceding_ellipsis = re.search(ur'\A…', display_url)
-            following_ellipsis = re.search(ur'…\z', display_url)
+            preceding_ellipsis = re.search(r'\A…', display_url)
+            following_ellipsis = re.search(r'…\z', display_url)
             if preceding_ellipsis is not None:
                 preceding_ellipsis = preceding_ellipsis.group()
             else:
@@ -356,13 +356,13 @@ def _link_to_hashtag(self, entity, chars, options = {}):
         if REGEXEN['rtl_chars'].search(hashtag):
             hashtag_class += ' rtl'
 
-        href = options.get('hashtag_url_transform', lambda ht: u'%s%s' % (options.get('hashtag_url_base'), ht))(hashtag)
+        href = options.get('hashtag_url_transform', lambda ht: '%s%s' % (options.get('hashtag_url_base'), ht))(hashtag)
 
         html_attrs = {}
         html_attrs.update(options.get('html_attrs', {}))
         html_attrs = {
             'class':    hashtag_class,
-            'title':    u'#%s' % hashtag,
+            'title':    '#%s' % hashtag,
         }
 
         link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options)
@@ -372,19 +372,19 @@ def _link_to_cashtag(self, entity, chars, options = {}):
         dollar = chars[entity['indices'][0]]
         cashtag = entity['cashtag']
 
-        href = options.get('cashtag_url_transform', lambda ct: u'%s%s' % (options.get('cashtag_url_base'), ct))(cashtag)
+        href = options.get('cashtag_url_transform', lambda ct: '%s%s' % (options.get('cashtag_url_base'), ct))(cashtag)
 
         html_attrs = {
             'class': options.get('cashtag_class'),
-            'title': u'$%s' % cashtag
+            'title': '$%s' % cashtag
         }
         html_attrs.update(options.get('html_attrs', {}))
 
         link = self._link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options)
         return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
 
     def _link_to_screen_name(self, entity, chars, options = {}):
-        name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
+        name = '%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
         chunk = options.get('link_text_transform', default_transform)(entity, name)
         name = name.lower()
 
@@ -395,30 +395,30 @@ def _link_to_screen_name(self, entity, chars, options = {}):
             del(html_attrs['title'])
 
         if entity.get('list_slug') and not options.get('supress_lists'):
-            href = options.get('list_url_transform', lambda sn: u'%s%s' % (options.get('list_url_base'), sn))(name)
+            href = options.get('list_url_transform', lambda sn: '%s%s' % (options.get('list_url_base'), sn))(name)
             html_attrs['class'] = options.get('list_class')
         else:
-            href = options.get('username_url_transform', lambda sn: u'%s%s' % (options.get('username_url_base'), sn))(name)
+            href = options.get('username_url_transform', lambda sn: '%s%s' % (options.get('username_url_base'), sn))(name)
             html_attrs['class'] = options.get('username_class')
 
         link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options)
         return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
 
     def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}):
-        tagged_symbol = u'<%s>%s</%s>' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol
+        tagged_symbol = '<%s>%s</%s>' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol
         text = self._html_escape(text)
-        tagged_text = u'<%s>%s</%s>' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text
+        tagged_text = '<%s>%s</%s>' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text
         if options.get('username_include_symbol') or not REGEXEN['at_signs'].match(symbol):
-            return u'%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options)
+            return '%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options)
         else:
-            return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options))
+            return '%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options))
 
     def _link_to_text(self, entity, text, href, attributes = {}, options = {}):
         attributes['href'] = href
-        if options.get('link_attributes_transform'):
-            attributes = options.get('link_attributes_transform')(entity, attributes)
+        if options.get('link_attribute_transform'):
+            attributes = options.get('link_attribute_transform')(entity, attributes)
         text = options.get('link_text_transform', default_transform)(entity, text)
-        return u'<a %s>%s</a>' % (self._tag_attrs(attributes), text)
+        return '<a %s>%s</a>' % (self._tag_attrs(attributes), text)
 
     def _tag_attrs(self, attributes = {}):
         attrs = []
@@ -428,7 +428,7 @@ def _tag_attrs(self, attributes = {}):
                 attrs.append(key)
                 continue
             if type(value) == list:
-                value = u' '.join(value)
-            attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value)))
+                value = ' '.join(value)
+            attrs.append('%s="%s"' % (self._html_escape(key), self._html_escape(value)))
 
-        return u' '.join(attrs)
+        return ' '.join(attrs)
diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py
@@ -1,15 +1,15 @@
 # encoding=utf-8
 
 import re
-from HTMLParser import HTMLParser
+from six.moves import html_parser
 
 from twitter_text.regex import UNICODE_SPACES
 from twitter_text.unicode import force_unicode
 
 DEFAULT_HIGHLIGHT_TAG = 'em'
 
 # from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
-class MLStripper(HTMLParser):
+class MLStripper(html_parser.HTMLParser):
     def __init__(self):
         self.reset()
         self.fed = []
@@ -80,4 +80,4 @@ def hit_highlight(self, hits = [], **kwargs):
             else:
                 result.append(chunk)
         self.text = u''.join(result)
-        return self.text
+        return self.text
diff --git a/twitter_text/regex.py b/twitter_text/regex.py
@@ -4,35 +4,36 @@
 # list is frozen at load time to ensure immutability. These reular expressions are
 # used throughout the Twitter classes. Special care has been taken to make
 # sure these reular expressions work with Tweets in all languages.
-import re, string
+import re, string, six
+from six.moves import reduce
 
 REGEXEN = {} # :nodoc:
 
 def regex_range(start, end = None):
     if end:
-        return u'%s-%s' % (unichr(start), unichr(end))
+        return u'%s-%s' % (six.unichr(start), six.unichr(end))
     else:
-        return u'%s' % unichr(start)
+        return u'%s' % six.unichr(start)
 
 # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
 # to access both the list of characters and a pattern suitible for use with String#split
 #  Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
 UNICODE_SPACES = []
 for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [
-        range(0x0009, 0x000D),  # White_Space # Cc   [5] <control-0009>..<control-000D>
+        list(range(0x0009, 0x000D)),  # White_Space # Cc   [5] <control-0009>..<control-000D>
         0x0020,                 # White_Space # Zs       SPACE
         0x0085,                 # White_Space # Cc       <control-0085>
         0x00A0,                 # White_Space # Zs       NO-BREAK SPACE
         0x1680,                 # White_Space # Zs       OGHAM SPACE MARK
         0x180E,                 # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR
-        range(0x2000, 0x200A),  # White_Space # Zs  [11] EN QUAD..HAIR SPACE
+        list(range(0x2000, 0x200A)),  # White_Space # Zs  [11] EN QUAD..HAIR SPACE
         0x2028,                 # White_Space # Zl       LINE SEPARATOR
         0x2029,                 # White_Space # Zp       PARAGRAPH SEPARATOR
         0x202F,                 # White_Space # Zs       NARROW NO-BREAK SPACE
         0x205F,                 # White_Space # Zs       MEDIUM MATHEMATICAL SPACE
         0x3000,                 # White_Space # Zs       IDEOGRAPHIC SPACE
     ]):
-    UNICODE_SPACES.append(unichr(space))
+    UNICODE_SPACES.append(six.unichr(space))
 REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES))
 
 # Characters not allowed in Tweets
@@ -41,7 +42,7 @@ def regex_range(start, end = None):
     0xFFFF,                                 # Special
     0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change
 ]
-REGEXEN['invalid_control_characters']   =   [unichr(x) for x in INVALID_CHARACTERS]
+REGEXEN['invalid_control_characters']   =   [six.unichr(x) for x in INVALID_CHARACTERS]
 
 REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$')
 
diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py
@@ -1,6 +1,7 @@
-import types, datetime
+import types, datetime, six
 from decimal import Decimal
 
+
 # borrowed from django.utils.encoding
 class TwitterTextUnicodeDecodeError(UnicodeDecodeError):
     def __init__(self, obj, *args):
@@ -18,46 +19,41 @@ def is_protected_type(obj):
     Objects of protected types are preserved as-is when passed to
     force_unicode(strings_only=True).
     """
-    return isinstance(obj, (
-        types.NoneType,
-        int, long,
+    return isinstance(obj, six.integer_types + (
+        type(None),
         datetime.datetime, datetime.date, datetime.time,
         float, Decimal)
     )
 
 def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
     """
-    Similar to smart_unicode, except that lazy instances are resolved to
+    Similar to smart_text, except that lazy instances are resolved to
     strings, rather than kept as lazy objects.
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
+    # Handle the common case first for performance reasons.
+    if issubclass(type(s), six.text_type):
+        return s
     if strings_only and is_protected_type(s):
         return s
     try:
-        if not isinstance(s, basestring,):
-            if hasattr(s, '__unicode__'):
-                s = unicode(s)
+        if not issubclass(type(s), six.string_types):
+            if six.PY3:
+                if isinstance(s, bytes):
+                    s = six.text_type(s, encoding, errors)
+                else:
+                    s = six.text_type(s)
+            elif hasattr(s, '__unicode__'):
+                s = six.text_type(s)
             else:
-                try:
-                    s = unicode(str(s), encoding, errors)
-                except UnicodeEncodeError:
-                    if not isinstance(s, Exception):
-                        raise
-                    # If we get to here, the caller has passed in an Exception
-                    # subclass populated with non-ASCII data without special
-                    # handling to display as a string. We need to handle this
-                    # without raising a further exception. We do an
-                    # approximation to what the Exception's standard str()
-                    # output should be.
-                    s = ' '.join([force_unicode(arg, encoding, strings_only,
-                            errors) for arg in s])
-        elif not isinstance(s, unicode):
-            # Note: We use .decode() here, instead of unicode(s, encoding,
-            # errors), so that if s is a SafeString, it ends up being a
-            # SafeUnicode at the end.
+                s = six.text_type(bytes(s), encoding, errors)
+        else:
+            # Note: We use .decode() here, instead of six.text_type(s, encoding,
+            # errors), so that if s is a SafeBytes, it ends up being a
+            # SafeText at the end.
             s = s.decode(encoding, errors)
-    except UnicodeDecodeError, e:
+    except UnicodeDecodeError as e:
         if not isinstance(s, Exception):
             raise TwitterTextUnicodeDecodeError(s, *e.args)
         else:
@@ -66,6 +62,7 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
             # working unicode method. Try to handle this without raising a
             # further exception by individually forcing the exception args
             # to unicode.
-            s = ' '.join([force_unicode(arg, encoding, strings_only,
-                    errors) for arg in s])
+            s = ' '.join(force_unicode(arg, encoding, strings_only, errors)
+                         for arg in s)
     return s
+