diff --git a/.gitignore b/.gitignore
index 17b1f10..20a1408 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
*.pyc
build*
*.egg*
-dist
\ No newline at end of file
+dist
+/.cache
diff --git a/.gitmodules b/.gitmodules
index f051160..e5d7799 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "twitter-text-conformance"]
- path = twitter-text-conformance
- url = https://github.com/dryan/twitter-text-conformance.git
+[submodule "twitter-text"]
+ path = twitter-text
+ url = https://github.com/twitter/twitter-text.git
diff --git a/.travis.yml b/.travis.yml
index b954140..b29bfb2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,11 +1,11 @@
+sudo: false
language: python
python:
- - "2.6"
- "2.7"
install:
- - "pip install . --use-mirrors"
- - "pip install -r requirements.txt --use-mirrors"
-script: "python ./tests.py"
+ - "pip install ."
+ - "pip install -r requirements.txt"
+script: "py.test"
notifications:
email: false
diff --git a/README.md b/README.md
index fb6d3cd..1c0537d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
A port of the Ruby gem [twitter-text-rb](https://github.com/twitter/twitter-text-rb) to Python.
-[](https://travis-ci.org/dryan/twitter-text-py)
+[](https://travis-ci.org/muckrack/twitter-text-py)
# Changes in 2.0
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000..221c885
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,177 @@
+# encoding=utf-8
+
+from __future__ import unicode_literals
+
+import json
+import os
+
+import pytest
+import yaml
+
+import twitter_text
+from twitter_text.encoding import force_text, smart_bytes
+
+# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
+#from yaml import Loader, SafeLoader
+
+
+narrow_build = True
+try:
+ unichr(0x20000)
+ narrow_build = False
+except:
+ pass
+
+
+#def construct_yaml_str(self, node):
+# return self.construct_scalar(node)
+#Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
+#SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
+
+
+try:
+ from bs4 import BeautifulSoup
+except ImportError:
+ raise Exception('You need to install BeautifulSoup4 to run the tests')
+
+
+def pytest_collect_file(parent, path):
+ if path.ext == '.yml':
+ return YamlFile(path, parent)
+
+
+class YamlException(Exception):
+ """ custom exception for error reporting. """
+
+
+class YamlFile(pytest.File):
+ def collect(self):
+ filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0]
+ if filename not in TEST_MAP:
+ print "Skipping {}; not supported".format(filename)
+ return
+ if TEST_MAP[filename].get('requires_wide_build') and narrow_build:
+ print "Skipping {} due to narrow build".format(filename)
+ return
+ raw = yaml.safe_load(force_text(self.fspath.open().read()))
+ if 'tests' not in raw:
+ return
+ for section, specs in raw['tests'].items():
+ for spec in specs:
+ yield YamlItem(self, filename, section, spec)
+
+
+TEST_MAP = {
+ 'autolink': {
+ 'cls': twitter_text.autolink.Autolink,
+ 'options': {'suppress_no_follow': True},
+ 'methods': {
+ 'usernames': 'auto_link_usernames_or_lists',
+ 'cashtags': 'auto_link_cashtags',
+ 'urls': 'auto_link_urls',
+ 'hashtags': 'auto_link_hashtags',
+ 'all': 'auto_link',
+ 'lists': 'auto_link_usernames_or_lists',
+ 'json': 'auto_link_with_json',
+ },
+ 'ignore_attribute_order': set([
+ 'usernames',
+ 'cashtags',
+ 'urls',
+ 'hashtags',
+ 'all',
+ 'lists',
+ 'json',
+ ])
+ },
+ 'extract': {
+ 'cls': twitter_text.extractor.Extractor,
+ 'methods': {
+ 'mentions': 'extract_mentioned_screen_names',
+ 'mentions_with_indices': 'extract_mentioned_screen_names_with_indices',
+ 'mentions_or_lists_with_indices': 'extract_mentions_or_lists_with_indices',
+ 'replies': 'extract_reply_screen_name',
+ 'urls': 'extract_urls',
+ 'urls_with_indices': 'extract_urls_with_indices',
+ 'hashtags': 'extract_hashtags',
+ 'cashtags': 'extract_cashtags',
+ 'hashtags_with_indices': 'extract_hashtags_with_indices',
+ 'cashtags_with_indices': 'extract_cashtags_with_indices',
+ },
+ },
+ 'hit_highlighting': {
+ 'cls': twitter_text.highlighter.HitHighlighter,
+ 'methods': {
+ 'plain_text': 'hit_highlight',
+ 'with_links': 'hit_highlight',
+ },
+ 'ignore_attribute_order': set([
+ 'with_links',
+ ])
+ },
+ 'validate': {
+ 'cls': twitter_text.validation.Validation,
+ 'requires_wide_build': True,
+ 'methods': {
+ 'tweets': 'valid_tweet_text',
+ 'usernames': 'valid_username',
+ 'lists': 'valid_list',
+ 'hashtags': 'valid_hashtag',
+ 'urls': 'valid_url',
+ 'urls_without_protocol': ('valid_url', {'require_protocol': False}),
+ 'lengths': 'tweet_length',
+ },
+ }
+}
+
+
+class YamlItem(pytest.Item):
+ def __init__(self, parent, filename, section, spec):
+ self.section = section
+ self.filename = filename
+ self.spec = spec
+ name = "{}:{}:{}".format(filename, section, spec['description'])
+ super(YamlItem, self).__init__(name, parent)
+
+ def _equal_without_attribute_order(self, result, expected):
+ # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
+ return BeautifulSoup(result, "lxml") == BeautifulSoup(expected, "lxml")
+
+ def runtest(self):
+ if self.filename not in TEST_MAP:
+ raise YamlException("{} file not supported".format(self.section))
+ if self.section not in TEST_MAP[self.filename]['methods']:
+ raise YamlException("{}:{} section not supported".format(self.section))
+ cls = TEST_MAP[self.filename]['cls']
+ instance = cls(self.spec['text'])
+ args = []
+ try:
+ method_name, kwargs = TEST_MAP[self.filename]['methods'][self.section]
+ kwargs = kwargs.copy()
+ except ValueError:
+ kwargs = {}
+ method_name = TEST_MAP[self.filename]['methods'][self.section]
+ if 'json' in self.spec:
+ args.append(json.loads(self.spec['json']))
+ if 'options' in TEST_MAP[self.filename]:
+ kwargs['options'] = TEST_MAP[self.filename]['options']
+ if 'hits' in self.spec:
+ kwargs['hits'] = self.spec['hits']
+ result = getattr(instance, method_name)(*args, **kwargs)
+ if self.section in TEST_MAP[self.filename].get('ignore_attribute_order', ()):
+ equal = self._equal_without_attribute_order(result, self.spec['expected'])
+ else:
+ equal = result == self.spec['expected']
+ if not equal:
+ raise YamlException("{} != {}".format(result, self.spec['expected']))
+
+ def repr_failure(self, excinfo):
+ """ called when self.runtest() raises an exception. """
+ if isinstance(excinfo.value, YamlException):
+ return smart_bytes("\n".join([
+ "usecase execution failed",
+ " {}".format(*excinfo.value.args)
+ ]))
+
+ def reportinfo(self):
+ return self.fspath, 0, smart_bytes("usecase: %s" % self.name)
diff --git a/requirements.txt b/requirements.txt
index 0ac3552..3cfae74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,8 @@
argparse==1.2.1
PyYAML==3.10
-beautifulsoup4==4.2.0
+beautifulsoup4==4.4.1
+Django==1.9.6
+lxml==3.4.4
+pytest==2.9.1
+py==1.4.29
+regex==2016.04.25
diff --git a/setup.py b/setup.py
index fcdabb2..bb27c76 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
from setuptools import setup, find_packages
-
+
setup(
name='twitter-text-py',
version='2.0.2',
@@ -19,5 +19,5 @@
],
include_package_data=True,
install_requires=['setuptools'],
- license = "BSD"
+ license="BSD"
)
diff --git a/tests.py b/tests.py
deleted file mode 100644
index 891b35e..0000000
--- a/tests.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# encoding=utf-8
-
-import twitter_text, sys, os, json, argparse, re
-from twitter_text.unicode import force_unicode
-
-narrow_build = True
-try:
- unichr(0x20000)
- narrow_build = False
-except:
- pass
-
-parser = argparse.ArgumentParser(description = u'Run the integration tests for twitter_text')
-parser.add_argument('--ignore-narrow-errors', '-i', help = u'Ignore errors caused by narrow builds', default = False, action = 'store_true')
-args = parser.parse_args()
-
-try:
- import yaml
-except ImportError:
- raise Exception('You need to install pyaml to run the tests')
-# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects
-from yaml import Loader, SafeLoader
-def construct_yaml_str(self, node):
- return self.construct_scalar(node)
-Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
-SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str)
-
-try:
- from bs4 import BeautifulSoup
-except ImportError:
- try:
- from BeautifulSoup import BeautifulSoup
- except ImportError:
- raise Exception('You need to install BeautifulSoup to run the tests')
-
-def success(text):
- return (u'\033[92m%s\033[0m\n' % text).encode('utf-8')
-
-def error(text):
- return (u'\033[91m%s\033[0m\n' % text).encode('utf-8')
-
-attempted = 0
-
-def assert_equal_without_attribute_order(result, test, failure_message = None):
- global attempted
- attempted += 1
- # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through
- assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')), error(u'Test %d Failed: %s' % (attempted, test.get('description')))
- sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description'))))
- sys.stdout.flush()
-
-def assert_equal(result, test):
- global attempted
- attempted += 1
- assert result == test.get('expected'), error(u'\nTest %d Failed: %s%s' % (attempted, test.get('description'), u'\n%s' % test.get('hits') if test.get('hits') else ''))
- sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description'))))
- sys.stdout.flush()
-
-# extractor section
-extractor_file = open(os.path.join('twitter-text-conformance', 'extract.yml'), 'r')
-extractor_tests = yaml.load(force_unicode(extractor_file.read()))
-extractor_file.close()
-
-sys.stdout.write('Testing Extractor\n')
-sys.stdout.flush()
-
-for section in extractor_tests.get('tests'):
- sys.stdout.write('\nTesting Extractor: %s\n' % section)
- sys.stdout.flush()
- for test in extractor_tests.get('tests').get(section):
- if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Hashtag with ideographic iteration mark']:
- sys.stdout.write('Skipping: %s\n' % test.get('description'))
- sys.stdout.flush()
- continue
- extractor = twitter_text.extractor.Extractor(test.get('text'))
- if section == 'mentions':
- assert_equal(extractor.extract_mentioned_screen_names(), test)
- elif section == 'mentions_with_indices':
- assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test)
- elif section == 'mentions_or_lists_with_indices':
- assert_equal(extractor.extract_mentions_or_lists_with_indices(), test)
- elif section == 'replies':
- assert_equal(extractor.extract_reply_screen_name(), test)
- elif section == 'urls':
- assert_equal(extractor.extract_urls(), test)
- elif section == 'urls_with_indices':
- assert_equal(extractor.extract_urls_with_indices(), test)
- elif section == 'hashtags':
- assert_equal(extractor.extract_hashtags(), test)
- elif section == 'cashtags':
- assert_equal(extractor.extract_cashtags(), test)
- elif section == 'hashtags_with_indices':
- assert_equal(extractor.extract_hashtags_with_indices(), test)
- elif section == 'cashtags_with_indices':
- assert_equal(extractor.extract_cashtags_with_indices(), test)
-
-# autolink section
-autolink_file = open(os.path.join('twitter-text-conformance', 'autolink.yml'), 'r')
-autolink_tests = yaml.load(force_unicode(autolink_file.read()))
-autolink_file.close()
-
-sys.stdout.write('\nTesting Autolink\n')
-sys.stdout.flush()
-
-autolink_options = {'suppress_no_follow': True}
-
-for section in autolink_tests.get('tests'):
- sys.stdout.write('\nTesting Autolink: %s\n' % section)
- for test in autolink_tests.get('tests').get(section):
- if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Autolink a hashtag containing ideographic iteration mark']:
- sys.stdout.write('Skipping: %s\n' % test.get('description'))
- sys.stdout.flush()
- continue
- autolink = twitter_text.autolink.Autolink(test.get('text'))
- if section == 'usernames':
- assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test)
- elif section == 'cashtags':
- assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test)
- elif section == 'urls':
- assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test)
- elif section == 'hashtags':
- assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test)
- elif section == 'all':
- assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test)
- elif section == 'lists':
- assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test)
- elif section == 'json':
- assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test)
-
-# hit_highlighting section
-hit_highlighting_file = open(os.path.join('twitter-text-conformance', 'hit_highlighting.yml'), 'r')
-hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read()))
-hit_highlighting_file.close()
-
-sys.stdout.write('\nTesting Hit Highlighting\n')
-sys.stdout.flush()
-
-for section in hit_highlighting_tests.get('tests'):
- sys.stdout.write('\nTesting Hit Highlighting: %s\n' % section)
- for test in hit_highlighting_tests.get('tests').get(section):
- hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text'))
- if section == 'plain_text':
- assert_equal(hit_highlighter.hit_highlight(hits = test.get('hits')), test)
- elif section == 'with_links':
- assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits = test.get('hits')), test)
-
-# validation section
-validation_tested = False
-validate_tests = None
-try:
- validate_file = open(os.path.join('twitter-text-conformance', 'validate.yml'), 'r')
- validate_file_contents = validate_file.read()
- validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape')))
- validate_file.close()
-except ValueError:
- sys.stdout.write('\nValidation tests were skipped because of wide character issues\n')
- sys.stdout.flush()
-
-if validate_tests:
- sys.stdout.write('\nTesting Validation\n')
- sys.stdout.flush()
-
- for section in validate_tests.get('tests'):
- sys.stdout.write('\nTesting Validation: %s\n' % section)
- for test in validate_tests.get('tests').get(section):
- validator = twitter_text.validation.Validation(test.get('text'))
- if section == 'tweets':
- assert_equal(not validator.tweet_invalid(), test)
- elif section == 'usernames':
- assert_equal(validator.valid_username(), test)
- elif section == 'lists':
- assert_equal(validator.valid_list(), test)
- elif section == 'hashtags':
- assert_equal(validator.valid_hashtag(), test)
- elif section == 'urls':
- assert_equal(validator.valid_url(), test)
-
-sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted)
-sys.stdout.flush()
-sys.exit(os.EX_OK)
\ No newline at end of file
diff --git a/twitter-text b/twitter-text
new file mode 160000
index 0000000..fb07f2e
--- /dev/null
+++ b/twitter-text
@@ -0,0 +1 @@
+Subproject commit fb07f2e30c1d3d053cf2bb2ad6971a3bcfc9b568
diff --git a/twitter-text-conformance b/twitter-text-conformance
deleted file mode 160000
index 9b58c44..0000000
--- a/twitter-text-conformance
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9b58c44302c4ab5bab261f6cfaf6ca89b5a6cf35
diff --git a/twitter_text/__init__.py b/twitter_text/__init__.py
index bb06120..e267dac 100644
--- a/twitter_text/__init__.py
+++ b/twitter_text/__init__.py
@@ -4,35 +4,41 @@
from twitter_text.extractor import Extractor
from twitter_text.highlighter import HitHighlighter
from twitter_text.validation import Validation
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
+
class TwitterText(object):
def __init__(self, text):
- self.text = force_unicode(text) # this will get modified by some functions
- self.original_text = self.text # this never changes; use it as a fallback or for comparison
+ # this will get modified by some functions
+ self.text = force_text(text)
+ # this never changes; use it as a fallback or for comparison
+ self.original_text = self.text
self.has_been_linked = False
- self.tweet_length = None # gets changed by validation method
- self.tweet_is_valid = None # gets changed by validation method
- self.validation_error = None # gets changed by validation method
-
+ # gets changed by validation method
+ self.tweet_length = None
+ # gets changed by validation method
+ self.tweet_is_valid = None
+ # gets changed by validation method
+ self.validation_error = None
+
def __unicode__(self):
return self.text
-
+
def __repr__(self):
return self.__unicode__()
-
+
@property
def autolink(self):
- return Autolink(self.text, parent = self)
-
+ return Autolink(self.text, parent=self)
+
@property
def extractor(self):
return Extractor(self.text)
-
+
@property
def highlighter(self):
- return HitHighlighter(self.text, parent = self)
-
+ return HitHighlighter(self.text, parent=self)
+
@property
def validation(self):
- return Validation(self.text, parent = self)
\ No newline at end of file
+ return Validation(self.text, parent=self)
diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py
index 821d042..7b69f83 100644
--- a/twitter_text/autolink.py
+++ b/twitter_text/autolink.py
@@ -1,9 +1,9 @@
# encoding=utf-8
-import re, cgi
+import re
from twitter_text.regex import REGEXEN
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
from twitter_text.extractor import Extractor
# Default CSS class for auto-linked lists
@@ -28,17 +28,17 @@
DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'"
DEFAULT_OPTIONS = {
- 'list_class': DEFAULT_LIST_CLASS,
- 'username_class': DEFAULT_USERNAME_CLASS,
- 'hashtag_class': DEFAULT_HASHTAG_CLASS,
- 'cashtag_class': DEFAULT_CASHTAG_CLASS,
+ 'list_class': DEFAULT_LIST_CLASS,
+ 'username_class': DEFAULT_USERNAME_CLASS,
+ 'hashtag_class': DEFAULT_HASHTAG_CLASS,
+ 'cashtag_class': DEFAULT_CASHTAG_CLASS,
- 'username_url_base': DEFAULT_USERNAME_URL_BASE,
- 'list_url_base': DEFAULT_LIST_URL_BASE,
- 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE,
- 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE,
+ 'username_url_base': DEFAULT_USERNAME_URL_BASE,
+ 'list_url_base': DEFAULT_LIST_URL_BASE,
+ 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE,
+ 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE,
- 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS,
+ 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS,
}
OPTIONS_NOT_ATTRIBUTES = (
@@ -69,30 +69,32 @@
)
HTML_ENTITIES = {
- '&': '&',
- '>': '>',
- '<': '<',
- '"': '"',
- "'": ''',
+ '&': '&',
+ '>': '>',
+ '<': '<',
+ '"': '"',
+ "'": ''',
}
BOOLEAN_ATTRIBUTES = (
- 'disabled',
+ 'disabled',
'readonly',
'multiple',
'checked',
)
+
def default_transform(entity, text):
return text
+
class Autolink(object):
def __init__(self, text, **kwargs):
- self.text = force_unicode(text)
+ self.text = force_text(text)
self.parent = kwargs.get('parent', False)
self.extractor = Extractor(self.text)
- def auto_link_with_json(self, json_obj, options = {}):
+ def auto_link_with_json(self, json_obj, options={}):
# concantenate entities
entities = []
if 'entities' in json_obj:
@@ -108,7 +110,7 @@ def auto_link_with_json(self, json_obj, options = {}):
return self.auto_link_entities(entities, options)
- def auto_link_entities(self, entities = [], options = {}):
+ def auto_link_entities(self, entities=[], options={}):
if not self.text:
return self.text
@@ -118,7 +120,7 @@ def auto_link_entities(self, entities = [], options = {}):
if not options.get('suppress_no_follow', False):
options['html_attrs']['rel'] = "nofollow"
- entities.sort(key = lambda entity: entity['indices'][0], reverse = True)
+ entities.sort(key=lambda entity: entity['indices'][0], reverse=True)
chars = self.text
for entity in entities:
@@ -133,7 +135,7 @@ def auto_link_entities(self, entities = [], options = {}):
return chars
- def auto_link(self, options = {}):
+ def auto_link(self, options={}):
"""
Add tags around the usernames, lists, hashtags and URLs in the provided text.
The tags can be controlled with the following entries in the options hash.
@@ -161,7 +163,7 @@ def auto_link(self, options = {}):
"""
return self.auto_link_entities(self.extractor.extract_entities_with_indices({'extract_url_without_protocol': False}), options)
- def auto_link_usernames_or_lists(self, options = {}):
+ def auto_link_usernames_or_lists(self, options={}):
"""
Add tags around the usernames and lists in the provided text. The
tags can be controlled with the following entries in the options hash.
@@ -182,7 +184,7 @@ def auto_link_usernames_or_lists(self, options = {}):
"""
return self.auto_link_entities(self.extractor.extract_mentions_or_lists_with_indices(), options)
- def auto_link_hashtags(self, options = {}):
+ def auto_link_hashtags(self, options={}):
"""
Add tags around the hashtags in the provided text.
The tags can be controlled with the following entries in the options hash.
@@ -199,7 +201,7 @@ def auto_link_hashtags(self, options = {}):
"""
return self.auto_link_entities(self.extractor.extract_hashtags_with_indices(), options)
- def auto_link_cashtags(self, options = {}):
+ def auto_link_cashtags(self, options={}):
"""
Add tags around the cashtags in the provided text.
The tags can be controlled with the following entries in the options hash.
@@ -216,7 +218,7 @@ def auto_link_cashtags(self, options = {}):
"""
return self.auto_link_entities(self.extractor.extract_cashtags_with_indices(), options)
- def auto_link_urls(self, options = {}):
+ def auto_link_urls(self, options={}):
"""
Add tags around the URLs in the provided text.
The tags can be controlled with the following entries in the options hash.
@@ -240,13 +242,13 @@ def _html_escape(self, text):
text = text.replace(char, HTML_ENTITIES[char])
return text
- def _extract_html_attrs_from_options(self, options = {}):
+ def _extract_html_attrs_from_options(self, options={}):
html_attrs = options.get('html_attrs', {})
options = options.copy()
if 'html_attrs' in options:
del(options['html_attrs'])
for option in options.keys():
- if not option in OPTIONS_NOT_ATTRIBUTES:
+ if option not in OPTIONS_NOT_ATTRIBUTES:
html_attrs[option] = options[option]
return html_attrs
@@ -256,7 +258,7 @@ def _url_entities_hash(self, url_entities):
entities[entity.get('url')] = entity
return entities
- def _link_to_url(self, entity, chars, options = {}):
+ def _link_to_url(self, entity, chars, options={}):
url = entity.get('url')
href = options.get('link_url_transform', lambda x: x)(url)
@@ -284,7 +286,7 @@ def _link_to_url(self, entity, chars, options = {}):
link = self._link_to_text(entity, link_text, href, html_attrs, options)
return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
- def _link_url_with_entity(self, entity, options = {}):
+ def _link_url_with_entity(self, entity, options={}):
"""
Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste
should contain the full original URL (expanded_url), not the display URL.
@@ -348,7 +350,7 @@ def _link_url_with_entity(self, entity, options = {}):
else:
return self._html_escape(display_url)
- def _link_to_hashtag(self, entity, chars, options = {}):
+ def _link_to_hashtag(self, entity, chars, options={}):
hashchar = chars[entity['indices'][0]]
hashtag = entity['hashtag']
hashtag_class = options.get('hashtag_class')
@@ -368,7 +370,7 @@ def _link_to_hashtag(self, entity, chars, options = {}):
link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options)
return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
- def _link_to_cashtag(self, entity, chars, options = {}):
+ def _link_to_cashtag(self, entity, chars, options={}):
dollar = chars[entity['indices'][0]]
cashtag = entity['cashtag']
@@ -383,10 +385,9 @@ def _link_to_cashtag(self, entity, chars, options = {}):
link = self._link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options)
return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
- def _link_to_screen_name(self, entity, chars, options = {}):
+ def _link_to_screen_name(self, entity, chars, options={}):
name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '')
chunk = options.get('link_text_transform', default_transform)(entity, name)
- name = name.lower()
at = chars[entity['indices'][0]]
@@ -404,7 +405,7 @@ def _link_to_screen_name(self, entity, chars, options = {}):
link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options)
return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:]
- def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}):
+ def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes={}, options={}):
tagged_symbol = u'<%s>%s%s>' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol
text = self._html_escape(text)
tagged_text = u'<%s>%s%s>' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text
@@ -413,14 +414,14 @@ def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {},
else:
return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options))
- def _link_to_text(self, entity, text, href, attributes = {}, options = {}):
+ def _link_to_text(self, entity, text, href, attributes={}, options={}):
attributes['href'] = href
if options.get('link_attribute_transform'):
attributes = options.get('link_attribute_transform')(entity, attributes)
text = options.get('link_text_transform', default_transform)(entity, text)
return u'%s' % (self._tag_attrs(attributes), text)
- def _tag_attrs(self, attributes = {}):
+ def _tag_attrs(self, attributes={}):
attrs = []
for key in sorted(attributes.keys()):
value = attributes[key]
@@ -431,4 +432,4 @@ def _tag_attrs(self, attributes = {}):
value = u' '.join(value)
attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value)))
- return u' '.join(attrs)
\ No newline at end of file
+ return u' '.join(attrs)
diff --git a/twitter_text/encoding.py b/twitter_text/encoding.py
new file mode 100644
index 0000000..bde3ce7
--- /dev/null
+++ b/twitter_text/encoding.py
@@ -0,0 +1,239 @@
+# flake8: noqa
+# Taken from django.utils.encoding
+from __future__ import unicode_literals
+
+import codecs
+import datetime
+from decimal import Decimal
+import locale
+
+from django.utils.functional import Promise
+from django.utils import six
+from django.utils.six.moves.urllib.parse import quote
+
+
+class DjangoUnicodeDecodeError(UnicodeDecodeError):
+ def __init__(self, obj, *args):
+ self.obj = obj
+ UnicodeDecodeError.__init__(self, *args)
+
+ def __str__(self):
+ original = UnicodeDecodeError.__str__(self)
+ return '%s. You passed in %r (%s)' % (original, self.obj,
+ type(self.obj))
+
+
+def python_2_unicode_compatible(klass):
+ """
+ A decorator that defines __unicode__ and __str__ methods under Python 2.
+ Under Python 3 it does nothing.
+
+ To support Python 2 and 3 with a single code base, define a __str__ method
+ returning text and apply this decorator to the class.
+ """
+ if six.PY2:
+ if '__str__' not in klass.__dict__:
+ raise ValueError("@python_2_unicode_compatible cannot be applied "
+ "to %s because it doesn't define __str__()." %
+ klass.__name__)
+ klass.__unicode__ = klass.__str__
+ klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
+ return klass
+
+
+def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Returns a text object representing 's' -- unicode on Python 2 and str on
+ Python 3. Treats bytestrings using the 'encoding' codec.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ if isinstance(s, Promise):
+ # The input is the result of a gettext_lazy() call.
+ return s
+ return force_text(s, encoding, strings_only, errors)
+
+
+def is_protected_type(obj):
+ """Determine if the object instance is of a protected type.
+
+ Objects of protected types are preserved as-is when passed to
+ force_text(strings_only=True).
+ """
+ return isinstance(obj, six.integer_types + (type(None), float, Decimal,
+ datetime.datetime, datetime.date, datetime.time))
+
+
+def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Similar to smart_text, except that lazy instances are resolved to
+ strings, rather than kept as lazy objects.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ # Handle the common case first for performance reasons.
+ if isinstance(s, six.text_type):
+ return s
+ if strings_only and is_protected_type(s):
+ return s
+ try:
+ if not isinstance(s, six.string_types):
+ if six.PY3:
+ if isinstance(s, bytes):
+ s = six.text_type(s, encoding, errors)
+ else:
+ s = six.text_type(s)
+ elif hasattr(s, '__unicode__'):
+ s = six.text_type(s)
+ else:
+ s = six.text_type(bytes(s), encoding, errors)
+ else:
+ # Note: We use .decode() here, instead of six.text_type(s, encoding,
+ # errors), so that if s is a SafeBytes, it ends up being a
+ # SafeText at the end.
+ s = s.decode(encoding, errors)
+ except UnicodeDecodeError as e:
+ if not isinstance(s, Exception):
+ raise DjangoUnicodeDecodeError(s, *e.args)
+ else:
+ # If we get to here, the caller has passed in an Exception
+ # subclass populated with non-ASCII bytestring data without a
+ # working unicode method. Try to handle this without raising a
+ # further exception by individually forcing the exception args
+ # to unicode.
+ s = ' '.join([force_text(arg, encoding, strings_only,
+ errors) for arg in s])
+ return s
+
+
+def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Returns a bytestring version of 's', encoded as specified in 'encoding'.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ if isinstance(s, Promise):
+ # The input is the result of a gettext_lazy() call.
+ return s
+ return force_bytes(s, encoding, strings_only, errors)
+
+
+def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Similar to smart_bytes, except that lazy instances are resolved to
+ strings, rather than kept as lazy objects.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ # Handle the common case first for performance reasons.
+ if isinstance(s, bytes):
+ if encoding == 'utf-8':
+ return s
+ else:
+ return s.decode('utf-8', errors).encode(encoding, errors)
+ if strings_only and is_protected_type(s):
+ return s
+ if isinstance(s, six.memoryview):
+ return bytes(s)
+ if isinstance(s, Promise):
+ return six.text_type(s).encode(encoding, errors)
+ if not isinstance(s, six.string_types):
+ try:
+ if six.PY3:
+ return six.text_type(s).encode(encoding)
+ else:
+ return bytes(s)
+ except UnicodeEncodeError:
+ if isinstance(s, Exception):
+ # An Exception subclass containing non-ASCII data that doesn't
+ # know how to print itself properly. We shouldn't raise a
+ # further exception.
+ return b' '.join([force_bytes(arg, encoding, strings_only,
+ errors) for arg in s])
+ return six.text_type(s).encode(encoding, errors)
+ else:
+ return s.encode(encoding, errors)
+
+if six.PY3:
+ smart_str = smart_text
+ force_str = force_text
+else:
+ smart_str = smart_bytes
+ force_str = force_bytes
+ # backwards compatibility for Python 2
+ smart_unicode = smart_text
+ force_unicode = force_text
+
+smart_str.__doc__ = """
+Apply smart_text in Python 3 and smart_bytes in Python 2.
+
+This is suitable for writing to sys.stdout (for instance).
+"""
+
+force_str.__doc__ = """
+Apply force_text in Python 3 and force_bytes in Python 2.
+"""
+
+
+def iri_to_uri(iri):
+ """
+ Convert an Internationalized Resource Identifier (IRI) portion to a URI
+ portion that is suitable for inclusion in a URL.
+
+ This is the algorithm from section 3.1 of RFC 3987. However, since we are
+ assuming input is either UTF-8 or unicode already, we can simplify things a
+ little from the full method.
+
+ Returns an ASCII string containing the encoded result.
+ """
+ # The list of safe characters here is constructed from the "reserved" and
+ # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
+ # reserved = gen-delims / sub-delims
+ # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+ # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
+ # / "*" / "+" / "," / ";" / "="
+ # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ # Of the unreserved characters, urllib.quote already considers all but
+ # the ~ safe.
+ # The % character is also added to the list of safe characters here, as the
+ # end of section 3.1 of RFC 3987 specifically mentions that % must not be
+ # converted.
+ if iri is None:
+ return iri
+ return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
+
+
+def filepath_to_uri(path):
+ """Convert a file system path to a URI portion that is suitable for
+ inclusion in a URL.
+
+ We are assuming input is either UTF-8 or unicode already.
+
+ This method will encode certain chars that would normally be recognized as
+ special chars for URIs. Note that this method does not encode the '
+ character, as it is a valid character within URIs. See
+ encodeURIComponent() JavaScript function for more details.
+
+ Returns an ASCII string containing the encoded result.
+ """
+ if path is None:
+ return path
+ # I know about `os.sep` and `os.altsep` but I want to leave
+ # some flexibility for hardcoding separators.
+ return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'")
+
+
+def get_system_encoding():
+ """
+ The encoding of the default system locale but falls back to the given
+ fallback encoding if the encoding is unsupported by python or could
+ not be determined. See tickets #10335 and #5846
+ """
+ try:
+ encoding = locale.getdefaultlocale()[1] or 'ascii'
+ codecs.lookup(encoding)
+ except Exception:
+ encoding = 'ascii'
+ return encoding
+
+DEFAULT_LOCALE_ENCODING = get_system_encoding()
diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py
index 1015b8c..1f51fce 100644
--- a/twitter_text/extractor.py
+++ b/twitter_text/extractor.py
@@ -1,16 +1,17 @@
# encoding=utf-8
from twitter_text.regex import REGEXEN
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
+
class Extractor(object):
"""
A module for including Tweet parsing in a class. This module provides function for the extraction and processing
of usernames, lists, URLs and hashtags.
"""
-
+
def __init__(self, text):
- self.text = force_unicode(text)
+ self.text = force_text(text)
def _remove_overlapping_entities(self, entities):
"""
@@ -19,18 +20,18 @@ def _remove_overlapping_entities(self, entities):
"""
# sort by start index
- entities.sort(key = lambda entity: entity['indices'][0])
+ entities.sort(key=lambda entity: entity['indices'][0])
# remove duplicates
- prev = None
+ prev = None
for entity in [e for e in entities]:
if prev and prev['indices'][1] > entity['indices'][0]:
entities.remove(entity)
else:
- prev = entity
+ prev = entity
return entities
- def extract_entities_with_indices(self, options = {}, transform = lambda x: x):
+ def extract_entities_with_indices(self, options={}, transform=lambda x: x):
"""
Extracts all usernames, lists, hashtags and URLs in the Tweet text
along with the indices for where the entity ocurred
@@ -43,19 +44,21 @@ def extract_entities_with_indices(self, options = {}, transform = lambda x: x):
return []
# extract all entities
- entities = self.extract_urls_with_indices(options) + \
- self.extract_hashtags_with_indices({'check_url_overlap': False}) + \
- self.extract_mentions_or_lists_with_indices() + \
- self.extract_cashtags_with_indices()
+ entities = (
+ self.extract_urls_with_indices(options) +
+ self.extract_hashtags_with_indices({'check_url_overlap': False}) +
+ self.extract_mentions_or_lists_with_indices() +
+ self.extract_cashtags_with_indices()
+ )
- entities = self._remove_overlapping_entities(entities)
+ entities = self._remove_overlapping_entities(entities)
for entity in entities:
- entity = transform(entity)
+ entity = transform(entity)
return entities
- def extract_mentioned_screen_names(self, transform = lambda x: x):
+ def extract_mentioned_screen_names(self, transform=lambda x: x):
"""
Extracts a list of all usernames mentioned in the Tweet text. If the
text is None or contains no username mentions an empty list
@@ -65,7 +68,7 @@ def extract_mentioned_screen_names(self, transform = lambda x: x):
"""
return [transform(mention['screen_name']) for mention in self.extract_mentioned_screen_names_with_indices()]
- def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x):
+ def extract_mentioned_screen_names_with_indices(self, transform=lambda x: x):
"""
Extracts a list of all usernames mentioned in the Tweet text
along with the indices for where the mention ocurred. If the
@@ -87,7 +90,7 @@ def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x):
})
return possible_screen_names
- def extract_mentions_or_lists_with_indices(self, transform = lambda x: x):
+ def extract_mentions_or_lists_with_indices(self, transform=lambda x: x):
"""
Extracts a list of all usernames or lists mentioned in the Tweet text
along with the indices for where the mention ocurred. If the
@@ -101,7 +104,7 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x):
if not REGEXEN['at_signs'].search(self.text):
return []
- possible_entries = []
+ possible_entries = []
for match in REGEXEN['valid_mention_or_list'].finditer(self.text):
try:
after = self.text[match.end()]
@@ -117,8 +120,8 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x):
})
return possible_entries
-
- def extract_reply_screen_name(self, transform = lambda x: x):
+
+ def extract_reply_screen_name(self, transform=lambda x: x):
"""
Extracts the username username replied to in the Tweet text. If the
text is None or is not a reply None will be returned.
@@ -135,8 +138,8 @@ def extract_reply_screen_name(self, transform = lambda x: x):
else:
possible_screen_name = transform(possible_screen_name.group(1))
return possible_screen_name
-
- def extract_urls(self, transform = lambda x: x):
+
+ def extract_urls(self, transform=lambda x: x):
"""
Extracts a list of all URLs included in the Tweet text. If the
text is None or contains no URLs an empty list
@@ -145,8 +148,8 @@ def extract_urls(self, transform = lambda x: x):
If a transform is given then it will be called for each URL.
"""
return [transform(url['url']) for url in self.extract_urls_with_indices()]
-
- def extract_urls_with_indices(self, options = {'extract_url_without_protocol': True}):
+
+ def extract_urls_with_indices(self, options={'extract_url_without_protocol': True}):
"""
Extracts a list of all URLs included in the Tweet text along
with the indices. If the text is None or contains no
@@ -170,10 +173,12 @@ def extract_urls_with_indices(self, options = {'extract_url_without_protocol': T
ascii_domain = ascii_domain.group()
last_url = {
'url': ascii_domain,
- 'indices': [start_position - len(before or '') + complete.find(ascii_domain), start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)]
+ 'indices': [start_position - len(before or '') + complete.find(ascii_domain),
+ start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)]
}
- last_url_invalid_match = REGEXEN['invalid_short_domain'].search(ascii_domain) is not None
- if not last_url_invalid_match:
+ if (path or
+ REGEXEN['valid_special_short_domain'].search(ascii_domain) or
+ not REGEXEN['invalid_short_domain'].search(ascii_domain)):
urls.append(last_url)
# no ASCII-only domain found. Skip the entire URL
if not last_url:
@@ -192,8 +197,8 @@ def extract_urls_with_indices(self, options = {'extract_url_without_protocol': T
'indices': [start_position, end_position]
})
return urls
-
- def extract_hashtags(self, transform = lambda x: x):
+
+ def extract_hashtags(self, transform=lambda x: x):
"""
Extracts a list of all hashtags included in the Tweet text. If the
text is None or contains no hashtags an empty list
@@ -203,8 +208,8 @@ def extract_hashtags(self, transform = lambda x: x):
If a block is given then it will be called for each hashtag.
"""
return [transform(hashtag['hashtag']) for hashtag in self.extract_hashtags_with_indices()]
-
- def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, transform = lambda x: x):
+
+ def extract_hashtags_with_indices(self, options={'check_url_overlap': True}, transform=lambda x: x):
"""
Extracts a list of all hashtags included in the Tweet text. If the
text is None or contains no hashtags an empty list
@@ -234,7 +239,7 @@ def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, t
return tags
- def extract_cashtags(self, transform = lambda x: x):
+ def extract_cashtags(self, transform=lambda x: x):
"""
Extracts a list of all cashtags included in the Tweet text. If the
text is None or contains no cashtags an empty list
@@ -245,7 +250,7 @@ def extract_cashtags(self, transform = lambda x: x):
"""
return [cashtag['cashtag'] for cashtag in self.extract_cashtags_with_indices()]
- def extract_cashtags_with_indices(self, transform = lambda x: x):
+ def extract_cashtags_with_indices(self, transform=lambda x: x):
"""
Extracts a list of all cashtags included in the Tweet text. If the
text is None or contains no cashtags an empty list
@@ -267,4 +272,4 @@ def extract_cashtags_with_indices(self, transform = lambda x: x):
'indices': [start_position, end_position]
})
- return tags
\ No newline at end of file
+ return tags
diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py
index ec128ca..3311c29 100644
--- a/twitter_text/highlighter.py
+++ b/twitter_text/highlighter.py
@@ -3,37 +3,41 @@
import re
from HTMLParser import HTMLParser
-from twitter_text.regex import UNICODE_SPACES
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
DEFAULT_HIGHLIGHT_TAG = 'em'
+
# from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
+
def handle_data(self, d):
self.fed.append(d)
+
def get_data(self):
return ''.join(self.fed)
+
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
+
class HitHighlighter(object):
def __init__(self, text, **kwargs):
- self.text = force_unicode(text)
+ self.text = force_text(text)
self.parent = kwargs.get('parent', False)
- def hit_highlight(self, hits = [], **kwargs):
+ def hit_highlight(self, hits=[], **kwargs):
if not hits and not kwargs.get('query'):
return self.text
if not hits and kwargs.get('query'):
- stripped_text = strip_tags(self.text)
+ stripped_text = strip_tags(self.text)
for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text):
hits.append(match.span())
@@ -49,7 +53,7 @@ def hit_highlight(self, hits = [], **kwargs):
for index, chunk in enumerate(chunks):
if not index % 2:
text_chunks.append(chunk)
- for hit in sorted(hits, key = lambda chunk: chunk[1], reverse = True):
+ for hit in sorted(hits, key=lambda chunk: chunk[1], reverse=True):
hit_start, hit_end = hit
placed = 0
for index, chunk in enumerate(chunks):
@@ -80,4 +84,4 @@ def hit_highlight(self, hits = [], **kwargs):
else:
result.append(chunk)
self.text = u''.join(result)
- return self.text
\ No newline at end of file
+ return self.text
diff --git a/twitter_text/regex.py b/twitter_text/regex.py
index c136f80..3129e51 100644
--- a/twitter_text/regex.py
+++ b/twitter_text/regex.py
@@ -3,45 +3,60 @@
# A collection of regular expressions for parsing Tweet text. The regular expression
# list is frozen at load time to ensure immutability. These reular expressions are
# used throughout the Twitter classes. Special care has been taken to make
-# sure these reular expressions work with Tweets in all languages.
-import re, string
+# sure these regular expressions work with Tweets in all languages.
+from __future__ import absolute_import
+import os
-REGEXEN = {} # :nodoc:
+import regex as re
+import yaml
-def regex_range(start, end = None):
+from twitter_text.encoding import force_text
+
+REGEXEN = {} # :nodoc:
+
+
+def regex_range(start, end=None):
if end:
return u'%s-%s' % (unichr(start), unichr(end))
else:
return u'%s' % unichr(start)
+TLDS = yaml.safe_load(force_text(
+ open(os.path.join(
+ os.path.dirname(os.path.dirname(__file__)),
+ 'twitter-text',
+ 'conformance',
+ 'tld_lib.yml'
+ )).read()
+))
+
+
# Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand
# to access both the list of characters and a pattern suitible for use with String#split
# Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE
-UNICODE_SPACES = []
-for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [
- range(0x0009, 0x000D), # White_Space # Cc [5] ..
- 0x0020, # White_Space # Zs SPACE
- 0x0085, # White_Space # Cc
- 0x00A0, # White_Space # Zs NO-BREAK SPACE
- 0x1680, # White_Space # Zs OGHAM SPACE MARK
- 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
- range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
- 0x2028, # White_Space # Zl LINE SEPARATOR
- 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
- 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
- 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
- 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
- ]):
- UNICODE_SPACES.append(unichr(space))
+UNICODE_SPACES = [unichr(space) for space in reduce(lambda x, y: x + y if type(y) == list else x + [y], [
+ range(0x0009, 0x000D), # White_Space # Cc [5] ..
+ 0x0020, # White_Space # Zs SPACE
+ 0x0085, # White_Space # Cc
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
+ range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE
+ 0x2028, # White_Space # Zl LINE SEPARATOR
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
+])]
REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES))
# Characters not allowed in Tweets
-INVALID_CHARACTERS = [
- 0xFFFE, 0xFEFF, # BOM
- 0xFFFF, # Special
- 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change
+INVALID_CHARACTERS = [
+ 0xFFFE, 0xFEFF, # BOM
+ 0xFFFF, # Special
+ 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change
]
-REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS]
+REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS]
REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$')
@@ -71,98 +86,33 @@ def regex_range(start, end = None):
LATIN_ACCENTS = u''.join(LATIN_ACCENTS)
RTL_CHARACTERS = ''.join([
- regex_range(0x0600,0x06FF),
- regex_range(0x0750,0x077F),
- regex_range(0x0590,0x05FF),
- regex_range(0xFE70,0xFEFF)
+ regex_range(0x0600, 0x06FF),
+ regex_range(0x0750, 0x077F),
+ regex_range(0x0590, 0x05FF),
+ regex_range(0xFE70, 0xFEFF)
])
-NON_LATIN_HASHTAG_CHARS = ''.join([
- # Cyrillic (Russian, Ukrainian, etc.)
- regex_range(0x0400, 0x04ff), # Cyrillic
- regex_range(0x0500, 0x0527), # Cyrillic Supplement
- regex_range(0x2de0, 0x2dff), # Cyrillic Extended A
- regex_range(0xa640, 0xa69f), # Cyrillic Extended B
- regex_range(0x0591, 0x05bf), # Hebrew
- regex_range(0x05c1, 0x05c2),
- regex_range(0x05c4, 0x05c5),
- regex_range(0x05c7),
- regex_range(0x05d0, 0x05ea),
- regex_range(0x05f0, 0x05f4),
- regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms
- regex_range(0xfb2a, 0xfb36),
- regex_range(0xfb38, 0xfb3c),
- regex_range(0xfb3e),
- regex_range(0xfb40, 0xfb41),
- regex_range(0xfb43, 0xfb44),
- regex_range(0xfb46, 0xfb4f),
- regex_range(0x0610, 0x061a), # Arabic
- regex_range(0x0620, 0x065f),
- regex_range(0x066e, 0x06d3),
- regex_range(0x06d5, 0x06dc),
- regex_range(0x06de, 0x06e8),
- regex_range(0x06ea, 0x06ef),
- regex_range(0x06fa, 0x06fc),
- regex_range(0x06ff),
- regex_range(0x0750, 0x077f), # Arabic Supplement
- regex_range(0x08a0), # Arabic Extended A
- regex_range(0x08a2, 0x08ac),
- regex_range(0x08e4, 0x08fe),
- regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A
- regex_range(0xfbd3, 0xfd3d),
- regex_range(0xfd50, 0xfd8f),
- regex_range(0xfd92, 0xfdc7),
- regex_range(0xfdf0, 0xfdfb),
- regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B
- regex_range(0xfe76, 0xfefc),
- regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner
- regex_range(0x0e01, 0x0e3a), # Thai
- regex_range(0x0e40, 0x0e4e), # Hangul (Korean)
- regex_range(0x1100, 0x11ff), # Hangul Jamo
- regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo
- regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A
- regex_range(0xAC00, 0xD7AF), # Hangul Syllables
- regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B
- regex_range(0xFFA1, 0xFFDC) # Half-width Hangul
-])
-
-CJ_HASHTAG_CHARACTERS = ''.join([
- regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width)
- regex_range(0xFF66, 0xFF9F), # Katakana (half-width)
- regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width)
- regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana
- regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A)
- regex_range(0x4E00, 0x9FFF), # Kanji (Unified)
-])
-
-try:
- CJ_HASHTAG_CHARACTERS = ''.join([
- CJ_HASHTAG_CHARACTERS,
- regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B)
- regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C)
- regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D)
- regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement)
- ])
-except ValueError:
- # this is a narrow python build so these extended Kanji characters won't work
- pass
-
PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~'
SPACE_CHARS = ur" \t\n\x0B\f\r"
CTRL_CHARS = ur"\x00-\x1F\x7F"
# A hashtag must contain latin characters, numbers and underscores, but not all numbers.
-HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
-HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
-HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS)
+HASHTAG_ALPHA = ur'[\p{L}\p{M}]'
+HASHTAG_ALPHANUMERIC = ur'[\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]'
+HASHTAG_BOUNDARY = ur'\A|\z|[^&\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]'
-HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE)
+HASHTAG = re.compile(ur'(%s)(#|#)(?!\ufe0f|\u20e3)(%s*%s%s*)' % (
+ HASHTAG_BOUNDARY,
+ HASHTAG_ALPHANUMERIC,
+ HASHTAG_ALPHA,
+ HASHTAG_ALPHANUMERIC,
+), re.IGNORECASE)
REGEXEN['valid_hashtag'] = HASHTAG
REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE)
REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$')
-REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)')
+REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|(?:^|[^a-zA-Z0-9_+~.-])[rR][tT]:?)')
REGEXEN['at_signs'] = re.compile(ur'[@@]')
REGEXEN['valid_mention_or_list'] = re.compile(
ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character
@@ -171,7 +121,7 @@ def regex_range(start, end = None):
ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional)
)
REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE)
- # Used in Extractor for final filtering
+# Used in Extractor for final filtering
REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE)
# URL related hash regex collection
@@ -179,11 +129,24 @@ def regex_range(start, end = None):
REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$')
DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES))
REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_domain_name'] = re.compile(
+ ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (
+ DOMAIN_VALID_CHARS,
+ DOMAIN_VALID_CHARS,
+ DOMAIN_VALID_CHARS
+ ), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_gTLD'] = re.compile(
+ ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % (
+ '|'.join(TLDS['generic']),
+ ), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_ccTLD'] = re.compile(
+ ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % (
+ '|'.join(TLDS['country']),
+ ), re.IGNORECASE | re.UNICODE)
REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_special_cctld'] = re.compile(ur'(?:(?:co|tv)(?=[^0-9a-z@]|$))')
+
REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE)
# This is used in Extractor
@@ -194,18 +157,40 @@ def regex_range(start, end = None):
# This is used in Extractor to filter out unwanted URLs.
REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_special_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_special_cctld'].pattern))
REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+')
-REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_general_url_path_chars'] = re.compile(
+ ur"[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % (
+ LATIN_ACCENTS,
+ ), re.IGNORECASE | re.UNICODE)
# Allow URL paths to contain balanced parens
# 1. Used in Wikipedia URLs like /Primer_(film)
# 2. Used in IIS sessions like /S(dfd346)/
-REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE)
+# Allow one nested level of balanced parentheses
+REGEXEN['valid_url_balanced_parens'] = re.compile(
+ ur'\((?:%s+|(?:%s*\(%s+\)%s*))\)' % (
+ REGEXEN['valid_general_url_path_chars'].pattern,
+ REGEXEN['valid_general_url_path_chars'].pattern,
+ REGEXEN['valid_general_url_path_chars'].pattern,
+ REGEXEN['valid_general_url_path_chars'].pattern,
+ ), re.IGNORECASE | re.UNICODE)
# Valid end-of-path chracters (so /foo. does not gobble the period).
# 1. Allow = for empty URL parameters and other URL-join artifacts
-REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE)
-REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_path_ending_chars'] = re.compile(
+ ur'[a-z\p{Cyrillic}0-9=_#\/\+\-%s]|(?:%s)' % (
+ LATIN_ACCENTS,
+ REGEXEN['valid_url_balanced_parens'].pattern
+ ), re.IGNORECASE | re.UNICODE)
+REGEXEN['valid_url_path'] = re.compile(
+ ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (
+ REGEXEN['valid_general_url_path_chars'].pattern,
+ REGEXEN['valid_url_balanced_parens'].pattern,
+ REGEXEN['valid_general_url_path_chars'].pattern,
+ REGEXEN['valid_url_path_ending_chars'].pattern,
+ REGEXEN['valid_general_url_path_chars'].pattern
+ ), re.IGNORECASE | re.UNICODE)
REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE)
REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE)
@@ -231,7 +216,7 @@ def regex_range(start, end = None):
REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE)
# These URL validation pattern strings are based on the ABNF from RFC 3986
-REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE)
+REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z\p{Cyrillic}0-9\-._~]', re.IGNORECASE | re.UNICODE)
REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE)
REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE)
REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE)
diff --git a/twitter_text/templatetags/twitterize.py b/twitter_text/templatetags/twitterize.py
index 01db63d..b58779a 100644
--- a/twitter_text/templatetags/twitterize.py
+++ b/twitter_text/templatetags/twitterize.py
@@ -8,15 +8,16 @@
register = Library()
-@register.filter(name = 'twitter_text')
+
+@register.filter(name='twitter_text')
@stringfilter
-def twitter_text(text, search_query = False):
+def twitter_text(text, search_query=False):
"""
Parses a text string through the TwitterText auto_link method and if search_query is passed, through the hit_highlight method.
"""
tt = TwitterText(text)
if search_query:
- tt.text = tt.highlighter.hit_highlight(query = search_query)
- tt.text = tt.autolink.auto_link()
+ tt.text = tt.highlighter.hit_highlight(query=search_query)
+ tt.text = tt.autolink.auto_link()
return tt.text
-twitter_text.is_safe = True
\ No newline at end of file
+twitter_text.is_safe = True
diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py
index 4e17267..e67238c 100644
--- a/twitter_text/unicode.py
+++ b/twitter_text/unicode.py
@@ -1,6 +1,8 @@
-import types, datetime
+import datetime
+import types
from decimal import Decimal
+
# borrowed from django.utils.encoding
class TwitterTextUnicodeDecodeError(UnicodeDecodeError):
def __init__(self, obj, *args):
@@ -10,7 +12,8 @@ def __init__(self, obj, *args):
def __str__(self):
original = UnicodeDecodeError.__str__(self)
return '%s. You passed in %r (%s)' % (original, self.obj,
- type(self.obj))
+ type(self.obj))
+
def is_protected_type(obj):
"""Determine if the object instance is of a protected type.
@@ -25,6 +28,7 @@ def is_protected_type(obj):
float, Decimal)
)
+
def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
"""
Similar to smart_unicode, except that lazy instances are resolved to
@@ -50,8 +54,8 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
# without raising a further exception. We do an
# approximation to what the Exception's standard str()
# output should be.
- s = ' '.join([force_unicode(arg, encoding, strings_only,
- errors) for arg in s])
+ s = ' '.join([force_unicode(arg, encoding, strings_only, errors)
+ for arg in s])
elif not isinstance(s, unicode):
# Note: We use .decode() here, instead of unicode(s, encoding,
# errors), so that if s is a SafeString, it ends up being a
@@ -66,6 +70,6 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
# working unicode method. Try to handle this without raising a
# further exception by individually forcing the exception args
# to unicode.
- s = ' '.join([force_unicode(arg, encoding, strings_only,
- errors) for arg in s])
+ s = ' '.join([force_unicode(arg, encoding, strings_only, errors)
+ for arg in s])
return s
diff --git a/twitter_text/validation.py b/twitter_text/validation.py
index 6dea5f9..3990ddf 100644
--- a/twitter_text/validation.py
+++ b/twitter_text/validation.py
@@ -2,24 +2,25 @@
import re
-from twitter_text.unicode import force_unicode
+from twitter_text.encoding import force_text
from twitter_text.extractor import Extractor
from twitter_text.regex import REGEXEN
MAX_LENGTH = 140
DEFAULT_TCO_URL_LENGTHS = {
- 'short_url_length': 22,
- 'short_url_length_https': 23,
- 'characters_reserved_per_media': 22,
+ 'short_url_length': 23,
+ 'short_url_length_https': 23,
+ 'characters_reserved_per_media': 22,
}
+
class Validation(object):
def __init__(self, text, **kwargs):
- self.text = force_unicode(text)
+ self.text = force_text(text)
self.parent = kwargs.get('parent', False)
-
- def tweet_length(self, options = {}):
+
+ def tweet_length(self, options={}):
"""
Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC
(See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a
@@ -34,14 +35,14 @@ def tweet_length(self, options = {}):
The string could also contain U+00E9 already, in which case the canonicalization will not change the value.
"""
- assert (not self.parent or not getattr(self.parent, 'has_been_linked', False) ), 'The validator should only be run on text before it has been modified.'
+ assert (not self.parent or not getattr(self.parent, 'has_been_linked', False)), 'The validator should only be run on text before it has been modified.'
for key in DEFAULT_TCO_URL_LENGTHS:
- if not key in options:
+ if key not in options:
options[key] = DEFAULT_TCO_URL_LENGTHS[key]
length = len(self.text)
- # thanks force_unicode for making this so much simpler than the ruby version
+ # thanks force_text for making this so much simpler than the ruby version
for url in Extractor(self.text).extract_urls_with_indices():
# remove the link of the original URL
@@ -52,21 +53,22 @@ def tweet_length(self, options = {}):
if self.parent and hasattr(self.parent, 'tweet_length'):
self.parent.tweet_length = length
return length
-
+
def tweet_invalid(self):
"""
Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation
before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation
will allow quicker feedback.
-
+
Returns false if this text is valid. Otherwise one of the following Symbols will be returned:
-
+
"Too long":: if the text is too long
"Empty text":: if the text is empty
"Invalid characters":: if the text contains non-Unicode or any of the disallowed Unicode characters
"""
- valid = True # optimism
+ # optimism
+ valid = True
validation_error = None
if not self.tweet_length():
@@ -77,7 +79,7 @@ def tweet_invalid(self):
if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text):
valid, validation_error = False, 'Invalid characters'
-
+
if self.parent and hasattr(self.parent, 'tweet_is_valid'):
self.parent.tweet_is_valid = valid
if self.parent and hasattr(self.parent, 'tweet_validation_error'):
@@ -108,7 +110,7 @@ def valid_hashtag(self):
return len(extracted) == 1 and extracted[0] == self.text[1:]
- def valid_url(self, unicode_domains = True, require_protocol = True):
+ def valid_url(self, unicode_domains=True, require_protocol=True):
if not self.text:
return False
@@ -121,38 +123,34 @@ def valid_url(self, unicode_domains = True, require_protocol = True):
if not (
(
- not require_protocol
- or (
- self._valid_match(scheme, REGEXEN['validate_url_scheme'])
- and re.compile(ur'^https?$', re.IGNORECASE).match(scheme)
+ not require_protocol or (
+ self._valid_match(scheme, REGEXEN['validate_url_scheme']) and
+ re.compile(ur'^https?$', re.IGNORECASE).match(scheme)
)
- )
- and (
- path == ''
- or self._valid_match(path, REGEXEN['validate_url_path'])
- )
- and self._valid_match(query, REGEXEN['validate_url_query'], True)
- and self._valid_match(fragment, REGEXEN['validate_url_fragment'], True)
+ ) and (
+ path == '' or
+ self._valid_match(path, REGEXEN['validate_url_path'])
+ ) and
+ self._valid_match(query, REGEXEN['validate_url_query'], True) and
+ self._valid_match(fragment, REGEXEN['validate_url_fragment'], True)
):
return False
return bool(
(
- unicode_domains
- and self._valid_match(authority, REGEXEN['validate_url_unicode_authority'])
- and REGEXEN['validate_url_unicode_authority'].match(authority).string == authority
- )
- or (
- not unicode_domains
- and self._valid_match(authority, REGEXEN['validate_url_authority'])
- and REGEXEN['validate_url_authority'].match(authority).string == authority
+ unicode_domains and
+ self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) and
+ REGEXEN['validate_url_unicode_authority'].match(authority).string == authority
+ ) or (
+ not unicode_domains and
+ self._valid_match(authority, REGEXEN['validate_url_authority']) and
+ REGEXEN['validate_url_authority'].match(authority).string == authority
)
)
- def _valid_match(self, string, re_obj, optional = False):
- if optional and string is None:
- return True
- match = re_obj.match(string)
+ def _valid_match(self, string, re_obj, optional=False):
+ if string:
+ match = re_obj.match(string)
if optional:
return not (string and (match is None or not match.string[match.span()[0]:match.span()[1]] == string))
else: