diff --git a/.gitignore b/.gitignore index 17b1f10..20a1408 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ *.pyc build* *.egg* -dist \ No newline at end of file +dist +/.cache diff --git a/.gitmodules b/.gitmodules index f051160..e5d7799 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "twitter-text-conformance"] - path = twitter-text-conformance - url = https://github.com/dryan/twitter-text-conformance.git +[submodule "twitter-text"] + path = twitter-text + url = https://github.com/twitter/twitter-text.git diff --git a/.travis.yml b/.travis.yml index b954140..b29bfb2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,11 @@ +sudo: false language: python python: - - "2.6" - "2.7" install: - - "pip install . --use-mirrors" - - "pip install -r requirements.txt --use-mirrors" -script: "python ./tests.py" + - "pip install ." + - "pip install -r requirements.txt" +script: "py.test" notifications: email: false diff --git a/README.md b/README.md index fb6d3cd..1c0537d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ A port of the Ruby gem [twitter-text-rb](https://github.com/twitter/twitter-text-rb) to Python. -[![Build Status](https://travis-ci.org/dryan/twitter-text-py.png?branch=master)](https://travis-ci.org/dryan/twitter-text-py) +[![Build Status](https://travis-ci.org/muckrack/twitter-text-py.svg?branch=master)](https://travis-ci.org/muckrack/twitter-text-py) # Changes in 2.0 diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..221c885 --- /dev/null +++ b/conftest.py @@ -0,0 +1,177 @@ +# encoding=utf-8 + +from __future__ import unicode_literals + +import json +import os + +import pytest +import yaml + +import twitter_text +from twitter_text.encoding import force_text, smart_bytes + +# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects +#from yaml import Loader, SafeLoader + + +narrow_build = True +try: + unichr(0x20000) + narrow_build = False +except: + pass + + +#def construct_yaml_str(self, node): +# return self.construct_scalar(node) +#Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) +#SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) + + +try: + from bs4 import BeautifulSoup +except ImportError: + raise Exception('You need to install BeautifulSoup4 to run the tests') + + +def pytest_collect_file(parent, path): + if path.ext == '.yml': + return YamlFile(path, parent) + + +class YamlException(Exception): + """ custom exception for error reporting. """ + + +class YamlFile(pytest.File): + def collect(self): + filename = os.path.splitext(os.path.basename(self.fspath.strpath))[0] + if filename not in TEST_MAP: + print "Skipping {}; not supported".format(filename) + return + if TEST_MAP[filename].get('requires_wide_build') and narrow_build: + print "Skipping {} due to narrow build".format(filename) + return + raw = yaml.safe_load(force_text(self.fspath.open().read())) + if 'tests' not in raw: + return + for section, specs in raw['tests'].items(): + for spec in specs: + yield YamlItem(self, filename, section, spec) + + +TEST_MAP = { + 'autolink': { + 'cls': twitter_text.autolink.Autolink, + 'options': {'suppress_no_follow': True}, + 'methods': { + 'usernames': 'auto_link_usernames_or_lists', + 'cashtags': 'auto_link_cashtags', + 'urls': 'auto_link_urls', + 'hashtags': 'auto_link_hashtags', + 'all': 'auto_link', + 'lists': 'auto_link_usernames_or_lists', + 'json': 'auto_link_with_json', + }, + 'ignore_attribute_order': set([ + 'usernames', + 'cashtags', + 'urls', + 'hashtags', + 'all', + 'lists', + 'json', + ]) + }, + 'extract': { + 'cls': twitter_text.extractor.Extractor, + 'methods': { + 'mentions': 'extract_mentioned_screen_names', + 'mentions_with_indices': 'extract_mentioned_screen_names_with_indices', + 'mentions_or_lists_with_indices': 'extract_mentions_or_lists_with_indices', + 'replies': 'extract_reply_screen_name', + 'urls': 'extract_urls', + 'urls_with_indices': 'extract_urls_with_indices', + 'hashtags': 'extract_hashtags', + 'cashtags': 'extract_cashtags', + 'hashtags_with_indices': 'extract_hashtags_with_indices', + 'cashtags_with_indices': 'extract_cashtags_with_indices', + }, + }, + 'hit_highlighting': { + 'cls': twitter_text.highlighter.HitHighlighter, + 'methods': { + 'plain_text': 'hit_highlight', + 'with_links': 'hit_highlight', + }, + 'ignore_attribute_order': set([ + 'with_links', + ]) + }, + 'validate': { + 'cls': twitter_text.validation.Validation, + 'requires_wide_build': True, + 'methods': { + 'tweets': 'valid_tweet_text', + 'usernames': 'valid_username', + 'lists': 'valid_list', + 'hashtags': 'valid_hashtag', + 'urls': 'valid_url', + 'urls_without_protocol': ('valid_url', {'require_protocol': False}), + 'lengths': 'tweet_length', + }, + } +} + + +class YamlItem(pytest.Item): + def __init__(self, parent, filename, section, spec): + self.section = section + self.filename = filename + self.spec = spec + name = "{}:{}:{}".format(filename, section, spec['description']) + super(YamlItem, self).__init__(name, parent) + + def _equal_without_attribute_order(self, result, expected): + # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through + return BeautifulSoup(result, "lxml") == BeautifulSoup(expected, "lxml") + + def runtest(self): + if self.filename not in TEST_MAP: + raise YamlException("{} file not supported".format(self.section)) + if self.section not in TEST_MAP[self.filename]['methods']: + raise YamlException("{}:{} section not supported".format(self.section)) + cls = TEST_MAP[self.filename]['cls'] + instance = cls(self.spec['text']) + args = [] + try: + method_name, kwargs = TEST_MAP[self.filename]['methods'][self.section] + kwargs = kwargs.copy() + except ValueError: + kwargs = {} + method_name = TEST_MAP[self.filename]['methods'][self.section] + if 'json' in self.spec: + args.append(json.loads(self.spec['json'])) + if 'options' in TEST_MAP[self.filename]: + kwargs['options'] = TEST_MAP[self.filename]['options'] + if 'hits' in self.spec: + kwargs['hits'] = self.spec['hits'] + result = getattr(instance, method_name)(*args, **kwargs) + if self.section in TEST_MAP[self.filename].get('ignore_attribute_order', ()): + equal = self._equal_without_attribute_order(result, self.spec['expected']) + else: + equal = result == self.spec['expected'] + if not equal: + raise YamlException("{} != {}".format(result, self.spec['expected'])) + + def repr_failure(self, excinfo): + """ called when self.runtest() raises an exception. """ + if isinstance(excinfo.value, YamlException): + return smart_bytes("\n".join([ + "usecase execution failed", + " {}".format(*excinfo.value.args) + ])) + + def reportinfo(self): + return self.fspath, 0, smart_bytes("usecase: %s" % self.name) diff --git a/requirements.txt b/requirements.txt index 0ac3552..3cfae74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,8 @@ argparse==1.2.1 PyYAML==3.10 -beautifulsoup4==4.2.0 +beautifulsoup4==4.4.1 +Django==1.9.6 +lxml==3.4.4 +pytest==2.9.1 +py==1.4.29 +regex==2016.04.25 diff --git a/setup.py b/setup.py index fcdabb2..bb27c76 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ from setuptools import setup, find_packages - + setup( name='twitter-text-py', version='2.0.2', @@ -19,5 +19,5 @@ ], include_package_data=True, install_requires=['setuptools'], - license = "BSD" + license="BSD" ) diff --git a/tests.py b/tests.py deleted file mode 100644 index 891b35e..0000000 --- a/tests.py +++ /dev/null @@ -1,180 +0,0 @@ -# encoding=utf-8 - -import twitter_text, sys, os, json, argparse, re -from twitter_text.unicode import force_unicode - -narrow_build = True -try: - unichr(0x20000) - narrow_build = False -except: - pass - -parser = argparse.ArgumentParser(description = u'Run the integration tests for twitter_text') -parser.add_argument('--ignore-narrow-errors', '-i', help = u'Ignore errors caused by narrow builds', default = False, action = 'store_true') -args = parser.parse_args() - -try: - import yaml -except ImportError: - raise Exception('You need to install pyaml to run the tests') -# from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects -from yaml import Loader, SafeLoader -def construct_yaml_str(self, node): - return self.construct_scalar(node) -Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) -SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) - -try: - from bs4 import BeautifulSoup -except ImportError: - try: - from BeautifulSoup import BeautifulSoup - except ImportError: - raise Exception('You need to install BeautifulSoup to run the tests') - -def success(text): - return (u'\033[92m%s\033[0m\n' % text).encode('utf-8') - -def error(text): - return (u'\033[91m%s\033[0m\n' % text).encode('utf-8') - -attempted = 0 - -def assert_equal_without_attribute_order(result, test, failure_message = None): - global attempted - attempted += 1 - # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through - assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')), error(u'Test %d Failed: %s' % (attempted, test.get('description'))) - sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) - sys.stdout.flush() - -def assert_equal(result, test): - global attempted - attempted += 1 - assert result == test.get('expected'), error(u'\nTest %d Failed: %s%s' % (attempted, test.get('description'), u'\n%s' % test.get('hits') if test.get('hits') else '')) - sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) - sys.stdout.flush() - -# extractor section -extractor_file = open(os.path.join('twitter-text-conformance', 'extract.yml'), 'r') -extractor_tests = yaml.load(force_unicode(extractor_file.read())) -extractor_file.close() - -sys.stdout.write('Testing Extractor\n') -sys.stdout.flush() - -for section in extractor_tests.get('tests'): - sys.stdout.write('\nTesting Extractor: %s\n' % section) - sys.stdout.flush() - for test in extractor_tests.get('tests').get(section): - if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Hashtag with ideographic iteration mark']: - sys.stdout.write('Skipping: %s\n' % test.get('description')) - sys.stdout.flush() - continue - extractor = twitter_text.extractor.Extractor(test.get('text')) - if section == 'mentions': - assert_equal(extractor.extract_mentioned_screen_names(), test) - elif section == 'mentions_with_indices': - assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test) - elif section == 'mentions_or_lists_with_indices': - assert_equal(extractor.extract_mentions_or_lists_with_indices(), test) - elif section == 'replies': - assert_equal(extractor.extract_reply_screen_name(), test) - elif section == 'urls': - assert_equal(extractor.extract_urls(), test) - elif section == 'urls_with_indices': - assert_equal(extractor.extract_urls_with_indices(), test) - elif section == 'hashtags': - assert_equal(extractor.extract_hashtags(), test) - elif section == 'cashtags': - assert_equal(extractor.extract_cashtags(), test) - elif section == 'hashtags_with_indices': - assert_equal(extractor.extract_hashtags_with_indices(), test) - elif section == 'cashtags_with_indices': - assert_equal(extractor.extract_cashtags_with_indices(), test) - -# autolink section -autolink_file = open(os.path.join('twitter-text-conformance', 'autolink.yml'), 'r') -autolink_tests = yaml.load(force_unicode(autolink_file.read())) -autolink_file.close() - -sys.stdout.write('\nTesting Autolink\n') -sys.stdout.flush() - -autolink_options = {'suppress_no_follow': True} - -for section in autolink_tests.get('tests'): - sys.stdout.write('\nTesting Autolink: %s\n' % section) - for test in autolink_tests.get('tests').get(section): - if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Autolink a hashtag containing ideographic iteration mark']: - sys.stdout.write('Skipping: %s\n' % test.get('description')) - sys.stdout.flush() - continue - autolink = twitter_text.autolink.Autolink(test.get('text')) - if section == 'usernames': - assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) - elif section == 'cashtags': - assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test) - elif section == 'urls': - assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test) - elif section == 'hashtags': - assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test) - elif section == 'all': - assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test) - elif section == 'lists': - assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) - elif section == 'json': - assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test) - -# hit_highlighting section -hit_highlighting_file = open(os.path.join('twitter-text-conformance', 'hit_highlighting.yml'), 'r') -hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read())) -hit_highlighting_file.close() - -sys.stdout.write('\nTesting Hit Highlighting\n') -sys.stdout.flush() - -for section in hit_highlighting_tests.get('tests'): - sys.stdout.write('\nTesting Hit Highlighting: %s\n' % section) - for test in hit_highlighting_tests.get('tests').get(section): - hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text')) - if section == 'plain_text': - assert_equal(hit_highlighter.hit_highlight(hits = test.get('hits')), test) - elif section == 'with_links': - assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits = test.get('hits')), test) - -# validation section -validation_tested = False -validate_tests = None -try: - validate_file = open(os.path.join('twitter-text-conformance', 'validate.yml'), 'r') - validate_file_contents = validate_file.read() - validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape'))) - validate_file.close() -except ValueError: - sys.stdout.write('\nValidation tests were skipped because of wide character issues\n') - sys.stdout.flush() - -if validate_tests: - sys.stdout.write('\nTesting Validation\n') - sys.stdout.flush() - - for section in validate_tests.get('tests'): - sys.stdout.write('\nTesting Validation: %s\n' % section) - for test in validate_tests.get('tests').get(section): - validator = twitter_text.validation.Validation(test.get('text')) - if section == 'tweets': - assert_equal(not validator.tweet_invalid(), test) - elif section == 'usernames': - assert_equal(validator.valid_username(), test) - elif section == 'lists': - assert_equal(validator.valid_list(), test) - elif section == 'hashtags': - assert_equal(validator.valid_hashtag(), test) - elif section == 'urls': - assert_equal(validator.valid_url(), test) - -sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted) -sys.stdout.flush() -sys.exit(os.EX_OK) \ No newline at end of file diff --git a/twitter-text b/twitter-text new file mode 160000 index 0000000..fb07f2e --- /dev/null +++ b/twitter-text @@ -0,0 +1 @@ +Subproject commit fb07f2e30c1d3d053cf2bb2ad6971a3bcfc9b568 diff --git a/twitter-text-conformance b/twitter-text-conformance deleted file mode 160000 index 9b58c44..0000000 --- a/twitter-text-conformance +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9b58c44302c4ab5bab261f6cfaf6ca89b5a6cf35 diff --git a/twitter_text/__init__.py b/twitter_text/__init__.py index bb06120..e267dac 100644 --- a/twitter_text/__init__.py +++ b/twitter_text/__init__.py @@ -4,35 +4,41 @@ from twitter_text.extractor import Extractor from twitter_text.highlighter import HitHighlighter from twitter_text.validation import Validation -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text + class TwitterText(object): def __init__(self, text): - self.text = force_unicode(text) # this will get modified by some functions - self.original_text = self.text # this never changes; use it as a fallback or for comparison + # this will get modified by some functions + self.text = force_text(text) + # this never changes; use it as a fallback or for comparison + self.original_text = self.text self.has_been_linked = False - self.tweet_length = None # gets changed by validation method - self.tweet_is_valid = None # gets changed by validation method - self.validation_error = None # gets changed by validation method - + # gets changed by validation method + self.tweet_length = None + # gets changed by validation method + self.tweet_is_valid = None + # gets changed by validation method + self.validation_error = None + def __unicode__(self): return self.text - + def __repr__(self): return self.__unicode__() - + @property def autolink(self): - return Autolink(self.text, parent = self) - + return Autolink(self.text, parent=self) + @property def extractor(self): return Extractor(self.text) - + @property def highlighter(self): - return HitHighlighter(self.text, parent = self) - + return HitHighlighter(self.text, parent=self) + @property def validation(self): - return Validation(self.text, parent = self) \ No newline at end of file + return Validation(self.text, parent=self) diff --git a/twitter_text/autolink.py b/twitter_text/autolink.py index 821d042..7b69f83 100644 --- a/twitter_text/autolink.py +++ b/twitter_text/autolink.py @@ -1,9 +1,9 @@ # encoding=utf-8 -import re, cgi +import re from twitter_text.regex import REGEXEN -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text from twitter_text.extractor import Extractor # Default CSS class for auto-linked lists @@ -28,17 +28,17 @@ DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'" DEFAULT_OPTIONS = { - 'list_class': DEFAULT_LIST_CLASS, - 'username_class': DEFAULT_USERNAME_CLASS, - 'hashtag_class': DEFAULT_HASHTAG_CLASS, - 'cashtag_class': DEFAULT_CASHTAG_CLASS, + 'list_class': DEFAULT_LIST_CLASS, + 'username_class': DEFAULT_USERNAME_CLASS, + 'hashtag_class': DEFAULT_HASHTAG_CLASS, + 'cashtag_class': DEFAULT_CASHTAG_CLASS, - 'username_url_base': DEFAULT_USERNAME_URL_BASE, - 'list_url_base': DEFAULT_LIST_URL_BASE, - 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE, - 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE, + 'username_url_base': DEFAULT_USERNAME_URL_BASE, + 'list_url_base': DEFAULT_LIST_URL_BASE, + 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE, + 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE, - 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS, + 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS, } OPTIONS_NOT_ATTRIBUTES = ( @@ -69,30 +69,32 @@ ) HTML_ENTITIES = { - '&': '&', - '>': '>', - '<': '<', - '"': '"', - "'": ''', + '&': '&', + '>': '>', + '<': '<', + '"': '"', + "'": ''', } BOOLEAN_ATTRIBUTES = ( - 'disabled', + 'disabled', 'readonly', 'multiple', 'checked', ) + def default_transform(entity, text): return text + class Autolink(object): def __init__(self, text, **kwargs): - self.text = force_unicode(text) + self.text = force_text(text) self.parent = kwargs.get('parent', False) self.extractor = Extractor(self.text) - def auto_link_with_json(self, json_obj, options = {}): + def auto_link_with_json(self, json_obj, options={}): # concantenate entities entities = [] if 'entities' in json_obj: @@ -108,7 +110,7 @@ def auto_link_with_json(self, json_obj, options = {}): return self.auto_link_entities(entities, options) - def auto_link_entities(self, entities = [], options = {}): + def auto_link_entities(self, entities=[], options={}): if not self.text: return self.text @@ -118,7 +120,7 @@ def auto_link_entities(self, entities = [], options = {}): if not options.get('suppress_no_follow', False): options['html_attrs']['rel'] = "nofollow" - entities.sort(key = lambda entity: entity['indices'][0], reverse = True) + entities.sort(key=lambda entity: entity['indices'][0], reverse=True) chars = self.text for entity in entities: @@ -133,7 +135,7 @@ def auto_link_entities(self, entities = [], options = {}): return chars - def auto_link(self, options = {}): + def auto_link(self, options={}): """ Add tags around the usernames, lists, hashtags and URLs in the provided text. The tags can be controlled with the following entries in the options hash. @@ -161,7 +163,7 @@ def auto_link(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_entities_with_indices({'extract_url_without_protocol': False}), options) - def auto_link_usernames_or_lists(self, options = {}): + def auto_link_usernames_or_lists(self, options={}): """ Add tags around the usernames and lists in the provided text. The tags can be controlled with the following entries in the options hash. @@ -182,7 +184,7 @@ def auto_link_usernames_or_lists(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_mentions_or_lists_with_indices(), options) - def auto_link_hashtags(self, options = {}): + def auto_link_hashtags(self, options={}): """ Add tags around the hashtags in the provided text. The tags can be controlled with the following entries in the options hash. @@ -199,7 +201,7 @@ def auto_link_hashtags(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_hashtags_with_indices(), options) - def auto_link_cashtags(self, options = {}): + def auto_link_cashtags(self, options={}): """ Add tags around the cashtags in the provided text. The tags can be controlled with the following entries in the options hash. @@ -216,7 +218,7 @@ def auto_link_cashtags(self, options = {}): """ return self.auto_link_entities(self.extractor.extract_cashtags_with_indices(), options) - def auto_link_urls(self, options = {}): + def auto_link_urls(self, options={}): """ Add tags around the URLs in the provided text. The tags can be controlled with the following entries in the options hash. @@ -240,13 +242,13 @@ def _html_escape(self, text): text = text.replace(char, HTML_ENTITIES[char]) return text - def _extract_html_attrs_from_options(self, options = {}): + def _extract_html_attrs_from_options(self, options={}): html_attrs = options.get('html_attrs', {}) options = options.copy() if 'html_attrs' in options: del(options['html_attrs']) for option in options.keys(): - if not option in OPTIONS_NOT_ATTRIBUTES: + if option not in OPTIONS_NOT_ATTRIBUTES: html_attrs[option] = options[option] return html_attrs @@ -256,7 +258,7 @@ def _url_entities_hash(self, url_entities): entities[entity.get('url')] = entity return entities - def _link_to_url(self, entity, chars, options = {}): + def _link_to_url(self, entity, chars, options={}): url = entity.get('url') href = options.get('link_url_transform', lambda x: x)(url) @@ -284,7 +286,7 @@ def _link_to_url(self, entity, chars, options = {}): link = self._link_to_text(entity, link_text, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_url_with_entity(self, entity, options = {}): + def _link_url_with_entity(self, entity, options={}): """ Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste should contain the full original URL (expanded_url), not the display URL. @@ -348,7 +350,7 @@ def _link_url_with_entity(self, entity, options = {}): else: return self._html_escape(display_url) - def _link_to_hashtag(self, entity, chars, options = {}): + def _link_to_hashtag(self, entity, chars, options={}): hashchar = chars[entity['indices'][0]] hashtag = entity['hashtag'] hashtag_class = options.get('hashtag_class') @@ -368,7 +370,7 @@ def _link_to_hashtag(self, entity, chars, options = {}): link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_to_cashtag(self, entity, chars, options = {}): + def _link_to_cashtag(self, entity, chars, options={}): dollar = chars[entity['indices'][0]] cashtag = entity['cashtag'] @@ -383,10 +385,9 @@ def _link_to_cashtag(self, entity, chars, options = {}): link = self._link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_to_screen_name(self, entity, chars, options = {}): + def _link_to_screen_name(self, entity, chars, options={}): name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '') chunk = options.get('link_text_transform', default_transform)(entity, name) - name = name.lower() at = chars[entity['indices'][0]] @@ -404,7 +405,7 @@ def _link_to_screen_name(self, entity, chars, options = {}): link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options) return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] - def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}): + def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes={}, options={}): tagged_symbol = u'<%s>%s' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol text = self._html_escape(text) tagged_text = u'<%s>%s' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text @@ -413,14 +414,14 @@ def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, else: return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options)) - def _link_to_text(self, entity, text, href, attributes = {}, options = {}): + def _link_to_text(self, entity, text, href, attributes={}, options={}): attributes['href'] = href if options.get('link_attribute_transform'): attributes = options.get('link_attribute_transform')(entity, attributes) text = options.get('link_text_transform', default_transform)(entity, text) return u'%s' % (self._tag_attrs(attributes), text) - def _tag_attrs(self, attributes = {}): + def _tag_attrs(self, attributes={}): attrs = [] for key in sorted(attributes.keys()): value = attributes[key] @@ -431,4 +432,4 @@ def _tag_attrs(self, attributes = {}): value = u' '.join(value) attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value))) - return u' '.join(attrs) \ No newline at end of file + return u' '.join(attrs) diff --git a/twitter_text/encoding.py b/twitter_text/encoding.py new file mode 100644 index 0000000..bde3ce7 --- /dev/null +++ b/twitter_text/encoding.py @@ -0,0 +1,239 @@ +# flake8: noqa +# Taken from django.utils.encoding +from __future__ import unicode_literals + +import codecs +import datetime +from decimal import Decimal +import locale + +from django.utils.functional import Promise +from django.utils import six +from django.utils.six.moves.urllib.parse import quote + + +class DjangoUnicodeDecodeError(UnicodeDecodeError): + def __init__(self, obj, *args): + self.obj = obj + UnicodeDecodeError.__init__(self, *args) + + def __str__(self): + original = UnicodeDecodeError.__str__(self) + return '%s. You passed in %r (%s)' % (original, self.obj, + type(self.obj)) + + +def python_2_unicode_compatible(klass): + """ + A decorator that defines __unicode__ and __str__ methods under Python 2. + Under Python 3 it does nothing. + + To support Python 2 and 3 with a single code base, define a __str__ method + returning text and apply this decorator to the class. + """ + if six.PY2: + if '__str__' not in klass.__dict__: + raise ValueError("@python_2_unicode_compatible cannot be applied " + "to %s because it doesn't define __str__()." % + klass.__name__) + klass.__unicode__ = klass.__str__ + klass.__str__ = lambda self: self.__unicode__().encode('utf-8') + return klass + + +def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Returns a text object representing 's' -- unicode on Python 2 and str on + Python 3. Treats bytestrings using the 'encoding' codec. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + if isinstance(s, Promise): + # The input is the result of a gettext_lazy() call. + return s + return force_text(s, encoding, strings_only, errors) + + +def is_protected_type(obj): + """Determine if the object instance is of a protected type. + + Objects of protected types are preserved as-is when passed to + force_text(strings_only=True). + """ + return isinstance(obj, six.integer_types + (type(None), float, Decimal, + datetime.datetime, datetime.date, datetime.time)) + + +def force_text(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Similar to smart_text, except that lazy instances are resolved to + strings, rather than kept as lazy objects. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + # Handle the common case first for performance reasons. + if isinstance(s, six.text_type): + return s + if strings_only and is_protected_type(s): + return s + try: + if not isinstance(s, six.string_types): + if six.PY3: + if isinstance(s, bytes): + s = six.text_type(s, encoding, errors) + else: + s = six.text_type(s) + elif hasattr(s, '__unicode__'): + s = six.text_type(s) + else: + s = six.text_type(bytes(s), encoding, errors) + else: + # Note: We use .decode() here, instead of six.text_type(s, encoding, + # errors), so that if s is a SafeBytes, it ends up being a + # SafeText at the end. + s = s.decode(encoding, errors) + except UnicodeDecodeError as e: + if not isinstance(s, Exception): + raise DjangoUnicodeDecodeError(s, *e.args) + else: + # If we get to here, the caller has passed in an Exception + # subclass populated with non-ASCII bytestring data without a + # working unicode method. Try to handle this without raising a + # further exception by individually forcing the exception args + # to unicode. + s = ' '.join([force_text(arg, encoding, strings_only, + errors) for arg in s]) + return s + + +def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Returns a bytestring version of 's', encoded as specified in 'encoding'. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + if isinstance(s, Promise): + # The input is the result of a gettext_lazy() call. + return s + return force_bytes(s, encoding, strings_only, errors) + + +def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'): + """ + Similar to smart_bytes, except that lazy instances are resolved to + strings, rather than kept as lazy objects. + + If strings_only is True, don't convert (some) non-string-like objects. + """ + # Handle the common case first for performance reasons. + if isinstance(s, bytes): + if encoding == 'utf-8': + return s + else: + return s.decode('utf-8', errors).encode(encoding, errors) + if strings_only and is_protected_type(s): + return s + if isinstance(s, six.memoryview): + return bytes(s) + if isinstance(s, Promise): + return six.text_type(s).encode(encoding, errors) + if not isinstance(s, six.string_types): + try: + if six.PY3: + return six.text_type(s).encode(encoding) + else: + return bytes(s) + except UnicodeEncodeError: + if isinstance(s, Exception): + # An Exception subclass containing non-ASCII data that doesn't + # know how to print itself properly. We shouldn't raise a + # further exception. + return b' '.join([force_bytes(arg, encoding, strings_only, + errors) for arg in s]) + return six.text_type(s).encode(encoding, errors) + else: + return s.encode(encoding, errors) + +if six.PY3: + smart_str = smart_text + force_str = force_text +else: + smart_str = smart_bytes + force_str = force_bytes + # backwards compatibility for Python 2 + smart_unicode = smart_text + force_unicode = force_text + +smart_str.__doc__ = """ +Apply smart_text in Python 3 and smart_bytes in Python 2. + +This is suitable for writing to sys.stdout (for instance). +""" + +force_str.__doc__ = """ +Apply force_text in Python 3 and force_bytes in Python 2. +""" + + +def iri_to_uri(iri): + """ + Convert an Internationalized Resource Identifier (IRI) portion to a URI + portion that is suitable for inclusion in a URL. + + This is the algorithm from section 3.1 of RFC 3987. However, since we are + assuming input is either UTF-8 or unicode already, we can simplify things a + little from the full method. + + Returns an ASCII string containing the encoded result. + """ + # The list of safe characters here is constructed from the "reserved" and + # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986: + # reserved = gen-delims / sub-delims + # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + # sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + # / "*" / "+" / "," / ";" / "=" + # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + # Of the unreserved characters, urllib.quote already considers all but + # the ~ safe. + # The % character is also added to the list of safe characters here, as the + # end of section 3.1 of RFC 3987 specifically mentions that % must not be + # converted. + if iri is None: + return iri + return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~") + + +def filepath_to_uri(path): + """Convert a file system path to a URI portion that is suitable for + inclusion in a URL. + + We are assuming input is either UTF-8 or unicode already. + + This method will encode certain chars that would normally be recognized as + special chars for URIs. Note that this method does not encode the ' + character, as it is a valid character within URIs. See + encodeURIComponent() JavaScript function for more details. + + Returns an ASCII string containing the encoded result. + """ + if path is None: + return path + # I know about `os.sep` and `os.altsep` but I want to leave + # some flexibility for hardcoding separators. + return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'") + + +def get_system_encoding(): + """ + The encoding of the default system locale but falls back to the given + fallback encoding if the encoding is unsupported by python or could + not be determined. See tickets #10335 and #5846 + """ + try: + encoding = locale.getdefaultlocale()[1] or 'ascii' + codecs.lookup(encoding) + except Exception: + encoding = 'ascii' + return encoding + +DEFAULT_LOCALE_ENCODING = get_system_encoding() diff --git a/twitter_text/extractor.py b/twitter_text/extractor.py index 1015b8c..1f51fce 100644 --- a/twitter_text/extractor.py +++ b/twitter_text/extractor.py @@ -1,16 +1,17 @@ # encoding=utf-8 from twitter_text.regex import REGEXEN -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text + class Extractor(object): """ A module for including Tweet parsing in a class. This module provides function for the extraction and processing of usernames, lists, URLs and hashtags. """ - + def __init__(self, text): - self.text = force_unicode(text) + self.text = force_text(text) def _remove_overlapping_entities(self, entities): """ @@ -19,18 +20,18 @@ def _remove_overlapping_entities(self, entities): """ # sort by start index - entities.sort(key = lambda entity: entity['indices'][0]) + entities.sort(key=lambda entity: entity['indices'][0]) # remove duplicates - prev = None + prev = None for entity in [e for e in entities]: if prev and prev['indices'][1] > entity['indices'][0]: entities.remove(entity) else: - prev = entity + prev = entity return entities - def extract_entities_with_indices(self, options = {}, transform = lambda x: x): + def extract_entities_with_indices(self, options={}, transform=lambda x: x): """ Extracts all usernames, lists, hashtags and URLs in the Tweet text along with the indices for where the entity ocurred @@ -43,19 +44,21 @@ def extract_entities_with_indices(self, options = {}, transform = lambda x: x): return [] # extract all entities - entities = self.extract_urls_with_indices(options) + \ - self.extract_hashtags_with_indices({'check_url_overlap': False}) + \ - self.extract_mentions_or_lists_with_indices() + \ - self.extract_cashtags_with_indices() + entities = ( + self.extract_urls_with_indices(options) + + self.extract_hashtags_with_indices({'check_url_overlap': False}) + + self.extract_mentions_or_lists_with_indices() + + self.extract_cashtags_with_indices() + ) - entities = self._remove_overlapping_entities(entities) + entities = self._remove_overlapping_entities(entities) for entity in entities: - entity = transform(entity) + entity = transform(entity) return entities - def extract_mentioned_screen_names(self, transform = lambda x: x): + def extract_mentioned_screen_names(self, transform=lambda x: x): """ Extracts a list of all usernames mentioned in the Tweet text. If the text is None or contains no username mentions an empty list @@ -65,7 +68,7 @@ def extract_mentioned_screen_names(self, transform = lambda x: x): """ return [transform(mention['screen_name']) for mention in self.extract_mentioned_screen_names_with_indices()] - def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x): + def extract_mentioned_screen_names_with_indices(self, transform=lambda x: x): """ Extracts a list of all usernames mentioned in the Tweet text along with the indices for where the mention ocurred. If the @@ -87,7 +90,7 @@ def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x): }) return possible_screen_names - def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): + def extract_mentions_or_lists_with_indices(self, transform=lambda x: x): """ Extracts a list of all usernames or lists mentioned in the Tweet text along with the indices for where the mention ocurred. If the @@ -101,7 +104,7 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): if not REGEXEN['at_signs'].search(self.text): return [] - possible_entries = [] + possible_entries = [] for match in REGEXEN['valid_mention_or_list'].finditer(self.text): try: after = self.text[match.end()] @@ -117,8 +120,8 @@ def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): }) return possible_entries - - def extract_reply_screen_name(self, transform = lambda x: x): + + def extract_reply_screen_name(self, transform=lambda x: x): """ Extracts the username username replied to in the Tweet text. If the text is None or is not a reply None will be returned. @@ -135,8 +138,8 @@ def extract_reply_screen_name(self, transform = lambda x: x): else: possible_screen_name = transform(possible_screen_name.group(1)) return possible_screen_name - - def extract_urls(self, transform = lambda x: x): + + def extract_urls(self, transform=lambda x: x): """ Extracts a list of all URLs included in the Tweet text. If the text is None or contains no URLs an empty list @@ -145,8 +148,8 @@ def extract_urls(self, transform = lambda x: x): If a transform is given then it will be called for each URL. """ return [transform(url['url']) for url in self.extract_urls_with_indices()] - - def extract_urls_with_indices(self, options = {'extract_url_without_protocol': True}): + + def extract_urls_with_indices(self, options={'extract_url_without_protocol': True}): """ Extracts a list of all URLs included in the Tweet text along with the indices. If the text is None or contains no @@ -170,10 +173,12 @@ def extract_urls_with_indices(self, options = {'extract_url_without_protocol': T ascii_domain = ascii_domain.group() last_url = { 'url': ascii_domain, - 'indices': [start_position - len(before or '') + complete.find(ascii_domain), start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)] + 'indices': [start_position - len(before or '') + complete.find(ascii_domain), + start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)] } - last_url_invalid_match = REGEXEN['invalid_short_domain'].search(ascii_domain) is not None - if not last_url_invalid_match: + if (path or + REGEXEN['valid_special_short_domain'].search(ascii_domain) or + not REGEXEN['invalid_short_domain'].search(ascii_domain)): urls.append(last_url) # no ASCII-only domain found. Skip the entire URL if not last_url: @@ -192,8 +197,8 @@ def extract_urls_with_indices(self, options = {'extract_url_without_protocol': T 'indices': [start_position, end_position] }) return urls - - def extract_hashtags(self, transform = lambda x: x): + + def extract_hashtags(self, transform=lambda x: x): """ Extracts a list of all hashtags included in the Tweet text. If the text is None or contains no hashtags an empty list @@ -203,8 +208,8 @@ def extract_hashtags(self, transform = lambda x: x): If a block is given then it will be called for each hashtag. """ return [transform(hashtag['hashtag']) for hashtag in self.extract_hashtags_with_indices()] - - def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, transform = lambda x: x): + + def extract_hashtags_with_indices(self, options={'check_url_overlap': True}, transform=lambda x: x): """ Extracts a list of all hashtags included in the Tweet text. If the text is None or contains no hashtags an empty list @@ -234,7 +239,7 @@ def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, t return tags - def extract_cashtags(self, transform = lambda x: x): + def extract_cashtags(self, transform=lambda x: x): """ Extracts a list of all cashtags included in the Tweet text. If the text is None or contains no cashtags an empty list @@ -245,7 +250,7 @@ def extract_cashtags(self, transform = lambda x: x): """ return [cashtag['cashtag'] for cashtag in self.extract_cashtags_with_indices()] - def extract_cashtags_with_indices(self, transform = lambda x: x): + def extract_cashtags_with_indices(self, transform=lambda x: x): """ Extracts a list of all cashtags included in the Tweet text. If the text is None or contains no cashtags an empty list @@ -267,4 +272,4 @@ def extract_cashtags_with_indices(self, transform = lambda x: x): 'indices': [start_position, end_position] }) - return tags \ No newline at end of file + return tags diff --git a/twitter_text/highlighter.py b/twitter_text/highlighter.py index ec128ca..3311c29 100644 --- a/twitter_text/highlighter.py +++ b/twitter_text/highlighter.py @@ -3,37 +3,41 @@ import re from HTMLParser import HTMLParser -from twitter_text.regex import UNICODE_SPACES -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text DEFAULT_HIGHLIGHT_TAG = 'em' + # from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] + def handle_data(self, d): self.fed.append(d) + def get_data(self): return ''.join(self.fed) + def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data() + class HitHighlighter(object): def __init__(self, text, **kwargs): - self.text = force_unicode(text) + self.text = force_text(text) self.parent = kwargs.get('parent', False) - def hit_highlight(self, hits = [], **kwargs): + def hit_highlight(self, hits=[], **kwargs): if not hits and not kwargs.get('query'): return self.text if not hits and kwargs.get('query'): - stripped_text = strip_tags(self.text) + stripped_text = strip_tags(self.text) for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text): hits.append(match.span()) @@ -49,7 +53,7 @@ def hit_highlight(self, hits = [], **kwargs): for index, chunk in enumerate(chunks): if not index % 2: text_chunks.append(chunk) - for hit in sorted(hits, key = lambda chunk: chunk[1], reverse = True): + for hit in sorted(hits, key=lambda chunk: chunk[1], reverse=True): hit_start, hit_end = hit placed = 0 for index, chunk in enumerate(chunks): @@ -80,4 +84,4 @@ def hit_highlight(self, hits = [], **kwargs): else: result.append(chunk) self.text = u''.join(result) - return self.text \ No newline at end of file + return self.text diff --git a/twitter_text/regex.py b/twitter_text/regex.py index c136f80..3129e51 100644 --- a/twitter_text/regex.py +++ b/twitter_text/regex.py @@ -3,45 +3,60 @@ # A collection of regular expressions for parsing Tweet text. The regular expression # list is frozen at load time to ensure immutability. These reular expressions are # used throughout the Twitter classes. Special care has been taken to make -# sure these reular expressions work with Tweets in all languages. -import re, string +# sure these regular expressions work with Tweets in all languages. +from __future__ import absolute_import +import os -REGEXEN = {} # :nodoc: +import regex as re +import yaml -def regex_range(start, end = None): +from twitter_text.encoding import force_text + +REGEXEN = {} # :nodoc: + + +def regex_range(start, end=None): if end: return u'%s-%s' % (unichr(start), unichr(end)) else: return u'%s' % unichr(start) +TLDS = yaml.safe_load(force_text( + open(os.path.join( + os.path.dirname(os.path.dirname(__file__)), + 'twitter-text', + 'conformance', + 'tld_lib.yml' + )).read() +)) + + # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand # to access both the list of characters and a pattern suitible for use with String#split # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE -UNICODE_SPACES = [] -for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [ - range(0x0009, 0x000D), # White_Space # Cc [5] .. - 0x0020, # White_Space # Zs SPACE - 0x0085, # White_Space # Cc - 0x00A0, # White_Space # Zs NO-BREAK SPACE - 0x1680, # White_Space # Zs OGHAM SPACE MARK - 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR - range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE - 0x2028, # White_Space # Zl LINE SEPARATOR - 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR - 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE - 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE - 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE - ]): - UNICODE_SPACES.append(unichr(space)) +UNICODE_SPACES = [unichr(space) for space in reduce(lambda x, y: x + y if type(y) == list else x + [y], [ + range(0x0009, 0x000D), # White_Space # Cc [5] .. + 0x0020, # White_Space # Zs SPACE + 0x0085, # White_Space # Cc + 0x00A0, # White_Space # Zs NO-BREAK SPACE + 0x1680, # White_Space # Zs OGHAM SPACE MARK + 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR + range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE + 0x2028, # White_Space # Zl LINE SEPARATOR + 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR + 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE + 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE + 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE +])] REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES)) # Characters not allowed in Tweets -INVALID_CHARACTERS = [ - 0xFFFE, 0xFEFF, # BOM - 0xFFFF, # Special - 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change +INVALID_CHARACTERS = [ + 0xFFFE, 0xFEFF, # BOM + 0xFFFF, # Special + 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change ] -REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] +REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$') @@ -71,98 +86,33 @@ def regex_range(start, end = None): LATIN_ACCENTS = u''.join(LATIN_ACCENTS) RTL_CHARACTERS = ''.join([ - regex_range(0x0600,0x06FF), - regex_range(0x0750,0x077F), - regex_range(0x0590,0x05FF), - regex_range(0xFE70,0xFEFF) + regex_range(0x0600, 0x06FF), + regex_range(0x0750, 0x077F), + regex_range(0x0590, 0x05FF), + regex_range(0xFE70, 0xFEFF) ]) -NON_LATIN_HASHTAG_CHARS = ''.join([ - # Cyrillic (Russian, Ukrainian, etc.) - regex_range(0x0400, 0x04ff), # Cyrillic - regex_range(0x0500, 0x0527), # Cyrillic Supplement - regex_range(0x2de0, 0x2dff), # Cyrillic Extended A - regex_range(0xa640, 0xa69f), # Cyrillic Extended B - regex_range(0x0591, 0x05bf), # Hebrew - regex_range(0x05c1, 0x05c2), - regex_range(0x05c4, 0x05c5), - regex_range(0x05c7), - regex_range(0x05d0, 0x05ea), - regex_range(0x05f0, 0x05f4), - regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms - regex_range(0xfb2a, 0xfb36), - regex_range(0xfb38, 0xfb3c), - regex_range(0xfb3e), - regex_range(0xfb40, 0xfb41), - regex_range(0xfb43, 0xfb44), - regex_range(0xfb46, 0xfb4f), - regex_range(0x0610, 0x061a), # Arabic - regex_range(0x0620, 0x065f), - regex_range(0x066e, 0x06d3), - regex_range(0x06d5, 0x06dc), - regex_range(0x06de, 0x06e8), - regex_range(0x06ea, 0x06ef), - regex_range(0x06fa, 0x06fc), - regex_range(0x06ff), - regex_range(0x0750, 0x077f), # Arabic Supplement - regex_range(0x08a0), # Arabic Extended A - regex_range(0x08a2, 0x08ac), - regex_range(0x08e4, 0x08fe), - regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A - regex_range(0xfbd3, 0xfd3d), - regex_range(0xfd50, 0xfd8f), - regex_range(0xfd92, 0xfdc7), - regex_range(0xfdf0, 0xfdfb), - regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B - regex_range(0xfe76, 0xfefc), - regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner - regex_range(0x0e01, 0x0e3a), # Thai - regex_range(0x0e40, 0x0e4e), # Hangul (Korean) - regex_range(0x1100, 0x11ff), # Hangul Jamo - regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo - regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A - regex_range(0xAC00, 0xD7AF), # Hangul Syllables - regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B - regex_range(0xFFA1, 0xFFDC) # Half-width Hangul -]) - -CJ_HASHTAG_CHARACTERS = ''.join([ - regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width) - regex_range(0xFF66, 0xFF9F), # Katakana (half-width) - regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width) - regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana - regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) - regex_range(0x4E00, 0x9FFF), # Kanji (Unified) -]) - -try: - CJ_HASHTAG_CHARACTERS = ''.join([ - CJ_HASHTAG_CHARACTERS, - regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B) - regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) - regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) - regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement) - ]) -except ValueError: - # this is a narrow python build so these extended Kanji characters won't work - pass - PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' SPACE_CHARS = ur" \t\n\x0B\f\r" CTRL_CHARS = ur"\x00-\x1F\x7F" # A hashtag must contain latin characters, numbers and underscores, but not all numbers. -HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) -HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) +HASHTAG_ALPHA = ur'[\p{L}\p{M}]' +HASHTAG_ALPHANUMERIC = ur'[\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\uff5e\u301c\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]' +HASHTAG_BOUNDARY = ur'\A|\z|[^&\p{L}\p{M}\p{Nd}_\u200c\u200d\u0482\ua673\ua67e\u05be\u05f3\u05f4\u309b\u309c\u30a0\u30fb\u3003\u0f0b\u0f0c\u00b7]' -HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) +HASHTAG = re.compile(ur'(%s)(#|#)(?!\ufe0f|\u20e3)(%s*%s%s*)' % ( + HASHTAG_BOUNDARY, + HASHTAG_ALPHANUMERIC, + HASHTAG_ALPHA, + HASHTAG_ALPHANUMERIC, +), re.IGNORECASE) REGEXEN['valid_hashtag'] = HASHTAG REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE) REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$') -REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)') +REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|(?:^|[^a-zA-Z0-9_+~.-])[rR][tT]:?)') REGEXEN['at_signs'] = re.compile(ur'[@@]') REGEXEN['valid_mention_or_list'] = re.compile( ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character @@ -171,7 +121,7 @@ def regex_range(start, end = None): ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional) ) REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) - # Used in Extractor for final filtering +# Used in Extractor for final filtering REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) # URL related hash regex collection @@ -179,11 +129,24 @@ def regex_range(start, end = None): REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$') DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES)) REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) -REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_domain_name'] = re.compile( + ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % ( + DOMAIN_VALID_CHARS, + DOMAIN_VALID_CHARS, + DOMAIN_VALID_CHARS + ), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_gTLD'] = re.compile( + ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % ( + '|'.join(TLDS['generic']), + ), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_ccTLD'] = re.compile( + ur'(?:(?:%s)(?=[^0-9a-z@]|$))' % ( + '|'.join(TLDS['country']), + ), re.IGNORECASE | re.UNICODE) REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) +REGEXEN['valid_special_cctld'] = re.compile(ur'(?:(?:co|tv)(?=[^0-9a-z@]|$))') + REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) # This is used in Extractor @@ -194,18 +157,40 @@ def regex_range(start, end = None): # This is used in Extractor to filter out unwanted URLs. REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_special_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_special_cctld'].pattern)) REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') -REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) +REGEXEN['valid_general_url_path_chars'] = re.compile( + ur"[a-z\p{Cyrillic}0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % ( + LATIN_ACCENTS, + ), re.IGNORECASE | re.UNICODE) # Allow URL paths to contain balanced parens # 1. Used in Wikipedia URLs like /Primer_(film) # 2. Used in IIS sessions like /S(dfd346)/ -REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) +# Allow one nested level of balanced parentheses +REGEXEN['valid_url_balanced_parens'] = re.compile( + ur'\((?:%s+|(?:%s*\(%s+\)%s*))\)' % ( + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + ), re.IGNORECASE | re.UNICODE) # Valid end-of-path chracters (so /foo. does not gobble the period). # 1. Allow =&# for empty URL parameters and other URL-join artifacts -REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) -REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path_ending_chars'] = re.compile( + ur'[a-z\p{Cyrillic}0-9=_#\/\+\-%s]|(?:%s)' % ( + LATIN_ACCENTS, + REGEXEN['valid_url_balanced_parens'].pattern + ), re.IGNORECASE | re.UNICODE) +REGEXEN['valid_url_path'] = re.compile( + ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % ( + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_url_balanced_parens'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern, + REGEXEN['valid_url_path_ending_chars'].pattern, + REGEXEN['valid_general_url_path_chars'].pattern + ), re.IGNORECASE | re.UNICODE) REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE) REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE) @@ -231,7 +216,7 @@ def regex_range(start, end = None): REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) # These URL validation pattern strings are based on the ABNF from RFC 3986 -REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE) +REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z\p{Cyrillic}0-9\-._~]', re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) diff --git a/twitter_text/templatetags/twitterize.py b/twitter_text/templatetags/twitterize.py index 01db63d..b58779a 100644 --- a/twitter_text/templatetags/twitterize.py +++ b/twitter_text/templatetags/twitterize.py @@ -8,15 +8,16 @@ register = Library() -@register.filter(name = 'twitter_text') + +@register.filter(name='twitter_text') @stringfilter -def twitter_text(text, search_query = False): +def twitter_text(text, search_query=False): """ Parses a text string through the TwitterText auto_link method and if search_query is passed, through the hit_highlight method. """ tt = TwitterText(text) if search_query: - tt.text = tt.highlighter.hit_highlight(query = search_query) - tt.text = tt.autolink.auto_link() + tt.text = tt.highlighter.hit_highlight(query=search_query) + tt.text = tt.autolink.auto_link() return tt.text -twitter_text.is_safe = True \ No newline at end of file +twitter_text.is_safe = True diff --git a/twitter_text/unicode.py b/twitter_text/unicode.py index 4e17267..e67238c 100644 --- a/twitter_text/unicode.py +++ b/twitter_text/unicode.py @@ -1,6 +1,8 @@ -import types, datetime +import datetime +import types from decimal import Decimal + # borrowed from django.utils.encoding class TwitterTextUnicodeDecodeError(UnicodeDecodeError): def __init__(self, obj, *args): @@ -10,7 +12,8 @@ def __init__(self, obj, *args): def __str__(self): original = UnicodeDecodeError.__str__(self) return '%s. You passed in %r (%s)' % (original, self.obj, - type(self.obj)) + type(self.obj)) + def is_protected_type(obj): """Determine if the object instance is of a protected type. @@ -25,6 +28,7 @@ def is_protected_type(obj): float, Decimal) ) + def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): """ Similar to smart_unicode, except that lazy instances are resolved to @@ -50,8 +54,8 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # without raising a further exception. We do an # approximation to what the Exception's standard str() # output should be. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) + s = ' '.join([force_unicode(arg, encoding, strings_only, errors) + for arg in s]) elif not isinstance(s, unicode): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a @@ -66,6 +70,6 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # working unicode method. Try to handle this without raising a # further exception by individually forcing the exception args # to unicode. - s = ' '.join([force_unicode(arg, encoding, strings_only, - errors) for arg in s]) + s = ' '.join([force_unicode(arg, encoding, strings_only, errors) + for arg in s]) return s diff --git a/twitter_text/validation.py b/twitter_text/validation.py index 6dea5f9..3990ddf 100644 --- a/twitter_text/validation.py +++ b/twitter_text/validation.py @@ -2,24 +2,25 @@ import re -from twitter_text.unicode import force_unicode +from twitter_text.encoding import force_text from twitter_text.extractor import Extractor from twitter_text.regex import REGEXEN MAX_LENGTH = 140 DEFAULT_TCO_URL_LENGTHS = { - 'short_url_length': 22, - 'short_url_length_https': 23, - 'characters_reserved_per_media': 22, + 'short_url_length': 23, + 'short_url_length_https': 23, + 'characters_reserved_per_media': 22, } + class Validation(object): def __init__(self, text, **kwargs): - self.text = force_unicode(text) + self.text = force_text(text) self.parent = kwargs.get('parent', False) - - def tweet_length(self, options = {}): + + def tweet_length(self, options={}): """ Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a @@ -34,14 +35,14 @@ def tweet_length(self, options = {}): The string could also contain U+00E9 already, in which case the canonicalization will not change the value. """ - assert (not self.parent or not getattr(self.parent, 'has_been_linked', False) ), 'The validator should only be run on text before it has been modified.' + assert (not self.parent or not getattr(self.parent, 'has_been_linked', False)), 'The validator should only be run on text before it has been modified.' for key in DEFAULT_TCO_URL_LENGTHS: - if not key in options: + if key not in options: options[key] = DEFAULT_TCO_URL_LENGTHS[key] length = len(self.text) - # thanks force_unicode for making this so much simpler than the ruby version + # thanks force_text for making this so much simpler than the ruby version for url in Extractor(self.text).extract_urls_with_indices(): # remove the link of the original URL @@ -52,21 +53,22 @@ def tweet_length(self, options = {}): if self.parent and hasattr(self.parent, 'tweet_length'): self.parent.tweet_length = length return length - + def tweet_invalid(self): """ Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation will allow quicker feedback. - + Returns false if this text is valid. Otherwise one of the following Symbols will be returned: - + "Too long":: if the text is too long "Empty text":: if the text is empty "Invalid characters":: if the text contains non-Unicode or any of the disallowed Unicode characters """ - valid = True # optimism + # optimism + valid = True validation_error = None if not self.tweet_length(): @@ -77,7 +79,7 @@ def tweet_invalid(self): if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text): valid, validation_error = False, 'Invalid characters' - + if self.parent and hasattr(self.parent, 'tweet_is_valid'): self.parent.tweet_is_valid = valid if self.parent and hasattr(self.parent, 'tweet_validation_error'): @@ -108,7 +110,7 @@ def valid_hashtag(self): return len(extracted) == 1 and extracted[0] == self.text[1:] - def valid_url(self, unicode_domains = True, require_protocol = True): + def valid_url(self, unicode_domains=True, require_protocol=True): if not self.text: return False @@ -121,38 +123,34 @@ def valid_url(self, unicode_domains = True, require_protocol = True): if not ( ( - not require_protocol - or ( - self._valid_match(scheme, REGEXEN['validate_url_scheme']) - and re.compile(ur'^https?$', re.IGNORECASE).match(scheme) + not require_protocol or ( + self._valid_match(scheme, REGEXEN['validate_url_scheme']) and + re.compile(ur'^https?$', re.IGNORECASE).match(scheme) ) - ) - and ( - path == '' - or self._valid_match(path, REGEXEN['validate_url_path']) - ) - and self._valid_match(query, REGEXEN['validate_url_query'], True) - and self._valid_match(fragment, REGEXEN['validate_url_fragment'], True) + ) and ( + path == '' or + self._valid_match(path, REGEXEN['validate_url_path']) + ) and + self._valid_match(query, REGEXEN['validate_url_query'], True) and + self._valid_match(fragment, REGEXEN['validate_url_fragment'], True) ): return False return bool( ( - unicode_domains - and self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) - and REGEXEN['validate_url_unicode_authority'].match(authority).string == authority - ) - or ( - not unicode_domains - and self._valid_match(authority, REGEXEN['validate_url_authority']) - and REGEXEN['validate_url_authority'].match(authority).string == authority + unicode_domains and + self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) and + REGEXEN['validate_url_unicode_authority'].match(authority).string == authority + ) or ( + not unicode_domains and + self._valid_match(authority, REGEXEN['validate_url_authority']) and + REGEXEN['validate_url_authority'].match(authority).string == authority ) ) - def _valid_match(self, string, re_obj, optional = False): - if optional and string is None: - return True - match = re_obj.match(string) + def _valid_match(self, string, re_obj, optional=False): + if string: + match = re_obj.match(string) if optional: return not (string and (match is None or not match.string[match.span()[0]:match.span()[1]] == string)) else: