diff --git a/v2ex/templatetags/filters.py b/v2ex/templatetags/filters.py index 1ac2077..d41d66e 100644 --- a/v2ex/templatetags/filters.py +++ b/v2ex/templatetags/filters.py @@ -8,25 +8,6 @@ import urllib, hashlib register = template.Library() -# Configuration for urlize() function -LEADING_PUNCTUATION = ['(', '<', '<'] -TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] - -# list of possible strings used for bullets in bulleted lists -DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] - -unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') -word_split_re = re.compile(r'(\s+)') -punctuation_re = re.compile('^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$' % \ - ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]), - '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION]))) -simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$') -link_target_attribute_re = re.compile(r'(]*?)target=[^\s>]+') -html_gunk_re = re.compile(r'(?:
|<\/i>|<\/b>|<\/em>|<\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) -hard_coded_bullets_re = re.compile(r'((?:

(?:%s).*?[a-zA-Z].*?

\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) -trailing_empty_content_re = re.compile(r'(?:

(?: |\s|
)*?

\s*)+\Z') -del x # Temporary variable - def timezone(value, offset): if offset > 12: offset = 12 - offset @@ -37,36 +18,90 @@ def autolink2(text): return bleach.linkify(text) register.filter(autolink2) -def autolink(text, trim_url_limit=None, nofollow=False): - """ - Converts any URLs in text into clickable links. Works on http://, https:// and - www. links. Links can have trailing punctuation (periods, commas, close-parens) - and leading punctuation (opening parens) and it'll still do the right thing. +_XHTML_ESCAPE_RE = re.compile('[&<>"]') +_XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"'} +_URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&|")*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""") - If trim_url_limit is not None, the URLs in link text will be limited to - trim_url_limit characters. +def xhtml_escape(value): + return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)], + to_basestring(value)) - If nofollow is True, the URLs in link text will get a rel="nofollow" attribute. +def to_unicode(value): + if isinstance(value, (unicode, type(None))): + return value + assert isinstance(value, bytes) + return value.decode("utf-8") + +def to_basestring(value): + if isinstance(value, (basestring, type(None))): + return value + assert isinstance(value, bytes) + return value.decode("utf-8") + +def autolink(text, shorten=False, extra_params="", + require_protocol=False, permitted_protocols=["http", "https", "mailto"]): + """Converts plain text into HTML with links. + For example: ``linkify("Hello http://www.v2ex.com!")`` would return + ``Hello
http://www.v2ex.com!`` + Parameters: + shorten: Long urls will be shortened for display. + extra_params: Extra text to include in the link tag, + e.g. linkify(text, extra_params='rel="nofollow" class="external"') + require_protocol: Only linkify urls which include a protocol. If this is + False, urls such as www.facebook.com will also be linkified. + permitted_protocols: List (or set) of protocols which should be linkified, + e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]). + It is very unsafe to include protocols such as "javascript". """ - trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x - words = word_split_re.split(text) - nofollow_attr = nofollow and ' rel="nofollow"' or '' - for i, word in enumerate(words): - match = punctuation_re.match(word) - if match: - lead, middle, trail = match.groups() - if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and not middle.startswith('https://') and \ - len(middle) > 0 and middle[0] in string.letters + string.digits and \ - (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))): - middle = '%s' % (middle, nofollow_attr, trim_url(middle)) - if middle.startswith('http://') or middle.startswith('https://'): - middle = '%s' % (middle, nofollow_attr, trim_url(middle)) - if '@' in middle and not middle.startswith('www.') and not ':' in middle \ - and simple_email_re.match(middle): - middle = '%s' % (middle, middle) - if lead + middle + trail != word: - words[i] = lead + middle + trail - return ''.join(words) + if extra_params: + extra_params = " " + extra_params.strip() + def make_link(m): + url = m.group(1) + proto = m.group(2) + if require_protocol and not proto: + return url # not protocol, no linkify + if proto and proto not in permitted_protocols: + return url # bad protocol, no linkify + href = m.group(1) + if not proto: + href = "http://" + href # no proto specified, use http + params = extra_params + # clip long urls. max_len is just an approximation + max_len = 30 + if shorten and len(url) > max_len: + before_clip = url + if proto: + proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : + else: + proto_len = 0 + parts = url[proto_len:].split("/") + if len(parts) > 1: + # Grab the whole host part plus the first bit of the path + # The path is usually not that interesting once shortened + # (no more slug, etc), so it really just provides a little + # extra indication of shortening. + url = url[:proto_len] + parts[0] + "/" + \ + parts[1][:8].split('?')[0].split('.')[0] + if len(url) > max_len * 1.5: # still too long + url = url[:max_len] + if url != before_clip: + amp = url.rfind('&') + # avoid splitting html char entities + if amp > max_len - 5: + url = url[:amp] + url += "..." + if len(url) >= len(before_clip): + url = before_clip + else: + # full url is visible on mouse-over (for those who don't + # have a status bar, such as Safari by default) + params += ' title="%s"' % href + return u'%s' % (href, params, url) + # First HTML-escape so that our strings are all safe. + # The regex is modified to avoid character entites other than & so + # that we won't pick up ", etc. + text = to_unicode(xhtml_escape(text)) + return _URL_RE.sub(make_link, text) register.filter(autolink) # auto convert img.ly/abcd links to image tags