livid · alexzhan · Apr 6, 2012
diff --git a/v2ex/templatetags/filters.py b/v2ex/templatetags/filters.py
@@ -8,25 +8,6 @@
 import urllib, hashlib
 register = template.Library()
 
-# Configuration for urlize() function
-LEADING_PUNCTUATION  = ['(', '<', '&lt;']
-TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']
-
-# list of possible strings used for bullets in bulleted lists
-DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']
-
-unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
-word_split_re = re.compile(r'(\s+)')
-punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
-    ('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
-    '|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
-simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
-link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
-html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
-hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
-trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
-del x # Temporary variable
-
 def timezone(value, offset):
     if offset > 12:
         offset = 12 - offset
@@ -37,36 +18,90 @@ def autolink2(text):
     return bleach.linkify(text)
 register.filter(autolink2)
 
-def autolink(text, trim_url_limit=None, nofollow=False):
-    """
-    Converts any URLs in text into clickable links. Works on http://, https:// and
-    www. links. Links can have trailing punctuation (periods, commas, close-parens)
-    and leading punctuation (opening parens) and it'll still do the right thing.
+_XHTML_ESCAPE_RE = re.compile('[&<>"]')
+_XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;'}
+_URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")
 
-    If trim_url_limit is not None, the URLs in link text will be limited to
-    trim_url_limit characters.
+def xhtml_escape(value):
+    return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
+            to_basestring(value))
 
-    If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
+def to_unicode(value):
+    if isinstance(value, (unicode, type(None))):
+        return value
+    assert isinstance(value, bytes)
+    return value.decode("utf-8")
+
+def to_basestring(value):
+    if isinstance(value, (basestring, type(None))):
+        return value
+    assert isinstance(value, bytes)
+    return value.decode("utf-8")
+
+def autolink(text, shorten=False, extra_params="",
+        require_protocol=False, permitted_protocols=["http", "https", "mailto"]):
+    """Converts plain text into HTML with links.
+    For example: ``linkify("Hello http://www.v2ex.com!")`` would return
+    ``Hello <a href="http://www.v2ex.com">http://www.v2ex.com</a>!``
+    Parameters:
+    shorten: Long urls will be shortened for display.
+    extra_params: Extra text to include in the link tag,
+        e.g. linkify(text, extra_params='rel="nofollow" class="external"')
+    require_protocol: Only linkify urls which include a protocol. If this is
+        False, urls such as www.facebook.com will also be linkified.
+    permitted_protocols: List (or set) of protocols which should be linkified,
+        e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
+        It is very unsafe to include protocols such as "javascript".
     """
-    trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or ''))  or x
-    words = word_split_re.split(text)
-    nofollow_attr = nofollow and ' rel="nofollow"' or ''
-    for i, word in enumerate(words):
-        match = punctuation_re.match(word)
-        if match:
-            lead, middle, trail = match.groups()
-            if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and not middle.startswith('https://') and \
-                    len(middle) > 0 and middle[0] in string.letters + string.digits and \
-                    (middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
-                middle = '<a href="http://%s"%s target="_blank">%s</a>' % (middle, nofollow_attr, trim_url(middle))
-            if middle.startswith('http://') or middle.startswith('https://'):
-                middle = '<a href="%s"%s target="_blank">%s</a>' % (middle, nofollow_attr, trim_url(middle))
-            if '@' in middle and not middle.startswith('www.') and not ':' in middle \
-                and simple_email_re.match(middle):
-                middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
-            if lead + middle + trail != word:
-                words[i] = lead + middle + trail
-    return ''.join(words)
+    if extra_params:
+        extra_params = " " + extra_params.strip()
+    def make_link(m):
+        url = m.group(1)
+        proto = m.group(2)
+        if require_protocol and not proto:
+            return url  # not protocol, no linkify
+        if proto and proto not in permitted_protocols:
+            return url  # bad protocol, no linkify
+        href = m.group(1)
+        if not proto:
+            href = "http://" + href   # no proto specified, use http
+        params = extra_params
+        # clip long urls. max_len is just an approximation
+        max_len = 30
+        if shorten and len(url) > max_len:
+            before_clip = url
+            if proto:
+                proto_len = len(proto) + 1 + len(m.group(3) or "")  # +1 for :
+            else:
+                proto_len = 0
+            parts = url[proto_len:].split("/")
+            if len(parts) > 1:
+                # Grab the whole host part plus the first bit of the path
+                # The path is usually not that interesting once shortened
+                # (no more slug, etc), so it really just provides a little
+                # extra indication of shortening.
+                url = url[:proto_len] + parts[0] + "/" + \
+                        parts[1][:8].split('?')[0].split('.')[0]
+            if len(url) > max_len * 1.5:  # still too long
+                url = url[:max_len]
+            if url != before_clip:
+                amp = url.rfind('&')
+                # avoid splitting html char entities
+                if amp > max_len - 5:
+                    url = url[:amp]
+                url += "..."
+                if len(url) >= len(before_clip):
+                    url = before_clip
+                else:
+                    # full url is visible on mouse-over (for those who don't
+                    # have a status bar, such as Safari by default)
+                    params += ' title="%s"' % href
+        return u'<a href="%s"%s target="_blank">%s</a>' % (href, params, url)
+    # First HTML-escape so that our strings are all safe.
+    # The regex is modified to avoid character entites other than &amp; so
+    # that we won't pick up &quot;, etc.
+    text = to_unicode(xhtml_escape(text))
+    return _URL_RE.sub(make_link, text)
 register.filter(autolink)
 
 # auto convert img.ly/abcd links to image tags