Skip to content
This repository has been archived by the owner on Dec 25, 2020. It is now read-only.

Modify autolink with regular expression #26

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 81 additions & 46 deletions v2ex/templatetags/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,6 @@
import urllib, hashlib
register = template.Library()

# Configuration for urlize() function
LEADING_PUNCTUATION = ['(', '<', '&lt;']
TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '&gt;']

# list of possible strings used for bullets in bulleted lists
DOTS = ['&middot;', '*', '\xe2\x80\xa2', '&#149;', '&bull;', '&#8226;']

unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)')
word_split_re = re.compile(r'(\s+)')
punctuation_re = re.compile('^(?P<lead>(?:%s)*)(?P<middle>.*?)(?P<trail>(?:%s)*)$' % \
('|'.join([re.escape(x) for x in LEADING_PUNCTUATION]),
'|'.join([re.escape(x) for x in TRAILING_PUNCTUATION])))
simple_email_re = re.compile(r'^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$')
link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
del x # Temporary variable

def timezone(value, offset):
if offset > 12:
offset = 12 - offset
Expand All @@ -37,36 +18,90 @@ def autolink2(text):
return bleach.linkify(text)
register.filter(autolink2)

def autolink(text, trim_url_limit=None, nofollow=False):
"""
Converts any URLs in text into clickable links. Works on http://, https:// and
www. links. Links can have trailing punctuation (periods, commas, close-parens)
and leading punctuation (opening parens) and it'll still do the right thing.
_XHTML_ESCAPE_RE = re.compile('[&<>"]')
_XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;'}
_URL_RE = re.compile(ur"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""")

If trim_url_limit is not None, the URLs in link text will be limited to
trim_url_limit characters.
def xhtml_escape(value):
return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
to_basestring(value))

If nofollow is True, the URLs in link text will get a rel="nofollow" attribute.
def to_unicode(value):
if isinstance(value, (unicode, type(None))):
return value
assert isinstance(value, bytes)
return value.decode("utf-8")

def to_basestring(value):
if isinstance(value, (basestring, type(None))):
return value
assert isinstance(value, bytes)
return value.decode("utf-8")

def autolink(text, shorten=False, extra_params="",
require_protocol=False, permitted_protocols=["http", "https", "mailto"]):
"""Converts plain text into HTML with links.
For example: ``linkify("Hello http://www.v2ex.com!")`` would return
``Hello <a href="http://www.v2ex.com">http://www.v2ex.com</a>!``
Parameters:
shorten: Long urls will be shortened for display.
extra_params: Extra text to include in the link tag,
e.g. linkify(text, extra_params='rel="nofollow" class="external"')
require_protocol: Only linkify urls which include a protocol. If this is
False, urls such as www.facebook.com will also be linkified.
permitted_protocols: List (or set) of protocols which should be linkified,
e.g. linkify(text, permitted_protocols=["http", "ftp", "mailto"]).
It is very unsafe to include protocols such as "javascript".
"""
trim_url = lambda x, limit=trim_url_limit: limit is not None and (x[:limit] + (len(x) >=limit and '...' or '')) or x
words = word_split_re.split(text)
nofollow_attr = nofollow and ' rel="nofollow"' or ''
for i, word in enumerate(words):
match = punctuation_re.match(word)
if match:
lead, middle, trail = match.groups()
if middle.startswith('www.') or ('@' not in middle and not middle.startswith('http://') and not middle.startswith('https://') and \
len(middle) > 0 and middle[0] in string.letters + string.digits and \
(middle.endswith('.org') or middle.endswith('.net') or middle.endswith('.com'))):
middle = '<a href="http://%s"%s target="_blank">%s</a>' % (middle, nofollow_attr, trim_url(middle))
if middle.startswith('http://') or middle.startswith('https://'):
middle = '<a href="%s"%s target="_blank">%s</a>' % (middle, nofollow_attr, trim_url(middle))
if '@' in middle and not middle.startswith('www.') and not ':' in middle \
and simple_email_re.match(middle):
middle = '<a href="mailto:%s">%s</a>' % (middle, middle)
if lead + middle + trail != word:
words[i] = lead + middle + trail
return ''.join(words)
if extra_params:
extra_params = " " + extra_params.strip()
def make_link(m):
url = m.group(1)
proto = m.group(2)
if require_protocol and not proto:
return url # not protocol, no linkify
if proto and proto not in permitted_protocols:
return url # bad protocol, no linkify
href = m.group(1)
if not proto:
href = "http://" + href # no proto specified, use http
params = extra_params
# clip long urls. max_len is just an approximation
max_len = 30
if shorten and len(url) > max_len:
before_clip = url
if proto:
proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
else:
proto_len = 0
parts = url[proto_len:].split("/")
if len(parts) > 1:
# Grab the whole host part plus the first bit of the path
# The path is usually not that interesting once shortened
# (no more slug, etc), so it really just provides a little
# extra indication of shortening.
url = url[:proto_len] + parts[0] + "/" + \
parts[1][:8].split('?')[0].split('.')[0]
if len(url) > max_len * 1.5: # still too long
url = url[:max_len]
if url != before_clip:
amp = url.rfind('&')
# avoid splitting html char entities
if amp > max_len - 5:
url = url[:amp]
url += "..."
if len(url) >= len(before_clip):
url = before_clip
else:
# full url is visible on mouse-over (for those who don't
# have a status bar, such as Safari by default)
params += ' title="%s"' % href
return u'<a href="%s"%s target="_blank">%s</a>' % (href, params, url)
# First HTML-escape so that our strings are all safe.
# The regex is modified to avoid character entites other than &amp; so
# that we won't pick up &quot;, etc.
text = to_unicode(xhtml_escape(text))
return _URL_RE.sub(make_link, text)
register.filter(autolink)

# auto convert img.ly/abcd links to image tags
Expand Down