Added some proper logic to the HTML being compressed. This fixes #1

SoulAuctioneer · Sep 19, 2011 · 3beb05a · 3beb05a
1 parent 5400d4c
commit 3beb05a
Showing 1 changed file with 89 additions and 29 deletions.
diff --git a/jinja2htmlcompress.py b/jinja2htmlcompress.py
@@ -16,62 +16,114 @@
 
 
 _tag_re = re.compile(r'(?:<(/?)([a-zA-Z0-9_-]+)\s*|(>\s*))(?s)')
+_ws_normalize_re = re.compile(r'[ \t\r\n]+')
 
 
-class HTMLCompress(Extension):
-    isolated_tags = frozenset(['script', 'style', 'pre', 'textarea'])
+class StreamProcessContext(object):
+
+    def __init__(self, stream):
+        self.stream = stream
+        self.token = None
+        self.stack = []
+
+    def fail(self, message):
+        raise TemplateSyntaxError(message, self.token.lineno,
+                                  self.stream.name, self.stream.filename)
+
+
+def _make_dict_from_listing(listing):
+    rv = {}
+    for keys, value in listing:
+        for key in keys:
+            rv[key] = value
+    return rv
 
-    def isolated(self, stack):
+
+class HTMLCompress(Extension):
+    isolated_elements = set(['script', 'style', 'noscript', 'textarea'])
+    void_elements = set(['br', 'img', 'area', 'hr', 'param', 'input',
+                         'embed', 'col'])
+    block_elements = set(['div', 'p', 'form', 'ul', 'ol', 'li', 'table', 'tr',
+                          'tbody', 'thead', 'tfoot', 'tr', 'td', 'th', 'dl',
+                          'dt', 'dd', 'blockquote', 'h1', 'h2', 'h3', 'h4',
+                          'h5', 'h6', 'pre'])
+    breaking_rules = _make_dict_from_listing([
+        (['p'], set(['#block'])),
+        (['li'], set(['li'])),
+        (['td', 'th'], set(['td', 'th', 'tr', 'tbody', 'thead', 'tfoot'])),
+        (['tr'], set(['tr', 'tbody', 'thead', 'tfoot'])),
+        (['thead', 'tbody', 'tfoot'], set(['thead', 'tbody', 'tfoot'])),
+        (['dd', 'dt'], set(['dl', 'dt', 'dd']))
+    ])
+
+    def is_isolated(self, stack):
         for tag in reversed(stack):
-            if tag in self.isolated_tags:
+            if tag in self.isolated_elements:
                 return True
         return False
 
-    def normalize(self, token, stack, stream):
+    def is_breaking(self, tag, other_tag):
+        breaking = self.breaking_rules.get(other_tag)
+        return breaking and (tag in breaking or
+            ('#block' in breaking and tag in self.block_elements))
+
+    def enter_tag(self, tag, ctx):
+        while ctx.stack and self.is_breaking(tag, ctx.stack[-1]):
+            self.leave_tag(ctx.stack[-1], ctx)
+        if tag not in self.void_elements:
+            ctx.stack.append(tag)
+
+    def leave_tag(self, tag, ctx):
+        if not ctx.stack:
+            ctx.fail('Tried to leave "%s" but something closed '
+                     'it already' % tag)
+        if tag == ctx.stack[-1]:
+            ctx.stack.pop()
+            return
+        for idx, other_tag in enumerate(reversed(ctx.stack)):
+            if other_tag == tag:
+                for num in xrange(idx + 1):
+                    ctx.stack.pop()
+            elif not self.breaking_rules.get(other_tag):
+                break
+
+    def normalize(self, ctx):
         pos = 0
         buffer = []
         def write_data(value):
-            if not self.isolated(stack):
-                value = value.strip()
+            if not self.is_isolated(ctx.stack):
+                value = _ws_normalize_re.sub(' ', value.strip())
             buffer.append(value)
 
-        for match in _tag_re.finditer(token.value):
+        for match in _tag_re.finditer(ctx.token.value):
             closes, tag, sole = match.groups()
-            preamble = token.value[pos:match.start()]
+            preamble = ctx.token.value[pos:match.start()]
             write_data(preamble)
             if sole:
                 write_data(sole)
             else:
                 buffer.append(match.group())
-                if closes:
-                    if stack.pop() != tag:
-                        raise TemplateSyntaxError('HTML has to be balanced '
-                            'when htmlcompress extension is active',
-                            token.lineno, stream.name, stream.filename)
-                else:
-                    stack.append(tag)
+                (closes and self.leave_tag or self.enter_tag)(tag, ctx)
             pos = match.end()
 
-        write_data(token.value[pos:])
+        write_data(ctx.token.value[pos:])
         return u''.join(buffer)
 
     def filter_stream(self, stream):
-        stack = []
+        ctx = StreamProcessContext(stream)
         for token in stream:
             if token.type != 'data':
                 yield token
                 continue
-            value = self.normalize(token, stack, stream)
+            ctx.token = token
+            value = self.normalize(ctx)
             yield Token(token.lineno, 'data', value)
 
 
 class SelectiveHTMLCompress(HTMLCompress):
 
     def filter_stream(self, stream):
-        def fail(msg):
-            raise TemplateSyntaxError(msg, stream.current.lineno,
-                                      stream.name, stream.filename)
-        stack = []
+        ctx = StreamProcessContext(stream)
         strip_depth = 0
         while 1:
             if stream.current.type == 'block_begin':
@@ -83,14 +135,15 @@ def fail(msg):
                     else:
                         strip_depth -= 1
                         if strip_depth < 0:
-                            fail('Unexpected tag endstrip')
+                            ctx.fail('Unexpected tag endstrip')
                     stream.skip()
                     if stream.current.type != 'block_end':
-                        fail('expected end of block, got %s' %
-                             describe_token(stream.current))
+                        ctx.fail('expected end of block, got %s' %
+                                 describe_token(stream.current))
                     stream.skip()
             if strip_depth > 0 and stream.current.type == 'data':
-                value = self.normalize(stream.current, stack, stream)
+                ctx.token = stream.current
+                value = self.normalize(ctx)
                 yield Token(stream.current.lineno, 'data', value)
             else:
                 yield stream.current
@@ -111,7 +164,8 @@ def test():
             }
           </script>
           <body>
-            <li><a href="{{ href }}">{{ title }}</a></li>
+            <li><a href="{{ href }}">{{ title }}</a><br>Test   Foo
+            <li><a href="{{ href }}">{{ title }}</a><img src=test.png>
           </body>
         </html>
     ''')
@@ -122,8 +176,14 @@ def test():
         Normal   <span>  unchanged </span> stuff
         {% strip %}Stripped <span class=foo  >   test   </span>
         <a href="foo">  test </a> {{ foo }}
-        {% endstrip %}
         Normal <stuff>   again {{ foo }}  </stuff>
+        <p>
+          Foo<br>Bar
+          Baz
+        <p>
+          Moep    <span>Test</span>    Moep
+        </p>
+        {% endstrip %}
     ''')
     print tmpl.render(foo=42)