Fix an HTML comment parsing case that can cause an infinite loop

facelessuser · waylan · commit 5354daf61814 · 2025-09-26T10:52:21.000-04:00
Fixes #1554
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -10,6 +10,12 @@ and this project adheres to the
 [Python Version Specification](https://packaging.python.org/en/latest/specifications/version-specifiers/).
 See the [Contributing Guide](contributing.md) for details.
 
+## [Unreleased]
+
+### Fixed
+
+* Fix an HTML comment parsing case in some Python versions that can cause an infinite loop (#1554).
+
 ## [3.9.0] - 2025-09-04
 
 ### Changed
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
@@ -33,6 +33,9 @@
 if TYPE_CHECKING:  # pragma: no cover
     from markdown import Markdown
 
+# Included for versions which do not have current comment fix
+commentclose = re.compile(r'--!?>')
+commentabruptclose = re.compile(r'-?>')
 
 # Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
 # Users can still do `from html import parser` and get the default behavior.
@@ -302,6 +305,22 @@ def parse_pi(self, i: int) -> int:
         self.handle_data('<?')
         return i + 2
 
+    if not hasattr(htmlparser, 'commentabruptclose'):
+        # Internal -- parse comment, return length or -1 if not terminated
+        # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
+        def parse_comment(self, i, report=True):
+            rawdata = self.rawdata
+            assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
+            match = commentclose.search(rawdata, i+4)
+            if not match:
+                match = commentabruptclose.match(rawdata, i+4)
+                if not match:
+                    return -1
+            if report:
+                j = match.start()
+                self.handle_comment(rawdata[i+4: j])
+            return match.end()
+
     def parse_html_declaration(self, i: int) -> int:
         if self.at_line_start() or self.intail:
             if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
diff --git a/tests/test_syntax/blocks/test_html_blocks.py b/tests/test_syntax/blocks/test_html_blocks.py
@@ -1018,7 +1018,7 @@ def test_comment_in_code_block(self):
     # Note: This is a change in behavior. Previously, Python-Markdown interpreted this in the same manner
     # as browsers and all text after the opening comment tag was considered to be in a comment. However,
     # that did not match the reference implementation. The new behavior does.
-    def test_unclosed_comment_(self):
+    def test_unclosed_comment(self):
         self.assertMarkdownRenders(
             self.dedent(
                 """
@@ -1035,6 +1035,22 @@ def test_unclosed_comment_(self):
             )
         )
 
+    def test_invalid_comment_end(self):
+        self.assertMarkdownRenders(
+            self.dedent(
+                """
+                <!-- This comment is malformed and never closes -- >
+                Some content after the bad comment.
+                """
+            ),
+            self.dedent(
+                """
+                <p>&lt;!-- This comment is malformed and never closes -- &gt;
+                Some content after the bad comment.</p>
+                """
+            )
+        )
+
     def test_raw_processing_instruction_one_line(self):
         self.assertMarkdownRenders(
             "<?php echo '>'; ?>",