python · serhiy-storchaka · Jun 18, 2025
@@ -29,7 +29,8 @@
 starttagopen = re.compile('<[a-zA-Z]')
 endtagopen = re.compile('</[a-zA-Z]')
 piclose = re.compile('>')
-commentclose = re.compile(r'--\s*>')
+commentclose = re.compile(r'--!?>')
+commentabruptclose = re.compile(r'-?>')
 # Note:
 #  1) if you change tagfind/attrfind remember to update locatestarttagend too;
 #  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
@@ -309,6 +310,21 @@ def parse_html_declaration(self, i):
         else:
             return self.parse_bogus_comment(i)
 
+    # Internal -- parse comment, return length or -1 if not terminated
+    # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
+    def parse_comment(self, i, report=True):
+        rawdata = self.rawdata
+        assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
+        match = commentclose.search(rawdata, i+4)
+        if not match:
+            match = commentabruptclose.match(rawdata, i+4)
+            if not match:
+                return -1
+        if report:
+            j = match.start()
+            self.handle_comment(rawdata[i+4: j])
+        return match.end()
+
     # Internal -- parse bogus comment, return length or -1 if not terminated
     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
     def parse_bogus_comment(self, i, report=1):

@@ -332,17 +332,41 @@ def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'
                 '<!------>'
+                '<!----->'
                 '<!---->'
+                # abrupt-closing-of-empty-comment
+                '<!--->'
+                '<!-->'
                 '<!----I have many hyphens---->'
                 '<!-- I have a > in the middle -->'
-                '<!-- and I have -- in the middle! -->')
+                '<!-- and I have -- in the middle! -->'
+                '<!--incorrectly-closed-comment--!>'
+                '<!----!>'
+                '<!----!-->'
+                '<!---- >-->'
+                # nested-comment
+                '<!-- <!-- nested --> -->'
+                '<!--<!-->'
+                '<!--<!--!>'
+        )
         expected = [('comment', " I'm a valid comment "),
                     ('comment', 'me too!'),
                     ('comment', '--'),
+                    ('comment', '-'),
+                    ('comment', ''),
+                    ('comment', ''),
                     ('comment', ''),
                     ('comment', '--I have many hyphens--'),
                     ('comment', ' I have a > in the middle '),
-                    ('comment', ' and I have -- in the middle! ')]
+                    ('comment', ' and I have -- in the middle! '),
+                    ('comment', 'incorrectly-closed-comment'),
+                    ('comment', ''),
+                    ('comment', '--!'),
+                    ('comment', '-- >'),
+                    ('comment', ' <!-- nested '), ('data', ' -->'),
+                    ('comment', '<!'),
+                    ('comment', '<!'),
+        ]
         self._run_check(html, expected)
 
     def test_condcoms(self):

diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-135661.nADrzJ.rst b/Misc/NEWS.d/next/Library/2025-06-18-13-28-08.gh-issue-135661.nADrzJ.rst
@@ -0,0 +1,3 @@
+Fix comment parsing in :class:`html.parser.HTMLParser`. ``--!>`` now
+ends the comment. ``-- >`` no longer ends the comment. Support abnormally
+ended empty comments ``<-->`` and ``<--->``.