File CVE-2025-6069-quad-complex-HTMLParser.patch of Package python

From 2a6869c71a3132eff9c7be96db9bdca48b3636aa Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 13 Jun 2025 19:57:48 +0300
Subject: [PATCH] [3.9] gh-135462: Fix quadratic complexity in processing
 special input in HTMLParser (GH-135464)

End-of-file errors are now handled according to the HTML5 specs --
comments and declarations are automatically closed, tags are ignored.
(cherry picked from commit 6eb6c5dbfb528bd07d77b60fd71fd05d81d45c41)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
---
 Lib/HTMLParser.py                                                        |   43 +++-
 Lib/test/test_htmlparser.py                                              |   91 +++++++++-
 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst |    4 
 3 files changed, 118 insertions(+), 20 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst

Index: Python-2.7.18/Lib/HTMLParser.py
===================================================================
--- Python-2.7.18.orig/Lib/HTMLParser.py	2020-04-19 23:13:39.000000000 +0200
+++ Python-2.7.18/Lib/HTMLParser.py	2025-07-17 23:53:30.394672529 +0200
@@ -20,13 +20,14 @@
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
 
 starttagopen = re.compile('<[a-zA-Z]')
+endtagopen = re.compile('</[a-zA-Z]')
 piclose = re.compile('>')
 commentclose = re.compile(r'--\s*>')
 
 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
 # note: if you change tagfind/attrfind remember to update locatestarttagend too
-tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
+tagfind = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
 # this regex is currently unused, but left for backward compatibility
 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
 
@@ -51,7 +52,7 @@
 endendtag = re.compile('>')
 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
 # </ and the tag name, so maybe this should be fixed
-endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
 
 class HTMLParseError(Exception):
@@ -94,7 +95,6 @@
 
     CDATA_CONTENT_ELEMENTS = ("script", "style")
 
-
     def __init__(self):
         """Initialize and reset this instance."""
         self.reset()
@@ -167,7 +167,7 @@
                     k = self.parse_pi(i)
                 elif startswith("<!", i):
                     k = self.parse_html_declaration(i)
-                elif (i + 1) < n:
+                elif (i + 1) < n or end:
                     self.handle_data("<")
                     k = i + 1
                 else:
@@ -175,14 +175,35 @@
                 if k < 0:
                     if not end:
                         break
-                    k = rawdata.find('>', i + 1)
-                    if k < 0:
-                        k = rawdata.find('<', i + 1)
-                        if k < 0:
-                            k = i + 1
+                    if starttagopen.match(rawdata, i):  # < + letter
+                        pass
+                    elif startswith("</", i):
+                        if i + 2 == n:
+                            self.handle_data("</")
+                        elif endtagopen.match(rawdata, i):  # </ + letter
+                            pass
+                        else:
+                            # bogus comment
+                            self.handle_comment(rawdata[i+2:])
+                    elif startswith("<!--", i):
+                        j = n
+                        for suffix in ("--!", "--", "-"):
+                            if rawdata.endswith(suffix, i+4):
+                                j -= len(suffix)
+                                break
+                        self.handle_comment(rawdata[i+4:j])
+                    elif startswith("<![CDATA[", i):
+                        self.unknown_decl(rawdata[i+3:])
+                    elif rawdata[i:i+9].lower() == '<!doctype':
+                        self.handle_decl(rawdata[i+2:])
+                    elif startswith("<!", i):
+                        # bogus comment
+                        self.handle_comment(rawdata[i+2:])
+                    elif startswith("<?", i):
+                        self.handle_pi(rawdata[i+2:])
                     else:
-                        k += 1
-                    self.handle_data(rawdata[i:k])
+                        raise AssertionError("we should not get here!")
+                    k = n
                 i = self.updatepos(i, k)
             elif startswith("&#", i):
                 match = charref.match(rawdata, i)
Index: Python-2.7.18/Lib/test/test_htmlparser.py
===================================================================
--- Python-2.7.18.orig/Lib/test/test_htmlparser.py	2020-04-19 23:13:39.000000000 +0200
+++ Python-2.7.18/Lib/test/test_htmlparser.py	2025-07-17 23:18:31.191457324 +0200
@@ -5,6 +5,8 @@
 import unittest
 from test import test_support
 
+from test import support
+
 
 class EventCollector(HTMLParser.HTMLParser):
 
@@ -202,23 +204,31 @@
         self._run_check(["<!--abc-->", ""], output)
 
     def test_starttag_junk_chars(self):
+        self._run_check("<", [('data', '<')])
+        self._run_check("<>", [('data', '<>')])
+        self._run_check("< >", [('data', '< >')])
+        self._run_check("< ", [('data', '< ')])
         self._run_check("</>", [])
+        self._run_check("<$>", [('data', '<$>')])
         self._run_check("</$>", [('comment', '$')])
         self._run_check("</", [('data', '</')])
-        self._run_check("</a", [('data', '</a')])
+        self._run_check("</a", [])
+        self._run_check("</ a>", [('endtag', 'a')])
+        self._run_check("</ a", [('comment', ' a')])
         self._run_check("<a<a>", [('starttag', 'a<a', [])])
         self._run_check("</a<a>", [('endtag', 'a<a')])
-        self._run_check("<!", [('data', '<!')])
-        self._run_check("<a", [('data', '<a')])
-        self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
-        self._run_check("<a foo='bar", [('data', "<a foo='bar")])
-        self._run_check("<a foo='>'", [('data', "<a foo='>'")])
-        self._run_check("<a foo='>", [('data', "<a foo='>")])
+        self._run_check("<!", [('comment', '')])
+        self._run_check("<a", [])
+        self._run_check("<a foo='bar'", [])
+        self._run_check("<a foo='bar", [])
+        self._run_check("<a foo='>'", [])
+        self._run_check("<a foo='>", [])
         self._run_check("<a$>", [('starttag', 'a$', [])])
         self._run_check("<a$b>", [('starttag', 'a$b', [])])
         self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
         self._run_check("<a$b  >", [('starttag', 'a$b', [])])
         self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
+        self._run_check("</a$b>", [('endtag', 'a$b')])
 
     def test_valid_doctypes(self):
         # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
@@ -553,13 +563,56 @@
                     ('comment', ' and I have -- in the middle! ')]
         self._run_check(html, expected)
 
-    def test_broken_comments(self):
-        html = ('<! not really a comment >'
+    def test_eof_in_comments(self):
+        data = [
+            ('<!--', [('comment', '')]),
+            ('<!---', [('comment', '')]),
+            ('<!----', [('comment', '')]),
+            ('<!-----', [('comment', '-')]),
+            ('<!------', [('comment', '--')]),
+            ('<!----!', [('comment', '')]),
+            ('<!---!', [('comment', '-!')]),
+            ('<!---!>', [('comment', '-!>')]),
+            ('<!--foo', [('comment', 'foo')]),
+            ('<!--foo-', [('comment', 'foo')]),
+            ('<!--foo--', [('comment', 'foo')]),
+            ('<!--foo--!', [('comment', 'foo')]),
+            ('<!--<!--', [('comment', '<!')]),
+            ('<!--<!--!', [('comment', '<!')]),
+        ]
+        for html, expected in data:
+            self._run_check(html, expected)
+
+    def test_eof_in_declarations(self):
+        data = [
+            ('<!', [('comment', '')]),
+            ('<!-', [('comment', '-')]),
+            ('<![', [('comment', '[')]),
+            ('<![CDATA[', [('unknown decl', 'CDATA[')]),
+            ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
+            ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
+            ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
+            ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
+            ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
+            ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
+            ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
+            ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
+            ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
+            ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
+             [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
+        ]
+        for html, expected in data:
+            self._run_check(html, expected)
+
+    def test_bogus_comments(self):
+        html = ('<!ELEMENT br EMPTY>'
+                '<! not really a comment >'
                 '<! not a comment either -->'
                 '<! -- close enough -->'
                 '<!><!<-- this was an empty comment>'
                 '<!!! another bogus comment !!!>')
         expected = [
+            ('comment', 'ELEMENT br EMPTY'),
             ('comment', ' not really a comment '),
             ('comment', ' not a comment either --'),
             ('comment', ' -- close enough --'),
@@ -615,6 +668,26 @@
 def test_main():
     test_support.run_unittest(HTMLParserTestCase, AttributesTestCase)
 
+    @support.requires_resource('cpu')
+    def test_eof_no_quadratic_complexity(self):
+        # Each of these examples used to take about an hour.
+        # Now they take a fraction of a second.
+        def check(source):
+            parser = html.parser.HTMLParser()
+            parser.feed(source)
+            parser.close()
+        n = 120000
+        check("<a " * n)
+        check("<a a=" * n)
+        check("</a " * 14 * n)
+        check("</a a=" * 11 * n)
+        check("<!--" * 4 * n)
+        check("<!" * 60 * n)
+        check("<?" * 19 * n)
+        check("</$" * 15 * n)
+        check("<![CDATA[" * 9 * n)
+        check("<!doctype" * 35 * n)
+
 
 if __name__ == "__main__":
     test_main()
Index: Python-2.7.18/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ Python-2.7.18/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst	2025-07-17 23:14:57.839336343 +0200
@@ -0,0 +1,4 @@
+Fix quadratic complexity in processing specially crafted input in
+:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
+to the HTML5 specs -- comments and declarations are automatically closed,
+tags are ignored.
Places

File CVE-2025-6069-quad-complex-HTMLParser.patch of Package python

Places