File CVE-2025-6069-quad-complex-HTMLParser.patch of Package python313
From 9043edabc7e2f0dd655146e0a4571e2a0b2906af Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Fri, 13 Jun 2025 19:57:48 +0300
Subject: [PATCH] gh-135462: Fix quadratic complexity in processing special
input in HTMLParser (GH-135464)
End-of-file errors are now handled according to the HTML5 specs --
comments and declarations are automatically closed, tags are ignored.
(cherry picked from commit 6eb6c5dbfb528bd07d77b60fd71fd05d81d45c41)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
---
Lib/html/parser.py | 41 +++-
Lib/test/test_htmlparser.py | 97 +++++++---
Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst | 4
3 files changed, 111 insertions(+), 31 deletions(-)
create mode 100644 Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
Index: Python-3.13.5/Lib/html/parser.py
===================================================================
--- Python-3.13.5.orig/Lib/html/parser.py 2025-06-11 17:36:57.000000000 +0200
+++ Python-3.13.5/Lib/html/parser.py 2025-07-02 16:49:52.020175099 +0200
@@ -27,6 +27,7 @@
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
+endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
@@ -195,7 +196,7 @@
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
- elif (i + 1) < n:
+ elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
@@ -203,17 +204,35 @@
if k < 0:
if not end:
break
- k = rawdata.find('>', i + 1)
- if k < 0:
- k = rawdata.find('<', i + 1)
- if k < 0:
- k = i + 1
+ if starttagopen.match(rawdata, i): # < + letter
+ pass
+ elif startswith("</", i):
+ if i + 2 == n:
+ self.handle_data("</")
+ elif endtagopen.match(rawdata, i): # </ + letter
+ pass
+ else:
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<!--", i):
+ j = n
+ for suffix in ("--!", "--", "-"):
+ if rawdata.endswith(suffix, i+4):
+ j -= len(suffix)
+ break
+ self.handle_comment(rawdata[i+4:j])
+ elif startswith("<![CDATA[", i):
+ self.unknown_decl(rawdata[i+3:])
+ elif rawdata[i:i+9].lower() == '<!doctype':
+ self.handle_decl(rawdata[i+2:])
+ elif startswith("<!", i):
+ # bogus comment
+ self.handle_comment(rawdata[i+2:])
+ elif startswith("<?", i):
+ self.handle_pi(rawdata[i+2:])
else:
- k += 1
- if self.convert_charrefs and not self.cdata_elem:
- self.handle_data(unescape(rawdata[i:k]))
- else:
- self.handle_data(rawdata[i:k])
+ raise AssertionError("we should not get here!")
+ k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
Index: Python-3.13.5/Lib/test/test_htmlparser.py
===================================================================
--- Python-3.13.5.orig/Lib/test/test_htmlparser.py 2025-06-11 17:36:57.000000000 +0200
+++ Python-3.13.5/Lib/test/test_htmlparser.py 2025-07-02 16:49:52.020821697 +0200
@@ -5,6 +5,7 @@
import unittest
from unittest.mock import patch
+from test import support
class EventCollector(html.parser.HTMLParser):
@@ -430,28 +431,34 @@
('data', '<'),
('starttag', 'bc<', [('a', None)]),
('endtag', 'html'),
- ('data', '\n<img src="URL>'),
- ('comment', '/img'),
- ('endtag', 'html<')])
+ ('data', '\n')])
def test_starttag_junk_chars(self):
+ self._run_check("<", [('data', '<')])
+ self._run_check("<>", [('data', '<>')])
+ self._run_check("< >", [('data', '< >')])
+ self._run_check("< ", [('data', '< ')])
self._run_check("</>", [])
+ self._run_check("<$>", [('data', '<$>')])
self._run_check("</$>", [('comment', '$')])
self._run_check("</", [('data', '</')])
- self._run_check("</a", [('data', '</a')])
+ self._run_check("</a", [])
+ self._run_check("</ a>", [('endtag', 'a')])
+ self._run_check("</ a", [('comment', ' a')])
self._run_check("<a<a>", [('starttag', 'a<a', [])])
self._run_check("</a<a>", [('endtag', 'a<a')])
- self._run_check("<!", [('data', '<!')])
- self._run_check("<a", [('data', '<a')])
- self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
- self._run_check("<a foo='bar", [('data', "<a foo='bar")])
- self._run_check("<a foo='>'", [('data', "<a foo='>'")])
- self._run_check("<a foo='>", [('data', "<a foo='>")])
+ self._run_check("<!", [('comment', '')])
+ self._run_check("<a", [])
+ self._run_check("<a foo='bar'", [])
+ self._run_check("<a foo='bar", [])
+ self._run_check("<a foo='>'", [])
+ self._run_check("<a foo='>", [])
self._run_check("<a$>", [('starttag', 'a$', [])])
self._run_check("<a$b>", [('starttag', 'a$b', [])])
self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
self._run_check("<a$b >", [('starttag', 'a$b', [])])
self._run_check("<a$b />", [('startendtag', 'a$b', [])])
+ self._run_check("</a$b>", [('endtag', 'a$b')])
def test_slashes_in_starttag(self):
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
@@ -576,21 +583,50 @@
for html, expected in data:
self._run_check(html, expected)
- def test_EOF_in_comments_or_decls(self):
+ def test_eof_in_comments(self):
data = [
- ('<!', [('data', '<!')]),
- ('<!-', [('data', '<!-')]),
- ('<!--', [('data', '<!--')]),
- ('<![', [('data', '<![')]),
- ('<![CDATA[', [('data', '<![CDATA[')]),
- ('<![CDATA[x', [('data', '<![CDATA[x')]),
- ('<!DOCTYPE', [('data', '<!DOCTYPE')]),
- ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
+ ('<!--', [('comment', '')]),
+ ('<!---', [('comment', '')]),
+ ('<!----', [('comment', '')]),
+ ('<!-----', [('comment', '-')]),
+ ('<!------', [('comment', '--')]),
+ ('<!----!', [('comment', '')]),
+ ('<!---!', [('comment', '-!')]),
+ ('<!---!>', [('comment', '-!>')]),
+ ('<!--foo', [('comment', 'foo')]),
+ ('<!--foo-', [('comment', 'foo')]),
+ ('<!--foo--', [('comment', 'foo')]),
+ ('<!--foo--!', [('comment', 'foo')]),
+ ('<!--<!--', [('comment', '<!')]),
+ ('<!--<!--!', [('comment', '<!')]),
]
for html, expected in data:
self._run_check(html, expected)
+
+ def test_eof_in_declarations(self):
+ data = [
+ ('<!', [('comment', '')]),
+ ('<!-', [('comment', '-')]),
+ ('<![', [('comment', '[')]),
+ ('<![CDATA[', [('unknown decl', 'CDATA[')]),
+ ('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
+ ('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
+ ('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
+ ('<!DOCTYPE', [('decl', 'DOCTYPE')]),
+ ('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
+ ('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
+ ('<!DOCTYPE html ', [('decl', 'DOCTYPE html ')]),
+ ('<!DOCTYPE html PUBLIC', [('decl', 'DOCTYPE html PUBLIC')]),
+ ('<!DOCTYPE html PUBLIC "foo', [('decl', 'DOCTYPE html PUBLIC "foo')]),
+ ('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo',
+ [('decl', 'DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "foo')]),
+ ]
+ for html, expected in data:
+ self._run_check(html, expected)
+
def test_bogus_comments(self):
- html = ('<! not really a comment >'
+ html = ('<!ELEMENT br EMPTY>'
+ '<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!><!<-- this was an empty comment>'
@@ -604,6 +640,7 @@
'<![CDATA]]>' # required '[' after CDATA
)
expected = [
+ ('comment', 'ELEMENT br EMPTY'),
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
('comment', ' -- close enough --'),
@@ -684,6 +721,26 @@
('endtag', 'a'), ('data', ' bar & baz')]
)
+ @support.requires_resource('cpu')
+ def test_eof_no_quadratic_complexity(self):
+ # Each of these examples used to take about an hour.
+ # Now they take a fraction of a second.
+ def check(source):
+ parser = html.parser.HTMLParser()
+ parser.feed(source)
+ parser.close()
+ n = 120_000
+ check("<a " * n)
+ check("<a a=" * n)
+ check("</a " * 14 * n)
+ check("</a a=" * 11 * n)
+ check("<!--" * 4 * n)
+ check("<!" * 60 * n)
+ check("<?" * 19 * n)
+ check("</$" * 15 * n)
+ check("<![CDATA[" * 9 * n)
+ check("<!doctype" * 35 * n)
+
class AttributesTestCase(TestCaseBase):
Index: Python-3.13.5/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ Python-3.13.5/Misc/NEWS.d/next/Security/2025-06-13-15-55-22.gh-issue-135462.KBeJpc.rst 2025-07-02 16:49:52.021124951 +0200
@@ -0,0 +1,4 @@
+Fix quadratic complexity in processing specially crafted input in
+:class:`html.parser.HTMLParser`. End-of-file errors are now handled according
+to the HTML5 specs -- comments and declarations are automatically closed,
+tags are ignored.