File CVE-2025-4516-DecodeError-handler.patch of Package python313
From 0afcdf15f64273dc5e4ed12ea63a01f7f1136d71 Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Mon, 12 May 2025 20:42:23 +0300
Subject: [PATCH] [3.13] gh-133767: Fix use-after-free in the unicode-escape
decoder with an error handler (GH-129648)
If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().
_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623bd01349a18ba0c7a9cb1dad6a51e8e)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
---
Include/internal/pycore_bytesobject.h | 5
Include/internal/pycore_unicodeobject.h | 12 +-
Lib/test/test_codeccallbacks.py | 39 +++++++
Lib/test/test_codecs.py | 52 ++++++++--
Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst | 2
Objects/bytesobject.c | 41 ++++---
Objects/unicodeobject.c | 46 +++++---
Parser/string_parser.c | 26 +++--
8 files changed, 160 insertions(+), 63 deletions(-)
create mode 100644 Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
Index: Python-3.13.3/Include/internal/pycore_bytesobject.h
===================================================================
--- Python-3.13.3.orig/Include/internal/pycore_bytesobject.h 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Include/internal/pycore_bytesobject.h 2025-05-17 07:33:22.001814947 +0000
@@ -20,8 +20,9 @@
// Helper for PyBytes_DecodeEscape that detects invalid escape chars.
// Export for test_peg_generator.
-PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
- const char *, const char **);
+PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t,
+ const char *,
+ int *, const char **);
// Substring Search.
Index: Python-3.13.3/Include/internal/pycore_unicodeobject.h
===================================================================
--- Python-3.13.3.orig/Include/internal/pycore_unicodeobject.h 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Include/internal/pycore_unicodeobject.h 2025-05-17 07:33:22.001974979 +0000
@@ -142,14 +142,18 @@
// Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
// chars.
// Export for test_peg_generator.
-PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2(
const char *string, /* Unicode-Escape encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed, /* bytes consumed */
- const char **first_invalid_escape); /* on return, points to first
- invalid escaped char in
- string. */
+ int *first_invalid_escape_char, /* on return, if not -1, contain the first
+ invalid escaped char (<= 0xff) or invalid
+ octal escape (> 0xff) in string. */
+ const char **first_invalid_escape_ptr); /* on return, if not NULL, may
+ point to the first invalid escaped
+ char in string.
+ May be NULL if errors is not NULL. */
/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
Index: Python-3.13.3/Lib/test/test_codeccallbacks.py
===================================================================
--- Python-3.13.3.orig/Lib/test/test_codeccallbacks.py 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Lib/test/test_codeccallbacks.py 2025-05-17 07:33:22.002185827 +0000
@@ -1,6 +1,7 @@
import codecs
import html.entities
import itertools
+import re
import sys
import unicodedata
import unittest
@@ -1124,7 +1125,7 @@
text = 'abc<def>ghi'*n
text.translate(charmap)
- def test_mutatingdecodehandler(self):
+ def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
@@ -1159,6 +1160,42 @@
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
+ def test_mutating_decode_handler_unicode_escape(self):
+ decode = codecs.unicode_escape_decode
+ def mutating(exc):
+ if isinstance(exc, UnicodeDecodeError):
+ r = data.get(exc.object[:exc.end])
+ if r is not None:
+ exc.object = r[0] + exc.object[exc.end:]
+ return ('\u0404', r[1])
+ raise AssertionError("don't know how to handle %r" % exc)
+
+ codecs.register_error('test.mutating2', mutating)
+ data = {
+ br'\x0': (b'\\', 0),
+ br'\x3': (b'xxx\\', 3),
+ br'\x5': (b'x\\', 1),
+ }
+ def check(input, expected, msg):
+ with self.assertWarns(DeprecationWarning) as cm:
+ self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
+ self.assertIn(msg, str(cm.warning))
+
+ check(br'\x0n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
+ check(br'\x0n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
+ check(br'\x0z', '\u0404\\z', r"invalid escape sequence '\z'")
+
+ check(br'\x3n\zr', '\u0404\n\\zr', r"invalid escape sequence '\z'")
+ check(br'\x3zr', '\u0404\\zr', r"invalid escape sequence '\z'")
+ check(br'\x3z5', '\u0404\\z5', r"invalid escape sequence '\z'")
+ check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r"invalid escape sequence '\z'")
+ check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r"invalid escape sequence '\z'")
+
+ check(br'\x5n\z', '\u0404\n\\z', r"invalid escape sequence '\z'")
+ check(br'\x5n\501', '\u0404\n\u0141', r"invalid octal escape sequence '\501'")
+ check(br'\x5z', '\u0404\\z', r"invalid escape sequence '\z'")
+ check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r"invalid escape sequence '\z'")
+
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot
Index: Python-3.13.3/Lib/test/test_codecs.py
===================================================================
--- Python-3.13.3.orig/Lib/test/test_codecs.py 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Lib/test/test_codecs.py 2025-05-17 07:33:22.002507164 +0000
@@ -1196,23 +1196,39 @@
check(br"[\1010]", b"[A0]")
check(br"[\x41]", b"[A]")
check(br"[\x410]", b"[A0]")
+
+ def test_warnings(self):
+ decode = codecs.escape_decode
+ check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtvx':
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\%c'" % i):
check(b"\\" + b, b"\\" + b)
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\%c'" % (i-32)):
check(b"\\" + b.upper(), b"\\" + b.upper())
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\8'"):
check(br"\8", b"\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", b"\\9")
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\\xfa'") as cm:
check(b"\\\xfa", b"\\\xfa")
for i in range(0o400, 0o1000):
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid octal escape sequence '\\%o'" % i):
check(rb'\%o' % i, bytes([i & 0o377]))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\z'"):
+ self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid octal escape sequence '\\501'"):
+ self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6))
+
def test_errors(self):
decode = codecs.escape_decode
self.assertRaises(ValueError, decode, br"\x")
@@ -2661,24 +2677,40 @@
check(br"[\x410]", "[A0]")
check(br"\u20ac", "\u20ac")
check(br"\U0001d120", "\U0001d120")
+
+ def test_decode_warnings(self):
+ decode = codecs.unicode_escape_decode
+ check = coding_checker(self, decode)
for i in range(97, 123):
b = bytes([i])
if b not in b'abfnrtuvx':
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\%c'" % i):
check(b"\\" + b, "\\" + chr(i))
if b.upper() not in b'UN':
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\%c'" % (i-32)):
check(b"\\" + b.upper(), "\\" + chr(i-32))
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\8'"):
check(br"\8", "\\8")
with self.assertWarns(DeprecationWarning):
check(br"\9", "\\9")
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\\xfa'") as cm:
check(b"\\\xfa", "\\\xfa")
for i in range(0o400, 0o1000):
- with self.assertWarns(DeprecationWarning):
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid octal escape sequence '\\%o'" % i):
check(rb'\%o' % i, chr(i))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid escape sequence '\\z'"):
+ self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4))
+ with self.assertWarnsRegex(DeprecationWarning,
+ r"invalid octal escape sequence '\\501'"):
+ self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6))
+
def test_decode_errors(self):
decode = codecs.unicode_escape_decode
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
Index: Python-3.13.3/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ Python-3.13.3/Misc/NEWS.d/next/Security/2025-05-09-20-22-54.gh-issue-133767.kN2i3Q.rst 2025-05-17 07:33:22.002786511 +0000
@@ -0,0 +1,2 @@
+Fix use-after-free in the "unicode-escape" decoder with a non-"strict" error
+handler.
Index: Python-3.13.3/Objects/bytesobject.c
===================================================================
--- Python-3.13.3.orig/Objects/bytesobject.c 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Objects/bytesobject.c 2025-05-17 07:33:22.003325939 +0000
@@ -1065,10 +1065,11 @@
}
/* Unescape a backslash-escaped string. */
-PyObject *_PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape2(const char *s,
Py_ssize_t len,
const char *errors,
- const char **first_invalid_escape)
+ int *first_invalid_escape_char,
+ const char **first_invalid_escape_ptr)
{
int c;
char *p;
@@ -1082,7 +1083,8 @@
return NULL;
writer.overallocate = 1;
- *first_invalid_escape = NULL;
+ *first_invalid_escape_char = -1;
+ *first_invalid_escape_ptr = NULL;
end = s + len;
while (s < end) {
@@ -1120,9 +1122,10 @@
c = (c<<3) + *s++ - '0';
}
if (c > 0377) {
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-3; /* Back up 3 chars, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = c;
+ /* Back up 3 chars, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 3;
}
}
*p++ = c;
@@ -1163,9 +1166,10 @@
break;
default:
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-1; /* Back up one char, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = (unsigned char)s[-1];
+ /* Back up one char, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 1;
}
*p++ = '\\';
s--;
@@ -1185,17 +1189,18 @@
Py_ssize_t Py_UNUSED(unicode),
const char *Py_UNUSED(recode_encoding))
{
- const char* first_invalid_escape;
- PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
- &first_invalid_escape);
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
if (result == NULL)
return NULL;
- if (first_invalid_escape != NULL) {
- unsigned char c = *first_invalid_escape;
- if ('4' <= c && c <= '7') {
+ if (first_invalid_escape_char != -1) {
+ if (first_invalid_escape_char > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
- "invalid octal escape sequence '\\%.3s'",
- first_invalid_escape) < 0)
+ "invalid octal escape sequence '\\%o'",
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
@@ -1204,7 +1209,7 @@
else {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
- c) < 0)
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
Index: Python-3.13.3/Objects/unicodeobject.c
===================================================================
--- Python-3.13.3.orig/Objects/unicodeobject.c 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Objects/unicodeobject.c 2025-05-17 07:33:22.004748214 +0000
@@ -6177,13 +6177,15 @@
/* --- Unicode Escape Codec ----------------------------------------------- */
PyObject *
-_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
+_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
- const char **first_invalid_escape)
+ int *first_invalid_escape_char,
+ const char **first_invalid_escape_ptr)
{
const char *starts = s;
+ const char *initial_starts = starts;
_PyUnicodeWriter writer;
const char *end;
PyObject *errorHandler = NULL;
@@ -6191,7 +6193,8 @@
_PyUnicode_Name_CAPI *ucnhash_capi;
// so we can remember if we've seen an invalid escape char or not
- *first_invalid_escape = NULL;
+ *first_invalid_escape_char = -1;
+ *first_invalid_escape_ptr = NULL;
if (size == 0) {
if (consumed) {
@@ -6279,9 +6282,12 @@
}
}
if (ch > 0377) {
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-3; /* Back up 3 chars, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = ch;
+ if (starts == initial_starts) {
+ /* Back up 3 chars, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 3;
+ }
}
}
WRITE_CHAR(ch);
@@ -6376,9 +6382,12 @@
goto error;
default:
- if (*first_invalid_escape == NULL) {
- *first_invalid_escape = s-1; /* Back up one char, since we've
- already incremented s. */
+ if (*first_invalid_escape_char == -1) {
+ *first_invalid_escape_char = c;
+ if (starts == initial_starts) {
+ /* Back up one char, since we've already incremented s. */
+ *first_invalid_escape_ptr = s - 1;
+ }
}
WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c);
@@ -6423,18 +6432,19 @@
const char *errors,
Py_ssize_t *consumed)
{
- const char *first_invalid_escape;
- PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
consumed,
- &first_invalid_escape);
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
if (result == NULL)
return NULL;
- if (first_invalid_escape != NULL) {
- unsigned char c = *first_invalid_escape;
- if ('4' <= c && c <= '7') {
+ if (first_invalid_escape_char != -1) {
+ if (first_invalid_escape_char > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
- "invalid octal escape sequence '\\%.3s'",
- first_invalid_escape) < 0)
+ "invalid octal escape sequence '\\%o'",
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
@@ -6443,7 +6453,7 @@
else {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
- c) < 0)
+ first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
Index: Python-3.13.3/Parser/string_parser.c
===================================================================
--- Python-3.13.3.orig/Parser/string_parser.c 2025-04-08 13:54:08.000000000 +0000
+++ Python-3.13.3/Parser/string_parser.c 2025-05-17 07:33:22.005519309 +0000
@@ -184,15 +184,18 @@
len = (size_t)(p - buf);
s = buf;
- const char *first_invalid_escape;
- v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape);
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
// HACK: later we can simply pass the line no, since we don't preserve the tokens
// when we are decoding the string but we preserve the line numbers.
- if (v != NULL && first_invalid_escape != NULL && t != NULL) {
- if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
- /* We have not decref u before because first_invalid_escape points
- inside u. */
+ if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) {
+ if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) {
+ /* We have not decref u before because first_invalid_escape_ptr
+ points inside u. */
Py_XDECREF(u);
Py_DECREF(v);
return NULL;
@@ -205,14 +208,17 @@
static PyObject *
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
{
- const char *first_invalid_escape;
- PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
+ int first_invalid_escape_char;
+ const char *first_invalid_escape_ptr;
+ PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
+ &first_invalid_escape_char,
+ &first_invalid_escape_ptr);
if (result == NULL) {
return NULL;
}
- if (first_invalid_escape != NULL) {
- if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
+ if (first_invalid_escape_ptr != NULL) {
+ if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) {
Py_DECREF(result);
return NULL;
}