File CVE-2022-31116-surrogate-chars.patch of Package python-ujson.28874
From e0e5db9a46decfea1174217382486e06bbab4743 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 9 Jun 2022 17:23:15 +0000
Subject: [PATCH] Fix handling of surrogates on decoding
This implements surrogate handling on decoding as it is in the standard library. Lone escaped surrogates and any raw surrogates in the input result in surrogates in the output, and escaped surrogate pairs get decoded into non-BMP characters. Note that raw surrogate pairs get treated differently on platforms/compilers with 16-bit `wchar_t`, e.g. Microsoft Windows.
---
lib/ultrajsondec.c | 46 ++++++++++++++++++++--------------------------
python/JSONtoObj.c | 6 +++++-
tests/tests.py | 2 ++
3 files changed, 27 insertions(+), 27 deletions(-)
--- a/lib/ultrajsondec.c
+++ b/lib/ultrajsondec.c
@@ -424,13 +424,15 @@ static const JSUINT8 g_decoderLookup[256
FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
{
- JSUTF16 sur[2] = { 0 };
- int iSur = 0;
int index;
wchar_t *escOffset;
wchar_t *escStart;
size_t escLen = (ds->escEnd - ds->escStart);
JSUINT8 *inputOffset;
+ JSUTF16 ch = 0;
+#if WCHAR_MAX >= 0x10FFFF
+ JSUINT8 *lastHighSurrogate = NULL;
+#endif
JSUINT8 oct;
JSUTF32 ucs;
ds->lastType = JT_INVALID;
@@ -530,7 +532,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode
case '7':
case '8':
case '9':
- sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
+ ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
break;
case 'a':
@@ -539,7 +541,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode
case 'd':
case 'e':
case 'f':
- sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
+ ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
break;
case 'A':
@@ -548,39 +550,31 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode
case 'D':
case 'E':
case 'F':
- sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
+ ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
break;
}
inputOffset ++;
}
- if (iSur == 0)
+#if WCHAR_MAX >= 0x10FFFF
+ if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
{
- if((sur[iSur] & 0xfc00) == 0xd800)
- {
- // First of a surrogate pair, continue parsing
- iSur ++;
- break;
- }
- (*escOffset++) = (wchar_t) sur[iSur];
- iSur = 0;
+ // Low surrogate immediately following a high surrogate
+ // Overwrite existing high surrogate with combined character
+ *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
}
else
- {
- // Decode pair
- if ((sur[1] & 0xfc00) != 0xdc00)
- {
- return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
- }
-#if WCHAR_MAX == 0xffff
- (*escOffset++) = (wchar_t) sur[0];
- (*escOffset++) = (wchar_t) sur[1];
-#else
- (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
#endif
- iSur = 0;
+ {
+ *(escOffset++) = (wchar_t) ch;
}
+#if WCHAR_MAX >= 0x10FFFF
+ if ((ch & 0xfc00) == 0xd800)
+ {
+ lastHighSurrogate = inputOffset;
+ }
+#endif
break;
}
--- a/python/JSONtoObj.c
+++ b/python/JSONtoObj.c
@@ -161,7 +161,11 @@ PyObject* JSONToObj(PyObject* self, PyOb
else
if (PyUnicode_Check(arg))
{
- sarg = PyUnicode_AsUTF8String(arg);
+#if PY_MAJOR_VERSION >= 3
+ sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
+#else
+ sarg = PyUnicode_AsEncodedString(arg, NULL, "ignore");
+#endif
if (sarg == NULL)
{
//Exception raised above us by codec according to docs
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -224,6 +224,7 @@ class UltraJSONTests(unittest.TestCase):
# Characters outside of Basic Multilingual Plane(larger than
# 16 bits) are represented as \UXXXXXXXX in python but should be encoded
# as \uXXXX\uXXXX in json.
+ @unittest.skipIf(six.PY2, "Doesn't work with Python 2")
def testEncodeUnicodeBMP(self):
s = '\U0001f42e\U0001f42e\U0001F42D\U0001F42D' # ๐ฎ๐ฎ๐ญ๐ญ
encoded = ujson.dumps(s)
@@ -251,6 +252,7 @@ class UltraJSONTests(unittest.TestCase):
decoded = ujson.loads(encoded)
self.assertEqual(s, decoded)
+ @unittest.skipIf(six.PY2, "Doesn't work with Python 2")
def testEncodeSymbols(self):
s = '\u273f\u2661\u273f' # โฟโกโฟ
encoded = ujson.dumps(s)