File CVE-2022-31116-surrogate-chars.patch of Package python-ujson.28874

From e0e5db9a46decfea1174217382486e06bbab4743 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Thu, 9 Jun 2022 17:23:15 +0000
Subject: [PATCH] Fix handling of surrogates on decoding

This implements surrogate handling on decoding as it is in the standard library. Lone escaped surrogates and any raw surrogates in the input result in surrogates in the output, and escaped surrogate pairs get decoded into non-BMP characters. Note that raw surrogate pairs get treated differently on platforms/compilers with 16-bit `wchar_t`, e.g. Microsoft Windows.
---
 lib/ultrajsondec.c |   46 ++++++++++++++++++++--------------------------
 python/JSONtoObj.c |    6 +++++-
 tests/tests.py     |    2 ++
 3 files changed, 27 insertions(+), 27 deletions(-)

--- a/lib/ultrajsondec.c
+++ b/lib/ultrajsondec.c
@@ -424,13 +424,15 @@ static const JSUINT8 g_decoderLookup[256
 
 FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
 {
-  JSUTF16 sur[2] = { 0 };
-  int iSur = 0;
   int index;
   wchar_t *escOffset;
   wchar_t *escStart;
   size_t escLen = (ds->escEnd - ds->escStart);
   JSUINT8 *inputOffset;
+  JSUTF16 ch = 0;
+#if WCHAR_MAX >= 0x10FFFF
+  JSUINT8 *lastHighSurrogate = NULL;
+#endif
   JSUINT8 oct;
   JSUTF32 ucs;
   ds->lastType = JT_INVALID;
@@ -530,7 +532,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode
                 case '7':
                 case '8':
                 case '9':
-                  sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
+                  ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
                   break;
 
                 case 'a':
@@ -539,7 +541,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode
                 case 'd':
                 case 'e':
                 case 'f':
-                  sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
+                  ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
                   break;
 
                 case 'A':
@@ -548,39 +550,31 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode
                 case 'D':
                 case 'E':
                 case 'F':
-                  sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
+                  ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
                   break;
               }
 
               inputOffset ++;
             }
 
-            if (iSur == 0)
+#if WCHAR_MAX >= 0x10FFFF
+            if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
             {
-              if((sur[iSur] & 0xfc00) == 0xd800)
-              {
-                // First of a surrogate pair, continue parsing
-                iSur ++;
-                break;
-              }
-              (*escOffset++) = (wchar_t) sur[iSur];
-              iSur = 0;
+              // Low surrogate immediately following a high surrogate
+              // Overwrite existing high surrogate with combined character
+              *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
             }
             else
-            {
-              // Decode pair
-              if ((sur[1] & 0xfc00) != 0xdc00)
-              {
-                return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
-              }
-#if WCHAR_MAX == 0xffff
-              (*escOffset++) = (wchar_t) sur[0];
-              (*escOffset++) = (wchar_t) sur[1];
-#else
-              (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
 #endif
-              iSur = 0;
+            {
+              *(escOffset++) = (wchar_t) ch;
             }
+#if WCHAR_MAX >= 0x10FFFF
+            if ((ch & 0xfc00) == 0xd800)
+            {
+              lastHighSurrogate = inputOffset;
+            }
+#endif
           break;
         }
 
--- a/python/JSONtoObj.c
+++ b/python/JSONtoObj.c
@@ -161,7 +161,11 @@ PyObject* JSONToObj(PyObject* self, PyOb
   else
   if (PyUnicode_Check(arg))
   {
-    sarg = PyUnicode_AsUTF8String(arg);
+#if PY_MAJOR_VERSION >= 3
+    sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
+#else
+    sarg = PyUnicode_AsEncodedString(arg, NULL, "ignore");
+#endif
     if (sarg == NULL)
     {
       //Exception raised above us by codec according to docs
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -224,6 +224,7 @@ class UltraJSONTests(unittest.TestCase):
     # Characters outside of Basic Multilingual Plane(larger than
     # 16 bits) are represented as \UXXXXXXXX in python but should be encoded
     # as \uXXXX\uXXXX in json.
+    @unittest.skipIf(six.PY2, "Doesn't work with Python 2")
     def testEncodeUnicodeBMP(self):
         s = '\U0001f42e\U0001f42e\U0001F42D\U0001F42D'  # 🐮🐮🐭🐭
         encoded = ujson.dumps(s)
@@ -251,6 +252,7 @@ class UltraJSONTests(unittest.TestCase):
         decoded = ujson.loads(encoded)
         self.assertEqual(s, decoded)
 
+    @unittest.skipIf(six.PY2, "Doesn't work with Python 2")
     def testEncodeSymbols(self):
         s = '\u273f\u2661\u273f'  # ✿♡✿
         encoded = ujson.dumps(s)
Places

File CVE-2022-31116-surrogate-chars.patch of Package python-ujson.28874

Places