File memory-error.patch of Package python-cbor2

From 387755eacf0be35591a478d3c67fe10618a6d542 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Gr=C3=B6nholm?= <alex.gronholm@nextday.fi>
Date: Sun, 14 Jan 2024 14:12:40 +0200
Subject: [PATCH] Fixed MemoryError when decoding large definite strings (#204)

Also fixed a return value check in `CBORTag_hash()`.
---
 cbor2/_decoder.py       |  29 ++++-
 docs/versionhistory.rst |   4 +-
 setup.py                |   2 +-
 source/decoder.c        | 232 +++++++++++++++++++++++++++++++++-------
 source/tags.c           |   2 +-
 tests/test_decoder.py   |  41 ++++++-
 6 files changed, 263 insertions(+), 47 deletions(-)

Index: cbor2-5.5.1/cbor2/_decoder.py
===================================================================
--- cbor2-5.5.1.orig/cbor2/_decoder.py
+++ cbor2-5.5.1/cbor2/_decoder.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import re
 import struct
 import sys
+from codecs import getincrementaldecoder
 from collections.abc import Callable, Mapping, Sequence
 from datetime import datetime, timedelta, timezone
 from io import BytesIO
@@ -31,6 +32,7 @@ T = TypeVar("T")
 timestamp_re = re.compile(
     r"^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)" r"(?:\.(\d{1,6})\d*)?(?:Z|([+-])(\d\d):(\d\d))$"
 )
+incremental_utf8_decoder = getincrementaldecoder("utf-8")
 
 
 class CBORDecoder:
@@ -305,8 +307,19 @@ class CBORDecoder:
         else:
             if length > sys.maxsize:
                 raise CBORDecodeValueError("invalid length for bytestring 0x%x" % length)
+            elif length <= 65536:
+                result = self.read(length)
+            else:
+                # Read large bytestrings 65536 (2 ** 16) bytes at a time
+                left = length
+                buffer = bytearray()
+                while left:
+                    chunk_size = min(left, 65536)
+                    buffer.extend(self.read(chunk_size))
+                    left -= chunk_size
+
+                result = bytes(buffer)
 
-            result = self.read(length)
             self._stringref_namespace_add(result, length)
 
         return self.set_shareable(result)
@@ -350,7 +363,19 @@ class CBORDecoder:
             if length > sys.maxsize:
                 raise CBORDecodeValueError("invalid length for string 0x%x" % length)
 
-            result = self.read(length).decode("utf-8", self._str_errors)
+            if length <= 65536:
+                result = self.read(length).decode("utf-8", self._str_errors)
+            else:
+                # Read and decode large text strings 65536 (2 ** 16) bytes at a time
+                codec = incremental_utf8_decoder(self._str_errors)
+                left = length
+                result = ""
+                while left:
+                    chunk_size = min(left, 65536)
+                    final = left <= chunk_size
+                    result += codec.decode(self.read(chunk_size), final)
+                    left -= chunk_size
+
             self._stringref_namespace_add(result, length)
 
         return self.set_shareable(result)
Index: cbor2-5.5.1/setup.py
===================================================================
--- cbor2-5.5.1.orig/setup.py
+++ cbor2-5.5.1/setup.py
@@ -57,7 +57,7 @@ if build_c_ext:
             "source/tags.c",
             "source/halffloat.c",
         ],
-        optional=True,
+        # optional=True,
     )
     kwargs = {"ext_modules": [_cbor2]}
 else:
Index: cbor2-5.5.1/source/decoder.c
===================================================================
--- cbor2-5.5.1.orig/source/decoder.c
+++ cbor2-5.5.1/source/decoder.c
@@ -346,31 +346,44 @@ _CBORDecoder_get_immutable(CBORDecoderOb
 
 // Utility functions /////////////////////////////////////////////////////////
 
-static int
-fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
+static PyObject *
+fp_read_object(CBORDecoderObject *self, const Py_ssize_t size)
 {
+    PyObject *ret = NULL;
     PyObject *obj, *size_obj;
-    char *data;
-    int ret = -1;
-
     size_obj = PyLong_FromSsize_t(size);
     if (size_obj) {
         obj = PyObject_CallFunctionObjArgs(self->read, size_obj, NULL);
+        Py_DECREF(size_obj);
         if (obj) {
             assert(PyBytes_CheckExact(obj));
             if (PyBytes_GET_SIZE(obj) == (Py_ssize_t) size) {
-                data = PyBytes_AS_STRING(obj);
-                memcpy(buf, data, size);
-                ret = 0;
+                ret = obj;
             } else {
+                Py_DECREF(obj);
                 PyErr_Format(
                     _CBOR2_CBORDecodeEOF,
                     "premature end of stream (expected to read %zd bytes, "
                     "got %zd instead)", size, PyBytes_GET_SIZE(obj));
             }
-            Py_DECREF(obj);
         }
-        Py_DECREF(size_obj);
+    }
+    return ret;
+}
+
+
+static int
+fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
+{
+    int ret = -1;
+    PyObject *obj = fp_read_object(self, size);
+    if (obj) {
+        char *data = PyBytes_AS_STRING(obj);
+        if (data) {
+            memcpy(buf, data, size);
+            ret = 0;
+        }
+        Py_DECREF(obj);
     }
     return ret;
 }
@@ -536,17 +549,12 @@ decode_negint(CBORDecoderObject *self, u
 
 
 static PyObject *
-decode_definite_bytestring(CBORDecoderObject *self, Py_ssize_t length)
+decode_definite_short_bytestring(CBORDecoderObject *self, Py_ssize_t length)
 {
-    PyObject *ret = NULL;
-
-    ret = PyBytes_FromStringAndSize(NULL, length);
+    PyObject *ret = fp_read_object(self, length);
     if (!ret)
         return NULL;
-    if (fp_read(self, PyBytes_AS_STRING(ret), length) == -1) {
-        Py_DECREF(ret);
-        return NULL;
-    }
+
     if (string_namespace_add(self, ret, length) == -1) {
         Py_DECREF(ret);
         return NULL;
@@ -556,6 +564,56 @@ decode_definite_bytestring(CBORDecoderOb
 
 
 static PyObject *
+decode_definite_long_bytestring(CBORDecoderObject *self, Py_ssize_t length)
+{
+    PyObject *buffer = NULL;
+    Py_ssize_t left = length;
+    while (left) {
+        Py_ssize_t chunk_length = length <= 65536 ? length : 65536;
+        PyObject *chunk = fp_read_object(self, chunk_length);
+        if (!chunk) {
+            break;
+        }
+
+        if (!PyBytes_CheckExact(chunk)) {
+            Py_DECREF(chunk);
+            break;
+        }
+
+        if (buffer) {
+            PyObject *new_buffer = PyByteArray_Concat(buffer, chunk);
+            Py_DECREF(chunk);
+            if (!new_buffer)
+                break;
+
+            if (new_buffer != buffer) {
+                Py_DECREF(buffer);
+                buffer = new_buffer;
+            }
+        } else {
+            buffer = PyByteArray_FromObject(chunk);
+            Py_DECREF(chunk);
+            if (!buffer)
+                break;
+        }
+        left -= chunk_length;
+    }
+
+    PyObject *ret = NULL;
+    if (buffer) {
+        ret = PyBytes_FromObject(buffer);
+        Py_DECREF(buffer);
+
+        if (ret && string_namespace_add(self, ret, length) == -1) {
+            Py_DECREF(ret);
+            ret = NULL;
+        }
+    }
+    return ret;
+}
+
+
+static PyObject *
 decode_indefinite_bytestrings(CBORDecoderObject *self)
 {
     PyObject *list, *ret = NULL;
@@ -613,9 +671,14 @@ decode_bytestring(CBORDecoderObject *sel
     }
     if (indefinite)
         ret = decode_indefinite_bytestrings(self);
+    else if (length <= 65536)
+        ret = decode_definite_short_bytestring(self, (Py_ssize_t)length);
     else
-        ret = decode_definite_bytestring(self, (Py_ssize_t)length);
-    set_shareable(self, ret);
+        ret = decode_definite_long_bytestring(self, (Py_ssize_t)length);
+
+    if (ret)
+        set_shareable(self, ret);
+
     return ret;
 }
 
@@ -635,21 +698,16 @@ decode_bytestring(CBORDecoderObject *sel
 
 
 static PyObject *
-decode_definite_string(CBORDecoderObject *self, Py_ssize_t length)
+decode_definite_short_string(CBORDecoderObject *self, Py_ssize_t length)
 {
-    PyObject *ret = NULL;
-    char *buf;
-
-    buf = PyMem_Malloc(length);
-    if (!buf)
-        return PyErr_NoMemory();
-
-    if (fp_read(self, buf, length) == 0)
-        ret = PyUnicode_DecodeUTF8(
-                buf, length, PyBytes_AS_STRING(self->str_errors));
-    PyMem_Free(buf);
+    PyObject *bytes_obj = fp_read_object(self, length);
+    if (!bytes_obj)
+        return NULL;
 
-    if (string_namespace_add(self, ret, length) == -1) {
+    const char *bytes = PyBytes_AS_STRING(bytes_obj);
+    PyObject *ret = PyUnicode_FromStringAndSize(bytes, length);
+    Py_DECREF(bytes_obj);
+    if (ret && string_namespace_add(self, ret, length) == -1) {
         Py_DECREF(ret);
         return NULL;
     }
@@ -658,6 +716,104 @@ decode_definite_string(CBORDecoderObject
 
 
 static PyObject *
+decode_definite_long_string(CBORDecoderObject *self, Py_ssize_t length)
+{
+    PyObject *ret = NULL, *chunk = NULL, *string = NULL;
+    Py_ssize_t left = length;
+    Py_ssize_t consumed;
+    Py_ssize_t buffer_size = 0;  // how many bytes are allocated for the buffer
+    Py_ssize_t buffer_length = 0;  // how many bytes are actually stored in the buffer
+    char *buffer = NULL;
+    while (left) {
+        // Read up to 65536 bytes of data from the stream
+        Py_ssize_t chunk_length = 65536 - buffer_size;
+        if (left < chunk_length)
+            chunk_length = left;
+
+        PyObject *chunk = fp_read_object(self, chunk_length);
+        left -= chunk_length;
+        if (!chunk)
+            goto error;
+
+        // Get the internal buffer of the bytes object
+        char *bytes_buffer = PyBytes_AsString(chunk);
+        if (!bytes_buffer)
+            goto error;
+
+        char *source_buffer;
+        if (buffer) {
+            // Grow the buffer to accommodate the previous data plus the new chunk
+            if (buffer_length + chunk_length > buffer_size) {
+                buffer_size = buffer_length + chunk_length;
+                char *new_buffer = PyMem_Realloc(buffer, buffer_size);
+                if (!new_buffer)
+                    goto error;
+
+                buffer = new_buffer;
+            }
+
+            // Concatenate the chunk into the buffer
+            memcpy(buffer + buffer_length, bytes_buffer, chunk_length);
+            buffer_length += chunk_length;
+
+            source_buffer = buffer;
+            chunk_length = buffer_length;
+        } else {
+            // Use the chunk's internal buffer directly to decode as many characters as possible
+            source_buffer = bytes_buffer;
+        }
+
+        string = PyUnicode_DecodeUTF8Stateful(source_buffer, chunk_length, NULL, &consumed);
+        if (!string)
+            goto error;
+
+        if (ret) {
+            // Concatenate the result to the existing result
+            PyObject *joined = PyUnicode_Concat(ret, string);
+            if (!joined)
+                goto error;
+
+            Py_DECREF(string);
+            string = NULL;
+            ret = joined;
+        } else {
+            // Set the result to the decoded string
+            ret = string;
+        }
+
+        Py_ssize_t unconsumed = chunk_length - consumed;
+        if (consumed != chunk_length) {
+            if (buffer) {
+                // Move the unconsumed bytes to the start of the buffer
+                memmove(buffer, buffer + consumed, unconsumed);
+            } else {
+                // Create a new buffer
+                buffer = PyMem_Malloc(unconsumed);
+                if (!buffer)
+                    goto error;
+
+                memcpy(buffer, bytes_buffer + consumed, unconsumed);
+            }
+            buffer_length = unconsumed;
+        }
+    }
+
+    if (ret && string_namespace_add(self, ret, length) == -1)
+        goto error;
+
+    return ret;
+error:
+    Py_XDECREF(ret);
+    Py_XDECREF(chunk);
+    Py_XDECREF(string);
+    if (buffer)
+        PyMem_Free(buffer);
+
+    return NULL;
+}
+
+
+static PyObject *
 decode_indefinite_strings(CBORDecoderObject *self)
 {
     PyObject *list, *ret = NULL;
@@ -714,9 +870,14 @@ decode_string(CBORDecoderObject *self, u
     }
     if (indefinite)
         ret = decode_indefinite_strings(self);
+    else if (length <= 65536)
+        ret = decode_definite_short_string(self, (Py_ssize_t)length);
     else
-        ret = decode_definite_string(self, (Py_ssize_t)length);
-    set_shareable(self, ret);
+        ret = decode_definite_long_string(self, (Py_ssize_t)length);
+
+    if (ret)
+        set_shareable(self, ret);
+
     return ret;
 }
 
Index: cbor2-5.5.1/tests/test_decoder.py
===================================================================
--- cbor2-5.5.1.orig/tests/test_decoder.py
+++ cbor2-5.5.1/tests/test_decoder.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import math
 import re
 import struct
@@ -9,6 +11,8 @@ from email.message import Message
 from fractions import Fraction
 from io import BytesIO
 from ipaddress import ip_address, ip_network
+from pathlib import Path
+from typing import Type, cast
 from uuid import UUID
 
 import pytest
@@ -226,6 +230,7 @@ def test_binary(impl, payload, expected)
         ("62225c", '"\\'),
         ("62c3bc", "\u00fc"),
         ("63e6b0b4", "\u6c34"),
+        pytest.param("7a00010001" + "61" * 65535 + "c3b6", "a" * 65535 + "รถ", id="split_unicode"),
     ],
 )
 def test_string(impl, payload, expected):
openSUSE Build Service is sponsored by