File memory-error.patch of Package python-cbor2
From 387755eacf0be35591a478d3c67fe10618a6d542 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Gr=C3=B6nholm?= <alex.gronholm@nextday.fi>
Date: Sun, 14 Jan 2024 14:12:40 +0200
Subject: [PATCH] Fixed MemoryError when decoding large definite strings (#204)
Also fixed a return value check in `CBORTag_hash()`.
---
cbor2/_decoder.py | 29 ++++-
docs/versionhistory.rst | 4 +-
setup.py | 2 +-
source/decoder.c | 232 +++++++++++++++++++++++++++++++++-------
source/tags.c | 2 +-
tests/test_decoder.py | 41 ++++++-
6 files changed, 263 insertions(+), 47 deletions(-)
Index: cbor2-5.5.1/cbor2/_decoder.py
===================================================================
--- cbor2-5.5.1.orig/cbor2/_decoder.py
+++ cbor2-5.5.1/cbor2/_decoder.py
@@ -3,6 +3,7 @@ from __future__ import annotations
import re
import struct
import sys
+from codecs import getincrementaldecoder
from collections.abc import Callable, Mapping, Sequence
from datetime import datetime, timedelta, timezone
from io import BytesIO
@@ -31,6 +32,7 @@ T = TypeVar("T")
timestamp_re = re.compile(
r"^(\d{4})-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)" r"(?:\.(\d{1,6})\d*)?(?:Z|([+-])(\d\d):(\d\d))$"
)
+incremental_utf8_decoder = getincrementaldecoder("utf-8")
class CBORDecoder:
@@ -305,8 +307,19 @@ class CBORDecoder:
else:
if length > sys.maxsize:
raise CBORDecodeValueError("invalid length for bytestring 0x%x" % length)
+ elif length <= 65536:
+ result = self.read(length)
+ else:
+ # Read large bytestrings 65536 (2 ** 16) bytes at a time
+ left = length
+ buffer = bytearray()
+ while left:
+ chunk_size = min(left, 65536)
+ buffer.extend(self.read(chunk_size))
+ left -= chunk_size
+
+ result = bytes(buffer)
- result = self.read(length)
self._stringref_namespace_add(result, length)
return self.set_shareable(result)
@@ -350,7 +363,19 @@ class CBORDecoder:
if length > sys.maxsize:
raise CBORDecodeValueError("invalid length for string 0x%x" % length)
- result = self.read(length).decode("utf-8", self._str_errors)
+ if length <= 65536:
+ result = self.read(length).decode("utf-8", self._str_errors)
+ else:
+ # Read and decode large text strings 65536 (2 ** 16) bytes at a time
+ codec = incremental_utf8_decoder(self._str_errors)
+ left = length
+ result = ""
+ while left:
+ chunk_size = min(left, 65536)
+ final = left <= chunk_size
+ result += codec.decode(self.read(chunk_size), final)
+ left -= chunk_size
+
self._stringref_namespace_add(result, length)
return self.set_shareable(result)
Index: cbor2-5.5.1/setup.py
===================================================================
--- cbor2-5.5.1.orig/setup.py
+++ cbor2-5.5.1/setup.py
@@ -57,7 +57,7 @@ if build_c_ext:
"source/tags.c",
"source/halffloat.c",
],
- optional=True,
+ # optional=True,
)
kwargs = {"ext_modules": [_cbor2]}
else:
Index: cbor2-5.5.1/source/decoder.c
===================================================================
--- cbor2-5.5.1.orig/source/decoder.c
+++ cbor2-5.5.1/source/decoder.c
@@ -346,31 +346,44 @@ _CBORDecoder_get_immutable(CBORDecoderOb
// Utility functions /////////////////////////////////////////////////////////
-static int
-fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
+static PyObject *
+fp_read_object(CBORDecoderObject *self, const Py_ssize_t size)
{
+ PyObject *ret = NULL;
PyObject *obj, *size_obj;
- char *data;
- int ret = -1;
-
size_obj = PyLong_FromSsize_t(size);
if (size_obj) {
obj = PyObject_CallFunctionObjArgs(self->read, size_obj, NULL);
+ Py_DECREF(size_obj);
if (obj) {
assert(PyBytes_CheckExact(obj));
if (PyBytes_GET_SIZE(obj) == (Py_ssize_t) size) {
- data = PyBytes_AS_STRING(obj);
- memcpy(buf, data, size);
- ret = 0;
+ ret = obj;
} else {
+ Py_DECREF(obj);
PyErr_Format(
_CBOR2_CBORDecodeEOF,
"premature end of stream (expected to read %zd bytes, "
"got %zd instead)", size, PyBytes_GET_SIZE(obj));
}
- Py_DECREF(obj);
}
- Py_DECREF(size_obj);
+ }
+ return ret;
+}
+
+
+static int
+fp_read(CBORDecoderObject *self, char *buf, const Py_ssize_t size)
+{
+ int ret = -1;
+ PyObject *obj = fp_read_object(self, size);
+ if (obj) {
+ char *data = PyBytes_AS_STRING(obj);
+ if (data) {
+ memcpy(buf, data, size);
+ ret = 0;
+ }
+ Py_DECREF(obj);
}
return ret;
}
@@ -536,17 +549,12 @@ decode_negint(CBORDecoderObject *self, u
static PyObject *
-decode_definite_bytestring(CBORDecoderObject *self, Py_ssize_t length)
+decode_definite_short_bytestring(CBORDecoderObject *self, Py_ssize_t length)
{
- PyObject *ret = NULL;
-
- ret = PyBytes_FromStringAndSize(NULL, length);
+ PyObject *ret = fp_read_object(self, length);
if (!ret)
return NULL;
- if (fp_read(self, PyBytes_AS_STRING(ret), length) == -1) {
- Py_DECREF(ret);
- return NULL;
- }
+
if (string_namespace_add(self, ret, length) == -1) {
Py_DECREF(ret);
return NULL;
@@ -556,6 +564,56 @@ decode_definite_bytestring(CBORDecoderOb
static PyObject *
+decode_definite_long_bytestring(CBORDecoderObject *self, Py_ssize_t length)
+{
+ PyObject *buffer = NULL;
+ Py_ssize_t left = length;
+ while (left) {
+ Py_ssize_t chunk_length = length <= 65536 ? length : 65536;
+ PyObject *chunk = fp_read_object(self, chunk_length);
+ if (!chunk) {
+ break;
+ }
+
+ if (!PyBytes_CheckExact(chunk)) {
+ Py_DECREF(chunk);
+ break;
+ }
+
+ if (buffer) {
+ PyObject *new_buffer = PyByteArray_Concat(buffer, chunk);
+ Py_DECREF(chunk);
+ if (!new_buffer)
+ break;
+
+ if (new_buffer != buffer) {
+ Py_DECREF(buffer);
+ buffer = new_buffer;
+ }
+ } else {
+ buffer = PyByteArray_FromObject(chunk);
+ Py_DECREF(chunk);
+ if (!buffer)
+ break;
+ }
+ left -= chunk_length;
+ }
+
+ PyObject *ret = NULL;
+ if (buffer) {
+ ret = PyBytes_FromObject(buffer);
+ Py_DECREF(buffer);
+
+ if (ret && string_namespace_add(self, ret, length) == -1) {
+ Py_DECREF(ret);
+ ret = NULL;
+ }
+ }
+ return ret;
+}
+
+
+static PyObject *
decode_indefinite_bytestrings(CBORDecoderObject *self)
{
PyObject *list, *ret = NULL;
@@ -613,9 +671,14 @@ decode_bytestring(CBORDecoderObject *sel
}
if (indefinite)
ret = decode_indefinite_bytestrings(self);
+ else if (length <= 65536)
+ ret = decode_definite_short_bytestring(self, (Py_ssize_t)length);
else
- ret = decode_definite_bytestring(self, (Py_ssize_t)length);
- set_shareable(self, ret);
+ ret = decode_definite_long_bytestring(self, (Py_ssize_t)length);
+
+ if (ret)
+ set_shareable(self, ret);
+
return ret;
}
@@ -635,21 +698,16 @@ decode_bytestring(CBORDecoderObject *sel
static PyObject *
-decode_definite_string(CBORDecoderObject *self, Py_ssize_t length)
+decode_definite_short_string(CBORDecoderObject *self, Py_ssize_t length)
{
- PyObject *ret = NULL;
- char *buf;
-
- buf = PyMem_Malloc(length);
- if (!buf)
- return PyErr_NoMemory();
-
- if (fp_read(self, buf, length) == 0)
- ret = PyUnicode_DecodeUTF8(
- buf, length, PyBytes_AS_STRING(self->str_errors));
- PyMem_Free(buf);
+ PyObject *bytes_obj = fp_read_object(self, length);
+ if (!bytes_obj)
+ return NULL;
- if (string_namespace_add(self, ret, length) == -1) {
+ const char *bytes = PyBytes_AS_STRING(bytes_obj);
+ PyObject *ret = PyUnicode_FromStringAndSize(bytes, length);
+ Py_DECREF(bytes_obj);
+ if (ret && string_namespace_add(self, ret, length) == -1) {
Py_DECREF(ret);
return NULL;
}
@@ -658,6 +716,104 @@ decode_definite_string(CBORDecoderObject
static PyObject *
+decode_definite_long_string(CBORDecoderObject *self, Py_ssize_t length)
+{
+ PyObject *ret = NULL, *chunk = NULL, *string = NULL;
+ Py_ssize_t left = length;
+ Py_ssize_t consumed;
+ Py_ssize_t buffer_size = 0; // how many bytes are allocated for the buffer
+ Py_ssize_t buffer_length = 0; // how many bytes are actually stored in the buffer
+ char *buffer = NULL;
+ while (left) {
+ // Read up to 65536 bytes of data from the stream
+ Py_ssize_t chunk_length = 65536 - buffer_size;
+ if (left < chunk_length)
+ chunk_length = left;
+
+ PyObject *chunk = fp_read_object(self, chunk_length);
+ left -= chunk_length;
+ if (!chunk)
+ goto error;
+
+ // Get the internal buffer of the bytes object
+ char *bytes_buffer = PyBytes_AsString(chunk);
+ if (!bytes_buffer)
+ goto error;
+
+ char *source_buffer;
+ if (buffer) {
+ // Grow the buffer to accommodate the previous data plus the new chunk
+ if (buffer_length + chunk_length > buffer_size) {
+ buffer_size = buffer_length + chunk_length;
+ char *new_buffer = PyMem_Realloc(buffer, buffer_size);
+ if (!new_buffer)
+ goto error;
+
+ buffer = new_buffer;
+ }
+
+ // Concatenate the chunk into the buffer
+ memcpy(buffer + buffer_length, bytes_buffer, chunk_length);
+ buffer_length += chunk_length;
+
+ source_buffer = buffer;
+ chunk_length = buffer_length;
+ } else {
+ // Use the chunk's internal buffer directly to decode as many characters as possible
+ source_buffer = bytes_buffer;
+ }
+
+ string = PyUnicode_DecodeUTF8Stateful(source_buffer, chunk_length, NULL, &consumed);
+ if (!string)
+ goto error;
+
+ if (ret) {
+ // Concatenate the result to the existing result
+ PyObject *joined = PyUnicode_Concat(ret, string);
+ if (!joined)
+ goto error;
+
+ Py_DECREF(string);
+ string = NULL;
+ ret = joined;
+ } else {
+ // Set the result to the decoded string
+ ret = string;
+ }
+
+ Py_ssize_t unconsumed = chunk_length - consumed;
+ if (consumed != chunk_length) {
+ if (buffer) {
+ // Move the unconsumed bytes to the start of the buffer
+ memmove(buffer, buffer + consumed, unconsumed);
+ } else {
+ // Create a new buffer
+ buffer = PyMem_Malloc(unconsumed);
+ if (!buffer)
+ goto error;
+
+ memcpy(buffer, bytes_buffer + consumed, unconsumed);
+ }
+ buffer_length = unconsumed;
+ }
+ }
+
+ if (ret && string_namespace_add(self, ret, length) == -1)
+ goto error;
+
+ return ret;
+error:
+ Py_XDECREF(ret);
+ Py_XDECREF(chunk);
+ Py_XDECREF(string);
+ if (buffer)
+ PyMem_Free(buffer);
+
+ return NULL;
+}
+
+
+static PyObject *
decode_indefinite_strings(CBORDecoderObject *self)
{
PyObject *list, *ret = NULL;
@@ -714,9 +870,14 @@ decode_string(CBORDecoderObject *self, u
}
if (indefinite)
ret = decode_indefinite_strings(self);
+ else if (length <= 65536)
+ ret = decode_definite_short_string(self, (Py_ssize_t)length);
else
- ret = decode_definite_string(self, (Py_ssize_t)length);
- set_shareable(self, ret);
+ ret = decode_definite_long_string(self, (Py_ssize_t)length);
+
+ if (ret)
+ set_shareable(self, ret);
+
return ret;
}
Index: cbor2-5.5.1/tests/test_decoder.py
===================================================================
--- cbor2-5.5.1.orig/tests/test_decoder.py
+++ cbor2-5.5.1/tests/test_decoder.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
import math
import re
import struct
@@ -9,6 +11,8 @@ from email.message import Message
from fractions import Fraction
from io import BytesIO
from ipaddress import ip_address, ip_network
+from pathlib import Path
+from typing import Type, cast
from uuid import UUID
import pytest
@@ -226,6 +230,7 @@ def test_binary(impl, payload, expected)
("62225c", '"\\'),
("62c3bc", "\u00fc"),
("63e6b0b4", "\u6c34"),
+ pytest.param("7a00010001" + "61" * 65535 + "c3b6", "a" * 65535 + "รถ", id="split_unicode"),
],
)
def test_string(impl, payload, expected):