diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 2c5b721093d5c0..3f7ba196cf7019 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -3173,6 +3173,92 @@ objects. .. versionadded:: 3.14 + .. method:: take_bytes(n=None, /) + + Take the first *n* bytes as an immutable :class:`bytes`. Defaults to all + bytes. + + If *n* is negative indexes from the end and takes the first :func:`len` + plus *n* bytes. If *n* is out of bounds raises :exc:`IndexError`. + + Taking less than the full length will leave remaining bytes in the + :class:`bytearray` which requires a copy. If the remaining bytes should be + discarded use :func:`~bytearray.resize` or :keyword:`del` to truncate + then :func:`~bytearray.take_bytes` without a size. + + .. impl-detail:: + + Taking all bytes is a zero-copy operation. + + .. list-table:: Suggested Replacements + :header-rows: 1 + + * - Description + - Old + - New + + * - Return :class:`bytes` after working with :class:`bytearray` + - .. code:: python + + + def read() -> bytes: + buffer = bytearray(1024) + ... + return bytes(buffer) + - .. code:: python + + def read() -> bytes: + buffer = bytearray(1024) + ... + return buffer.take_bytes() + + * - Empty a buffer getting the bytes + - .. code:: python + + buffer = bytearray(1024) + ... + data = bytes(buffer) + buffer.clear() + - .. code:: python + + buffer = bytearray(1024) + ... + data = buffer.take_bytes() + + * - Split a buffer at a specific separator + - .. code:: python + + buffer = bytearray(b'abc\ndef') + n = buffer.find(b'\n') + data = bytes(buffer[:n + 1]) + del buffer[:n + 1] + assert buffer == bytearray(b'def') + + - .. code:: python + + buffer = bytearray(b'abc\ndef') + n = buffer.find(b'\n') + data = buffer.take_bytes(n + 1) + + * - Split a buffer at a specific separator; discard after the separator + - .. code:: python + + buffer = bytearray(b'abc\ndef') + n = buffer.find(b'\n') + data = bytes(buffer[:n]) + buffer.clear() + assert data == b'abc' + assert len(buffer) == 0 + + - .. code:: python + + buffer = bytearray(b'abc\ndef') + n = buffer.find(b'\n') + buffer.resize(n) + data = buffer.take_bytes() + + .. versionadded:: next + Since bytearray objects are sequences of integers (akin to a list), for a bytearray object *b*, ``b[0]`` will be an integer, while ``b[0:1]`` will be a bytearray object of length 1. (This contrasts with text strings, where diff --git a/Include/cpython/bytearrayobject.h b/Include/cpython/bytearrayobject.h index 4dddef713ce097..f116271ad655b5 100644 --- a/Include/cpython/bytearrayobject.h +++ b/Include/cpython/bytearrayobject.h @@ -9,6 +9,7 @@ typedef struct { char *ob_bytes; /* Physical backing buffer */ char *ob_start; /* Logical start inside ob_bytes */ Py_ssize_t ob_exports; /* How many buffer exports */ + PyObject *ob_bytes_object; /* PyBytes for zero-copy bytes conversion */ } PyByteArrayObject; PyAPI_DATA(char) _PyByteArray_empty_string[]; diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index f10e4041937f4f..23a95dc0d40b62 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -1390,6 +1390,16 @@ def test_clear(self): b.append(ord('p')) self.assertEqual(b, b'p') + # Cleared object should be empty. + b = bytearray(b'abc') + b.clear() + self.assertEqual(b.__alloc__(), 0) + base_size = sys.getsizeof(bytearray()) + self.assertEqual(sys.getsizeof(b), base_size) + c = b.copy() + self.assertEqual(c.__alloc__(), 0) + self.assertEqual(sys.getsizeof(c), base_size) + def test_copy(self): b = bytearray(b'abc') bb = b.copy() @@ -1451,6 +1461,61 @@ def test_resize(self): self.assertRaises(MemoryError, bytearray().resize, sys.maxsize) self.assertRaises(MemoryError, bytearray(1000).resize, sys.maxsize) + def test_take_bytes(self): + ba = bytearray(b'ab') + self.assertEqual(ba.take_bytes(), b'ab') + self.assertEqual(len(ba), 0) + self.assertEqual(ba, bytearray(b'')) + self.assertEqual(ba.__alloc__(), 0) + base_size = sys.getsizeof(bytearray()) + self.assertEqual(sys.getsizeof(ba), base_size) + + # Positive and negative slicing. + ba = bytearray(b'abcdef') + self.assertEqual(ba.take_bytes(1), b'a') + self.assertEqual(ba, bytearray(b'bcdef')) + self.assertEqual(len(ba), 5) + self.assertEqual(ba.take_bytes(-5), b'') + self.assertEqual(ba, bytearray(b'bcdef')) + self.assertEqual(len(ba), 5) + self.assertEqual(ba.take_bytes(-3), b'bc') + self.assertEqual(ba, bytearray(b'def')) + self.assertEqual(len(ba), 3) + self.assertEqual(ba.take_bytes(3), b'def') + self.assertEqual(ba, bytearray(b'')) + self.assertEqual(len(ba), 0) + + # Take nothing from emptiness. + self.assertEqual(ba.take_bytes(0), b'') + self.assertEqual(ba.take_bytes(), b'') + self.assertEqual(ba.take_bytes(None), b'') + + # Out of bounds, bad take value. + self.assertRaises(IndexError, ba.take_bytes, -1) + self.assertRaises(TypeError, ba.take_bytes, 3.14) + ba = bytearray(b'abcdef') + self.assertRaises(IndexError, ba.take_bytes, 7) + + # Offset between physical and logical start (ob_bytes != ob_start). + ba = bytearray(b'abcde') + del ba[:2] + self.assertEqual(ba, bytearray(b'cde')) + self.assertEqual(ba.take_bytes(), b'cde') + + # Overallocation at end. + ba = bytearray(b'abcde') + del ba[-2:] + self.assertEqual(ba, bytearray(b'abc')) + self.assertEqual(ba.take_bytes(), b'abc') + ba = bytearray(b'abcde') + ba.resize(4) + self.assertEqual(ba.take_bytes(), b'abcd') + + # Take of a bytearray with references should fail. + ba = bytearray(b'abc') + with memoryview(ba) as mv: + self.assertRaises(BufferError, ba.take_bytes) + self.assertEqual(ba.take_bytes(), b'abc') def test_setitem(self): def setitem_as_mapping(b, i, val): @@ -2557,6 +2622,11 @@ def zfill(b, a): c = a.zfill(0x400000) assert not c or c[-1] not in (0xdd, 0xcd) + def take_bytes(b, a): + b.wait() + c = a.take_bytes() + assert not c or c[0] == 48 # '0' + def check(funcs, a=None, *args): if a is None: a = bytearray(b'0' * 0x400000) @@ -2617,6 +2687,7 @@ def check(funcs, a=None, *args): check([clear] + [splitlines] * 10, bytearray(b'\n' * 0x400)) check([clear] + [startswith] * 10) check([clear] + [strip] * 10) + check([clear] + [take_bytes] * 10) check([clear] + [contains] * 10) check([clear] + [subscript] * 10) diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 1198c6d35113c8..a65840d437ec62 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1583,7 +1583,7 @@ def test_objecttypes(self): samples = [b'', b'u'*100000] for sample in samples: x = bytearray(sample) - check(x, vsize('n2Pi') + x.__alloc__()) + check(x, vsize('n2PiP') + x.__alloc__()) # bytearray_iterator check(iter(bytearray()), size('nP')) # bytes diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-14-18-24-16.gh-issue-139871.SWtuUz.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-14-18-24-16.gh-issue-139871.SWtuUz.rst new file mode 100644 index 00000000000000..d4b8578afe3afc --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-14-18-24-16.gh-issue-139871.SWtuUz.rst @@ -0,0 +1,2 @@ +Update :class:`bytearray` to use a :class:`bytes` under the hood as its buffer +and add :func:`bytearray.take_bytes` to take it out. diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index a73bfff340ce48..80ec5708b5bac0 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -141,22 +141,27 @@ PyByteArray_FromStringAndSize(const char *bytes, Py_ssize_t size) } new = PyObject_New(PyByteArrayObject, &PyByteArray_Type); - if (new == NULL) + if (new == NULL) { return NULL; + } if (size == 0) { + new->ob_bytes_object = NULL; new->ob_bytes = NULL; alloc = 0; } else { alloc = size + 1; - new->ob_bytes = PyMem_Malloc(alloc); - if (new->ob_bytes == NULL) { + new->ob_bytes_object = PyBytes_FromStringAndSize(NULL, alloc); + if (new->ob_bytes_object == NULL) { Py_DECREF(new); - return PyErr_NoMemory(); + return NULL; } - if (bytes != NULL && size > 0) + new->ob_bytes = PyBytes_AS_STRING(new->ob_bytes_object); + assert(new->ob_bytes); + if (bytes != NULL && size > 0) { memcpy(new->ob_bytes, bytes, size); + } new->ob_bytes[size] = '\0'; /* Trailing null byte */ } Py_SET_SIZE(new, size); @@ -189,7 +194,6 @@ static int bytearray_resize_lock_held(PyObject *self, Py_ssize_t requested_size) { _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(self); - void *sval; PyByteArrayObject *obj = ((PyByteArrayObject *)self); /* All computations are done unsigned to avoid integer overflows (see issue #22335). */ @@ -220,6 +224,11 @@ bytearray_resize_lock_held(PyObject *self, Py_ssize_t requested_size) if (size < alloc / 2) { /* Major downsize; resize down to exact size */ alloc = size + 1; + + /* If new size is 0; don't need to allocate one byte for null. */ + if (size == 0) { + alloc = 0; + } } else { /* Minor downsize; quick exit */ @@ -235,7 +244,7 @@ bytearray_resize_lock_held(PyObject *self, Py_ssize_t requested_size) alloc = size + (size >> 3) + (size < 9 ? 3 : 6); } else { - /* Major upsize; resize up to exact size */ + /* Major upsize; resize up to exact size. Upsize always means size > 0 */ alloc = size + 1; } } @@ -244,25 +253,28 @@ bytearray_resize_lock_held(PyObject *self, Py_ssize_t requested_size) return -1; } + /* re-align data to the start of the allocation. */ if (logical_offset > 0) { - sval = PyMem_Malloc(alloc); - if (sval == NULL) { - PyErr_NoMemory(); + memmove(obj->ob_bytes, obj->ob_start, + Py_MIN(requested_size, Py_SIZE(self))); + } + + if (obj->ob_bytes_object == NULL) { + obj->ob_bytes_object = PyBytes_FromStringAndSize(NULL, alloc); + if (obj->ob_bytes_object == NULL) { return -1; } - memcpy(sval, PyByteArray_AS_STRING(self), - Py_MIN((size_t)requested_size, (size_t)Py_SIZE(self))); - PyMem_Free(obj->ob_bytes); } else { - sval = PyMem_Realloc(obj->ob_bytes, alloc); - if (sval == NULL) { - PyErr_NoMemory(); + if (_PyBytes_Resize(&obj->ob_bytes_object, alloc) == -1) { + Py_SET_SIZE(self, 0); + obj->ob_bytes = obj->ob_start = NULL; + FT_ATOMIC_STORE_SSIZE_RELAXED(obj->ob_alloc, 0); return -1; } } - obj->ob_bytes = obj->ob_start = sval; + obj->ob_bytes = obj->ob_start = PyBytes_AS_STRING(obj->ob_bytes_object); Py_SET_SIZE(self, size); FT_ATOMIC_STORE_SSIZE_RELAXED(obj->ob_alloc, alloc); obj->ob_bytes[size] = '\0'; /* Trailing null byte */ @@ -1169,9 +1181,7 @@ bytearray_dealloc(PyObject *op) "deallocated bytearray object has exported buffers"); PyErr_Print(); } - if (self->ob_bytes != 0) { - PyMem_Free(self->ob_bytes); - } + Py_CLEAR(self->ob_bytes_object); Py_TYPE(self)->tp_free((PyObject *)self); } @@ -1491,6 +1501,95 @@ bytearray_resize_impl(PyByteArrayObject *self, Py_ssize_t size) } +/*[clinic input] +@critical_section +bytearray.take_bytes + n: object = None + Bytes to take, negative indexes from end. None indicates all bytes. + / +Take *n* bytes from the bytearray and return them as a bytes object. +[clinic start generated code]*/ + +static PyObject * +bytearray_take_bytes_impl(PyByteArrayObject *self, PyObject *n) +/*[clinic end generated code: output=3147fbc0bbbe8d94 input=b15b5172cdc6deda]*/ +{ + Py_ssize_t to_take, original; + Py_ssize_t size = Py_SIZE(self); + if (Py_IsNone(n)) { + to_take = original = size; + } + // Integer index, from start (zero, positive) or end (negative). + else if (_PyIndex_Check(n)) { + to_take = original = PyNumber_AsSsize_t(n, PyExc_IndexError); + if (to_take == -1 && PyErr_Occurred()) { + return NULL; + } + if (to_take < 0) { + to_take += size; + } + } else { + PyErr_SetString(PyExc_TypeError, "n must be an integer or None"); + return NULL; + } + + if (to_take < 0 || to_take > size) { + PyErr_Format(PyExc_IndexError, + "can't take %zd(%zd) outside size %zd", + original, to_take, size); + return NULL; + } + + // Exports may change the contents, No mutable bytes allowed. + if (!_canresize(self)) { + return NULL; + } + + if (to_take == 0 || size == 0) { + return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES); + } + + // Copy remaining bytes to a new bytes. + PyObject *remaining = NULL; + Py_ssize_t remaining_length = size - to_take; + if (remaining_length > 0) { + // +1 to copy across the null which always ends a bytearray. + remaining = PyBytes_FromStringAndSize(self->ob_start + to_take, + remaining_length + 1); + if (remaining == NULL) { + return NULL; + } + } + + // If the bytes are offset inside the buffer must first align. + if (self->ob_start != self->ob_bytes) { + memmove(self->ob_bytes, self->ob_start, to_take); + self->ob_start = self->ob_bytes; + } + + if (_PyBytes_Resize(&self->ob_bytes_object, to_take) == -1) { + Py_CLEAR(remaining); + return NULL; + } + + // Point the bytearray towards the buffer with the remaining data. + PyObject *result = self->ob_bytes_object; + self->ob_bytes_object = remaining; + if (remaining) { + self->ob_bytes = self->ob_start = PyBytes_AS_STRING(self->ob_bytes_object); + Py_SET_SIZE(self, size - to_take); + FT_ATOMIC_STORE_SSIZE_RELAXED(self->ob_alloc, size - to_take + 1); + } + else { + self->ob_bytes = self->ob_start = NULL; + Py_SET_SIZE(self, 0); + FT_ATOMIC_STORE_SSIZE_RELAXED(self->ob_alloc, 0); + } + + return result; +} + + /*[clinic input] @critical_section bytearray.translate @@ -2405,7 +2504,11 @@ static PyObject * bytearray_alloc(PyObject *op, PyObject *Py_UNUSED(ignored)) { PyByteArrayObject *self = _PyByteArray_CAST(op); - return PyLong_FromSsize_t(FT_ATOMIC_LOAD_SSIZE_RELAXED(self->ob_alloc)); + Py_ssize_t alloc = FT_ATOMIC_LOAD_SSIZE_RELAXED(self->ob_alloc); + if (alloc > 0) { + alloc += sizeof(PyBytesObject); + } + return PyLong_FromSsize_t(alloc); } /*[clinic input] @@ -2601,8 +2704,13 @@ static PyObject * bytearray_sizeof_impl(PyByteArrayObject *self) /*[clinic end generated code: output=738abdd17951c427 input=e27320fd98a4bc5a]*/ { - size_t res = _PyObject_SIZE(Py_TYPE(self)); - res += (size_t)FT_ATOMIC_LOAD_SSIZE_RELAXED(self->ob_alloc) * sizeof(char); + Py_ssize_t res = _PyObject_SIZE(Py_TYPE(self)); + Py_ssize_t alloc = FT_ATOMIC_LOAD_SSIZE_RELAXED(self->ob_alloc) * sizeof(char); + if (alloc > 0) { + res += sizeof(PyBytesObject); + res += alloc * sizeof(char); + } + return PyLong_FromSize_t(res); } @@ -2686,6 +2794,7 @@ static PyMethodDef bytearray_methods[] = { BYTEARRAY_STARTSWITH_METHODDEF BYTEARRAY_STRIP_METHODDEF {"swapcase", bytearray_swapcase, METH_NOARGS, _Py_swapcase__doc__}, + BYTEARRAY_TAKE_BYTES_METHODDEF {"title", bytearray_title, METH_NOARGS, _Py_title__doc__}, BYTEARRAY_TRANSLATE_METHODDEF {"upper", bytearray_upper, METH_NOARGS, _Py_upper__doc__}, diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h index 6f13865177dde5..be704ccf68f669 100644 --- a/Objects/clinic/bytearrayobject.c.h +++ b/Objects/clinic/bytearrayobject.c.h @@ -631,6 +631,43 @@ bytearray_resize(PyObject *self, PyObject *arg) return return_value; } +PyDoc_STRVAR(bytearray_take_bytes__doc__, +"take_bytes($self, n=None, /)\n" +"--\n" +"\n" +"Take *n* bytes from the bytearray and return them as a bytes object.\n" +"\n" +" n\n" +" Bytes to take, negative indexes from end. None indicates all bytes."); + +#define BYTEARRAY_TAKE_BYTES_METHODDEF \ + {"take_bytes", _PyCFunction_CAST(bytearray_take_bytes), METH_FASTCALL, bytearray_take_bytes__doc__}, + +static PyObject * +bytearray_take_bytes_impl(PyByteArrayObject *self, PyObject *n); + +static PyObject * +bytearray_take_bytes(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + PyObject *n = Py_None; + + if (!_PyArg_CheckPositional("take_bytes", nargs, 0, 1)) { + goto exit; + } + if (nargs < 1) { + goto skip_optional; + } + n = args[0]; +skip_optional: + Py_BEGIN_CRITICAL_SECTION(self); + return_value = bytearray_take_bytes_impl((PyByteArrayObject *)self, n); + Py_END_CRITICAL_SECTION(); + +exit: + return return_value; +} + PyDoc_STRVAR(bytearray_translate__doc__, "translate($self, table, /, delete=b\'\')\n" "--\n" @@ -1796,4 +1833,4 @@ bytearray_sizeof(PyObject *self, PyObject *Py_UNUSED(ignored)) { return bytearray_sizeof_impl((PyByteArrayObject *)self); } -/*[clinic end generated code: output=fdfe41139c91e409 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5eddefde2a001ceb input=a9049054013a1b77]*/