From f1cffbb9a99b95f3753cd948389ece2968b09514 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 23 May 2024 00:29:36 +0200 Subject: [PATCH] WIP: Add PyUnicodeWriter API TODO: update API doc. --- docs/changelog.rst | 13 +++ pythoncapi_compat.h | 153 ++++++++++++++++++++++++++++ tests/test_pythoncapi_compat_cext.c | 146 ++++++++++++++++++++++++++ 3 files changed, 312 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index 4d0fb9d..1e7ba2a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,19 @@ Changelog ========= +* 2024-07-18: Add functions: + + * ``PyUnicodeWriter_Create()`` + * ``PyUnicodeWriter_Discard()`` + * ``PyUnicodeWriter_Finish()`` + * ``PyUnicodeWriter_WriteChar()`` + * ``PyUnicodeWriter_WriteUTF8()`` + * ``PyUnicodeWriter_WriteStr()`` + * ``PyUnicodeWriter_WriteRepr()`` + * ``PyUnicodeWriter_WriteSubstring()`` + * ``PyUnicodeWriter_WriteWideChar()`` + * ``PyUnicodeWriter_Format()`` + * 2024-06-03: Add ``PyLong_GetSign()``. * 2024-04-23: Drop Python 3.5 support. It cannot be tested anymore (pip fails). * 2024-04-02: Add ``PyDict_SetDefaultRef()`` function. diff --git a/pythoncapi_compat.h b/pythoncapi_compat.h index 51e8c0d..d45828f 100644 --- a/pythoncapi_compat.h +++ b/pythoncapi_compat.h @@ -1338,6 +1338,159 @@ PyDict_SetDefaultRef(PyObject *d, PyObject *key, PyObject *default_value, } #endif +#if PY_VERSION_HEX < 0x030E0000 && PY_VERSION_HEX >= 0x03060000 && !defined(PYPY_VERSION) +typedef struct PyUnicodeWriter PyUnicodeWriter; + +static inline void PyUnicodeWriter_Discard(PyUnicodeWriter *writer) +{ + _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer); + PyMem_Free(writer); +} + +static inline PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length) +{ + if (length < 0) { + PyErr_SetString(PyExc_ValueError, + "length must be positive"); + return NULL; + } + + const size_t size = sizeof(_PyUnicodeWriter); + PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size); + if (pub_writer == _Py_NULL) { + PyErr_NoMemory(); + return _Py_NULL; + } + _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer; + + _PyUnicodeWriter_Init(writer); + if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) { + PyUnicodeWriter_Discard(pub_writer); + return NULL; + } + writer->overallocate = 1; + return pub_writer; +} + +static inline PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer) +{ + PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer); + assert(((_PyUnicodeWriter*)writer)->buffer == NULL); + PyMem_Free(writer); + return str; +} + +static inline int +PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch) +{ + if (ch > 0x10ffff) { + PyErr_SetString(PyExc_ValueError, + "character must be in range(0x110000)"); + return -1; + } + + return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch); +} + +int +PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj) +{ + PyObject *str = PyObject_Str(obj); + if (str == NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); + Py_DECREF(str); + return res; +} + +int +PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj) +{ + PyObject *str = PyObject_Repr(obj); + if (str == NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); + Py_DECREF(str); + return res; +} + +static inline int +PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer, + const char *str, Py_ssize_t size) +{ + if (size < 0) { + size = (Py_ssize_t)strlen(str); + } + + PyObject *str_obj = PyUnicode_FromStringAndSize(str, size); + if (str_obj == _Py_NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj); + Py_DECREF(str_obj); + return res; +} + +static inline int +PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, + const wchar_t *str, Py_ssize_t size) +{ + if (size < 0) { + size = (Py_ssize_t)wcslen(str); + } + + PyObject *str_obj = PyUnicode_FromWideChar(str, size); + if (str_obj == _Py_NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj); + Py_DECREF(str_obj); + return res; +} + +static inline int +PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t start, Py_ssize_t end) +{ + if (!PyUnicode_Check(str)) { + PyErr_Format(PyExc_TypeError, "expect str, not %T", str); + return -1; + } + if (start < 0 || start > end) { + PyErr_Format(PyExc_ValueError, "invalid start argument"); + return -1; + } + if (end > PyUnicode_GET_LENGTH(str)) { + PyErr_Format(PyExc_ValueError, "invalid end argument"); + return -1; + } + + return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str, + start, end); +} + +static inline int +PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...) +{ + va_list vargs; + va_start(vargs, format); + PyObject *str = PyUnicode_FromFormatV(format, vargs); + va_end(vargs); + if (str == _Py_NULL) { + return -1; + } + + int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str); + Py_DECREF(str); + return res; +} +#endif // PY_VERSION_HEX < 0x030E0000 // gh-116560 added PyLong_GetSign() to Python 3.14.0a0 #if PY_VERSION_HEX < 0x030E00A0 diff --git a/tests/test_pythoncapi_compat_cext.c b/tests/test_pythoncapi_compat_cext.c index aa7b206..f813548 100644 --- a/tests/test_pythoncapi_compat_cext.c +++ b/tests/test_pythoncapi_compat_cext.c @@ -1733,6 +1733,147 @@ test_get_constant(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)) } +#if PY_VERSION_HEX < 0x030E0000 && PY_VERSION_HEX >= 0x03060000 && !defined(PYPY_VERSION) +#define TEST_UNICODEWRITER 1 + +static PyObject * +test_unicodewriter(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + int ret; + + // test PyUnicodeWriter_WriteStr() + PyObject *str = PyUnicode_FromString("var"); + if (str == NULL) { + goto error; + } + ret = PyUnicodeWriter_WriteStr(writer, str); + Py_CLEAR(str); + if (ret < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteChar() + if (PyUnicodeWriter_WriteChar(writer, '=') < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteSubstring() + str = PyUnicode_FromString("[long]"); + if (str == NULL) { + goto error; + } + ret = PyUnicodeWriter_WriteSubstring(writer, str, 1, 5); + Py_CLEAR(str); + if (ret < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteUTF8() + if (PyUnicodeWriter_WriteUTF8(writer, " valu\xC3\xA9", -1) < 0) { + goto error; + } + if (PyUnicodeWriter_WriteChar(writer, ' ') < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteRepr() + str = PyUnicode_FromString("repr"); + if (str == NULL) { + goto error; + } + if (PyUnicodeWriter_WriteRepr(writer, str) < 0) { + goto error; + } + Py_CLEAR(str); + + { + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "var=long valu\xC3\xA9 'repr'")); + Py_DECREF(result); + } + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * +test_unicodewriter_widechar(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + + // test PyUnicodeWriter_WriteWideChar() + int ret = PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1); + if (ret < 0) { + goto error; + } + + { + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "euro=\xe2\x82\xac")); + Py_DECREF(result); + } + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} + + +static PyObject * +test_unicodewriter_format(PyObject *Py_UNUSED(self), PyObject *Py_UNUSED(args)) +{ + PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); + if (writer == NULL) { + return NULL; + } + + // test PyUnicodeWriter_Format() + if (PyUnicodeWriter_Format(writer, "%s %i", "Hello", 123) < 0) { + goto error; + } + + // test PyUnicodeWriter_WriteChar() + if (PyUnicodeWriter_WriteChar(writer, '.') < 0) { + goto error; + } + + { + PyObject *result = PyUnicodeWriter_Finish(writer); + if (result == NULL) { + return NULL; + } + assert(PyUnicode_EqualToUTF8(result, "Hello 123.")); + Py_DECREF(result); + } + + Py_RETURN_NONE; + +error: + PyUnicodeWriter_Discard(writer); + return NULL; +} +#endif + + static struct PyMethodDef methods[] = { {"test_object", test_object, METH_NOARGS, _Py_NULL}, {"test_py_is", test_py_is, METH_NOARGS, _Py_NULL}, @@ -1771,6 +1912,11 @@ static struct PyMethodDef methods[] = { {"test_time", test_time, METH_NOARGS, _Py_NULL}, #endif {"test_get_constant", test_get_constant, METH_NOARGS, _Py_NULL}, +#ifdef TEST_UNICODEWRITER + {"test_unicodewriter", test_unicodewriter, METH_NOARGS, _Py_NULL}, + {"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS, _Py_NULL}, + {"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS, _Py_NULL}, +#endif {_Py_NULL, _Py_NULL, 0, _Py_NULL} };