diff --git a/src/calibre/utils/icu_calibre_utils.h b/src/calibre/utils/icu_calibre_utils.h index c21578330c81..6de9e9e9c71d 100644 --- a/src/calibre/utils/icu_calibre_utils.h +++ b/src/calibre/utils/icu_calibre_utils.h @@ -22,11 +22,13 @@ #include #include -#if PY_VERSION_HEX >= 0x03030000 +#if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif #define MIN(x, y) ((x)<(y)) ? (x) : (y) +#define IS_HIGH_SURROGATE(x) (0xd800 <= x && x <= 0xdbff) +#define IS_LOW_SURROGATE(x) (0xdc00 <= x && x <= 0xdfff) // Roundtripping will need to be implemented differently for python 3.3+ where strings are stored with variable widths @@ -42,21 +44,29 @@ static UChar* python_to_icu(PyObject *obj, int32_t *osz, uint8_t do_check) { PyErr_SetString(PyExc_TypeError, "Not a unicode string"); goto end; } + sz = PyUnicode_GET_SIZE(obj); #ifdef Py_UNICODE_WIDE // wide build (UCS 4) - sz = PyUnicode_GET_SIZE(obj); ans = (UChar*) calloc(2*(sz+1), sizeof(UChar)); // There can be no more than 2 UChars per character + ensure null termination if (ans == NULL) { PyErr_NoMemory(); goto end; } - u_strFromUTF32(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, &status); + u_strFromUTF32WithSub(ans, (int32_t)(2*(sz+1)), osz, (UChar32*)PyUnicode_AS_UNICODE(obj), (int32_t)sz, 0xfffd, NULL, &status); if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, u_errorName(status)); free(ans); ans = NULL; goto end; } #else // narrow build (UTF-16) - sz = PyUnicode_GET_DATA_SIZE(obj); - ans = (UChar*) calloc(sz+2, 1); // Ensure null termination + ans = (UChar*) malloc((sz + 1) * sizeof(UChar)); if (ans == NULL) { PyErr_NoMemory(); goto end; } - memcpy(ans, PyUnicode_AS_UNICODE(obj), sz); - if (osz != NULL) *osz = (int32_t)PyUnicode_GET_SIZE(obj); + for (Py_ssize_t i = 0; i < sz; i++) { + UChar ch = PyUnicode_AS_UNICODE(obj)[i]; + if (IS_HIGH_SURROGATE(ch)) { + if (i >= sz - 1 || !IS_LOW_SURROGATE(PyUnicode_AS_UNICODE(obj)[i+1])) ans[i] = 0xfffd; + else { ans[i] = ch; ans[i+1] = PyUnicode_AS_UNICODE(obj)[i+1]; i++; } + } else if (IS_LOW_SURROGATE(ch)) { + ans[i] = 0xfffd; + } else ans[i] = ch; + } + ans[sz] = 0; // Ensure null termination + if (osz != NULL) *osz = (int32_t)sz; #endif end: return ans; @@ -104,5 +114,3 @@ static PyObject* icu_to_python(UChar *src, int32_t sz) { #endif } #endif - - diff --git a/src/calibre/utils/icu_test.py b/src/calibre/utils/icu_test.py index caa11d217a99..36cab51b4779 100644 --- a/src/calibre/utils/icu_test.py +++ b/src/calibre/utils/icu_test.py @@ -133,6 +133,8 @@ def test_roundtrip(self): ' Test roundtripping ' for r in (u'xxx\0\u2219\U0001f431xxx', u'\0', u'', u'simple'): self.ae(r, icu._icu.roundtrip(r)) + self.ae(icu._icu.roundtrip('\ud8e81'), '\ufffd1') + self.ae(icu._icu.roundtrip('\udc01\ud8e8'), '\ufffd\ufffd') for x, l in [('', 0), ('a', 1), ('\U0001f431', 1)]: self.ae(icu._icu.string_length(x), l) for x, l in [('', 0), ('a', 1), ('\U0001f431', 2)]: @@ -218,6 +220,6 @@ def test_build(): if not result.wasSuccessful(): raise SystemExit(1) + if __name__ == '__main__': run(verbosity=4) -