diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -384,32 +384,11 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_T ((PyASCIIObject*)op)->length : \ ((PyCompactUnicodeObject*)op)->wstr_length) -/* Returns the deprecated Py_UNICODE representation's size in code units - (this includes surrogate pairs as 2 units). - If the Py_UNICODE representation is not available, it will be computed - on request. Use PyUnicode_GET_LENGTH() for the length in code points. */ - -#define PyUnicode_GET_SIZE(op) \ - (assert(PyUnicode_Check(op)), \ - (((PyASCIIObject *)(op))->wstr) ? \ - PyUnicode_WSTR_LENGTH(op) : \ - ((void)PyUnicode_AsUnicode((PyObject *)(op)), \ - assert(((PyASCIIObject *)(op))->wstr), \ - PyUnicode_WSTR_LENGTH(op))) - -#define PyUnicode_GET_DATA_SIZE(op) \ - (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) - /* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE representation on demand. Using this macro is very inefficient now, try to port your code to use the new PyUnicode_*BYTE_DATA() macros or use PyUnicode_WRITE() and PyUnicode_READ(). */ -#define PyUnicode_AS_UNICODE(op) \ - (assert(PyUnicode_Check(op)), \ - (((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \ - PyUnicode_AsUnicode((PyObject *)(op))) - #define PyUnicode_AS_DATA(op) \ ((const char *)(PyUnicode_AS_UNICODE(op))) @@ -710,6 +689,25 @@ PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Cop PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode( PyObject *unicode /* Unicode object */ ); + +PyAPI_FUNC(Py_UNICODE *) PyUnicode_AS_UNICODE( + PyObject *unicode /* Unicode object */ + ); + +/* Returns the deprecated Py_UNICODE representation's size in code units + (this includes surrogate pairs as 2 units). + If the Py_UNICODE representation is not available, it will be computed + on request. + Return -1 on error. + + Use PyUnicode_GET_LENGTH() for the length in code points. */ + +PyAPI_FUNC(Py_ssize_t) PyUnicode_GET_SIZE( + PyObject *unicode /* Unicode object */ + ); + +#define PyUnicode_GET_DATA_SIZE(op) \ + (PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE) #endif /* Return a read-only pointer to the Unicode object's internal @@ -862,7 +860,7 @@ PyAPI_FUNC(void) _Py_ReleaseInternedUnic The buffer is copied into the new object. */ PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( - register const wchar_t *w, /* wchar_t buffer */ + const wchar_t *w, /* wchar_t buffer */ Py_ssize_t size /* size of buffer */ ); @@ -880,12 +878,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromWide PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( PyObject *unicode, /* Unicode object */ - register wchar_t *w, /* wchar_t buffer */ + wchar_t *w, /* wchar_t buffer */ Py_ssize_t size /* size of buffer */ ); /* Convert the Unicode object to a wide character string. The output string - always ends with a nul character. If size is not NULL, write the number of + always ends with a null character. If size is not NULL, write the number of wide characters (excluding the null character) into *size. Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) @@ -897,6 +895,23 @@ PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCha Py_ssize_t *size /* number of characters of the result */ ); +/* Convert the Unicode object to a wide character string. The output string + always ends with a null character. If size is not NULL, write the number of + wide characters (excluding the null character) into *size. + + This function caches the wide character string in the Unicode object + and subsequent calls will return the same string. The memory is released + when the Unicode object is deallocated. + + On error, returns NULL, *size is undefined and an exception is raised. */ + +#ifndef Py_LIMITED_API +PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharAndSize( + PyObject *unicode, /* Unicode object */ + Py_ssize_t *size /* number of characters of the result */ + ); +#endif + #ifndef Py_LIMITED_API PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); #endif @@ -2020,7 +2035,7 @@ PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrc Py_UNICODE c ); -/* Create a copy of a unicode string ending with a nul character. Return NULL +/* Create a copy of a unicode string ending with a null character. Return NULL and raise a MemoryError exception on memory allocation failure, otherwise return a new allocated buffer (use PyMem_Free() to free the buffer). */ diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py --- a/Lib/test/test_capi.py +++ b/Lib/test/test_capi.py @@ -8,6 +8,7 @@ import subprocess import sys import time import unittest +import warnings from test import support try: import threading @@ -179,10 +180,16 @@ def test_main(): support.run_unittest(CAPITest, TestPendingCalls, Test6012, EmbeddingTest) for name in dir(_testcapi): - if name.startswith('test_'): - test = getattr(_testcapi, name) - if support.verbose: - print("internal", name) + if not name.startswith('test_'): + continue + test = getattr(_testcapi, name) + if support.verbose: + print("internal", name) + if name in ('test_u_code', 'test_Z_code'): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + test() + else: test() # some extra thread-state tests driven via _testcapi diff --git a/Lib/test/test_getargs2.py b/Lib/test/test_getargs2.py --- a/Lib/test/test_getargs2.py +++ b/Lib/test/test_getargs2.py @@ -1,4 +1,5 @@ import unittest +import warnings from test import support from _testcapi import getargs_keywords @@ -400,39 +401,51 @@ class Bytes_TestCase(unittest.TestCase): class Unicode_TestCase(unittest.TestCase): def test_u(self): from _testcapi import getargs_u - self.assertEqual(getargs_u('abc\xe9'), 'abc\xe9') - self.assertRaises(TypeError, getargs_u, 'nul:0円') - self.assertRaises(TypeError, getargs_u, b'bytes') - self.assertRaises(TypeError, getargs_u, bytearray(b'bytearray')) - self.assertRaises(TypeError, getargs_u, memoryview(b'memoryview')) - self.assertRaises(TypeError, getargs_u, None) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + self.assertEqual(getargs_u('abc\xe9'), 'abc\xe9') + self.assertRaises(TypeError, getargs_u, 'nul:0円') + self.assertRaises(TypeError, getargs_u, b'bytes') + self.assertRaises(TypeError, getargs_u, bytearray(b'bytearray')) + self.assertRaises(TypeError, getargs_u, memoryview(b'memoryview')) + self.assertRaises(TypeError, getargs_u, None) def test_u_hash(self): from _testcapi import getargs_u_hash - self.assertEqual(getargs_u_hash('abc\xe9'), 'abc\xe9') - self.assertEqual(getargs_u_hash('nul:0円'), 'nul:0円') - self.assertRaises(TypeError, getargs_u_hash, b'bytes') - self.assertRaises(TypeError, getargs_u_hash, bytearray(b'bytearray')) - self.assertRaises(TypeError, getargs_u_hash, memoryview(b'memoryview')) - self.assertRaises(TypeError, getargs_u_hash, None) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + self.assertEqual(getargs_u_hash('abc\xe9'), 'abc\xe9') + self.assertEqual(getargs_u_hash('nul:0円'), 'nul:0円') + self.assertRaises(TypeError, getargs_u_hash, b'bytes') + self.assertRaises(TypeError, getargs_u_hash, bytearray(b'bytearray')) + self.assertRaises(TypeError, getargs_u_hash, memoryview(b'memoryview')) + self.assertRaises(TypeError, getargs_u_hash, None) def test_Z(self): from _testcapi import getargs_Z - self.assertEqual(getargs_Z('abc\xe9'), 'abc\xe9') - self.assertRaises(TypeError, getargs_Z, 'nul:0円') - self.assertRaises(TypeError, getargs_Z, b'bytes') - self.assertRaises(TypeError, getargs_Z, bytearray(b'bytearray')) - self.assertRaises(TypeError, getargs_Z, memoryview(b'memoryview')) - self.assertIsNone(getargs_Z(None)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + self.assertEqual(getargs_Z('abc\xe9'), 'abc\xe9') + self.assertRaises(TypeError, getargs_Z, 'nul:0円') + self.assertRaises(TypeError, getargs_Z, b'bytes') + self.assertRaises(TypeError, getargs_Z, bytearray(b'bytearray')) + self.assertRaises(TypeError, getargs_Z, memoryview(b'memoryview')) + self.assertIsNone(getargs_Z(None)) def test_Z_hash(self): from _testcapi import getargs_Z_hash - self.assertEqual(getargs_Z_hash('abc\xe9'), 'abc\xe9') - self.assertEqual(getargs_Z_hash('nul:0円'), 'nul:0円') - self.assertRaises(TypeError, getargs_Z_hash, b'bytes') - self.assertRaises(TypeError, getargs_Z_hash, bytearray(b'bytearray')) - self.assertRaises(TypeError, getargs_Z_hash, memoryview(b'memoryview')) - self.assertIsNone(getargs_Z_hash(None)) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + self.assertEqual(getargs_Z_hash('abc\xe9'), 'abc\xe9') + self.assertEqual(getargs_Z_hash('nul:0円'), 'nul:0円') + self.assertRaises(TypeError, getargs_Z_hash, b'bytes') + self.assertRaises(TypeError, getargs_Z_hash, bytearray(b'bytearray')) + self.assertRaises(TypeError, getargs_Z_hash, memoryview(b'memoryview')) + self.assertIsNone(getargs_Z_hash(None)) def test_main(): diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1808,29 +1808,36 @@ class UnicodeTest(string_tests.CommonTes def test_encode_decimal(self): from _testcapi import unicode_encodedecimal - self.assertEqual(unicode_encodedecimal('123'), - b'123') - self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), - b'3.14') - self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), - b' 3.14 ') - self.assertRaises(UnicodeEncodeError, - unicode_encodedecimal, "123\u20ac", "strict") - self.assertRaisesRegex( - ValueError, - "^'decimal' codec can't encode character", - unicode_encodedecimal, "123\u20ac", "replace") + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + self.assertEqual(unicode_encodedecimal('123'), + b'123') + self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), + b'3.14') + self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), + b' 3.14 ') + self.assertRaises(UnicodeEncodeError, + unicode_encodedecimal, "123\u20ac", "strict") + self.assertRaisesRegex( + ValueError, + "^'decimal' codec can't encode character", + unicode_encodedecimal, "123\u20ac", "replace") def test_transform_decimal(self): from _testcapi import unicode_transformdecimaltoascii as transform_decimal - self.assertEqual(transform_decimal('123'), - '123') - self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), - '3.14') - self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), - "\N{EM SPACE}3.14\N{EN SPACE}") - self.assertEqual(transform_decimal('123\u20ac'), - '123\u20ac') + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + + self.assertEqual(transform_decimal('123'), + '123') + self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), + '3.14') + self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), + "\N{EM SPACE}3.14\N{EN SPACE}") + self.assertEqual(transform_decimal('123\u20ac'), + '123\u20ac') def test_getnewargs(self): text = 'abc' diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -687,18 +687,15 @@ unicode_internal_encode(PyObject *self, return NULL; if (PyUnicode_Check(obj)) { - Py_UNICODE *u; + wchar_t *wstr; - if (PyUnicode_READY(obj) < 0) + wstr = PyUnicode_AsWideCharAndSize(obj, &len); + if (wstr == NULL) return NULL; - - u = PyUnicode_AsUnicodeAndSize(obj, &len); - if (u == NULL) - return NULL; - if (len> PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) + if (len> PY_SSIZE_T_MAX / sizeof(wchar_t)) return PyErr_NoMemory(); - size = len * sizeof(Py_UNICODE); - return codec_tuple(PyBytes_FromStringAndSize((const char*)u, size), + size = len * sizeof(wchar_t); + return codec_tuple(PyBytes_FromStringAndSize((const char*)wstr, size), PyUnicode_GET_LENGTH(obj)); } else { diff --git a/Modules/_ctypes/_ctypes.c b/Modules/_ctypes/_ctypes.c --- a/Modules/_ctypes/_ctypes.c +++ b/Modules/_ctypes/_ctypes.c @@ -1142,7 +1142,7 @@ static int WCharArray_set_value(CDataObject *self, PyObject *value) { Py_ssize_t result = 0; - Py_UNICODE *wstr; + wchar_t *wstr; Py_ssize_t len; if (value == NULL) { @@ -1158,7 +1158,7 @@ WCharArray_set_value(CDataObject *self, } else Py_INCREF(value); - wstr = PyUnicode_AsUnicodeAndSize(value, &len); + wstr = PyUnicode_AsWideCharAndSize(value, &len); if (wstr == NULL) return -1; if ((unsigned)len> self->b_size/sizeof(wchar_t)) { @@ -3130,10 +3130,10 @@ _validate_paramflags(PyTypeObject *type, for (i = 0; i < len; ++i) { PyObject *item = PyTuple_GET_ITEM(paramflags, i); int flag; - char *name; + PyObject *nameobj; PyObject *defval; PyObject *typ; - if (!PyArg_ParseTuple(item, "i|ZO", &flag, &name, &defval)) { + if (!PyArg_ParseTuple(item, "i|UO", &flag, &nameobj, &defval)) { PyErr_SetString(PyExc_TypeError, "paramflags must be a sequence of (int [,string [,value]]) tuples"); return 0; diff --git a/Modules/_ctypes/callproc.c b/Modules/_ctypes/callproc.c --- a/Modules/_ctypes/callproc.c +++ b/Modules/_ctypes/callproc.c @@ -1234,7 +1234,7 @@ static PyObject *load_library(PyObject * if (!PyArg_ParseTuple(args, "O|O:LoadLibrary", &nameobj, &ignored)) return NULL; - name = PyUnicode_AsUnicode(nameobj); + name = PyUnicode_AsWideCharAndSize(nameobj, NULL); if (!name) return NULL; diff --git a/Modules/_ctypes/cfield.c b/Modules/_ctypes/cfield.c --- a/Modules/_ctypes/cfield.c +++ b/Modules/_ctypes/cfield.c @@ -1259,7 +1259,7 @@ U_get(void *ptr, Py_ssize_t size) static PyObject * U_set(void *ptr, PyObject *value, Py_ssize_t length) { - Py_UNICODE *wstr; + wchar_t *wstr; Py_ssize_t size; /* It's easier to calculate in characters than in bytes */ @@ -1273,7 +1273,7 @@ U_set(void *ptr, PyObject *value, Py_ssi } else Py_INCREF(value); - wstr = PyUnicode_AsUnicodeAndSize(value, &size); + wstr = PyUnicode_AsWideCharAndSize(value, &size); if (wstr == NULL) return NULL; if (size> length) { @@ -1477,7 +1477,7 @@ BSTR_set(void *ptr, PyObject *value, Py_ if (value) { wchar_t* wvalue; Py_ssize_t size; - wvalue = PyUnicode_AsUnicodeAndSize(value, &size); + wvalue = PyUnicode_AsWideCharAndSize(value, &size); if (wvalue == NULL) return NULL; if ((unsigned) size != size) { diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1502,6 +1502,7 @@ unicode_aswidecharstring(PyObject *self, static PyObject * unicode_encodedecimal(PyObject *self, PyObject *args) { + PyObject *obj; Py_UNICODE *unicode; Py_ssize_t length; char *errors = NULL; @@ -1509,7 +1510,11 @@ unicode_encodedecimal(PyObject *self, Py Py_ssize_t decimal_length, new_length; int res; - if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors)) + if (!PyArg_ParseTuple(args, "U|s", &obj, &errors)) + return NULL; + + unicode = PyUnicode_AsWideCharAndSize(obj, &length); + if (unicode == NULL) return NULL; decimal_length = length * 7; /* len('€') */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1639,14 +1639,14 @@ get_latin1_char(unsigned char ch) return unicode; } -PyObject * -PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) +static PyObject * +unicode_fromwidechar(const Py_UNICODE *wstr, Py_ssize_t size) { PyObject *unicode; Py_UCS4 maxchar = 0; Py_ssize_t num_surrogates; - if (u == NULL) + if (wstr == NULL) return (PyObject*)_PyUnicode_New(size); /* If the Unicode data is known at construction time, we can apply @@ -1660,12 +1660,12 @@ PyUnicode_FromUnicode(const Py_UNICODE * /* Single character Unicode objects in the Latin-1 range are shared when using this constructor */ - if (size == 1 && *u < 256) - return get_latin1_char((unsigned char)*u); + if (size == 1 && wstr[0] < 256) + return get_latin1_char((unsigned char)wstr[0]); /* If not empty and not single character, copy the Unicode data into the new object */ - if (find_maxchar_surrogates(u, u + size, + if (find_maxchar_surrogates(wstr, wstr + size, &maxchar, &num_surrogates) == -1) return NULL; @@ -1674,27 +1674,30 @@ PyUnicode_FromUnicode(const Py_UNICODE * if (!unicode) return NULL; - switch (PyUnicode_KIND(unicode)) { + switch (PyUnicode_KIND(unicode)) + { case PyUnicode_1BYTE_KIND: _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char, - u, u + size, PyUnicode_1BYTE_DATA(unicode)); + wstr, + wstr + size, PyUnicode_1BYTE_DATA(unicode)); break; case PyUnicode_2BYTE_KIND: #if Py_UNICODE_SIZE == 2 - Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2); + Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), wstr, size * 2); #else _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2, - u, u + size, PyUnicode_2BYTE_DATA(unicode)); + wstr, + wstr + size, PyUnicode_2BYTE_DATA(unicode)); #endif break; case PyUnicode_4BYTE_KIND: #if SIZEOF_WCHAR_T == 2 /* This is the only case which has to process surrogates, thus a simple copy loop is not enough and we need a function. */ - unicode_convert_wchar_to_ucs4(u, u + size, unicode); + unicode_convert_wchar_to_ucs4(wstr, wstr + size, unicode); #else assert(num_surrogates == 0); - Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4); + Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), wstr, size * 4); #endif break; default: @@ -1705,6 +1708,17 @@ PyUnicode_FromUnicode(const Py_UNICODE * } PyObject * +PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) +{ + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_FromUnicode() has been deprecated", + 1)) + return NULL; + + return unicode_fromwidechar(u, size); +} + +PyObject * PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size) { if (size < 0) { @@ -2113,7 +2127,7 @@ PyUnicode_AsUCS4Copy(PyObject *string) #ifdef HAVE_WCHAR_H PyObject * -PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) +PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size) { if (w == NULL) { if (size == 0) @@ -2126,7 +2140,7 @@ PyUnicode_FromWideChar(register const wc size = wcslen(w); } - return PyUnicode_FromUnicode(w, size); + return unicode_fromwidechar(w, size); } #endif /* HAVE_WCHAR_H */ @@ -2729,7 +2743,7 @@ unicode_aswidechar(PyObject *unicode, Py_ssize_t res; const wchar_t *wstr; - wstr = PyUnicode_AsUnicodeAndSize(unicode, &res); + wstr = PyUnicode_AsWideCharAndSize(unicode, &res); if (wstr == NULL) return -1; @@ -3421,8 +3435,8 @@ static int unicode_as_unicode_calls = 0; #endif -Py_UNICODE * -PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) +wchar_t* +PyUnicode_AsWideCharAndSize(PyObject *unicode, Py_ssize_t *size) { const unsigned char *one_byte; #if SIZEOF_WCHAR_T == 4 @@ -3439,105 +3453,150 @@ PyUnicode_AsUnicodeAndSize(PyObject *uni PyErr_BadArgument(); return NULL; } - if (_PyUnicode_WSTR(unicode) == NULL) { - /* Non-ASCII compact unicode object */ - assert(_PyUnicode_KIND(unicode) != 0); - assert(PyUnicode_IS_READY(unicode)); + + if (_PyUnicode_WSTR(unicode) != NULL) { + if (size != NULL) + *size = PyUnicode_WSTR_LENGTH(unicode); + return _PyUnicode_WSTR(unicode); + } + + /* Non-ASCII compact unicode object */ + assert(_PyUnicode_KIND(unicode) != 0); + assert(PyUnicode_IS_READY(unicode)); #ifdef Py_DEBUG - ++unicode_as_unicode_calls; -#endif - - if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { + ++unicode_as_unicode_calls; +#endif + + if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) { #if SIZEOF_WCHAR_T == 2 - four_bytes = PyUnicode_4BYTE_DATA(unicode); - ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); - num_surrogates = 0; - - for (; four_bytes < ucs4_end; ++four_bytes) { - if (*four_bytes> 0xFFFF) - ++num_surrogates; - } - - _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( - sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); - if (!_PyUnicode_WSTR(unicode)) { - PyErr_NoMemory(); - return NULL; - } - _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; - - w = _PyUnicode_WSTR(unicode); - wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); - four_bytes = PyUnicode_4BYTE_DATA(unicode); - for (; four_bytes < ucs4_end; ++four_bytes, ++w) { - if (*four_bytes> 0xFFFF) { - assert(*four_bytes <= 0x10FFFF); - /* encode surrogate pair in this case */ - *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); - *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); - } - else - *w = *four_bytes; - - if (w> wchar_end) { - assert(0 && "Miscalculated string end"); - } - } + four_bytes = PyUnicode_4BYTE_DATA(unicode); + ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode); + num_surrogates = 0; + + for (; four_bytes < ucs4_end; ++four_bytes) { + if (*four_bytes> 0xFFFF) + ++num_surrogates; + } + + _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC( + sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates)); + if (!_PyUnicode_WSTR(unicode)) { + PyErr_NoMemory(); + return NULL; + } + _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates; + + w = _PyUnicode_WSTR(unicode); + wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode); + four_bytes = PyUnicode_4BYTE_DATA(unicode); + for (; four_bytes < ucs4_end; ++four_bytes, ++w) { + if (*four_bytes> 0xFFFF) { + assert(*four_bytes <= 0x10FFFF); + /* encode surrogate pair in this case */ + *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); + *w = Py_UNICODE_LOW_SURROGATE(*four_bytes); + } + else + *w = *four_bytes; + + if (w> wchar_end) { + assert(0 && "Miscalculated string end"); + } + } + *w = 0; +#else + /* sizeof(wchar_t) == 4 */ + Py_FatalError("Impossible unicode object state, wstr and str " + "should share memory already."); + return NULL; +#endif + } + else { + _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * + (_PyUnicode_LENGTH(unicode) + 1)); + if (!_PyUnicode_WSTR(unicode)) { + PyErr_NoMemory(); + return NULL; + } + if (!PyUnicode_IS_COMPACT_ASCII(unicode)) + _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); + w = _PyUnicode_WSTR(unicode); + wchar_end = w + _PyUnicode_LENGTH(unicode); + + if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { + one_byte = PyUnicode_1BYTE_DATA(unicode); + for (; w < wchar_end; ++one_byte, ++w) + *w = *one_byte; + /* null-terminate the wstr */ + *w = 0; + } + else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { +#if SIZEOF_WCHAR_T == 4 + two_bytes = PyUnicode_2BYTE_DATA(unicode); + for (; w < wchar_end; ++two_bytes, ++w) + *w = *two_bytes; + /* null-terminate the wstr */ *w = 0; #else - /* sizeof(wchar_t) == 4 */ - Py_FatalError("Impossible unicode object state, wstr and str " - "should share memory already."); + /* sizeof(wchar_t) == 2 */ + PyObject_FREE(_PyUnicode_WSTR(unicode)); + _PyUnicode_WSTR(unicode) = NULL; + Py_FatalError("Impossible unicode object state, wstr " + "and str should share memory already."); return NULL; #endif } else { - _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * - (_PyUnicode_LENGTH(unicode) + 1)); - if (!_PyUnicode_WSTR(unicode)) { - PyErr_NoMemory(); - return NULL; - } - if (!PyUnicode_IS_COMPACT_ASCII(unicode)) - _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode); - w = _PyUnicode_WSTR(unicode); - wchar_end = w + _PyUnicode_LENGTH(unicode); - - if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) { - one_byte = PyUnicode_1BYTE_DATA(unicode); - for (; w < wchar_end; ++one_byte, ++w) - *w = *one_byte; - /* null-terminate the wstr */ - *w = 0; - } - else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) { -#if SIZEOF_WCHAR_T == 4 - two_bytes = PyUnicode_2BYTE_DATA(unicode); - for (; w < wchar_end; ++two_bytes, ++w) - *w = *two_bytes; - /* null-terminate the wstr */ - *w = 0; -#else - /* sizeof(wchar_t) == 2 */ - PyObject_FREE(_PyUnicode_WSTR(unicode)); - _PyUnicode_WSTR(unicode) = NULL; - Py_FatalError("Impossible unicode object state, wstr " - "and str should share memory already."); - return NULL; -#endif - } - else { - assert(0 && "This should never happen."); - } - } - } + assert(0 && "This should never happen."); + } + } + if (size != NULL) *size = PyUnicode_WSTR_LENGTH(unicode); return _PyUnicode_WSTR(unicode); } Py_UNICODE * +PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) +{ + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_AsUnicodeAndSize() has been deprecated", + 1)) + return NULL; + return PyUnicode_AsWideCharAndSize(unicode, size); +} + +Py_UNICODE * +PyUnicode_AS_UNICODE(PyObject *unicode) +{ + return PyUnicode_AsUnicodeAndSize(unicode, NULL); +} + +Py_ssize_t +PyUnicode_GET_SIZE(PyObject *unicode) +{ + Py_UNICODE *u; + assert(PyUnicode_Check(unicode)); + + if (PyErr_WarnEx(PyExc_DeprecationWarning, + "PyUnicode_GET_SIZE() has been deprecated", + 1)) + return -1; + + if (_PyUnicode_WSTR(unicode)) + return PyUnicode_WSTR_LENGTH(unicode); + u = PyUnicode_AsUnicode(unicode); +#ifdef Py_DEBUG + assert(u != NULL); +#else + if (u == NULL) + return -1; +#endif + return PyUnicode_WSTR_LENGTH(unicode); +} + +Py_UNICODE * PyUnicode_AsUnicode(PyObject *unicode) { return PyUnicode_AsUnicodeAndSize(unicode, NULL);