[Python-checkins] gh-80480: array: Add 'w' typecode. (#105242)

Sun Jun 4 12:45:07 EDT 2023

https://github.com/python/cpython/commit/1237fb6a4b177ce8f750949b9006c58f9f22942e
commit: 1237fb6a4b177ce8f750949b9006c58f9f22942e
branch: main
author: Inada Naoki <songofacandy at gmail.com>
committer: methane <songofacandy at gmail.com>
date: 2023年06月04日T16:45:00Z
summary:
gh-80480: array: Add 'w' typecode. (#105242)
files:
A Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst
M Doc/faq/programming.rst
M Doc/library/array.rst
M Doc/whatsnew/3.13.rst
M Lib/test/test_array.py
M Lib/test/test_csv.py
M Modules/arraymodule.c

diff --git a/Doc/faq/programming.rst b/Doc/faq/programming.rst
index ab5618db84f77..6e1812504a184 100644
--- a/Doc/faq/programming.rst
+++ b/Doc/faq/programming.rst
@@ -924,12 +924,12 @@ module::
 'Hello, there!'
 
 >>> import array
- >>> a = array.array('u', s)
+ >>> a = array.array('w', s)
 >>> print(a)
- array('u', 'Hello, world')
+ array('w', 'Hello, world')
 >>> a[0] = 'y'
 >>> print(a)
- array('u', 'yello, world')
+ array('w', 'yello, world')
 >>> a.tounicode()
 'yello, world'
 
diff --git a/Doc/library/array.rst b/Doc/library/array.rst
index 1f8fec6ea5539..1f5810b35d2d8 100644
--- a/Doc/library/array.rst
+++ b/Doc/library/array.rst
@@ -24,6 +24,8 @@ defined:
 +-----------+--------------------+-------------------+-----------------------+-------+
 | ``'u'`` | wchar_t | Unicode character | 2 | \(1) |
 +-----------+--------------------+-------------------+-----------------------+-------+
+| ``'w'`` | Py_UCS4 | Unicode character | 4 | |
++-----------+--------------------+-------------------+-----------------------+-------+
 | ``'h'`` | signed short | int | 2 | |
 +-----------+--------------------+-------------------+-----------------------+-------+
 | ``'H'`` | unsigned short | int | 2 | |
@@ -56,6 +58,7 @@ Notes:
 ``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3.
 
 .. deprecated-removed:: 3.3 4.0
+ Please migrate to ``'w'`` typecode.
 
 
 The actual representation of values is determined by the machine architecture
@@ -174,9 +177,9 @@ The module defines the following type:
 
 .. method:: fromunicode(s)
 
- Extends this array with data from the given unicode string. The array must
- be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised. Use
- ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
+ Extends this array with data from the given unicode string.
+ The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised.
+ Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
 array of some other type.
 
 
@@ -236,21 +239,22 @@ The module defines the following type:
 
 .. method:: tounicode()
 
- Convert the array to a unicode string. The array must be a type ``'u'`` array;
+ Convert the array to a unicode string. The array must have a type ``'u'`` or ``'w'``;
 otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to
 obtain a unicode string from an array of some other type.
 
 
 When an array object is printed or converted to a string, it is represented as
 ``array(typecode, initializer)``. The *initializer* is omitted if the array is
-empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a
-list of numbers. The string is guaranteed to be able to be converted back to an
+empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``,
+otherwise it is a list of numbers.
+The string is guaranteed to be able to be converted back to an
 array with the same type and value using :func:`eval`, so long as the
 :class:`~array.array` class has been imported using ``from array import array``.
 Examples::
 
 array('l')
- array('u', 'hello \u2641')
+ array('w', 'hello \u2641')
 array('l', [1, 2, 3, 4, 5])
 array('d', [1.0, 2.0, 3.14])
 
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst
index 04ac34e09264b..ff7772ef1ffa3 100644
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -87,6 +87,13 @@ New Modules
 Improved Modules
 ================
 
+array
+-----
+
+* Add ``'w'`` type code that can be used for Unicode strings.
+ It can be used instead of ``'u'`` type code, which is deprecated.
+ (Contributed by Inada Naoki in :gh:`80480`.)
+
 io
 --
 
diff --git a/Lib/test/test_array.py b/Lib/test/test_array.py
index 5b2c107a6044b..a94d04f6515e2 100755
--- a/Lib/test/test_array.py
+++ b/Lib/test/test_array.py
@@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array):
 def __init__(self, typecode, newarg=None):
 array.array.__init__(self)
 
-typecodes = 'ubBhHiIlLfdqQ'
+typecodes = 'uwbBhHiIlLfdqQ'
 
 class MiscTest(unittest.TestCase):
 
@@ -186,11 +186,12 @@ def test_unicode(self):
 )
 for testcase in testcases:
 mformat_code, encoding = testcase
- a = array.array('u', teststr)
- b = array_reconstructor(
- array.array, 'u', mformat_code, teststr.encode(encoding))
- self.assertEqual(a, b,
- msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
+ for c in 'uw':
+ a = array.array(c, teststr)
+ b = array_reconstructor(
+ array.array, c, mformat_code, teststr.encode(encoding))
+ self.assertEqual(a, b,
+ msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
 
 
 class BaseTest:
@@ -234,7 +235,7 @@ def test_buffer_info(self):
 self.assertEqual(bi[1], len(a))
 
 def test_byteswap(self):
- if self.typecode == 'u':
+ if self.typecode in ('u', 'w'):
 example = '\U00100100'
 else:
 example = self.example
@@ -1079,7 +1080,7 @@ def test_buffer(self):
 self.assertEqual(m.tobytes(), expected)
 self.assertRaises(BufferError, a.frombytes, a.tobytes())
 self.assertEqual(m.tobytes(), expected)
- if self.typecode == 'u':
+ if self.typecode in ('u', 'w'):
 self.assertRaises(BufferError, a.fromunicode, a.tounicode())
 self.assertEqual(m.tobytes(), expected)
 self.assertRaises(BufferError, operator.imul, a, 2)
@@ -1135,16 +1136,17 @@ def test_sizeof_without_buffer(self):
 support.check_sizeof(self, a, basesize)
 
 def test_initialize_with_unicode(self):
- if self.typecode != 'u':
+ if self.typecode not in ('u', 'w'):
 with self.assertRaises(TypeError) as cm:
 a = array.array(self.typecode, 'foo')
 self.assertIn("cannot use a str", str(cm.exception))
 with self.assertRaises(TypeError) as cm:
- a = array.array(self.typecode, array.array('u', 'foo'))
+ a = array.array(self.typecode, array.array('w', 'foo'))
 self.assertIn("cannot use a unicode array", str(cm.exception))
 else:
 a = array.array(self.typecode, "foo")
 a = array.array(self.typecode, array.array('u', 'foo'))
+ a = array.array(self.typecode, array.array('w', 'foo'))
 
 @support.cpython_only
 def test_obsolete_write_lock(self):
@@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase):
 smallerexample = '\x01\u263a\x00\ufefe'
 biggerexample = '\x01\u263a\x01\ufeff'
 outside = str('\x33')
- minitemsize = 2
+ minitemsize = sizeof_wchar
 
 def test_unicode(self):
 self.assertRaises(TypeError, array.array, 'b', 'foo')
 
- a = array.array('u', '\xa0\xc2\u1234')
+ a = array.array(self.typecode, '\xa0\xc2\u1234')
 a.fromunicode(' ')
 a.fromunicode('')
 a.fromunicode('')
 a.fromunicode('\x11abc\xff\u1234')
 s = a.tounicode()
 self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234')
- self.assertEqual(a.itemsize, sizeof_wchar)
+ self.assertEqual(a.itemsize, self.minitemsize)
 
 s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234'
- a = array.array('u', s)
+ a = array.array(self.typecode, s)
 self.assertEqual(
 repr(a),
- "array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
+ f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
 
 self.assertRaises(TypeError, a.fromunicode)
 
 def test_issue17223(self):
- # this used to crash
- if sizeof_wchar == 4:
- # U+FFFFFFFF is an invalid code point in Unicode 6.0
- invalid_str = b'\xff\xff\xff\xff'
- else:
+ if self.typecode == 'u' and sizeof_wchar == 2:
 # PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
 self.skipTest("specific to 32-bit wchar_t")
- a = array.array('u', invalid_str)
+
+ # this used to crash
+ # U+FFFFFFFF is an invalid code point in Unicode 6.0
+ invalid_str = b'\xff\xff\xff\xff'
+
+ a = array.array(self.typecode, invalid_str)
 self.assertRaises(ValueError, a.tounicode)
 self.assertRaises(ValueError, str, a)
 
+class UCS4Test(UnicodeTest):
+ typecode = 'w'
+ minitemsize = 4
+
 class NumberTest(BaseTest):
 
 def test_extslice(self):
diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py
index de7ac97d72cb8..6a4180e6d1b0a 100644
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@@ -955,7 +955,7 @@ def test_float_write(self):
 
 def test_char_write(self):
 import array, string
- a = array.array('u', string.ascii_letters)
+ a = array.array('w', string.ascii_letters)
 
 with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
 writer = csv.writer(fileobj, dialect="excel")
diff --git a/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst b/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst
new file mode 100644
index 0000000000000..fd87efe9bde0c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-06-02-23-32-17.gh-issue-80480.savBw9.rst
@@ -0,0 +1 @@
+:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.
diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c
index 999b848f9adbd..16e3739eb26fc 100644
--- a/Modules/arraymodule.c
+++ b/Modules/arraymodule.c
@@ -13,6 +13,7 @@
 #include "pycore_bytesobject.h" // _PyBytes_Repeat
 #include "structmember.h" // PyMemberDef
 #include <stddef.h> // offsetof()
+#include <stdbool.h>
 
 /*[clinic input]
 module array
@@ -279,6 +280,31 @@ u_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
 return 0;
 }
 
+static PyObject *
+w_getitem(arrayobject *ap, Py_ssize_t i)
+{
+ return PyUnicode_FromOrdinal(((Py_UCS4 *) ap->ob_item)[i]);
+}
+
+static int
+w_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
+{
+ PyObject *u;
+ if (!PyArg_Parse(v, "U;array item must be unicode character", &u)) {
+ return -1;
+ }
+
+ if (PyUnicode_GetLength(u) != 1) {
+ PyErr_SetString(PyExc_TypeError,
+ "array item must be unicode character");
+ return -1;
+ }
+
+ if (i >= 0) {
+ ((Py_UCS4 *)ap->ob_item)[i] = PyUnicode_READ_CHAR(u, 0);
+ }
+ return 0;
+}
 
 static PyObject *
 h_getitem(arrayobject *ap, Py_ssize_t i)
@@ -543,6 +569,7 @@ d_setitem(arrayobject *ap, Py_ssize_t i, PyObject *v)
 DEFINE_COMPAREITEMS(b, signed char)
 DEFINE_COMPAREITEMS(BB, unsigned char)
 DEFINE_COMPAREITEMS(u, wchar_t)
+DEFINE_COMPAREITEMS(w, Py_UCS4)
 DEFINE_COMPAREITEMS(h, short)
 DEFINE_COMPAREITEMS(HH, unsigned short)
 DEFINE_COMPAREITEMS(i, int)
@@ -561,6 +588,7 @@ static const struct arraydescr descriptors[] = {
 {'b', 1, b_getitem, b_setitem, b_compareitems, "b", 1, 1},
 {'B', 1, BB_getitem, BB_setitem, BB_compareitems, "B", 1, 0},
 {'u', sizeof(wchar_t), u_getitem, u_setitem, u_compareitems, "u", 0, 0},
+ {'w', sizeof(Py_UCS4), w_getitem, w_setitem, w_compareitems, "w", 0, 0,},
 {'h', sizeof(short), h_getitem, h_setitem, h_compareitems, "h", 1, 1},
 {'H', sizeof(short), HH_getitem, HH_setitem, HH_compareitems, "H", 1, 0},
 {'i', sizeof(int), i_getitem, i_setitem, i_compareitems, "i", 1, 1},
@@ -1716,25 +1744,46 @@ static PyObject *
 array_array_fromunicode_impl(arrayobject *self, PyObject *ustr)
 /*[clinic end generated code: output=24359f5e001a7f2b input=025db1fdade7a4ce]*/
 {
- if (self->ob_descr->typecode != 'u') {
+ int typecode = self->ob_descr->typecode;
+ if (typecode != 'u' && typecode != 'w') {
 PyErr_SetString(PyExc_ValueError,
 "fromunicode() may only be called on "
- "unicode type arrays");
+ "unicode type arrays ('u' or 'w')");
 return NULL;
 }
 
- Py_ssize_t ustr_length = PyUnicode_AsWideChar(ustr, NULL, 0);
- assert(ustr_length > 0);
- if (ustr_length > 1) {
- ustr_length--; /* trim trailing NUL character */
+ if (typecode == 'u') {
+ Py_ssize_t ustr_length = PyUnicode_AsWideChar(ustr, NULL, 0);
+ assert(ustr_length > 0);
+ if (ustr_length > 1) {
+ ustr_length--; /* trim trailing NUL character */
+ Py_ssize_t old_size = Py_SIZE(self);
+ if (array_resize(self, old_size + ustr_length) == -1) {
+ return NULL;
+ }
+
+ // must not fail
+ PyUnicode_AsWideChar(
+ ustr, ((wchar_t *)self->ob_item) + old_size, ustr_length);
+ }
+ }
+ else { // typecode == 'w'
+ Py_ssize_t ustr_length = PyUnicode_GetLength(ustr);
 Py_ssize_t old_size = Py_SIZE(self);
- if (array_resize(self, old_size + ustr_length) == -1) {
+ Py_ssize_t new_size = old_size + ustr_length;
+
+ if (new_size < 0 || (size_t)new_size > PY_SSIZE_T_MAX / sizeof(Py_UCS4)) {
+ return PyErr_NoMemory();
+ }
+ if (array_resize(self, new_size) == -1) {
 return NULL;
 }
 
 // must not fail
- PyUnicode_AsWideChar(
- ustr, ((wchar_t *)self->ob_item) + old_size, ustr_length);
+ Py_UCS4 *u = PyUnicode_AsUCS4(ustr, ((Py_UCS4*)self->ob_item) + old_size,
+ ustr_length, 0);
+ assert(u != NULL);
+ (void)u; // Suppress unused_variable warning.
 }
 
 Py_RETURN_NONE;
@@ -1754,12 +1803,20 @@ static PyObject *
 array_array_tounicode_impl(arrayobject *self)
 /*[clinic end generated code: output=08e442378336e1ef input=127242eebe70b66d]*/
 {
- if (self->ob_descr->typecode != 'u') {
+ int typecode = self->ob_descr->typecode;
+ if (typecode != 'u' && typecode != 'w') {
 PyErr_SetString(PyExc_ValueError,
- "tounicode() may only be called on unicode type arrays");
+ "tounicode() may only be called on unicode type arrays ('u' or 'w')");
 return NULL;
 }
- return PyUnicode_FromWideChar((wchar_t *) self->ob_item, Py_SIZE(self));
+ if (typecode == 'u') {
+ return PyUnicode_FromWideChar((wchar_t *) self->ob_item, Py_SIZE(self));
+ }
+ else { // typecode == 'w'
+ int byteorder = 0; // native byteorder
+ return PyUnicode_DecodeUTF32((const char *) self->ob_item, Py_SIZE(self) * 4,
+ NULL, &byteorder);
+ }
 }
 
 /*[clinic input]
@@ -1838,6 +1895,9 @@ typecode_to_mformat_code(char typecode)
 }
 return UNKNOWN_FORMAT;
 
+ case 'w':
+ return UTF32_LE + is_big_endian;
+
 case 'f':
 if (sizeof(float) == 4) {
 const float y = 16711938.0;
@@ -2314,7 +2374,7 @@ array_repr(arrayobject *a)
 return PyUnicode_FromFormat("%s('%c')",
 _PyType_Name(Py_TYPE(a)), (int)typecode);
 }
- if (typecode == 'u') {
+ if (typecode == 'u' || typecode == 'w') {
 v = array_array_tounicode_impl(a);
 } else {
 v = array_array_tolist_impl(a);
@@ -2619,17 +2679,21 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 return NULL;
 }
 
- if (initial && c != 'u') {
+ bool is_unicode = c == 'u' || c == 'w';
+
+ if (initial && !is_unicode) {
 if (PyUnicode_Check(initial)) {
 PyErr_Format(PyExc_TypeError, "cannot use a str to initialize "
 "an array with typecode '%c'", c);
 return NULL;
 }
- else if (array_Check(initial, state) &&
- ((arrayobject*)initial)->ob_descr->typecode == 'u') {
- PyErr_Format(PyExc_TypeError, "cannot use a unicode array to "
- "initialize an array with typecode '%c'", c);
- return NULL;
+ else if (array_Check(initial, state)) {
+ int ic = ((arrayobject*)initial)->ob_descr->typecode;
+ if (ic == 'u' || ic == 'w') {
+ PyErr_Format(PyExc_TypeError, "cannot use a unicode array to "
+ "initialize an array with typecode '%c'", c);
+ return NULL;
+ }
 }
 }
 
@@ -2637,7 +2701,7 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 || PyByteArray_Check(initial)
 || PyBytes_Check(initial)
 || PyTuple_Check(initial)
- || ((c=='u') && PyUnicode_Check(initial))
+ || (is_unicode && PyUnicode_Check(initial))
 || (array_Check(initial, state)
 && c == ((arrayobject*)initial)->ob_descr->typecode))) {
 it = PyObject_GetIter(initial);
@@ -2697,14 +2761,31 @@ array_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 Py_DECREF(v);
 }
 else if (initial != NULL && PyUnicode_Check(initial)) {
- Py_ssize_t n;
- wchar_t *ustr = PyUnicode_AsWideCharString(initial, &n);
- if (ustr == NULL) {
- Py_DECREF(a);
- return NULL;
+ if (c == 'u') {
+ Py_ssize_t n;
+ wchar_t *ustr = PyUnicode_AsWideCharString(initial, &n);
+ if (ustr == NULL) {
+ Py_DECREF(a);
+ return NULL;
+ }
+
+ if (n > 0) {
+ arrayobject *self = (arrayobject *)a;
+ // self->ob_item may be NULL but it is safe.
+ PyMem_Free(self->ob_item);
+ self->ob_item = (char *)ustr;
+ Py_SET_SIZE(self, n);
+ self->allocated = n;
+ }
 }
+ else { // c == 'w'
+ Py_ssize_t n = PyUnicode_GET_LENGTH(initial);
+ Py_UCS4 *ustr = PyUnicode_AsUCS4Copy(initial);
+ if (ustr == NULL) {
+ Py_DECREF(a);
+ return NULL;
+ }
 
- if (n > 0) {
 arrayobject *self = (arrayobject *)a;
 // self->ob_item may be NULL but it is safe.
 PyMem_Free(self->ob_item);