Skip to content

Commit

Permalink
gh-80480: array: Add 'w' typecode. (#105242)
Browse files Browse the repository at this point in the history
  • Loading branch information
methane authored Jun 4, 2023
1 parent 5a5ed7a commit 1237fb6
Show file tree
Hide file tree
Showing 7 changed files with 158 additions and 58 deletions.
6 changes: 3 additions & 3 deletions Doc/faq/programming.rst
Original file line number Diff line number Diff line change
Expand Up @@ -924,12 +924,12 @@ module::
'Hello, there!'

>>> import array
>>> a = array.array('u', s)
>>> a = array.array('w', s)
>>> print(a)
array('u', 'Hello, world')
array('w', 'Hello, world')
>>> a[0] = 'y'
>>> print(a)
array('u', 'yello, world')
array('w', 'yello, world')
>>> a.tounicode()
'yello, world'

Expand Down
18 changes: 11 additions & 7 deletions Doc/library/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ defined:
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'u'`` | wchar_t | Unicode character | 2 | \(1) |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'w'`` | Py_UCS4 | Unicode character | 4 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'h'`` | signed short | int | 2 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'H'`` | unsigned short | int | 2 | |
Expand Down Expand Up @@ -56,6 +58,7 @@ Notes:
``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3.

.. deprecated-removed:: 3.3 4.0
Please migrate to ``'w'`` typecode.


The actual representation of values is determined by the machine architecture
Expand Down Expand Up @@ -174,9 +177,9 @@ The module defines the following type:

.. method:: fromunicode(s)

Extends this array with data from the given unicode string. The array must
be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised. Use
``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
Extends this array with data from the given unicode string.
The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised.
Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
array of some other type.


Expand Down Expand Up @@ -236,21 +239,22 @@ The module defines the following type:

.. method:: tounicode()

Convert the array to a unicode string. The array must be a type ``'u'`` array;
Convert the array to a unicode string. The array must have a type ``'u'`` or ``'w'``;
otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to
obtain a unicode string from an array of some other type.


When an array object is printed or converted to a string, it is represented as
``array(typecode, initializer)``. The *initializer* is omitted if the array is
empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a
list of numbers. The string is guaranteed to be able to be converted back to an
empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``,
otherwise it is a list of numbers.
The string is guaranteed to be able to be converted back to an
array with the same type and value using :func:`eval`, so long as the
:class:`~array.array` class has been imported using ``from array import array``.
Examples::

array('l')
array('u', 'hello \u2641')
array('w', 'hello \u2641')
array('l', [1, 2, 3, 4, 5])
array('d', [1.0, 2.0, 3.14])

Expand Down
7 changes: 7 additions & 0 deletions Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ New Modules
Improved Modules
================

array
-----

* Add ``'w'`` type code that can be used for Unicode strings.
It can be used instead of ``'u'`` type code, which is deprecated.
(Contributed by Inada Naoki in :gh:`80480`.)

io
--

Expand Down
49 changes: 28 additions & 21 deletions Lib/test/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array):
def __init__(self, typecode, newarg=None):
array.array.__init__(self)

typecodes = 'ubBhHiIlLfdqQ'
typecodes = 'uwbBhHiIlLfdqQ'

class MiscTest(unittest.TestCase):

Expand Down Expand Up @@ -186,11 +186,12 @@ def test_unicode(self):
)
for testcase in testcases:
mformat_code, encoding = testcase
a = array.array('u', teststr)
b = array_reconstructor(
array.array, 'u', mformat_code, teststr.encode(encoding))
self.assertEqual(a, b,
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
for c in 'uw':
a = array.array(c, teststr)
b = array_reconstructor(
array.array, c, mformat_code, teststr.encode(encoding))
self.assertEqual(a, b,
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))


class BaseTest:
Expand Down Expand Up @@ -234,7 +235,7 @@ def test_buffer_info(self):
self.assertEqual(bi[1], len(a))

def test_byteswap(self):
if self.typecode == 'u':
if self.typecode in ('u', 'w'):
example = '\U00100100'
else:
example = self.example
Expand Down Expand Up @@ -1079,7 +1080,7 @@ def test_buffer(self):
self.assertEqual(m.tobytes(), expected)
self.assertRaises(BufferError, a.frombytes, a.tobytes())
self.assertEqual(m.tobytes(), expected)
if self.typecode == 'u':
if self.typecode in ('u', 'w'):
self.assertRaises(BufferError, a.fromunicode, a.tounicode())
self.assertEqual(m.tobytes(), expected)
self.assertRaises(BufferError, operator.imul, a, 2)
Expand Down Expand Up @@ -1135,16 +1136,17 @@ def test_sizeof_without_buffer(self):
support.check_sizeof(self, a, basesize)

def test_initialize_with_unicode(self):
if self.typecode != 'u':
if self.typecode not in ('u', 'w'):
with self.assertRaises(TypeError) as cm:
a = array.array(self.typecode, 'foo')
self.assertIn("cannot use a str", str(cm.exception))
with self.assertRaises(TypeError) as cm:
a = array.array(self.typecode, array.array('u', 'foo'))
a = array.array(self.typecode, array.array('w', 'foo'))
self.assertIn("cannot use a unicode array", str(cm.exception))
else:
a = array.array(self.typecode, "foo")
a = array.array(self.typecode, array.array('u', 'foo'))
a = array.array(self.typecode, array.array('w', 'foo'))

@support.cpython_only
def test_obsolete_write_lock(self):
Expand All @@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase):
smallerexample = '\x01\u263a\x00\ufefe'
biggerexample = '\x01\u263a\x01\ufeff'
outside = str('\x33')
minitemsize = 2
minitemsize = sizeof_wchar

def test_unicode(self):
self.assertRaises(TypeError, array.array, 'b', 'foo')

a = array.array('u', '\xa0\xc2\u1234')
a = array.array(self.typecode, '\xa0\xc2\u1234')
a.fromunicode(' ')
a.fromunicode('')
a.fromunicode('')
a.fromunicode('\x11abc\xff\u1234')
s = a.tounicode()
self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234')
self.assertEqual(a.itemsize, sizeof_wchar)
self.assertEqual(a.itemsize, self.minitemsize)

s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234'
a = array.array('u', s)
a = array.array(self.typecode, s)
self.assertEqual(
repr(a),
"array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")

self.assertRaises(TypeError, a.fromunicode)

def test_issue17223(self):
# this used to crash
if sizeof_wchar == 4:
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'
else:
if self.typecode == 'u' and sizeof_wchar == 2:
# PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
self.skipTest("specific to 32-bit wchar_t")
a = array.array('u', invalid_str)

# this used to crash
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'

a = array.array(self.typecode, invalid_str)
self.assertRaises(ValueError, a.tounicode)
self.assertRaises(ValueError, str, a)

class UCS4Test(UnicodeTest):
typecode = 'w'
minitemsize = 4

class NumberTest(BaseTest):

def test_extslice(self):
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ def test_float_write(self):

def test_char_write(self):
import array, string
a = array.array('u', string.ascii_letters)
a = array.array('w', string.ascii_letters)

with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
writer = csv.writer(fileobj, dialect="excel")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.
Loading

0 comments on commit 1237fb6

Please sign in to comment.