[Python-checkins] r84655 - in python/branches/py3k: Lib/test/test_builtin.py Lib/test/test_codeccallbacks.py Misc/NEWS Python/codecs.c

Thu Sep 9 22:30:24 CEST 2010

Author: antoine.pitrou
Date: Thu Sep 9 22:30:23 2010
New Revision: 84655
Log:
Issue #9804: ascii() now always represents unicode surrogate pairs as
a single `\UXXXXXXXX`, regardless of whether the character is printable
or not. Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
Modified:
 python/branches/py3k/Lib/test/test_builtin.py
 python/branches/py3k/Lib/test/test_codeccallbacks.py
 python/branches/py3k/Misc/NEWS
 python/branches/py3k/Python/codecs.c
Modified: python/branches/py3k/Lib/test/test_builtin.py
==============================================================================

--- python/branches/py3k/Lib/test/test_builtin.py	(original)
+++ python/branches/py3k/Lib/test/test_builtin.py	Thu Sep 9 22:30:23 2010
@@ -179,6 +179,28 @@
 a = {}
 a[0] = a
 self.assertEqual(ascii(a), '{0: {...}}')
+ # Advanced checks for unicode strings
+ def _check_uni(s):
+ self.assertEqual(ascii(s), repr(s))
+ _check_uni("'")
+ _check_uni('"')
+ _check_uni('"\'')
+ _check_uni('0円')
+ _check_uni('\r\n\t .')
+ # Unprintable non-ASCII characters
+ _check_uni('\x85')
+ _check_uni('\u1fff')
+ _check_uni('\U00012fff')
+ # Lone surrogates
+ _check_uni('\ud800')
+ _check_uni('\udfff')
+ # Issue #9804: surrogates should be joined even for printable
+ # wide characters (UCS-2 builds).
+ self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
+ # All together
+ s = "'0円\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
+ self.assertEqual(ascii(s),
+ r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
 
 def test_neg(self):
 x = -sys.maxsize-1
Modified: python/branches/py3k/Lib/test/test_codeccallbacks.py
==============================================================================
--- python/branches/py3k/Lib/test/test_codeccallbacks.py	(original)
+++ python/branches/py3k/Lib/test/test_codeccallbacks.py	Thu Sep 9 22:30:23 2010
@@ -577,17 +577,31 @@
 UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
 ("\\uffff", 1)
 )
- if sys.maxunicode>0xffff:
- self.assertEquals(
- codecs.backslashreplace_errors(
- UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
- ("\\U00010000", 1)
- )
- self.assertEquals(
- codecs.backslashreplace_errors(
- UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
- ("\\U0010ffff", 1)
- )
+ # 1 on UCS-4 builds, 2 on UCS-2
+ len_wide = len("\U00010000")
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\U00010000",
+ 0, len_wide, "ouch")),
+ ("\\U00010000", len_wide)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\U0010ffff",
+ 0, len_wide, "ouch")),
+ ("\\U0010ffff", len_wide)
+ )
+ # Lone surrogates (regardless of unicode width)
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
+ ("\\ud800", 1)
+ )
+ self.assertEquals(
+ codecs.backslashreplace_errors(
+ UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
+ ("\\udfff", 1)
+ )
 
 def test_badhandlerresults(self):
 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
Modified: python/branches/py3k/Misc/NEWS
==============================================================================
--- python/branches/py3k/Misc/NEWS	(original)
+++ python/branches/py3k/Misc/NEWS	Thu Sep 9 22:30:23 2010
@@ -10,6 +10,11 @@
 Core and Builtins
 -----------------
 
+- Issue #9804: ascii() now always represents unicode surrogate pairs as
+ a single ``\UXXXXXXXX``, regardless of whether the character is printable
+ or not. Also, the "backslashreplace" error handler now joins surrogate
+ pairs into a single character on UCS-2 builds.
+
 - Issue #9757: memoryview objects get a release() method to release the
 underlying buffer (previously this was only done when deallocating the
 memoryview), and gain support for the context management protocol.
Modified: python/branches/py3k/Python/codecs.c
==============================================================================
--- python/branches/py3k/Python/codecs.c	(original)
+++ python/branches/py3k/Python/codecs.c	Thu Sep 9 22:30:23 2010
@@ -678,6 +678,13 @@
 
 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
 {
+#ifndef Py_UNICODE_WIDE
+#define IS_SURROGATE_PAIR(p, end) \
+ (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
+ *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
+#else
+#define IS_SURROGATE_PAIR(p, end) 0
+#endif
 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
 PyObject *restuple;
 PyObject *object;
@@ -702,7 +709,12 @@
 else
 #endif
 if (*p >= 0x100) {
- ressize += 1+1+4;
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ ressize += 1+1+8;
+ ++p;
+ }
+ else
+ ressize += 1+1+4;
 }
 else
 ressize += 1+1+2;
@@ -712,9 +724,12 @@
 return NULL;
 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
 p < startp+end; ++p) {
- Py_UNICODE c = *p;
+ Py_UCS4 c = (Py_UCS4) *p;
 *outp++ = '\\';
-#ifdef Py_UNICODE_WIDE
+ if (IS_SURROGATE_PAIR(p, startp+end)) {
+ c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
+ ++p;
+ }
 if (c >= 0x00010000) {
 *outp++ = 'U';
 *outp++ = hexdigits[(c>>28)&0xf];
@@ -724,9 +739,7 @@
 *outp++ = hexdigits[(c>>12)&0xf];
 *outp++ = hexdigits[(c>>8)&0xf];
 }
- else
-#endif
- if (c >= 0x100) {
+ else if (c >= 0x100) {
 *outp++ = 'u';
 *outp++ = hexdigits[(c>>12)&0xf];
 *outp++ = hexdigits[(c>>8)&0xf];
@@ -746,6 +759,7 @@
 wrong_exception_type(exc);
 return NULL;
 }
+#undef IS_SURROGATE_PAIR
 }
 
 /* This handler is declared static until someone demonstrates