[Python-checkins] cpython (2.7): Issue #15866: The xmlcharrefreplace error handler no more produces two XML

Tue Aug 6 15:57:04 CEST 2013

http://hg.python.org/cpython/rev/719ee60fc5e2
changeset: 85051:719ee60fc5e2
branch: 2.7
parent: 85041:395ac61ebe1a
user: Serhiy Storchaka <storchaka at gmail.com>
date: Tue Aug 06 16:56:26 2013 +0300
summary:
 Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
files:
 Lib/test/test_codeccallbacks.py | 25 +++++-
 Lib/test/test_unicode.py | 12 +++
 Misc/NEWS | 3 +
 Modules/_testcapimodule.c | 2 +-
 Objects/unicodeobject.c | 82 +++++++++++++++-----
 Python/codecs.c | 66 +++++++++-------
 6 files changed, 135 insertions(+), 55 deletions(-)

diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -66,15 +66,34 @@
 # replace unencodable characters which numeric character entities.
 # For ascii, latin-1 and charmaps this is completely implemented
 # in C and should be reasonably fast.
- s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
+ s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161"
 self.assertEqual(
 s.encode("ascii", "xmlcharrefreplace"),
- "スパモ änd eggs"
+ "スパモ änd eggš"
 )
 self.assertEqual(
 s.encode("latin-1", "xmlcharrefreplace"),
- "スパモ \xe4nd eggs"
+ "スパモ \xe4nd eggš"
 )
+ self.assertEqual(
+ s.encode("iso-8859-15", "xmlcharrefreplace"),
+ "スパモ \xe4nd egg\xa8"
+ )
+
+ def test_xmlcharrefreplace_with_surrogates(self):
+ tests = [(u'\U0001f49d', '💝'),
+ (u'\ud83d', '�'),
+ (u'\udc9d', '�'),
+ (u'\ud83d\udc9d', '💝' if len(u'\U0001f49d') > 1 else
+ '��'),
+ ]
+ for encoding in ['ascii', 'latin1', 'iso-8859-15']:
+ for s, exp in tests:
+ self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'),
+ exp, msg='%r.encode(%r)' % (s, encoding))
+ self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'),
+ exp+'X',
+ msg='%r.encode(%r)' % (s + 'X', encoding))
 
 def test_xmlcharnamereplace(self):
 # This time use a named character entity for unencodable
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -1658,6 +1658,18 @@
 self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
 b'123?0')
 
+ def test_encode_decimal_with_surrogates(self):
+ from _testcapi import unicode_encodedecimal
+ tests = [(u'\U0001f49d', '💝'),
+ (u'\ud83d', '�'),
+ (u'\udc9d', '�'),
+ (u'\ud83d\udc9d', '💝' if len(u'\U0001f49d') > 1 else
+ '��'),
+ ]
+ for s, exp in tests:
+ self.assertEqual(
+ unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
+ '123' + exp)
 
 def test_main():
 test_support.run_unittest(__name__)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -9,6 +9,9 @@
 Core and Builtins
 -----------------
 
+- Issue #15866: The xmlcharrefreplace error handler no more produces two XML
+ entities for a non-BMP character on narrow build.
+
 - Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise
 OverflowError when an argument of %c format is out of range.
 
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -1118,7 +1118,7 @@
 if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
 return NULL;
 
- decimal_length = length * 7; /* len('€') */
+ decimal_length = length * 10; /* len('􏿿') */
 decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
 if (decimal == NULL)
 return NULL;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -547,6 +547,37 @@
 return PyUnicode_FromStringAndSize(u, size);
 }
 
+/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
+ * by 'ptr', possibly combining surrogate pairs on narrow builds.
+ * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
+ * that should be returned and 'end' pointing to the end of the buffer.
+ * ('end' is used on narrow builds to detect a lone surrogate at the
+ * end of the buffer that should be returned unchanged.)
+ * The ptr and end arguments should be side-effect free and ptr must an lvalue.
+ * The type of the returned char is always Py_UCS4.
+ *
+ * Note: the macro advances ptr to next char, so it might have side-effects
+ * (especially if used with other macros).
+ */
+
+/* helper macros used by _Py_UNICODE_NEXT */
+#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
+#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
+/* Join two surrogate characters and return a single Py_UCS4 value. */
+#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
+ (((((Py_UCS4)(high) & 0x03FF) << 10) | \
+ ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
+
+#ifdef Py_UNICODE_WIDE
+#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
+#else
+#define _Py_UNICODE_NEXT(ptr, end) \
+ (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
+ _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
+ ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
+ (Py_UCS4)*(ptr)++)
+#endif
+
 #ifdef HAVE_WCHAR_H
 
 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
@@ -3642,26 +3673,22 @@
 case 4: /* xmlcharrefreplace */
 respos = str-PyString_AS_STRING(res);
 /* determine replacement size (temporarily (mis)uses p) */
- for (p = collstart, repsize = 0; p < collend; ++p) {
- if (*p<10)
+ for (p = collstart, repsize = 0; p < collend;) {
+ Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+ if (ch < 10)
 repsize += 2+1+1;
- else if (*p<100)
+ else if (ch < 100)
 repsize += 2+2+1;
- else if (*p<1000)
+ else if (ch < 1000)
 repsize += 2+3+1;
- else if (*p<10000)
+ else if (ch < 10000)
 repsize += 2+4+1;
-#ifndef Py_UNICODE_WIDE
- else
+ else if (ch < 100000)
 repsize += 2+5+1;
-#else
- else if (*p<100000)
- repsize += 2+5+1;
- else if (*p<1000000)
+ else if (ch < 1000000)
 repsize += 2+6+1;
 else
 repsize += 2+7+1;
-#endif
 }
 requiredsize = respos+repsize+(endp-collend);
 if (requiredsize > ressize) {
@@ -3673,8 +3700,9 @@
 ressize = requiredsize;
 }
 /* generate replacement (temporarily (mis)uses p) */
- for (p = collstart; p < collend; ++p) {
- str += sprintf(str, "&#%d;", (int)*p);
+ for (p = collstart; p < collend;) {
+ Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+ str += sprintf(str, "&#%d;", (int)ch);
 }
 p = collend;
 break;
@@ -4649,11 +4677,20 @@
 *inpos = collendpos;
 break;
 case 4: /* xmlcharrefreplace */
- /* generate replacement (temporarily (mis)uses p) */
- for (collpos = collstartpos; collpos < collendpos; ++collpos) {
+ /* generate replacement */
+ for (collpos = collstartpos; collpos < collendpos;) {
 char buffer[2+29+1+1];
 char *cp;
- sprintf(buffer, "&#%d;", (int)p[collpos]);
+ Py_UCS4 ch = p[collpos++];
+#ifndef Py_UNICODE_WIDE
+ if ((0xD800 <= ch && ch <= 0xDBFF) &&
+ (collpos < collendpos) &&
+ (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
+ ch = ((((ch & 0x03FF) << 10) |
+ ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
+ }
+#endif
+ sprintf(buffer, "&#%d;", (int)ch);
 for (cp = buffer; *cp; ++cp) {
 x = charmapencode_output(*cp, mapping, res, respos);
 if (x==enc_EXCEPTION)
@@ -5068,10 +5105,11 @@
 break;
 case 4: /* xmlcharrefreplace */
 /* generate replacement (temporarily (mis)uses p) */
- for (p = collstart; p < collend; ++p) {
+ for (p = collstart; p < collend;) {
 char buffer[2+29+1+1];
 char *cp;
- sprintf(buffer, "&#%d;", (int)*p);
+ Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+ sprintf(buffer, "&#%d;", (int)ch);
 if (charmaptranslate_makespace(&res, &str,
 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
 goto onError;
@@ -5222,8 +5260,10 @@
 break;
 case 4: /* xmlcharrefreplace */
 /* generate replacement (temporarily (mis)uses p) */
- for (p = collstart; p < collend; ++p)
- output += sprintf(output, "&#%d;", (int)*p);
+ for (p = collstart; p < collend;) {
+ Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
+ output += sprintf(output, "&#%d;", ch);
+ }
 p = collend;
 break;
 default:
diff --git a/Python/codecs.c b/Python/codecs.c
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -556,6 +556,7 @@
 PyObject *res;
 Py_UNICODE *p;
 Py_UNICODE *startp;
+ Py_UNICODE *e;
 Py_UNICODE *outp;
 int ressize;
 if (PyUnicodeEncodeError_GetStart(exc, &start))
@@ -565,26 +566,31 @@
 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
 return NULL;
 startp = PyUnicode_AS_UNICODE(object);
- for (p = startp+start, ressize = 0; p < startp+end; ++p) {
- if (*p<10)
+ e = startp + end;
+ for (p = startp+start, ressize = 0; p < e;) {
+ Py_UCS4 ch = *p++;
+#ifndef Py_UNICODE_WIDE
+ if ((0xD800 <= ch && ch <= 0xDBFF) &&
+ (p < e) &&
+ (0xDC00 <= *p && *p <= 0xDFFF)) {
+ ch = ((((ch & 0x03FF) << 10) |
+ ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
+ }
+#endif
+ if (ch < 10)
 ressize += 2+1+1;
- else if (*p<100)
+ else if (ch < 100)
 ressize += 2+2+1;
- else if (*p<1000)
+ else if (ch < 1000)
 ressize += 2+3+1;
- else if (*p<10000)
+ else if (ch < 10000)
 ressize += 2+4+1;
-#ifndef Py_UNICODE_WIDE
- else
+ else if (ch < 100000)
 ressize += 2+5+1;
-#else
- else if (*p<100000)
- ressize += 2+5+1;
- else if (*p<1000000)
+ else if (ch < 1000000)
 ressize += 2+6+1;
 else
 ressize += 2+7+1;
-#endif
 }
 /* allocate replacement */
 res = PyUnicode_FromUnicode(NULL, ressize);
@@ -593,40 +599,41 @@
 return NULL;
 }
 /* generate replacement */
- for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
- p < startp+end; ++p) {
- Py_UNICODE c = *p;
+ for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
 int digits;
 int base;
+ Py_UCS4 ch = *p++;
+#ifndef Py_UNICODE_WIDE
+ if ((0xD800 <= ch && ch <= 0xDBFF) &&
+ (p < startp+end) &&
+ (0xDC00 <= *p && *p <= 0xDFFF)) {
+ ch = ((((ch & 0x03FF) << 10) |
+ ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
+ }
+#endif
 *outp++ = '&';
 *outp++ = '#';
- if (*p<10) {
+ if (ch < 10) {
 digits = 1;
 base = 1;
 }
- else if (*p<100) {
+ else if (ch < 100) {
 digits = 2;
 base = 10;
 }
- else if (*p<1000) {
+ else if (ch < 1000) {
 digits = 3;
 base = 100;
 }
- else if (*p<10000) {
+ else if (ch < 10000) {
 digits = 4;
 base = 1000;
 }
-#ifndef Py_UNICODE_WIDE
- else {
+ else if (ch < 100000) {
 digits = 5;
 base = 10000;
 }
-#else
- else if (*p<100000) {
- digits = 5;
- base = 10000;
- }
- else if (*p<1000000) {
+ else if (ch < 1000000) {
 digits = 6;
 base = 100000;
 }
@@ -634,10 +641,9 @@
 digits = 7;
 base = 1000000;
 }
-#endif
 while (digits-->0) {
- *outp++ = '0' + c/base;
- c %= base;
+ *outp++ = '0' + ch/base;
+ ch %= base;
 base /= 10;
 }
 *outp++ = ';';
-- 
Repository URL: http://hg.python.org/cpython