changeset: 81329:4677c5f6fcf7 branch: 3.2 parent: 81327:bf347198fbaf user: Serhiy Storchaka date: Tue Jan 08 23:14:24 2013 +0200 files: Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c description: Issue #11461: Fix the incremental UTF-16 decoder. Original patch by Amaury Forgeot d'Arc. Added tests for partial decoding of non-BMP characters. diff -r bf347198fbaf -r 4677c5f6fcf7 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Tue Jan 08 22:45:42 2013 +0200 +++ b/Lib/test/test_codecs.py Tue Jan 08 23:14:24 2013 +0200 @@ -313,7 +313,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", [ "", # first byte of BOM read "", # second byte of BOM read @@ -335,6 +335,10 @@ "\x00\xff\u0100", "\x00\xff\u0100", "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", ] ) @@ -369,7 +373,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", [ "", "", @@ -387,6 +391,10 @@ "\x00\xff\u0100", "\x00\xff\u0100", "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", ] ) @@ -409,7 +417,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", [ "", "", @@ -427,6 +435,10 @@ "\x00\xff\u0100", "\x00\xff\u0100", "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", ] ) @@ -477,7 +489,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", [ "", # first byte of BOM read "", # second byte of BOM read => byteorder known @@ -489,6 +501,10 @@ "\x00\xff\u0100", "\x00\xff\u0100", "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", ] ) @@ -526,7 +542,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", [ "", "\x00", @@ -536,6 +552,10 @@ "\x00\xff\u0100", "\x00\xff\u0100", "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", ] ) @@ -565,7 +585,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", [ "", "\x00", @@ -575,6 +595,10 @@ "\x00\xff\u0100", "\x00\xff\u0100", "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff", + "\x00\xff\u0100\uffff\U00010000", ] ) @@ -604,7 +628,7 @@ def test_partial(self): self.check_partial( - "\x00\xff\u07ff\u0800\uffff", + "\x00\xff\u07ff\u0800\uffff\U00010000", [ "\x00", "\x00", @@ -617,6 +641,10 @@ "\x00\xff\u07ff\u0800", "\x00\xff\u07ff\u0800", "\x00\xff\u07ff\u0800\uffff", + "\x00\xff\u07ff\u0800\uffff", + "\x00\xff\u07ff\u0800\uffff", + "\x00\xff\u07ff\u0800\uffff", + "\x00\xff\u07ff\u0800\uffff\U00010000", ] ) @@ -694,7 +722,7 @@ def test_partial(self): self.check_partial( - "\ufeff\x00\xff\u07ff\u0800\uffff", + "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", [ "", "", @@ -713,6 +741,10 @@ "\ufeff\x00\xff\u07ff\u0800", "\ufeff\x00\xff\u07ff\u0800", "\ufeff\x00\xff\u07ff\u0800\uffff", + "\ufeff\x00\xff\u07ff\u0800\uffff", + "\ufeff\x00\xff\u07ff\u0800\uffff", + "\ufeff\x00\xff\u07ff\u0800\uffff", + "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000", ] ) diff -r bf347198fbaf -r 4677c5f6fcf7 Misc/NEWS --- a/Misc/NEWS Tue Jan 08 22:45:42 2013 +0200 +++ b/Misc/NEWS Tue Jan 08 23:14:24 2013 +0200 @@ -10,6 +10,9 @@ Core and Builtins ----------------- +- Issue #11461: Fix the incremental UTF-16 decoder. Original patch by + Amaury Forgeot d'Arc. + - Issue #16367: Fix FileIO.readall() on Windows for files larger than 2 GB. - Issue #16455: On FreeBSD and Solaris, if the locale is C, the diff -r bf347198fbaf -r 4677c5f6fcf7 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Jan 08 22:45:42 2013 +0200 +++ b/Objects/unicodeobject.c Tue Jan 08 23:14:24 2013 +0200 @@ -3573,8 +3573,11 @@ /* UTF-16 code pair: */ if (e - q < 2) { + q -= 2; + if (consumed) + break; errmsg = "unexpected end of data"; - startinpos = (((const char *)q) - 2) - starts; + startinpos = ((const char *)q) - starts; endinpos = ((const char *)e) - starts; goto utf16Error; }

AltStyle によって変換されたページ (->オリジナル) /