[Python-checkins] CVS: python/dist/src/Lib/test test_unicode.py,1.48,1.49

2002年2月07日 03:33:51 -0800

Update of /cvsroot/python/python/dist/src/Lib/test
In directory usw-pr-cvs1:/tmp/cvs-serv8617/Lib/test
Modified Files:
	test_unicode.py 
Log Message:
Fix to the UTF-8 encoder: it failed on 0-length input strings.
Fix for the UTF-8 decoder: it will now accept isolated surrogates
(previously it raised an exception which causes round-trips to
fail).
Added new tests for UTF-8 round-trip safety (we rely on UTF-8 for
marshalling Unicode objects, so we better make sure it works for
all Unicode code points, including isolated surrogates).
Bumped the PYC magic in a non-standard way -- please review. This
was needed because the old PYC format used illegal UTF-8 sequences
for isolated high surrogates which now raise an exception.
Index: test_unicode.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/test/test_unicode.py,v
retrieving revision 1.48
retrieving revision 1.49
diff -C2 -d -r1.48 -r1.49
*** test_unicode.py	6 Feb 2002 18:09:02 -0000	1.48
--- test_unicode.py	7 Feb 2002 11:33:49 -0000	1.49
***************
*** 24,42 ****
 verify(repr(u"'") == '''u"'"''')
 verify(repr(u'"') == """u'"'""")
! verify(repr(u''.join(map(unichr, range(256)))) ==
! "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
! "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
! "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
! "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
! "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
! "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
! "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
! "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
! "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
! "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
! "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
! "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
! "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
! "\\xfe\\xff'")
 
 def test(method, input, output, *args):
--- 24,44 ----
 verify(repr(u"'") == '''u"'"''')
 verify(repr(u'"') == """u'"'""")
! latin1repr = (
! "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
! "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
! "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
! "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
! "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
! "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
! "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
! "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
! "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
! "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
! "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
! "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
! "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
! "\\xfe\\xff'")
! testrepr = repr(u''.join(map(unichr, range(256))))
! verify(testrepr == latin1repr)
 
 def test(method, input, output, *args):
***************
*** 496,499 ****
--- 498,502 ----
 
 # UTF-8 specific encoding tests:
+ verify(u''.encode('utf-8') == '')
 verify(u'\u20ac'.encode('utf-8') == '\xe2\x82\xac')
 verify(u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82')
***************
*** 553,564 ****
 verify(unicode(u.encode(encoding),encoding) == u)
 
! # Roundtrip safety for non-BMP (just a few chars)
! u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
! for encoding in ('utf-8',
! 'utf-16', 'utf-16-le', 'utf-16-be',
! #'raw_unicode_escape',
! 'unicode_escape', 'unicode_internal'):
! verify(unicode(u.encode(encoding),encoding) == u)
! 
 u = u''.join(map(unichr, range(256)))
 for encoding in (
--- 556,560 ----
 verify(unicode(u.encode(encoding),encoding) == u)
 
! # Roundtrip safety for BMP (just the first 256 chars)
 u = u''.join(map(unichr, range(256)))
 for encoding in (
***************
*** 572,575 ****
--- 568,572 ----
 print '*** codec for "%s" failed: %s' % (encoding, why)
 
+ # Roundtrip safety for BMP (just the first 128 chars)
 u = u''.join(map(unichr, range(128)))
 for encoding in (
***************
*** 582,585 ****
--- 579,595 ----
 except ValueError,why:
 print '*** codec for "%s" failed: %s' % (encoding, why)
+ 
+ # Roundtrip safety for non-BMP (just a few chars)
+ u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
+ for encoding in ('utf-8',
+ 'utf-16', 'utf-16-le', 'utf-16-be',
+ #'raw_unicode_escape',
+ 'unicode_escape', 'unicode_internal'):
+ verify(unicode(u.encode(encoding),encoding) == u)
+ 
+ # UTF-8 must be roundtrip safe for all UCS-2 code points
+ u = u''.join(map(unichr, range(0x10000)))
+ for encoding in ('utf-8',):
+ verify(unicode(u.encode(encoding),encoding) == u)
 
 print 'done.'