[Python-checkins] r67714 - in python/branches/release30-maint: Lib/test/test_tokenize.py Lib/tokenize.py Misc/NEWS

Fri Dec 12 02:40:01 CET 2008

Author: benjamin.peterson
Date: Fri Dec 12 02:40:00 2008
New Revision: 67714
Log:
Merged revisions 67711 via svnmerge from 
svn+ssh://pythondev@svn.python.org/python/branches/py3k
........
 r67711 | benjamin.peterson | 2008年12月11日 19:25:05 -0600 (2008年12月11日) | 1 line
 
 raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021
........
Modified:
 python/branches/release30-maint/ (props changed)
 python/branches/release30-maint/Lib/test/test_tokenize.py
 python/branches/release30-maint/Lib/tokenize.py
 python/branches/release30-maint/Misc/NEWS
Modified: python/branches/release30-maint/Lib/test/test_tokenize.py
==============================================================================

--- python/branches/release30-maint/Lib/test/test_tokenize.py	(original)
+++ python/branches/release30-maint/Lib/test/test_tokenize.py	Fri Dec 12 02:40:00 2008
@@ -795,6 +795,8 @@
 self.assertEquals(encoding, 'utf-8')
 self.assertEquals(consumed_lines, [])
 
+ readline = self.get_readline((b'# coding: bad\n',))
+ self.assertRaises(SyntaxError, detect_encoding, readline)
 
 class TestTokenize(TestCase):
 
Modified: python/branches/release30-maint/Lib/tokenize.py
==============================================================================
--- python/branches/release30-maint/Lib/tokenize.py	(original)
+++ python/branches/release30-maint/Lib/tokenize.py	Fri Dec 12 02:40:00 2008
@@ -26,7 +26,7 @@
 
 import re, string, sys
 from token import *
-from codecs import lookup
+from codecs import lookup, BOM_UTF8
 from itertools import chain, repeat
 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
 
@@ -251,11 +251,11 @@
 
 It detects the encoding from the presence of a utf-8 bom or an encoding
 cookie as specified in pep-0263. If both a bom and a cookie are present,
- but disagree, a SyntaxError will be raised.
+ but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ invalid charset, raise a SyntaxError.
 
 If no encoding is specified, then the default of 'utf-8' will be returned.
 """
- utf8_bom = b'\xef\xbb\xbf'
 bom_found = False
 encoding = None
 def read_or_stop():
@@ -268,18 +268,25 @@
 try:
 line_string = line.decode('ascii')
 except UnicodeDecodeError:
- pass
- else:
- matches = cookie_re.findall(line_string)
- if matches:
- encoding = matches[0]
- if bom_found and lookup(encoding).name != 'utf-8':
- # This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
- return encoding
+ return None
+
+ matches = cookie_re.findall(line_string)
+ if not matches:
+ return None
+ encoding = matches[0]
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("unknown encoding: " + encoding)
+
+ if bom_found and codec.name != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError('encoding problem: utf-8')
+ return encoding
 
 first = read_or_stop()
- if first.startswith(utf8_bom):
+ if first.startswith(BOM_UTF8):
 bom_found = True
 first = first[3:]
 if not first:
Modified: python/branches/release30-maint/Misc/NEWS
==============================================================================
--- python/branches/release30-maint/Misc/NEWS	(original)
+++ python/branches/release30-maint/Misc/NEWS	Fri Dec 12 02:40:00 2008
@@ -35,6 +35,9 @@
 Library
 -------
 
+- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the
+ codec cannot be found. This is for compatibility with the builtin behavior.
+
 - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
 give correct results in the case where one argument is a quiet NaN
 and the other is a finite number that requires rounding.