[Python-checkins] [3.7] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891) (GH-8132)

Fri Jul 6 06:21:09 EDT 2018

https://github.com/python/cpython/commit/ab75d9e4244ee24bc96ea9d52362899e3bf365a2
commit: ab75d9e4244ee24bc96ea9d52362899e3bf365a2
branch: 3.7
author: Ammar Askar <ammar_askar at hotmail.com>
committer: Tal Einat <taleinat+github at gmail.com>
date: 2018年07月06日T13:21:05+03:00
summary:
[3.7] bpo-33899: Make tokenize module mirror end-of-file is end-of-line behavior (GH-7891) (GH-8132)
Most of the change involves fixing up the test suite, which previously made
the assumption that there wouldn't be a new line if the input didn't end in
one.
Contributed by Ammar Askar.
(cherry picked from commit c4ef4896eac86a6759901c8546e26de4695a1389)
files:
A Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
M Lib/test/test_tokenize.py
M Lib/tokenize.py

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 3520a67bd42b..d0db77995acf 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,7 +1,8 @@
 from test import support
 from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
- open as tokenize_open, Untokenizer)
+ open as tokenize_open, Untokenizer, generate_tokens,
+ NEWLINE)
 from io import BytesIO
 import unittest
 from unittest import TestCase, mock
@@ -11,27 +12,51 @@
 import token
 
 
+# Converts a source string into a list of textual representation
+# of the tokens such as:
+# ` NAME 'if' (1, 0) (1, 2)`
+# to make writing tests easier.
+def stringify_tokens_from_source(token_generator, source_string):
+ result = []
+ num_lines = len(source_string.splitlines())
+ missing_trailing_nl = source_string[-1] not in '\r\n'
+
+ for type, token, start, end, line in token_generator:
+ if type == ENDMARKER:
+ break
+ # Ignore the new line on the last line if the input lacks one
+ if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
+ continue
+ type = tok_name[type]
+ result.append(f" {type:10} {token!r:13} {start} {end}")
+
+ return result
+
 class TokenizeTest(TestCase):
 # Tests for the tokenize module.
 
 # The tests can be really simple. Given a small fragment of source
- # code, print out a table with tokens. The ENDMARKER is omitted for
- # brevity.
+ # code, print out a table with tokens. The ENDMARKER, ENCODING and
+ # final NEWLINE are omitted for brevity.
 
 def check_tokenize(self, s, expected):
 # Format the tokens in s in a table format.
- # The ENDMARKER is omitted.
- result = []
+ # The ENDMARKER and final NEWLINE are omitted.
 f = BytesIO(s.encode('utf-8'))
- for type, token, start, end, line in tokenize(f.readline):
- if type == ENDMARKER:
- break
- type = tok_name[type]
- result.append(f" {type:10} {token!r:13} {start} {end}")
+ result = stringify_tokens_from_source(tokenize(f.readline), s)
+
 self.assertEqual(result,
 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
 expected.rstrip().splitlines())
 
+ def test_implicit_newline(self):
+ # Make sure that the tokenizer puts in an implicit NEWLINE
+ # when the input lacks a trailing new line.
+ f = BytesIO("x".encode('utf-8'))
+ tokens = list(tokenize(f.readline))
+ self.assertEqual(tokens[-2].type, NEWLINE)
+ self.assertEqual(tokens[-1].type, ENDMARKER)
+
 def test_basic(self):
 self.check_tokenize("1 + 1", """\
 NUMBER '1' (1, 0) (1, 1)
@@ -1009,8 +1034,8 @@ def readline():
 else:
 return b''
 
- # skip the initial encoding token and the end token
- tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
+ # skip the initial encoding token and the end tokens
+ tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
 self.assertEqual(tokens, expected_tokens,
 "bytes not decoded with encoding")
@@ -1026,8 +1051,8 @@ def readline():
 else:
 return b''
 
- # skip the end token
- tokens = list(_tokenize(readline, encoding=None))[:-1]
+ # skip the end tokens
+ tokens = list(_tokenize(readline, encoding=None))[:-2]
 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
 self.assertEqual(tokens, expected_tokens,
 "string not tokenized when encoding is None")
@@ -1338,18 +1363,21 @@ def test_oneline_defs(self):
 
 # Test that 500 consequent, one-line defs is OK
 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
- self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
+ self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
+ # [-2] is always NEWLINE
 
 def assertExactTypeEqual(self, opstr, *optypes):
 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
 num_optypes = len(optypes)
- self.assertEqual(len(tokens), 2 + num_optypes)
+ self.assertEqual(len(tokens), 3 + num_optypes)
 self.assertEqual(tok_name[tokens[0].exact_type],
 tok_name[ENCODING])
 for i in range(num_optypes):
 self.assertEqual(tok_name[tokens[i + 1].exact_type],
 tok_name[optypes[i]])
 self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
+ tok_name[token.NEWLINE])
+ self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
 tok_name[token.ENDMARKER])
 
 def test_exact_type(self):
@@ -1502,7 +1530,7 @@ def test_roundtrip(self):
 self.check_roundtrip("if x == 1:\n"
 " print(x)\n")
 self.check_roundtrip("# This is a comment\n"
- "# This also")
+ "# This also\n")
 
 # Some people use different formatting conventions, which makes
 # untokenize a little trickier. Note that this test involves trailing
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 6528b9006128..0eccc9b08d44 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -496,8 +496,15 @@ def _tokenize(readline, encoding):
 # BOM will already have been stripped.
 encoding = "utf-8"
 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
- while True: # loop over lines in stream
+ last_line = b''
+ line = b''
+ while True: # loop over lines in stream
 try:
+ # We capture the value of the line variable here because
+ # readline uses the empty string '' to signal end of input,
+ # hence `line` itself will always be overwritten at the end
+ # of this loop.
+ last_line = line
 line = readline()
 except StopIteration:
 line = b''
@@ -652,6 +659,9 @@ def _tokenize(readline, encoding):
 (lnum, pos), (lnum, pos+1), line)
 pos += 1
 
+ # Add an implicit NEWLINE if the input doesn't end in one
+ if last_line and last_line[-1] not in '\r\n':
+ yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
 for indent in indents[1:]: # pop remaining indent levels
 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
diff --git a/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
new file mode 100644
index 000000000000..21c909599363
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-24-01-57-14.bpo-33899.IaOcAr.rst
@@ -0,0 +1,3 @@
+Tokenize module now implicitly emits a NEWLINE when provided with input that
+does not have a trailing new line. This behavior now matches what the C
+tokenizer does internally. Contributed by Ammar Askar.