[Python-checkins] r79494 - in python/trunk: Lib/test/test_unicodedata.py Misc/NEWS Objects/unicodeobject.c Objects/unicodetype_db.h Tools/unicode/makeunicodedata.py

Tue Mar 30 10:24:06 CEST 2010

Author: florent.xicluna
Date: Tue Mar 30 10:24:06 2010
New Revision: 79494
Log:
#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14.
Modified:
 python/trunk/Lib/test/test_unicodedata.py
 python/trunk/Misc/NEWS
 python/trunk/Objects/unicodeobject.c
 python/trunk/Objects/unicodetype_db.h
 python/trunk/Tools/unicode/makeunicodedata.py
Modified: python/trunk/Lib/test/test_unicodedata.py
==============================================================================

--- python/trunk/Lib/test/test_unicodedata.py	(original)
+++ python/trunk/Lib/test/test_unicodedata.py	Tue Mar 30 10:24:06 2010
@@ -24,7 +24,7 @@
 
 def test_method_checksum(self):
 h = hashlib.sha1()
- for i in range(65536):
+ for i in range(0x10000):
 char = unichr(i)
 data = [
 # Predicates (single char)
@@ -282,6 +282,17 @@
 self.assertEqual(u"\u01c5".title(), u"\u01c5")
 self.assertEqual(u"\u01c6".title(), u"\u01c5")
 
+ def test_linebreak_7643(self):
+ for i in range(0x10000):
+ lines = (unichr(i) + u'A').splitlines()
+ if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
+ 0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
+ self.assertEqual(len(lines), 2,
+ r"\u%.4x should be a linebreak" % i)
+ else:
+ self.assertEqual(len(lines), 1,
+ r"\u%.4x should not be a linebreak" % i)
+
 def test_main():
 test.test_support.run_unittest(
 UnicodeMiscTest,
Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Tue Mar 30 10:24:06 2010
@@ -32,6 +32,10 @@
 Library
 -------
 
+- Issue #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks
+ according to Unicode Standard Annex #14.
+ http://www.unicode.org/reports/tr14/
+
 - Comparisons using one of <, <=, >, >= between a complex instance and
 a Fractions instance now raise TypeError instead of returning
 True/False. This makes Fraction <=> complex comparisons consistent with
Modified: python/trunk/Objects/unicodeobject.c
==============================================================================
--- python/trunk/Objects/unicodeobject.c	(original)
+++ python/trunk/Objects/unicodeobject.c	Tue Mar 30 10:24:06 2010
@@ -115,9 +115,9 @@
 /* Fast detection of the most frequent whitespace characters */
 const unsigned char _Py_ascii_whitespace[] = {
 0, 0, 0, 0, 0, 0, 0, 0,
-/* case 0x0009: * HORIZONTAL TABULATION */
+/* case 0x0009: * CHARACTER TABULATION */
 /* case 0x000A: * LINE FEED */
-/* case 0x000B: * VERTICAL TABULATION */
+/* case 0x000B: * LINE TABULATION */
 /* case 0x000C: * FORM FEED */
 /* case 0x000D: * CARRIAGE RETURN */
 0, 1, 1, 1, 1, 1, 0, 0,
@@ -147,8 +147,10 @@
 static unsigned char ascii_linebreak[] = {
 0, 0, 0, 0, 0, 0, 0, 0,
 /* 0x000A, * LINE FEED */
+/* 0x000B, * LINE TABULATION */
+/* 0x000C, * FORM FEED */
 /* 0x000D, * CARRIAGE RETURN */
- 0, 0, 1, 0, 0, 1, 0, 0,
+ 0, 0, 1, 1, 1, 1, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0,
 /* 0x001C, * FILE SEPARATOR */
 /* 0x001D, * GROUP SEPARATOR */
Modified: python/trunk/Objects/unicodetype_db.h
==============================================================================
--- python/trunk/Objects/unicodetype_db.h	(original)
+++ python/trunk/Objects/unicodetype_db.h	Tue Mar 30 10:24:06 2010
@@ -661,7 +661,7 @@
 };
 
 static unsigned char index2[] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 1, 1, 1, 1, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 1, 1, 1, 1, 1, 1, 14, 14, 14, 14, 
 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 
@@ -3313,13 +3313,16 @@
 #endif
 }
 
-/* Returns 1 for Unicode characters having the category 'Zl',
- * 'Zp' or type 'B', 0 otherwise.
+/* Returns 1 for Unicode characters having the line break
+ * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
+ * type 'B', 0 otherwise.
 */
 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
 {
 switch (ch) {
 case 0x000A:
+ case 0x000B:
+ case 0x000C:
 case 0x000D:
 case 0x001C:
 case 0x001D:
Modified: python/trunk/Tools/unicode/makeunicodedata.py
==============================================================================
--- python/trunk/Tools/unicode/makeunicodedata.py	(original)
+++ python/trunk/Tools/unicode/makeunicodedata.py	Tue Mar 30 10:24:06 2010
@@ -36,6 +36,7 @@
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 UNIHAN = "Unihan%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
+LINE_BREAK = "LineBreak%s.txt"
 
 old_versions = ["3.2.0"]
 
@@ -50,6 +51,8 @@
 
 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
 
+MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
+
 # note: should match definitions in Objects/unicodectype.c
 ALPHA_MASK = 0x01
 DECIMAL_MASK = 0x02
@@ -71,7 +74,8 @@
 COMPOSITION_EXCLUSIONS % version,
 EASTASIAN_WIDTH % version,
 UNIHAN % version,
- DERIVEDNORMALIZATION_PROPS % version)
+ DERIVEDNORMALIZATION_PROPS % version,
+ LINE_BREAK % version)
 
 print len(filter(None, unicode.table)), "characters"
 
@@ -113,7 +117,7 @@
 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
 mirrored = record[9] == "Y"
 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
- normalizationquickcheck = record[16]
+ normalizationquickcheck = record[17]
 item = (
 category, combining, bidirectional, mirrored, eastasianwidth,
 normalizationquickcheck
@@ -365,13 +369,14 @@
 # extract database properties
 category = record[2]
 bidirectional = record[4]
+ properties = record[16]
 flags = 0
 delta = True
 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
 flags |= ALPHA_MASK
 if category == "Ll":
 flags |= LOWER_MASK
- if category == "Zl" or bidirectional == "B":
+ if 'Line_Break' in properties or bidirectional == "B":
 flags |= LINEBREAK_MASK
 linebreaks.append(char)
 if category == "Zs" or bidirectional in ("WS", "B", "S"):
@@ -524,8 +529,9 @@
 print >>fp
 
 # Generate code for _PyUnicode_IsLinebreak()
- print >>fp, "/* Returns 1 for Unicode characters having the category 'Zl',"
- print >>fp, " * 'Zp' or type 'B', 0 otherwise."
+ print >>fp, "/* Returns 1 for Unicode characters having the line break"
+ print >>fp, " * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional"
+ print >>fp, " * type 'B', 0 otherwise."
 print >>fp, " */"
 print >>fp, 'int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)'
 print >>fp, '{'
@@ -787,6 +793,9 @@
 elif k == 14:
 # change to simple titlecase mapping; ignore
 pass
+ elif k == 16:
+ # change to properties; not yet
+ pass
 else:
 class Difference(Exception):pass
 raise Difference, (hex(i), k, old.table[i], new.table[i])
@@ -803,9 +812,15 @@
 # load a unicode-data file from disk
 
 class UnicodeData:
+ # Record structure:
+ # [ID, name, category, combining, bidi, decomp, (6)
+ # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
+ # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
+ # properties] (17)
 
 def __init__(self, filename, exclusions, eastasianwidth, unihan,
- derivednormalizationprops=None, expand=1):
+ derivednormalizationprops=None, linebreakprops=None,
+ expand=1):
 self.changed = []
 file = open(filename)
 table = [None] * 0x110000
@@ -868,6 +883,23 @@
 for i in range(0, 0x110000):
 if table[i] is not None:
 table[i].append(widths[i])
+
+ for i in range(0, 0x110000):
+ if table[i] is not None:
+ table[i].append(set())
+ if linebreakprops:
+ for s in open(linebreakprops):
+ s = s.partition('#')[0]
+ s = [i.strip() for i in s.split(';')]
+ if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+ continue
+ if '..' not in s[0]:
+ first = last = int(s[0], 16)
+ else:
+ first, last = [int(c, 16) for c in s[0].split('..')]
+ for char in range(first, last+1):
+ table[char][-1].add('Line_Break')
+
 if derivednormalizationprops:
 quickchecks = [0] * 0x110000 # default is Yes
 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()