[Python-checkins] r67746 - in python/trunk: Lib/test/test_textwrap.py Lib/textwrap.py Misc/NEWS

Sun Dec 14 00:12:31 CET 2008

Author: antoine.pitrou
Date: Sun Dec 14 00:12:30 2008
New Revision: 67746
Log:
Issue #4163: Use unicode-friendly word splitting in the textwrap functions when given an unicode string.
Modified:
 python/trunk/Lib/test/test_textwrap.py
 python/trunk/Lib/textwrap.py
 python/trunk/Misc/NEWS
Modified: python/trunk/Lib/test/test_textwrap.py
==============================================================================

--- python/trunk/Lib/test/test_textwrap.py	(original)
+++ python/trunk/Lib/test/test_textwrap.py	Sun Dec 14 00:12:30 2008
@@ -174,7 +174,7 @@
 text = ("Python 1.0.0 was released on 1994年01月26日. Python 1.0.1 was\n"
 "released on 1994年02月15日.")
 
- self.check_wrap(text, 30, ['Python 1.0.0 was released on',
+ self.check_wrap(text, 35, ['Python 1.0.0 was released on',
 '1994-01-26. Python 1.0.1 was',
 'released on 1994年02月15日.'])
 self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994年01月26日.',
@@ -353,6 +353,14 @@
 otext = self.wrapper.fill(text)
 assert isinstance(otext, unicode)
 
+ def test_no_split_at_umlaut(self):
+ text = u"Die Empf\xe4nger-Auswahl"
+ self.check_wrap(text, 13, [u"Die", u"Empf\xe4nger-", u"Auswahl"])
+
+ def test_umlaut_followed_by_dash(self):
+ text = u"aa \xe4\xe4-\xe4\xe4"
+ self.check_wrap(text, 7, [u"aa \xe4\xe4-", u"\xe4\xe4"])
+
 def test_split(self):
 # Ensure that the standard _split() method works as advertised
 # in the comments
Modified: python/trunk/Lib/textwrap.py
==============================================================================
--- python/trunk/Lib/textwrap.py	(original)
+++ python/trunk/Lib/textwrap.py	Sun Dec 14 00:12:30 2008
@@ -84,16 +84,16 @@
 # splits into
 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
 # (after stripping out empty strings).
- wordsep_re = re.compile(
+ wordsep_re = (
 r'(\s+|' # any whitespace
- r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
+ r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
 r'(?<=[\w\!\"\'\&\.,円\?])-{2,}(?=\w))') # em-dash
 
 # This less funky little regex just split on recognized spaces. E.g.
 # "Hello there -- you goof-ball, use the -b option!"
 # splits into
 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
- wordsep_simple_re = re.compile(r'(\s+)')
+ wordsep_simple_re = r'(\s+)'
 
 # XXX this is not locale- or charset-aware -- string.lowercase
 # is US-ASCII only (and therefore English-only)
@@ -160,10 +160,12 @@
 'use', ' ', 'the', ' ', '-b', ' ', option!'
 otherwise.
 """
- if self.break_on_hyphens is True:
- chunks = self.wordsep_re.split(text)
+ flags = re.UNICODE if isinstance(text, unicode) else 0
+ if self.break_on_hyphens:
+ pat = self.wordsep_re
 else:
- chunks = self.wordsep_simple_re.split(text)
+ pat = self.wordsep_simple_re
+ chunks = re.compile(pat, flags).split(text)
 chunks = filter(None, chunks) # remove empty chunks
 return chunks
 
Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Sun Dec 14 00:12:30 2008
@@ -74,6 +74,9 @@
 Library
 -------
 
+- Issue #4163: Use unicode-friendly word splitting in the textwrap functions
+ when given an unicode string.
+
 - Issue #4616: TarFile.utime(): Restore directory times on Windows.
 
 - Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to