[Python-checkins] cpython (merge 3.2 -> default): #13358: merge with 3.2.

Fri Nov 18 17:03:12 CET 2011

http://hg.python.org/cpython/rev/e12d2b9c88ef
changeset: 73610:e12d2b9c88ef
parent: 73607:cb6614e3438b
parent: 73609:0a32e7e3aa1f
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Fri Nov 18 18:02:59 2011 +0200
summary:
 #13358: merge with 3.2.
files:
 Lib/html/parser.py | 7 ++++---
 Lib/test/test_htmlparser.py | 20 ++++++++++++++++++++
 Misc/NEWS | 2 ++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -14,7 +14,6 @@
 # Regular expressions used for parsing
 
 interesting_normal = re.compile('[&<]')
-interesting_cdata = re.compile(r'<(/|\Z)')
 incomplete = re.compile('&[a-zA-Z#]')
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -149,8 +148,8 @@
 return self.__starttag_text
 
 def set_cdata_mode(self, elem):
- self.interesting = interesting_cdata
 self.cdata_elem = elem.lower()
+ self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
 
 def clear_cdata_mode(self):
 self.interesting = interesting_normal
@@ -168,6 +167,8 @@
 if match:
 j = match.start()
 else:
+ if self.cdata_elem:
+ break
 j = n
 if i < j: self.handle_data(rawdata[i:j])
 i = self.updatepos(i, j)
@@ -250,7 +251,7 @@
 else:
 assert 0, "interesting.search() lied"
 # end while
- if end and i < n:
+ if end and i < n and not self.cdata_elem:
 self.handle_data(rawdata[i:n])
 i = self.updatepos(i, n)
 self.rawdata = rawdata[i:]
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -301,7 +301,27 @@
 ("data", content),
 ("endtag", element_lower)])
 
+ def test_cdata_with_closing_tags(self):
+ # see issue #13358
+ # make sure that HTMLParser calls handle_data only once for each CDATA.
+ # The normal event collector normalizes the events in get_events,
+ # so we override it to return the original list of events.
+ class Collector(EventCollector):
+ def get_events(self):
+ return self.events
 
+ content = """<!-- not a comment --> &not-an-entity-ref;
+ <a href="" /> </p><p> <span></span></style>
+ '</script' + '>'"""
+ for element in [' script', 'script ', ' script ',
+ '\nscript', 'script\n', '\nscript\n']:
+ element_lower = element.lower().strip()
+ s = '<script>{content}</{element}>'.format(element=element,
+ content=content)
+ self._run_check(s, [("starttag", element_lower, []),
+ ("data", content),
+ ("endtag", element_lower)],
+ collector=Collector())
 
 class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -377,6 +377,8 @@
 Library
 -------
 
+- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
+
 - Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
 node when it is the only child of an element. Initial patch by Dan
 Kenigsberg.
-- 
Repository URL: http://hg.python.org/cpython