changeset: 73159:41d41776aa6d branch: 3.2 parent: 73157:d1c2b62ff80c user: Ezio Melotti date: Fri Oct 28 13:21:09 2011 +0300 files: Lib/html/parser.py Lib/test/test_htmlparser.py Misc/NEWS description: #13273: fix a bug that prevented HTMLParser to properly detect some tags when strict=False. diff -r d1c2b62ff80c -r 41d41776aa6d Lib/html/parser.py --- a/Lib/html/parser.py Fri Oct 28 12:32:53 2011 +0300 +++ b/Lib/html/parser.py Fri Oct 28 13:21:09 2011 +0300 @@ -30,7 +30,7 @@ r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^\s"\'=`]*))?') attrfind_tolerant = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' + r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-za-z][-.a-za-z0-9:_]* # tag name @@ -277,12 +277,11 @@ assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: if self.strict: m = attrfind.match(rawdata, k) else: - m = attrfind_tolerant.search(rawdata, k) + m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) diff -r d1c2b62ff80c -r 41d41776aa6d Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Fri Oct 28 12:32:53 2011 +0300 +++ b/Lib/test/test_htmlparser.py Fri Oct 28 13:21:09 2011 +0300 @@ -373,6 +373,39 @@ [('action', 'bogus|&#()value')])], collector = self.collector) + def test_issue13273(self): + html = ('
The rain ' + '
in Spain
') + expected = [ + ('starttag', 'div', [('style', '')]), + ('starttag', 'b', []), + ('data', 'The '), + ('starttag', 'a', [('href', 'some_url')]), + ('data', 'rain'), + ('endtag', 'a'), + ('data', ' '), + ('startendtag', 'br', []), + ('data', ' in '), + ('starttag', 'span', []), + ('data', 'Spain'), + ('endtag', 'span'), + ('endtag', 'b'), + ('endtag', 'div') + ] + self._run_check(html, expected, collector=self.collector) + + def test_issue13273_2(self): + html = '
The rain' + expected = [ + ('starttag', 'div', [('style', ''), ('foo', 'bar')]), + ('starttag', 'b', []), + ('data', 'The '), + ('starttag', 'a', [('href', 'some_url')]), + ('data', 'rain'), + ('endtag', 'a'), + ] + self._run_check(html, expected, collector=self.collector) + def test_unescape_function(self): p = html.parser.HTMLParser() self.assertEqual(p.unescape('&#bad;'),'&#bad;') diff -r d1c2b62ff80c -r 41d41776aa6d Misc/NEWS --- a/Misc/NEWS Fri Oct 28 12:32:53 2011 +0300 +++ b/Misc/NEWS Fri Oct 28 13:21:09 2011 +0300 @@ -61,6 +61,9 @@ Library ------- +- Issue #13273: fix a bug that prevented HTMLParser to properly detect some + tags when strict=False. + - Issue #10332: multiprocessing: fix a race condition when a Pool is closed before all tasks have completed.

AltStyle によって変換されたページ (->オリジナル) /