[Python-checkins] cpython (3.2): #7311: fix html.parser to accept non-ASCII attribute values.

ezio.melotti python-checkins at python.org
Thu Apr 7 21:27:23 CEST 2011


http://hg.python.org/cpython/rev/225400cb6e84
changeset: 69189:225400cb6e84
branch: 3.2
parent: 69186:3d7c9b38fbfd
user: Ezio Melotti
date: Thu Apr 07 22:03:31 2011 +0300
summary:
 #7311: fix html.parser to accept non-ASCII attribute values.
files:
 Lib/html/parser.py | 2 +-
 Lib/test/test_htmlparser.py | 17 +++++++++++++++++
 Misc/NEWS | 2 ++
 3 files changed, 20 insertions(+), 1 deletions(-)
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -28,7 +28,7 @@
 # make it correctly strict without breaking backward compatibility.
 attrfind = re.compile(
 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
+ r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 attrfind_tolerant = re.compile(
 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
 r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -217,6 +217,23 @@
 ("starttag", "a", [("href", "mailto:xyz at example.com")]),
 ])
 
+ def test_attr_nonascii(self):
+ # see issue 7311
+ self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
+ ("starttag", "img", [("src", "/foo/bar.png"),
+ ("alt", "\u4e2d\u6587")]),
+ ])
+ self._run_check("<a title='\u30c6\u30b9\u30c8' "
+ "href='\u30c6\u30b9\u30c8.html'>", [
+ ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")]),
+ ])
+ self._run_check('<a title="\u30c6\u30b9\u30c8" '
+ 'href="\u30c6\u30b9\u30c8.html">', [
+ ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
+ ("href", "\u30c6\u30b9\u30c8.html")]),
+ ])
+
 def test_attr_entity_replacement(self):
 self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
 ("starttag", "a", [("b", "&><\"'")]),
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -49,6 +49,8 @@
 Library
 -------
 
+- Issue #7311: fix html.parser to accept non-ASCII attribute values.
+
 - Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart
 subpararts with an 8bit CTE into unicode instead of preserving the bytes.
 
-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /