Index: Misc/NEWS =================================================================== --- Misc/NEWS (revision 64812) +++ Misc/NEWS (working copy) @@ -25,6 +25,11 @@ Library ------- +- Issue #2275: urllib2 header capitalization. Included a case-insensitive dict + lookup for headers interface. Headers sent to httplib will be .title()-ed + instead of capitalize()'d. Headers dictionary exposed and documented using + .headers and Request.get_header() + - Issue #2683: Fix inconsistency in subprocess.Popen.communicate(): the argument now must be a bytes object in any case. Index: Doc/library/urllib.request.rst =================================================================== --- Doc/library/urllib.request.rst (revision 64812) +++ Doc/library/urllib.request.rst (working copy) @@ -175,13 +175,16 @@ :func:`urllib.urlencode` function takes a mapping or sequence of 2-tuples and returns a string in this format. - *headers* should be a dictionary, and will be treated as if :meth:`add_header` - was called with each key and value as arguments. This is often used to "spoof" - the ``User-Agent`` header, which is used by a browser to identify itself -- - some HTTP servers only allow requests coming from common browsers as opposed - to scripts. For example, Mozilla Firefox may identify itself as ``"Mozilla/5.0 - (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"``, while :mod:`urllib`'s - default user agent string is ``"Python-urllib/2.6"`` (on Python 2.6). + *headers* should be a dictionary, and will be treated as if + :meth:`add_header` was called with each key and value as arguments. + *headers* are internally stored as a special form of ``dict`` which provides + case-insensitive key lookup. Headers is often used to "spoof" the + ``User-Agent`` header, which is used by a browser to identify itself -- some + HTTP servers only allow requests coming from common browsers as opposed to + scripts. For example, Mozilla Firefox may identify itself as ``"Mozilla/5.0 + (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"``, while + :mod:`urllib`'s default user agent string is ``"Python-urllib/2.6"`` (on + Python 2.6). The final two arguments are only of interest for correct handling of third-party HTTP cookies: @@ -1094,6 +1097,20 @@ :mailheader:`Content-Type` and :mailheader:`Host`) are added when the :class:`Request` is passed to :func:`urlopen` (or :meth:`OpenerDirector.open`). +Retriving HTTP headers: + +To retrive HTTP header, either use .headers parameter directly or use +:meth:`Request.get_header` method. :: + +>>> from urllib.request import Request +>>> url = "http://example.com" +>>> req = Request(url, headers={"Spam-eggs": "blah"}) +>>> req.headers["Spam-eggs"] + 'blah' +>>> req.headers["Spam-Eggs"] + 'blah' +>>> req.get_header["Spam-Eggs"] + 'blah' .. _urllib-examples: Here is an example session that uses the ``GET`` method to retrieve a URL Index: Lib/urllib/request.py =================================================================== --- Lib/urllib/request.py (revision 64812) +++ Lib/urllib/request.py (working copy) @@ -158,6 +158,37 @@ host = _cut_port_re.sub("", host, 1) return host.lower() +class CaseInsensitiveDict(dict): + def __init__(self, *args, **kwargs): + self.keystore = {} + d = dict(*args, **kwargs) + for k in d.keys(): + self.keystore[self._get_lower(k)] = k + return super(CaseInsensitiveDict, self).__init__(*args, **kwargs) + def __setitem__(self, k, v): + if hasattr(self,'keystore'): + self.keystore[self._get_lower(k)] = k + return super(CaseInsensitiveDict, self).__setitem__(k, v) + def __getitem__(self, k): + if hasattr(self,'keystore') and self._get_lower(k) in self.keystore: + k = self.keystore[self._get_lower(k)] + return super(CaseInsensitiveDict, self).__getitem__(k) + def __contains__(self, k): + if hasattr(self,'keystore') and self._get_lower(k) in self.keystore: + k = self.keystore[self._get_lower(k)] + return super(CaseInsensitiveDict, self).__contains__(k) + def get(self, k, failobj=None): + if hasattr(self,'keystore') and self._get_lower(k) in self.keystore: + k = self.keystore[self._get_lower(k)] + return super(CaseInsensitiveDict, self).get(k, failobj) + + @staticmethod + def _get_lower(k): + if isinstance(k, str): + return k.lower() + else: + return k + class Request: def __init__(self, url, data=None, headers={}, @@ -169,10 +200,10 @@ self.host = None self.port = None self.data = data - self.headers = {} + self.headers = CaseInsensitiveDict() for key, value in headers.items(): self.add_header(key, value) - self.unredirected_hdrs = {} + self.unredirected_hdrs = CaseInsensitiveDict() if origin_req_host is None: origin_req_host = request_host(self) self.origin_req_host = origin_req_host @@ -239,11 +270,11 @@ def add_header(self, key, val): # useful for something like authentication - self.headers[key.capitalize()] = val + self.headers[key.title()] = val def add_unredirected_header(self, key, val): # will not be added to a redirected request - self.unredirected_hdrs[key.capitalize()] = val + self.unredirected_hdrs[key.title()] = val def has_header(self, header_name): return (header_name in self.headers or @@ -1002,13 +1033,13 @@ if request.has_data(): # POST data = request.get_data() - if not request.has_header('Content-type'): + if not request.has_header('Content-Type'): request.add_unredirected_header( - 'Content-type', + 'Content-Type', 'application/x-www-form-urlencoded') - if not request.has_header('Content-length'): + if not request.has_header('Content-Length'): request.add_unredirected_header( - 'Content-length', '%d' % len(data)) + 'Content-Length', '%d' % len(data)) scheme, sel = splittype(request.get_selector()) sel_host, sel_path = splithost(sel) Index: Lib/test/test_urllib2.py =================================================================== --- Lib/test/test_urllib2.py (revision 64812) +++ Lib/test/test_urllib2.py (working copy) @@ -48,56 +48,55 @@ def test_request_headers_dict(): """ - The Request.headers dictionary is not a documented interface. It should - stay that way, because the complete set of headers are only accessible - through the .get_header(), .has_header(), .header_items() interface. - However, .headers pre-dates those methods, and so real code will be using - the dictionary. + Check CaseInsensitive Dict lookup, so that any form of key (.capitalized() + or .title()) can lookup the headers dict. - The introduction in 2.4 of those methods was a mistake for the same reason: - code that previously saw all (urllib2 user)-provided headers in .headers - now sees only a subset (and the function interface is ugly and incomplete). - A better change would have been to replace .headers dict with a dict - subclass (or UserDict.DictMixin instance?) that preserved the .headers - interface and also provided access to the "unredirected" headers. It's - probably too late to fix that, though. - - - Check .capitalize() case normalization: - >>> url = "http://example.com" >>> Request(url, headers={"Spam-eggs": "blah"}).headers["Spam-eggs"] 'blah' >>> Request(url, headers={"spam-EggS": "blah"}).headers["Spam-eggs"] 'blah' - - Currently, Request(url, "Spam-eggs").headers["Spam-Eggs"] raises KeyError, - but that could be changed in future. - +>>> Request(url, headers={"Spam-eggs":"blah"}).headers["Spam-Eggs"] + 'blah' +>>> Request(url, headers={"SpaM-EggS":"blah"}).headers["spam-eggs"] + 'blah' """ def test_request_headers_methods(): """ - Note the case normalization of header names here, to .capitalize()-case. - This should be preserved for backwards-compatibility. (In the HTTP case, - normalization to .title()-case is done by urllib2 before sending headers to - http.client). + Note the case normalization of header names here, to .title()-case + (#Issue2275).With the addition of case insensitive dict lookup,the backward + compatiblity, as in retrieval using capitalize() case is maintained. >>> url = "http://example.com" >>> r = Request(url, headers={"Spam-eggs": "blah"}) ->>> r.has_header("Spam-eggs") +>>> r.has_header("Spam-Eggs") True >>> r.header_items() - [('Spam-eggs', 'blah')] + [('Spam-Eggs', 'blah')] >>> r.add_header("Foo-Bar", "baz") >>> items = sorted(r.header_items()) >>> items - [('Foo-bar', 'baz'), ('Spam-eggs', 'blah')] + [('Foo-Bar', 'baz'), ('Spam-Eggs', 'blah')] - Note that e.g. r.has_header("spam-EggS") is currently False, and - r.get_header("spam-EggS") returns None, but that could be changed in - future. + Examples below demonstrate Case Insensitive Dict lookup. +>>> r.has_header("Spam-eggs") # .capitalize() case + True +>>> r.has_header("spam-EggS") + True +>>> r.has_header("sPaM-EggS") + True +>>> r.get_header("Spam-eggs") # .capitalize() case + 'blah' +>>> r.get_header("spam-EGGS") + 'blah' +>>> r.get_header("sPaM-EggS") + 'blah' + + Invalid and Default value scenarios + + >>> r.has_header("Not-there") False >>> print(r.get_header("Not-there")) @@ -749,24 +748,24 @@ r = MockResponse(200, "OK", {}, "") newreq = h.do_request_(req) if data is None: # GET - self.assert_("Content-length" not in req.unredirected_hdrs) - self.assert_("Content-type" not in req.unredirected_hdrs) + self.assert_("Content-Length" not in req.unredirected_hdrs) + self.assert_("Content-Type" not in req.unredirected_hdrs) else: # POST - self.assertEqual(req.unredirected_hdrs["Content-length"], "0") - self.assertEqual(req.unredirected_hdrs["Content-type"], + self.assertEqual(req.unredirected_hdrs["Content-Length"], "0") + self.assertEqual(req.unredirected_hdrs["Content-Type"], "application/x-www-form-urlencoded") # XXX the details of Host could be better tested self.assertEqual(req.unredirected_hdrs["Host"], "example.com") self.assertEqual(req.unredirected_hdrs["Spam"], "eggs") # don't clobber existing headers - req.add_unredirected_header("Content-length", "foo") - req.add_unredirected_header("Content-type", "bar") + req.add_unredirected_header("Content-Length", "foo") + req.add_unredirected_header("Content-Type", "bar") req.add_unredirected_header("Host", "baz") req.add_unredirected_header("Spam", "foo") newreq = h.do_request_(req) - self.assertEqual(req.unredirected_hdrs["Content-length"], "foo") - self.assertEqual(req.unredirected_hdrs["Content-type"], "bar") + self.assertEqual(req.unredirected_hdrs["Content-Length"], "foo") + self.assertEqual(req.unredirected_hdrs["Content-Type"], "bar") self.assertEqual(req.unredirected_hdrs["Host"], "baz") self.assertEqual(req.unredirected_hdrs["Spam"], "foo") @@ -940,7 +939,7 @@ 407, 'Proxy-Authenticate: Basic realm="%s"\r\n\r\n' % realm) opener.add_handler(auth_handler) opener.add_handler(http_handler) - self._test_basic_auth(opener, auth_handler, "Proxy-authorization", + self._test_basic_auth(opener, auth_handler, "Proxy-Authorization", realm, http_handler, password_manager, "http://acme.example.com:3128/protected", "proxy.example.com:3128",