Commit 509ea39

miss-islingtonserhiy-storchaka

and

authored

[3.13] gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502) (GH-138549)

* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing. (cherry picked from commit cb7ef18) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>

1 parent 5f61a0a commit 509ea39Copy full SHA for 509ea39

File tree

4 files changed

+170

-31

lines changed

Lib
- test
  - test_robotparser.py
- urllib
  - robotparser.py
Misc/NEWS.d/next/Library
- 2025年09月04日-15-18-11.gh-issue-111788.tuTEM5.rst
- 2025年09月05日-15-35-59.gh-issue-88375.dC491a.rst

4 files changed

+170

-31

lines changed

`‎Lib/test/test_robotparser.py‎`

Lines changed: 141 additions & 20 deletions

Original file line number	Diff line number	Diff line change
`@@ -16,6 +16,14 @@ class BaseRobotTest:`
`16`	`16`	`bad = []`
`17`	`17`	`site_maps = None`
`18`	`18`
	`19`	`+ def __init_subclass__(cls):`
	`20`	`+ super().__init_subclass__()`
	`21`	`+ # Remove tests that do nothing.`
	`22`	`+ if not cls.good:`
	`23`	`+ cls.test_good_urls = None`
	`24`	`+ if not cls.bad:`
	`25`	`+ cls.test_bad_urls = None`
	`26`	`+`
`19`	`27`	`def setUp(self):`
`20`	`28`	`lines = io.StringIO(self.robots_txt).readlines()`
`21`	`29`	`self.parser = urllib.robotparser.RobotFileParser()`
`@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):`
`231`	`239`	`robots_txt = """\`
`232`	`240`	`User-agent: *`
`233`	`241`	`Disallow: /some/path?name=value`
	`242`	`+Disallow: /another/path?`
	`243`	`+Disallow: /yet/one/path?name=value&more`
`234`	`244`	`"""`
`235`		`- good = ['/some/path']`
`236`		`- bad = ['/some/path?name=value']`
	`245`	`+ good = ['/some/path', '/some/path?',`
	`246`	`+ '/some/path%3Fname=value', '/some/path?name%3Dvalue',`
	`247`	`+ '/another/path', '/another/path%3F',`
	`248`	`+ '/yet/one/path?name=value%26more']`
	`249`	`+ bad = ['/some/path?name=value'`
	`250`	`+ '/another/path?', '/another/path?name=value',`
	`251`	`+ '/yet/one/path?name=value&more']`
`237`	`252`
`238`	`253`
`239`	`254`	`class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):`
`@@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):`
`249`	`264`	`bad = ['/some/path']`
`250`	`265`
`251`	`266`
`252`		`-class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):`
`253`		`- # normalize the URL first (#17403)`
	`267`	`+class PercentEncodingTest(BaseRobotTest, unittest.TestCase):`
`254`	`268`	`robots_txt = """\`
`255`	`269`	`User-agent: *`
`256`		`-Allow: /some/path?`
`257`		`-Disallow: /another/path?`
`258`		`- """`
`259`		`- good = ['/some/path?']`
`260`		`- bad = ['/another/path?']`
	`270`	`+Disallow: /a1/Z-._~ # unreserved characters`
	`271`	`+Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters`
	`272`	`+Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character`
	`273`	`+Disallow: /u2/%f0%9f%90%8d`
	`274`	`+Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character`
	`275`	`+Disallow: /v1/%F0 # percent-encoded non-ASCII octet`
	`276`	`+Disallow: /v2/%f0`
	`277`	`+Disallow: /v3/\udcf0 # raw non-ASCII octet`
	`278`	`+Disallow: /p1%xy # raw percent`
	`279`	`+Disallow: /p2%`
	`280`	`+Disallow: /p3%25xy # percent-encoded percent`
	`281`	`+Disallow: /p4%2525xy # double percent-encoded percent`
	`282`	`+Disallow: /john%20smith # space`
	`283`	`+Disallow: /john doe`
	`284`	`+Disallow: /trailingspace%20`
	`285`	`+Disallow: /question%3Fq=v # not query`
	`286`	`+Disallow: /hash%23f # not fragment`
	`287`	`+Disallow: /dollar%24`
	`288`	`+Disallow: /asterisk%2A`
	`289`	`+Disallow: /sub/dir`
	`290`	`+Disallow: /slash%2F`
	`291`	`+Disallow: /query/question?q=%3F`
	`292`	`+Disallow: /query/raw/question?q=?`
	`293`	`+Disallow: /query/eq?q%3Dv`
	`294`	`+Disallow: /query/amp?q=v%26a`
	`295`	`+"""`
	`296`	`+ good = [`
	`297`	`+ '/u1/%F0', '/u1/%f0',`
	`298`	`+ '/u2/%F0', '/u2/%f0',`
	`299`	`+ '/u3/%F0', '/u3/%f0',`
	`300`	`+ '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',`
	`301`	`+ '/question?q=v',`
	`302`	`+ '/dollar', '/asterisk',`
	`303`	`+ '/query/eq?q=v',`
	`304`	`+ '/query/amp?q=v&a',`
	`305`	`+ ]`
	`306`	`+ bad = [`
	`307`	`+ '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',`
	`308`	`+ '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',`
	`309`	`+ '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',`
	`310`	`+ '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',`
	`311`	`+ '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',`
	`312`	`+ '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',`
	`313`	`+ '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',`
	`314`	`+ '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',`
	`315`	`+ '/p1%xy', '/p1%25xy',`
	`316`	`+ '/p2%', '/p2%25', '/p2%2525', '/p2%xy',`
	`317`	`+ '/p3%xy', '/p3%25xy',`
	`318`	`+ '/p4%2525xy',`
	`319`	`+ '/john%20smith', '/john smith',`
	`320`	`+ '/john%20doe', '/john doe',`
	`321`	`+ '/trailingspace%20', '/trailingspace ',`
	`322`	`+ '/question%3Fq=v',`
	`323`	`+ '/hash#f', '/hash%23f',`
	`324`	`+ '/dollar$', '/dollar%24',`
	`325`	`+ '/asterisk*', '/asterisk%2A',`
	`326`	`+ '/sub/dir', '/sub%2Fdir',`
	`327`	`+ '/slash%2F', '/slash/',`
	`328`	`+ '/query/question?q=?', '/query/question?q=%3F',`
	`329`	`+ '/query/raw/question?q=?', '/query/raw/question?q=%3F',`
	`330`	`+ '/query/eq?q%3Dv',`
	`331`	`+ '/query/amp?q=v%26a',`
	`332`	`+ ]`
	`333`	`+ # other reserved characters`
	`334`	`+ for c in ":/[]@!$&'()*+,;=":`
	`335`	`+ robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'`
	`336`	`+ bad.append(f'/raw{c}')`
	`337`	`+ bad.append(f'/raw%{ord(c):02X}')`
	`338`	`+ bad.append(f'/pc{c}')`
	`339`	`+ bad.append(f'/pc%{ord(c):02X}')`
`261`	`340`
`262`	`341`
`263`	`342`	`class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):`
`@@ -299,26 +378,17 @@ def test_string_formatting(self):`
`299`	`378`	`self.assertEqual(str(self.parser), self.expected_output)`
`300`	`379`
`301`	`380`
`302`		`-class RobotHandler(BaseHTTPRequestHandler):`
`303`		`-`
`304`		`- def do_GET(self):`
`305`		`- self.send_error(403, "Forbidden access")`
`306`		`-`
`307`		`- def log_message(self, format, *args):`
`308`		`- pass`
`309`		`-`
`310`		`-`
`311`	`381`	`@unittest.skipUnless(`
`312`	`382`	`support.has_socket_support,`
`313`	`383`	`"Socket server requires working socket."`
`314`	`384`	`)`
`315`		`-class PasswordProtectedSiteTestCase(unittest.TestCase):`
	`385`	`+class BaseLocalNetworkTestCase:`
`316`	`386`
`317`	`387`	`def setUp(self):`
`318`	`388`	`# clear _opener global variable`
`319`	`389`	`self.addCleanup(urllib.request.urlcleanup)`
`320`	`390`
`321`		`- self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)`
	`391`	`+ self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)`
`322`	`392`
`323`	`393`	`self.t = threading.Thread(`
`324`	`394`	`name='HTTPServer serving',`
`@@ -335,6 +405,57 @@ def tearDown(self):`
`335`	`405`	`self.t.join()`
`336`	`406`	`self.server.server_close()`
`337`	`407`
	`408`	`+`
	`409`	`+SAMPLE_ROBOTS_TXT = b'''\`
	`410`	`+User-agent: test_robotparser`
	`411`	`+Disallow: /utf8/\xf0\x9f\x90\x8d`
	`412`	`+Disallow: /non-utf8/\xf0`
	`413`	`+Disallow: //[spam]/path`
	`414`	`+'''`
	`415`	`+`
	`416`	`+`
	`417`	`+class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):`
	`418`	`+ class RobotHandler(BaseHTTPRequestHandler):`
	`419`	`+`
	`420`	`+ def do_GET(self):`
	`421`	`+ self.send_response(200)`
	`422`	`+ self.end_headers()`
	`423`	`+ self.wfile.write(SAMPLE_ROBOTS_TXT)`
	`424`	`+`
	`425`	`+ def log_message(self, format, *args):`
	`426`	`+ pass`
	`427`	`+`
	`428`	`+ @threading_helper.reap_threads`
	`429`	`+ def testRead(self):`
	`430`	`+ # Test that reading a weird robots.txt doesn't fail.`
	`431`	`+ addr = self.server.server_address`
	`432`	`+ url = f'http://{socket_helper.HOST}:{addr[1]}'`
	`433`	`+ robots_url = url + '/robots.txt'`
	`434`	`+ parser = urllib.robotparser.RobotFileParser()`
	`435`	`+ parser.set_url(robots_url)`
	`436`	`+ parser.read()`
	`437`	`+ # And it can even interpret the weird paths in some reasonable way.`
	`438`	`+ agent = 'test_robotparser'`
	`439`	`+ self.assertTrue(parser.can_fetch(agent, robots_url))`
	`440`	`+ self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))`
	`441`	`+ self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))`
	`442`	`+ self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))`
	`443`	`+ self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))`
	`444`	`+ self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))`
	`445`	`+ self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))`
	`446`	`+ self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))`
	`447`	`+ self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))`
	`448`	`+`
	`449`	`+`
	`450`	`+class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):`
	`451`	`+ class RobotHandler(BaseHTTPRequestHandler):`
	`452`	`+`
	`453`	`+ def do_GET(self):`
	`454`	`+ self.send_error(403, "Forbidden access")`
	`455`	`+`
	`456`	`+ def log_message(self, format, *args):`
	`457`	`+ pass`
	`458`	`+`
`338`	`459`	`@threading_helper.reap_threads`
`339`	`460`	`def testPasswordProtectedSite(self):`
`340`	`461`	`addr = self.server.server_address`

`‎Lib/urllib/robotparser.py‎`

Lines changed: 22 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`	`"""`
`12`	`12`
`13`	`13`	`import collections`
	`14`	`+import re`
`14`	`15`	`import urllib.error`
`15`	`16`	`import urllib.parse`
`16`	`17`	`import urllib.request`
`@@ -20,6 +21,19 @@`
`20`	`21`	`RequestRate = collections.namedtuple("RequestRate", "requests seconds")`
`21`	`22`
`22`	`23`
	`24`	`+def normalize(path):`
	`25`	`+ unquoted = urllib.parse.unquote(path, errors='surrogateescape')`
	`26`	`+ return urllib.parse.quote(unquoted, errors='surrogateescape')`
	`27`	`+`
	`28`	`+def normalize_path(path):`
	`29`	`+ path, sep, query = path.partition('?')`
	`30`	`+ path = normalize(path)`
	`31`	`+ if sep:`
	`32`	`+ query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)`
	`33`	`+ path += '?' + query`
	`34`	`+ return path`
	`35`	`+`
	`36`	`+`
`23`	`37`	`class RobotFileParser:`
`24`	`38`	`""" This class provides a set of methods to read, parse and answer`
`25`	`39`	`questions about a single robots.txt file.`
`@@ -55,7 +69,7 @@ def modified(self):`
`55`	`69`	`def set_url(self, url):`
`56`	`70`	`"""Sets the URL referring to a robots.txt file."""`
`57`	`71`	`self.url = url`
`58`		`- self.host, self.path = urllib.parse.urlparse(url)[1:3]`
	`72`	`+ self.host, self.path = urllib.parse.urlsplit(url)[1:3]`
`59`	`73`
`60`	`74`	`def read(self):`
`61`	`75`	`"""Reads the robots.txt URL and feeds it to the parser."""`
`@@ -69,7 +83,7 @@ def read(self):`
`69`	`83`	`err.close()`
`70`	`84`	`else:`
`71`	`85`	`raw = f.read()`
`72`		`- self.parse(raw.decode("utf-8").splitlines())`
	`86`	`+ self.parse(raw.decode("utf-8", "surrogateescape").splitlines())`
`73`	`87`
`74`	`88`	`def _add_entry(self, entry):`
`75`	`89`	`if "*" in entry.useragents:`
`@@ -113,7 +127,7 @@ def parse(self, lines):`
`113`	`127`	`line = line.split(':', 1)`
`114`	`128`	`if len(line) == 2:`
`115`	`129`	`line[0] = line[0].strip().lower()`
`116`		`- line[1] = urllib.parse.unquote(line[1].strip())`
	`130`	`+ line[1] = line[1].strip()`
`117`	`131`	`if line[0] == "user-agent":`
`118`	`132`	`if state == 2:`
`119`	`133`	`self._add_entry(entry)`
`@@ -167,10 +181,9 @@ def can_fetch(self, useragent, url):`
`167`	`181`	`return False`
`168`	`182`	`# search for given user agent matches`
`169`	`183`	`# the first match counts`
`170`		`- parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))`
`171`		`- url = urllib.parse.urlunparse(('','',parsed_url.path,`
`172`		`- parsed_url.params,parsed_url.query, parsed_url.fragment))`
`173`		`- url = urllib.parse.quote(url)`
	`184`	`+ parsed_url = urllib.parse.urlsplit(url)`
	`185`	`+ url = urllib.parse.urlunsplit(('', '', *parsed_url[2:]))`
	`186`	`+ url = normalize_path(url)`
`174`	`187`	`if not url:`
`175`	`188`	`url = "/"`
`176`	`189`	`for entry in self.entries:`
`@@ -213,16 +226,14 @@ def __str__(self):`
`213`	`226`	`entries = entries + [self.default_entry]`
`214`	`227`	`return '\n\n'.join(map(str, entries))`
`215`	`228`
`216`		`-`
`217`	`229`	`class RuleLine:`
`218`	`230`	`"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"`
`219`	`231`	`(allowance==False) followed by a path."""`
`220`	`232`	`def __init__(self, path, allowance):`
`221`	`233`	`if path == '' and not allowance:`
`222`	`234`	`# an empty value means allow all`
`223`	`235`	`allowance = True`
`224`		`- path = urllib.parse.urlunparse(urllib.parse.urlparse(path))`
`225`		`- self.path = urllib.parse.quote(path)`
	`236`	`+ self.path = normalize_path(path)`
`226`	`237`	`self.allowance = allowance`
`227`	`238`
`228`	`239`	`def applies_to(self, filename):`
`@@ -268,7 +279,7 @@ def applies_to(self, useragent):`
`268`	`279`	`def allowance(self, filename):`
`269`	`280`	`"""Preconditions:`
`270`	`281`	`- our agent applies to this entry`
`271`		`- - filename is URL decoded"""`
	`282`	`+ - filename is URL encoded"""`
`272`	`283`	`for line in self.rulelines:`
`273`	`284`	`if line.applies_to(filename):`
`274`	`285`	`return line.allowance`

`‎Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst‎`

Lines changed: 3 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Fix parsing errors in the :mod:`urllib.robotparser` module.
	`2`	`+Don't fail trying to parse weird paths.`
	`3`	+Don't fail trying to decode non-UTF-8 ``robots.txt`` files.

`‎Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,4 @@`
	`1`	+Fix normalization of the ``robots.txt`` rules and URLs in the
	`2`	+:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
	`3`	+Distinguish raw special characters ``?``, ``=`` and ``&`` from the
	`4`	`+percent-encoded ones.`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit 509ea39

File tree

4 files changed

4 files changed

`‎Lib/test/test_robotparser.py‎`

`‎Lib/urllib/robotparser.py‎`

`‎Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst‎`

`‎Misc/NEWS.d/next/Library/2025-09-05-15-35-59.gh-issue-88375.dC491a.rst‎`

0 commit comments