Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 509ea39

Browse files
miss-islingtonserhiy-storchaka
andauthored
[3.13] gh-88375, gh-111788: Fix parsing errors and normalization in robotparser (GH-138502) (GH-138549)
* Don't fail trying to parse weird patterns. * Don't fail trying to decode non-UTF-8 "robots.txt" files. * No longer ignore trailing "?" in patterns and URLs. * Distinguish raw special characters "?", "=" and "&" from the percent-encoded ones. * Remove tests that do nothing. (cherry picked from commit cb7ef18) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 5f61a0a commit 509ea39

File tree

4 files changed

+170
-31
lines changed

4 files changed

+170
-31
lines changed

‎Lib/test/test_robotparser.py‎

Lines changed: 141 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ class BaseRobotTest:
1616
bad = []
1717
site_maps = None
1818

19+
def __init_subclass__(cls):
20+
super().__init_subclass__()
21+
# Remove tests that do nothing.
22+
if not cls.good:
23+
cls.test_good_urls = None
24+
if not cls.bad:
25+
cls.test_bad_urls = None
26+
1927
def setUp(self):
2028
lines = io.StringIO(self.robots_txt).readlines()
2129
self.parser = urllib.robotparser.RobotFileParser()
@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
231239
robots_txt = """\
232240
User-agent: *
233241
Disallow: /some/path?name=value
242+
Disallow: /another/path?
243+
Disallow: /yet/one/path?name=value&more
234244
"""
235-
good = ['/some/path']
236-
bad = ['/some/path?name=value']
245+
good = ['/some/path', '/some/path?',
246+
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
247+
'/another/path', '/another/path%3F',
248+
'/yet/one/path?name=value%26more']
249+
bad = ['/some/path?name=value'
250+
'/another/path?', '/another/path?name=value',
251+
'/yet/one/path?name=value&more']
237252

238253

239254
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
@@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249264
bad = ['/some/path']
250265

251266

252-
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
253-
# normalize the URL first (#17403)
267+
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
254268
robots_txt = """\
255269
User-agent: *
256-
Allow: /some/path?
257-
Disallow: /another/path?
258-
"""
259-
good = ['/some/path?']
260-
bad = ['/another/path?']
270+
Disallow: /a1/Z-._~ # unreserved characters
271+
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
272+
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
273+
Disallow: /u2/%f0%9f%90%8d
274+
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
275+
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
276+
Disallow: /v2/%f0
277+
Disallow: /v3/\udcf0 # raw non-ASCII octet
278+
Disallow: /p1%xy # raw percent
279+
Disallow: /p2%
280+
Disallow: /p3%25xy # percent-encoded percent
281+
Disallow: /p4%2525xy # double percent-encoded percent
282+
Disallow: /john%20smith # space
283+
Disallow: /john doe
284+
Disallow: /trailingspace%20
285+
Disallow: /question%3Fq=v # not query
286+
Disallow: /hash%23f # not fragment
287+
Disallow: /dollar%24
288+
Disallow: /asterisk%2A
289+
Disallow: /sub/dir
290+
Disallow: /slash%2F
291+
Disallow: /query/question?q=%3F
292+
Disallow: /query/raw/question?q=?
293+
Disallow: /query/eq?q%3Dv
294+
Disallow: /query/amp?q=v%26a
295+
"""
296+
good = [
297+
'/u1/%F0', '/u1/%f0',
298+
'/u2/%F0', '/u2/%f0',
299+
'/u3/%F0', '/u3/%f0',
300+
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
301+
'/question?q=v',
302+
'/dollar', '/asterisk',
303+
'/query/eq?q=v',
304+
'/query/amp?q=v&a',
305+
]
306+
bad = [
307+
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
308+
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
309+
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
310+
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
311+
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
312+
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
313+
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
314+
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
315+
'/p1%xy', '/p1%25xy',
316+
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
317+
'/p3%xy', '/p3%25xy',
318+
'/p4%2525xy',
319+
'/john%20smith', '/john smith',
320+
'/john%20doe', '/john doe',
321+
'/trailingspace%20', '/trailingspace ',
322+
'/question%3Fq=v',
323+
'/hash#f', '/hash%23f',
324+
'/dollar$', '/dollar%24',
325+
'/asterisk*', '/asterisk%2A',
326+
'/sub/dir', '/sub%2Fdir',
327+
'/slash%2F', '/slash/',
328+
'/query/question?q=?', '/query/question?q=%3F',
329+
'/query/raw/question?q=?', '/query/raw/question?q=%3F',
330+
'/query/eq?q%3Dv',
331+
'/query/amp?q=v%26a',
332+
]
333+
# other reserved characters
334+
for c in ":/[]@!$&'()*+,;=":
335+
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
336+
bad.append(f'/raw{c}')
337+
bad.append(f'/raw%{ord(c):02X}')
338+
bad.append(f'/pc{c}')
339+
bad.append(f'/pc%{ord(c):02X}')
261340

262341

263342
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@@ -299,26 +378,17 @@ def test_string_formatting(self):
299378
self.assertEqual(str(self.parser), self.expected_output)
300379

301380

302-
class RobotHandler(BaseHTTPRequestHandler):
303-
304-
def do_GET(self):
305-
self.send_error(403, "Forbidden access")
306-
307-
def log_message(self, format, *args):
308-
pass
309-
310-
311381
@unittest.skipUnless(
312382
support.has_socket_support,
313383
"Socket server requires working socket."
314384
)
315-
class PasswordProtectedSiteTestCase(unittest.TestCase):
385+
class BaseLocalNetworkTestCase:
316386

317387
def setUp(self):
318388
# clear _opener global variable
319389
self.addCleanup(urllib.request.urlcleanup)
320390

321-
self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
391+
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
322392

323393
self.t = threading.Thread(
324394
name='HTTPServer serving',
@@ -335,6 +405,57 @@ def tearDown(self):
335405
self.t.join()
336406
self.server.server_close()
337407

408+
409+
SAMPLE_ROBOTS_TXT = b'''\
410+
User-agent: test_robotparser
411+
Disallow: /utf8/\xf0\x9f\x90\x8d
412+
Disallow: /non-utf8/\xf0
413+
Disallow: //[spam]/path
414+
'''
415+
416+
417+
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
418+
class RobotHandler(BaseHTTPRequestHandler):
419+
420+
def do_GET(self):
421+
self.send_response(200)
422+
self.end_headers()
423+
self.wfile.write(SAMPLE_ROBOTS_TXT)
424+
425+
def log_message(self, format, *args):
426+
pass
427+
428+
@threading_helper.reap_threads
429+
def testRead(self):
430+
# Test that reading a weird robots.txt doesn't fail.
431+
addr = self.server.server_address
432+
url = f'http://{socket_helper.HOST}:{addr[1]}'
433+
robots_url = url + '/robots.txt'
434+
parser = urllib.robotparser.RobotFileParser()
435+
parser.set_url(robots_url)
436+
parser.read()
437+
# And it can even interpret the weird paths in some reasonable way.
438+
agent = 'test_robotparser'
439+
self.assertTrue(parser.can_fetch(agent, robots_url))
440+
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
441+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
442+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
443+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
444+
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
445+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
446+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
447+
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
448+
449+
450+
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
451+
class RobotHandler(BaseHTTPRequestHandler):
452+
453+
def do_GET(self):
454+
self.send_error(403, "Forbidden access")
455+
456+
def log_message(self, format, *args):
457+
pass
458+
338459
@threading_helper.reap_threads
339460
def testPasswordProtectedSite(self):
340461
addr = self.server.server_address

‎Lib/urllib/robotparser.py‎

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"""
1212

1313
import collections
14+
import re
1415
import urllib.error
1516
import urllib.parse
1617
import urllib.request
@@ -20,6 +21,19 @@
2021
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
2122

2223

24+
def normalize(path):
25+
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
26+
return urllib.parse.quote(unquoted, errors='surrogateescape')
27+
28+
def normalize_path(path):
29+
path, sep, query = path.partition('?')
30+
path = normalize(path)
31+
if sep:
32+
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
33+
path += '?' + query
34+
return path
35+
36+
2337
class RobotFileParser:
2438
""" This class provides a set of methods to read, parse and answer
2539
questions about a single robots.txt file.
@@ -55,7 +69,7 @@ def modified(self):
5569
def set_url(self, url):
5670
"""Sets the URL referring to a robots.txt file."""
5771
self.url = url
58-
self.host, self.path = urllib.parse.urlparse(url)[1:3]
72+
self.host, self.path = urllib.parse.urlsplit(url)[1:3]
5973

6074
def read(self):
6175
"""Reads the robots.txt URL and feeds it to the parser."""
@@ -69,7 +83,7 @@ def read(self):
6983
err.close()
7084
else:
7185
raw = f.read()
72-
self.parse(raw.decode("utf-8").splitlines())
86+
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
7387

7488
def _add_entry(self, entry):
7589
if "*" in entry.useragents:
@@ -113,7 +127,7 @@ def parse(self, lines):
113127
line = line.split(':', 1)
114128
if len(line) == 2:
115129
line[0] = line[0].strip().lower()
116-
line[1] = urllib.parse.unquote(line[1].strip())
130+
line[1] = line[1].strip()
117131
if line[0] == "user-agent":
118132
if state == 2:
119133
self._add_entry(entry)
@@ -167,10 +181,9 @@ def can_fetch(self, useragent, url):
167181
return False
168182
# search for given user agent matches
169183
# the first match counts
170-
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
171-
url = urllib.parse.urlunparse(('','',parsed_url.path,
172-
parsed_url.params,parsed_url.query, parsed_url.fragment))
173-
url = urllib.parse.quote(url)
184+
parsed_url = urllib.parse.urlsplit(url)
185+
url = urllib.parse.urlunsplit(('', '', *parsed_url[2:]))
186+
url = normalize_path(url)
174187
if not url:
175188
url = "/"
176189
for entry in self.entries:
@@ -213,16 +226,14 @@ def __str__(self):
213226
entries = entries + [self.default_entry]
214227
return '\n\n'.join(map(str, entries))
215228

216-
217229
class RuleLine:
218230
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
219231
(allowance==False) followed by a path."""
220232
def __init__(self, path, allowance):
221233
if path == '' and not allowance:
222234
# an empty value means allow all
223235
allowance = True
224-
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
225-
self.path = urllib.parse.quote(path)
236+
self.path = normalize_path(path)
226237
self.allowance = allowance
227238

228239
def applies_to(self, filename):
@@ -268,7 +279,7 @@ def applies_to(self, useragent):
268279
def allowance(self, filename):
269280
"""Preconditions:
270281
- our agent applies to this entry
271-
- filename is URL decoded"""
282+
- filename is URL encoded"""
272283
for line in self.rulelines:
273284
if line.applies_to(filename):
274285
return line.allowance
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix parsing errors in the :mod:`urllib.robotparser` module.
2+
Don't fail trying to parse weird paths.
3+
Don't fail trying to decode non-UTF-8 ``robots.txt`` files.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix normalization of the ``robots.txt`` rules and URLs in the
2+
:mod:`urllib.robotparser` module. No longer ignore trailing ``?``.
3+
Distinguish raw special characters ``?``, ``=`` and ``&`` from the
4+
percent-encoded ones.

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /