@@ -16,6 +16,14 @@ class BaseRobotTest:
16
16
bad = []
17
17
site_maps = None
18
18
19
+ def __init_subclass__ (cls ):
20
+ super ().__init_subclass__ ()
21
+ # Remove tests that do nothing.
22
+ if not cls .good :
23
+ cls .test_good_urls = None
24
+ if not cls .bad :
25
+ cls .test_bad_urls = None
26
+
19
27
def setUp (self ):
20
28
lines = io .StringIO (self .robots_txt ).readlines ()
21
29
self .parser = urllib .robotparser .RobotFileParser ()
@@ -231,9 +239,16 @@ class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
231
239
robots_txt = """\
232
240
User-agent: *
233
241
Disallow: /some/path?name=value
242
+ Disallow: /another/path?
243
+ Disallow: /yet/one/path?name=value&more
234
244
"""
235
- good = ['/some/path' ]
236
- bad = ['/some/path?name=value' ]
245
+ good = ['/some/path' , '/some/path?' ,
246
+ '/some/path%3Fname=value' , '/some/path?name%3Dvalue' ,
247
+ '/another/path' , '/another/path%3F' ,
248
+ '/yet/one/path?name=value%26more' ]
249
+ bad = ['/some/path?name=value'
250
+ '/another/path?' , '/another/path?name=value' ,
251
+ '/yet/one/path?name=value&more' ]
237
252
238
253
239
254
class UseFirstUserAgentWildcardTest (BaseRobotTest , unittest .TestCase ):
@@ -249,15 +264,79 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249
264
bad = ['/some/path' ]
250
265
251
266
252
- class EmptyQueryStringTest (BaseRobotTest , unittest .TestCase ):
253
- # normalize the URL first (#17403)
267
+ class PercentEncodingTest (BaseRobotTest , unittest .TestCase ):
254
268
robots_txt = """\
255
269
User-agent: *
256
- Allow: /some/path?
257
- Disallow: /another/path?
258
- """
259
- good = ['/some/path?' ]
260
- bad = ['/another/path?' ]
270
+ Disallow: /a1/Z-._~ # unreserved characters
271
+ Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
272
+ Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
273
+ Disallow: /u2/%f0%9f%90%8d
274
+ Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
275
+ Disallow: /v1/%F0 # percent-encoded non-ASCII octet
276
+ Disallow: /v2/%f0
277
+ Disallow: /v3/\udcf0 # raw non-ASCII octet
278
+ Disallow: /p1%xy # raw percent
279
+ Disallow: /p2%
280
+ Disallow: /p3%25xy # percent-encoded percent
281
+ Disallow: /p4%2525xy # double percent-encoded percent
282
+ Disallow: /john%20smith # space
283
+ Disallow: /john doe
284
+ Disallow: /trailingspace%20
285
+ Disallow: /question%3Fq=v # not query
286
+ Disallow: /hash%23f # not fragment
287
+ Disallow: /dollar%24
288
+ Disallow: /asterisk%2A
289
+ Disallow: /sub/dir
290
+ Disallow: /slash%2F
291
+ Disallow: /query/question?q=%3F
292
+ Disallow: /query/raw/question?q=?
293
+ Disallow: /query/eq?q%3Dv
294
+ Disallow: /query/amp?q=v%26a
295
+ """
296
+ good = [
297
+ '/u1/%F0' , '/u1/%f0' ,
298
+ '/u2/%F0' , '/u2/%f0' ,
299
+ '/u3/%F0' , '/u3/%f0' ,
300
+ '/p1%2525xy' , '/p2%f0' , '/p3%2525xy' , '/p4%xy' , '/p4%25xy' ,
301
+ '/question?q=v' ,
302
+ '/dollar' , '/asterisk' ,
303
+ '/query/eq?q=v' ,
304
+ '/query/amp?q=v&a' ,
305
+ ]
306
+ bad = [
307
+ '/a1/Z-._~' , '/a1/%5A%2D%2E%5F%7E' ,
308
+ '/a2/Z-._~' , '/a2/%5A%2D%2E%5F%7E' ,
309
+ '/u1/%F0%9F%90%8D' , '/u1/%f0%9f%90%8d' , '/u1/\U0001f40d ' ,
310
+ '/u2/%F0%9F%90%8D' , '/u2/%f0%9f%90%8d' , '/u2/\U0001f40d ' ,
311
+ '/u3/%F0%9F%90%8D' , '/u3/%f0%9f%90%8d' , '/u3/\U0001f40d ' ,
312
+ '/v1/%F0' , '/v1/%f0' , '/v1/\udcf0 ' , '/v1/\U0001f40d ' ,
313
+ '/v2/%F0' , '/v2/%f0' , '/v2/\udcf0 ' , '/v2/\U0001f40d ' ,
314
+ '/v3/%F0' , '/v3/%f0' , '/v3/\udcf0 ' , '/v3/\U0001f40d ' ,
315
+ '/p1%xy' , '/p1%25xy' ,
316
+ '/p2%' , '/p2%25' , '/p2%2525' , '/p2%xy' ,
317
+ '/p3%xy' , '/p3%25xy' ,
318
+ '/p4%2525xy' ,
319
+ '/john%20smith' , '/john smith' ,
320
+ '/john%20doe' , '/john doe' ,
321
+ '/trailingspace%20' , '/trailingspace ' ,
322
+ '/question%3Fq=v' ,
323
+ '/hash#f' , '/hash%23f' ,
324
+ '/dollar$' , '/dollar%24' ,
325
+ '/asterisk*' , '/asterisk%2A' ,
326
+ '/sub/dir' , '/sub%2Fdir' ,
327
+ '/slash%2F' , '/slash/' ,
328
+ '/query/question?q=?' , '/query/question?q=%3F' ,
329
+ '/query/raw/question?q=?' , '/query/raw/question?q=%3F' ,
330
+ '/query/eq?q%3Dv' ,
331
+ '/query/amp?q=v%26a' ,
332
+ ]
333
+ # other reserved characters
334
+ for c in ":/[]@!$&'()*+,;=" :
335
+ robots_txt += f'Disallow: /raw{ c } \n Disallow: /pc%{ ord (c ):02X} \n '
336
+ bad .append (f'/raw{ c } ' )
337
+ bad .append (f'/raw%{ ord (c ):02X} ' )
338
+ bad .append (f'/pc{ c } ' )
339
+ bad .append (f'/pc%{ ord (c ):02X} ' )
261
340
262
341
263
342
class DefaultEntryTest (BaseRequestRateTest , unittest .TestCase ):
@@ -299,26 +378,17 @@ def test_string_formatting(self):
299
378
self .assertEqual (str (self .parser ), self .expected_output )
300
379
301
380
302
- class RobotHandler (BaseHTTPRequestHandler ):
303
-
304
- def do_GET (self ):
305
- self .send_error (403 , "Forbidden access" )
306
-
307
- def log_message (self , format , * args ):
308
- pass
309
-
310
-
311
381
@unittest .skipUnless (
312
382
support .has_socket_support ,
313
383
"Socket server requires working socket."
314
384
)
315
- class PasswordProtectedSiteTestCase ( unittest . TestCase ) :
385
+ class BaseLocalNetworkTestCase :
316
386
317
387
def setUp (self ):
318
388
# clear _opener global variable
319
389
self .addCleanup (urllib .request .urlcleanup )
320
390
321
- self .server = HTTPServer ((socket_helper .HOST , 0 ), RobotHandler )
391
+ self .server = HTTPServer ((socket_helper .HOST , 0 ), self . RobotHandler )
322
392
323
393
self .t = threading .Thread (
324
394
name = 'HTTPServer serving' ,
@@ -335,6 +405,57 @@ def tearDown(self):
335
405
self .t .join ()
336
406
self .server .server_close ()
337
407
408
+
409
+ SAMPLE_ROBOTS_TXT = b'''\
410
+ User-agent: test_robotparser
411
+ Disallow: /utf8/\xf0 \x9f \x90 \x8d
412
+ Disallow: /non-utf8/\xf0
413
+ Disallow: //[spam]/path
414
+ '''
415
+
416
+
417
+ class LocalNetworkTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
418
+ class RobotHandler (BaseHTTPRequestHandler ):
419
+
420
+ def do_GET (self ):
421
+ self .send_response (200 )
422
+ self .end_headers ()
423
+ self .wfile .write (SAMPLE_ROBOTS_TXT )
424
+
425
+ def log_message (self , format , * args ):
426
+ pass
427
+
428
+ @threading_helper .reap_threads
429
+ def testRead (self ):
430
+ # Test that reading a weird robots.txt doesn't fail.
431
+ addr = self .server .server_address
432
+ url = f'http://{ socket_helper .HOST } :{ addr [1 ]} '
433
+ robots_url = url + '/robots.txt'
434
+ parser = urllib .robotparser .RobotFileParser ()
435
+ parser .set_url (robots_url )
436
+ parser .read ()
437
+ # And it can even interpret the weird paths in some reasonable way.
438
+ agent = 'test_robotparser'
439
+ self .assertTrue (parser .can_fetch (agent , robots_url ))
440
+ self .assertTrue (parser .can_fetch (agent , url + '/utf8/' ))
441
+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
442
+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/%F0%9F%90%8D' ))
443
+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
444
+ self .assertTrue (parser .can_fetch (agent , url + '/non-utf8/' ))
445
+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/%F0' ))
446
+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/\U0001f40d ' ))
447
+ self .assertFalse (parser .can_fetch (agent , url + '/%2F[spam]/path' ))
448
+
449
+
450
+ class PasswordProtectedSiteTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
451
+ class RobotHandler (BaseHTTPRequestHandler ):
452
+
453
+ def do_GET (self ):
454
+ self .send_error (403 , "Forbidden access" )
455
+
456
+ def log_message (self , format , * args ):
457
+ pass
458
+
338
459
@threading_helper .reap_threads
339
460
def testPasswordProtectedSite (self ):
340
461
addr = self .server .server_address
0 commit comments