[Python-checkins] bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)

Tal Einat webhook-mailer at python.org
Sun Jun 16 02:49:03 EDT 2019


https://github.com/python/cpython/commit/8047e0e1c620f69cc21f9ca48b24bf2cdd5c3668
commit: 8047e0e1c620f69cc21f9ca48b24bf2cdd5c3668
branch: master
author: Rémi Lapeyre <remi.lapeyre at henki.fr>
committer: Tal Einat <taleinat at gmail.com>
date: 2019年06月16日T09:48:57+03:00
summary:
bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)
Co-Authored-By: Tal Einat <taleinat+github at gmail.com>
files:
A Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst
M Lib/test/test_robotparser.py
M Lib/urllib/robotparser.py
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 84a267ad9567..77cd7c4d29df 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -97,30 +97,38 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
 
 
 class BaseRequestRateTest(BaseRobotTest):
+ request_rate = None
+ crawl_delay = None
 
 def test_request_rate(self):
+ parser = self.parser
 for url in self.good + self.bad:
 agent, url = self.get_agent_and_url(url)
 with self.subTest(url=url, agent=agent):
- if self.crawl_delay:
- self.assertEqual(
- self.parser.crawl_delay(agent), self.crawl_delay
- )
- if self.request_rate:
+ self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
+
+ parsed_request_rate = parser.request_rate(agent)
+ self.assertEqual(parsed_request_rate, self.request_rate)
+ if self.request_rate is not None:
 self.assertIsInstance(
- self.parser.request_rate(agent),
+ parsed_request_rate,
 urllib.robotparser.RequestRate
 )
 self.assertEqual(
- self.parser.request_rate(agent).requests,
+ parsed_request_rate.requests,
 self.request_rate.requests
 )
 self.assertEqual(
- self.parser.request_rate(agent).seconds,
+ parsed_request_rate.seconds,
 self.request_rate.seconds
 )
 
 
+class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
+ robots_txt = ''
+ good = ['/foo']
+
+
 class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
 robots_txt = """\
 User-agent: figtree
@@ -141,10 +149,6 @@ class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
 
 class DifferentAgentTest(CrawlDelayAndRequestRateTest):
 agent = 'FigTree Robot libwww-perl/5.04'
- # these are not actually tested, but we still need to parse it
- # in order to accommodate the input parameters
- request_rate = None
- crawl_delay = None
 
 
 class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 7089916a4f81..c58565e39451 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -186,7 +186,9 @@ def crawl_delay(self, useragent):
 for entry in self.entries:
 if entry.applies_to(useragent):
 return entry.delay
- return self.default_entry.delay
+ if self.default_entry:
+ return self.default_entry.delay
+ return None
 
 def request_rate(self, useragent):
 if not self.mtime():
@@ -194,7 +196,9 @@ def request_rate(self, useragent):
 for entry in self.entries:
 if entry.applies_to(useragent):
 return entry.req_rate
- return self.default_entry.req_rate
+ if self.default_entry:
+ return self.default_entry.req_rate
+ return None
 
 def site_maps(self):
 if not self.sitemaps:
diff --git a/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst
new file mode 100644
index 000000000000..5271a495624d
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst
@@ -0,0 +1,4 @@
+Fix :meth:`RobotFileParser.crawl_delay` and
+:meth:`RobotFileParser.request_rate` to return ``None`` rather than
+raise :exc:`AttributeError` when no relevant rule is defined in the
+robots.txt file. Patch by Rémi Lapeyre.


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /