[Python-checkins] bpo-21475: Support the Sitemap extension in robotparser (GH-6883)

Ned Deily webhook-mailer at python.org
Wed May 16 10:52:15 EDT 2018


https://github.com/python/cpython/commit/5db5c0669e624767375593cc1a01f32092c91c58
commit: 5db5c0669e624767375593cc1a01f32092c91c58
branch: master
author: Christopher Beacham <mcscope at gmail.com>
committer: Ned Deily <nad at python.org>
date: 2018年05月16日T10:52:07-04:00
summary:
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
files:
A Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst
M Doc/library/urllib.robotparser.rst
M Lib/test/test_robotparser.py
M Lib/urllib/robotparser.py
M Misc/ACKS
diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst
index e3b90e673caa..544f50273dd1 100644
--- a/Doc/library/urllib.robotparser.rst
+++ b/Doc/library/urllib.robotparser.rst
@@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
 
 .. versionadded:: 3.6
 
+ .. method:: site_maps()
+
+ Returns the contents of the ``Sitemap`` parameter from
+ ``robots.txt`` in the form of a :func:`list`. If there is no such
+ parameter or the ``robots.txt`` entry for this parameter has
+ invalid syntax, return ``None``.
+
+ .. versionadded:: 3.8
+
 
 The following example demonstrates basic use of the :class:`RobotFileParser`
 class::
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index bee8d238be6b..84a267ad9567 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -12,6 +12,7 @@ class BaseRobotTest:
 agent = 'test_robotparser'
 good = []
 bad = []
+ site_maps = None
 
 def setUp(self):
 lines = io.StringIO(self.robots_txt).readlines()
@@ -36,6 +37,9 @@ def test_bad_urls(self):
 with self.subTest(url=url, agent=agent):
 self.assertFalse(self.parser.can_fetch(agent, url))
 
+ def test_site_maps(self):
+ self.assertEqual(self.parser.site_maps(), self.site_maps)
+
 
 class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
 robots_txt = """\
@@ -65,6 +69,23 @@ class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
 bad = ['/cyberworld/map/index.html']
 
 
+class SitemapTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+# robots.txt for http://www.example.com/
+
+User-agent: *
+Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
+Sitemap: http://www.google.com/hostednews/sitemap_index.xml
+Request-rate: 3/15
+Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+ """
+ good = ['/', '/test.html']
+ bad = ['/cyberworld/map/index.html']
+ site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
+ 'http://www.google.com/hostednews/sitemap_index.xml']
+
+
 class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
 robots_txt = """\
 # go away
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 92e4efe6865e..7089916a4f81 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -27,6 +27,7 @@ class RobotFileParser:
 
 def __init__(self, url=''):
 self.entries = []
+ self.sitemaps = []
 self.default_entry = None
 self.disallow_all = False
 self.allow_all = False
@@ -141,6 +142,12 @@ def parse(self, lines):
 and numbers[1].strip().isdigit()):
 entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
 state = 2
+ elif line[0] == "sitemap":
+ # According to http://www.sitemaps.org/protocol.html
+ # "This directive is independent of the user-agent line,
+ # so it doesn't matter where you place it in your file."
+ # Therefore we do not change the state of the parser.
+ self.sitemaps.append(line[1])
 if state == 2:
 self._add_entry(entry)
 
@@ -189,6 +196,11 @@ def request_rate(self, useragent):
 return entry.req_rate
 return self.default_entry.req_rate
 
+ def site_maps(self):
+ if not self.sitemaps:
+ return None
+ return self.sitemaps
+
 def __str__(self):
 entries = self.entries
 if self.default_entry is not None:
diff --git a/Misc/ACKS b/Misc/ACKS
index 665b4dd7f43f..5c05ee7d5aa1 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -109,6 +109,7 @@ Anthony Baxter
 Mike Bayer
 Samuel L. Bayer
 Bo Bayles
+Christopher Beacham AKA Lady Red
 Tommy Beadle
 Donald Beaudry
 David Beazley
@@ -1760,6 +1761,7 @@ Dik Winter
 Blake Winton
 Jean-Claude Wippler
 Stéphane Wirtel
+Peter Wirtz
 Lars Wirzenius
 John Wiseman
 Chris Withers
diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst
new file mode 100644
index 000000000000..e3e8f16eef07
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst
@@ -0,0 +1,3 @@
+Added support for Site Maps to urllib's ``RobotFileParser`` as
+:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
+Patch by Lady Red, based on patch by Peter Wirtz.


More information about the Python-checkins mailing list

AltStyle によって変換されたページ (->オリジナル) /