[Python-checkins] CVS: python/dist/src/Lib robotparser.py,1.10,1.11

Martin v. L?wis loewis@users.sourceforge.net
2002年2月28日 07:24:49 -0800


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv31559/Lib
Modified Files:
	robotparser.py 
Log Message:
Correct various errors: 
- Use substring search, not re search for user-agent and paths. 
- Consider * entry last. Unquote, then requote URLs. 
- Treat empty Disallow as "allow everything". 
Add test cases. Fixes #523041
Index: robotparser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/robotparser.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** robotparser.py	13 Aug 2001 14:43:43 -0000	1.10
--- robotparser.py	28 Feb 2002 15:24:47 -0000	1.11
***************
*** 23,26 ****
--- 23,27 ----
 def __init__(self, url=''):
 self.entries = []
+ self.default_entry = None
 self.disallow_all = 0
 self.allow_all = 0
***************
*** 73,77 ****
 state = 0
 elif state==2:
! self.entries.append(entry)
 entry = Entry()
 state = 0
--- 74,82 ----
 state = 0
 elif state==2:
! if "*" in entry.useragents:
! # the default entry is considered last
! self.default_entry = entry
! else:
! self.entries.append(entry)
 entry = Entry()
 state = 0
***************
*** 86,90 ****
 if len(line) == 2:
 line[0] = line[0].strip().lower()
! line[1] = line[1].strip()
 if line[0] == "user-agent":
 if state==2:
--- 91,95 ----
 if len(line) == 2:
 line[0] = line[0].strip().lower()
! line[1] = urllib.unquote(line[1].strip())
 if line[0] == "user-agent":
 if state==2:
***************
*** 129,136 ****
 # search for given user agent matches
 # the first match counts
! url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
 for entry in self.entries:
 if entry.applies_to(useragent):
 return entry.allowance(url)
 # agent not found ==> access granted
 return 1
--- 134,144 ----
 # search for given user agent matches
 # the first match counts
! url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
 for entry in self.entries:
 if entry.applies_to(useragent):
 return entry.allowance(url)
+ # try the default entry last
+ if self.default_entry:
+ return self.default_entry.allowance(url)
 # agent not found ==> access granted
 return 1
***************
*** 148,156 ****
 (allowance==0) followed by a path."""
 def __init__(self, path, allowance):
 self.path = urllib.quote(path)
 self.allowance = allowance
 
 def applies_to(self, filename):
! return self.path=="*" or re.match(self.path, filename)
 
 def __str__(self):
--- 156,167 ----
 (allowance==0) followed by a path."""
 def __init__(self, path, allowance):
+ if path == '' and not allowance:
+ # an empty value means allow all
+ allowance = 1
 self.path = urllib.quote(path)
 self.allowance = allowance
 
 def applies_to(self, filename):
! return self.path=="*" or filename.startswith(self.path)
 
 def __str__(self):
***************
*** 181,186 ****
 return 1
 agent = agent.lower()
! # don't forget to re.escape
! if re.search(re.escape(useragent), agent):
 return 1
 return 0
--- 192,196 ----
 return 1
 agent = agent.lower()
! if useragent.find(agent) != -1:
 return 1
 return 0

AltStyle によって変換されたページ (->オリジナル) /