Program not working as expected

Question 1

I'm trying to run an example program for a web crawler from netinstructions.com but it is not working. I run the program using:

spider("http://www.netinstructions.com/", "python", 50)

but it always returns

1 Visiting: http://www.netinstructions.com
Word never found

no matter what url I enter. The code for the program is below:

from html.parser import HTMLParser 
from urllib.request import urlopen 
from urllib import parse
# We are going to create a class called LinkParser that inherits some
# methods from HTMLParser which is why it is passed into the definition
class LinkParser(HTMLParser):
 # This is a function that HTMLParser normally has
 # but we are adding some functionality to it
 def handle_starttag(self, tag, attrs):
 # We are looking for the begining of a link. Links normally look
 # like <a href="www.someurl.com"></a>
 if tag == 'a':
 for (key, value) in attrs:
 if key == 'href':
 # We are grabbing the new URL. We are also adding the
 # base URL to it. For example:
 # www.netinstructions.com is the base and
 # somepage.html is the new URL (a relative URL)
 #
 # We combine a relative URL with the base URL to create
 # an absolute URL like:
 # www.netinstructions.com/somepage.html
 newUrl = parse.urljoin(self.baseUrl, value)
 # And add it to our colection of links:
 self.links = self.links + [newUrl]
 # This is a new function that we are creating to get links
 # that our spider() function will call
 def getLinks(self, url):
 self.links = []
 # Remember the base URL which will be important when creating
 # absolute URLs
 self.baseUrl = url
 # Use the urlopen function from the standard Python 3 library
 response = urlopen(url)
 # Make sure that we are looking at HTML and not other things that
 # are floating around on the internet (such as
 # JavaScript files, CSS, or .PDFs for example)
 if response.getheader('Content-Type')=='text/html':
 htmlBytes = response.read()
 # Note that feed() handles Strings well, but not bytes
 # (A change from Python 2.x to Python 3.x)
 htmlString = htmlBytes.decode("utf-8")
 self.feed(htmlString)
 return htmlString, self.links
 else:
 return "",[]
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages): 
 pagesToVisit = [url]
 numberVisited = 0
 foundWord = False
 # The main loop. Create a LinkParser and get all the links on the page.
 # Also search the page for the word or string
 # In our getLinks function we return the web page
 # (this is useful for searching for the word)
 # and we return a set of links from that web page
 # (this is useful for where to go next)
 while numberVisited < maxPages and pagesToVisit != [] and not foundWord:
 numberVisited = numberVisited +1
 # Start from the beginning of our collection of pages to visit:
 url = pagesToVisit[0]
 pagesToVisit = pagesToVisit[1:]
 try:
 print(numberVisited, "Visiting:", url)
 parser = LinkParser()
 data, links = parser.getLinks(url)
 if data.find(word)> -1:
 foundWord = True
 # Add the pages that we visited to the end of our collection
 # of pages to visit:
 pagesToVisit = pagesToVisit + links
 print(" **Success!**")
 except:
 print(" **Failed!**")
 if foundWord:
 print("The word", word, "was found at", url)
 else:
 print("Word never found")

Does anyone know what's going on? I'm using Python 3.5 (32-bit) and running on Windows 10.

Question 2

Run for the hills, any tutorial that uses a blanket except is not one I would recommend, the error is clear if you except Exception as e:print(e). ie 'LinkParser' object has no attribute 'getLinks' although the error is your fault def getLinks(self, url): should be inside the class. I would recommend you check out requests and BeautifulSoup if you want two nice libraries for web scraping

Question 3

I forgot to indent the code but I've fixed that now.

Question 4

response.getheader('Content-Type') returns text/html; charset=utf-8 which is not equal to text/html so you never get any links at all. You can see if it is contained in the string:

def getLinks(self, url):
 self.links = []
 # Remember the base URL which will be important when creating
 # absolute URLs
 self.baseUrl = url
 # Use the urlopen function from the standard Python 3 library
 response = urlopen(url)
 # Make sure that we are looking at HTML and not other things that
 # are floating around on the internet (such as
 # JavaScript files, CSS, or .PDFs for example)
 if 'text/html' in response.getheader('Content-Type')

Also pagesToVisit = pagesToVisit + links should be outside the if as you would only add the links if find was != -1. Make the following changes and your code will run:

def getLinks(self, url):
 self.links = []
 # Remember the base URL which will be important when creating
 # absolute URLs
 self.baseUrl = url
 # Use the urlopen function from the standard Python 3 library
 response = urlopen(url)
 # Make sure that we are looking at HTML and not other things that
 # are floating around on the internet (such as
 print(response.getheader('Content-Type'))
 # JavaScript files, CSS, or .PDFs for example)
 if 'text/html' in response.getheader('Content-Type'):
 htmlBytes = response.read()
 # Note that feed() handles Strings well, but not bytes
 # (A change from Python 2.x to Python 3.x)
 htmlString = htmlBytes.decode("utf-8")
 self.feed(htmlString)
 return htmlString, self.links
 return "",[]
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):
 pagesToVisit = [url]
 foundWord = False
 # The main loop. Create a LinkParser and get all the links on the page.
 # Also search the page for the word or string
 # In our getLinks function we return the web page
 # (this is useful for searching for the word)
 # and we return a set of links from that web page
 # (this is useful for where to go next)
 for ind, url in enumerate(pagesToVisit, 1):
 if ind >= maxPages or foundWord:
 break
 # Start from the beginning of our collection of pages to visit:
 try:
 print(ind, "Visiting:", url)
 parser = LinkParser()
 data, links = parser.getLinks(url)
 if data.find(word)> -1:
 foundWord = True
 # Add the pages that we visited to the end of our collection
 # of pages to visit:
 print(" **Success!**")
 pagesToVisit.extend(links)
 except Exception as e:
 print(" **Failed!**")
 if foundWord:
 print("The word", word, "was found at", url)
 else:
 print("Word never found")
spider("http://www.netinstructions.com/", "python", 50)

Question 5

Congrats on 100k :)

Padraic Cunningham 181k30 gold badges264 silver badges327 bronze badges · Accepted Answer · 2016-07-06 00:05:27Z

response.getheader('Content-Type') returns text/html; charset=utf-8 which is not equal to text/html so you never get any links at all. You can see if it is contained in the string:

def getLinks(self, url):
 self.links = []
 # Remember the base URL which will be important when creating
 # absolute URLs
 self.baseUrl = url
 # Use the urlopen function from the standard Python 3 library
 response = urlopen(url)
 # Make sure that we are looking at HTML and not other things that
 # are floating around on the internet (such as
 # JavaScript files, CSS, or .PDFs for example)
 if 'text/html' in response.getheader('Content-Type')

Also pagesToVisit = pagesToVisit + links should be outside the if as you would only add the links if find was != -1. Make the following changes and your code will run:

def getLinks(self, url):
 self.links = []
 # Remember the base URL which will be important when creating
 # absolute URLs
 self.baseUrl = url
 # Use the urlopen function from the standard Python 3 library
 response = urlopen(url)
 # Make sure that we are looking at HTML and not other things that
 # are floating around on the internet (such as
 print(response.getheader('Content-Type'))
 # JavaScript files, CSS, or .PDFs for example)
 if 'text/html' in response.getheader('Content-Type'):
 htmlBytes = response.read()
 # Note that feed() handles Strings well, but not bytes
 # (A change from Python 2.x to Python 3.x)
 htmlString = htmlBytes.decode("utf-8")
 self.feed(htmlString)
 return htmlString, self.links
 return "",[]
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, word, maxPages):
 pagesToVisit = [url]
 foundWord = False
 # The main loop. Create a LinkParser and get all the links on the page.
 # Also search the page for the word or string
 # In our getLinks function we return the web page
 # (this is useful for searching for the word)
 # and we return a set of links from that web page
 # (this is useful for where to go next)
 for ind, url in enumerate(pagesToVisit, 1):
 if ind >= maxPages or foundWord:
 break
 # Start from the beginning of our collection of pages to visit:
 try:
 print(ind, "Visiting:", url)
 parser = LinkParser()
 data, links = parser.getLinks(url)
 if data.find(word)> -1:
 foundWord = True
 # Add the pages that we visited to the end of our collection
 # of pages to visit:
 print(" **Success!**")
 pagesToVisit.extend(links)
 except Exception as e:
 print(" **Failed!**")
 if foundWord:
 print("The word", word, "was found at", url)
 else:
 print("Word never found")
spider("http://www.netinstructions.com/", "python", 50)

CollectivesTM on Stack Overflow

Program not working as expected

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CollectivesTM on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related