HNews "ask section" page scraping Python script

Question 1

Here is a small script I wrote to get the HNews ask section and display them without using a web browser. I'm just looking for feedback on how to improve my style/coding logic/overall code.

#!/usr/bin/python
'''This script gets the headlines from hacker news ask section'''
import urllib2
import HTMLParser
import re
class HNParser(HTMLParser.HTMLParser):
def __init__(self):
 HTMLParser.HTMLParser.__init__(self)
 self.data=[]
 self.recording=0
def handle_starttag(self, tag, attribute):
 if tag!='a':
 return
 elif self.recording:
 self.recording +=1
 return
 for name, value in attribute:
 if name=='href' and value[0]=='i':
 break
 else:
 return
 self.recording=1
def handle_endtag(self, tag):
 if tag=='a' and self.recording:
 self.recording-=1
def handle_data(self, data):
 if self.recording:
 self.data.append(data)
HOST='http://news.ycombinator.com/ask'
parser=HNParser()
f=urllib2.urlopen(HOST)
rawHTML=f.read()
parser.feed(rawHTML)
i=0
for items in parser.data:
 try:
 print parser.data[2*i]
 i+=1
 except IndexError:
 break
 parser.close()

Question 2

You'll almost definitely get someone who points out that your indenting is incorrect ... wait that was me! Just check the preview before you submit it so that the code section looks the same as your file.

Question 3

This is how I would modify your script. Comments are inline. This version is PEP8-PyLint-PyFlakes approved. ;-)

#!/usr/bin/python
'''This script gets the headlines from hacker news ask section'''
import urllib2
import HTMLParser
# remove unused imports
class HNParser(HTMLParser.HTMLParser):
 # use class-level attribute instead of "magic" strings.
 tag_name = 'a'
 def __init__(self):
 HTMLParser.HTMLParser.__init__(self)
 self.data = []
 self.recording = 0
 def handle_starttag(self, tag, attribute):
 if tag != self.tag_name:
 return
 # clearer implicit loop, with no "break"
 if not any(name == 'href' and value.startswith('item')
 for name, value in attribute):
 return
 # no need to check if self.recording == 0, because in that case setting
 # self.recording = 1 is equivalent to incrementing self.recording!
 self.recording += 1
 def handle_endtag(self, tag):
 if tag == self.tag_name and self.recording:
 self.recording -= 1
 def handle_data(self, data):
 if self.recording:
 self.data.append(data)
def main():
 HOST = 'http://news.ycombinator.com/ask'
 parser = HNParser()
 f = urllib2.urlopen(HOST)
 rawHTML = f.read()
 parser.feed(rawHTML)
 try:
 # use fancy list indexing and omit last "Feature Request" link
 for item in parser.data[:-1:2]:
 print item
 finally:
 # close the parser even if exception is raised
 parser.close()
if __name__ == '__main__':
 main()

Question 4

I’m not convinced that the generic tag_name is more readable than 'a', on the contrary. I’m all for using an attribute here, but give it a proper name. The context suggets that it should be called starttag – or maybe linktag. Also, the try ... finally: parser.close() should be replaced with a with block.

Question 5

Instead of HTMLParser.HTMLParser.__init__(self) you should write super(HTMLParser.HTMLParser, self).__init__()

urllib2 and HTMLParser, though included with Python, are very limited. requests and lxml + cssselect are more powerful. With them, your script can be written as

import requests
from lxml.html import fromstring
content = requests.get('https://news.ycombinator.com/ask').content
html_doc = fromstring(content)
# Select all title links & ignore 
links = html_doc.cssselect('.title > a[href^=item]')
print '\n'.join(link.text for link in links)

lbolla lbolla 3882 silver badges7 bronze badges · Accepted Answer · 2011-12-22 15:31:04Z

This is how I would modify your script. Comments are inline. This version is PEP8-PyLint-PyFlakes approved. ;-)

#!/usr/bin/python
'''This script gets the headlines from hacker news ask section'''
import urllib2
import HTMLParser
# remove unused imports
class HNParser(HTMLParser.HTMLParser):
 # use class-level attribute instead of "magic" strings.
 tag_name = 'a'
 def __init__(self):
 HTMLParser.HTMLParser.__init__(self)
 self.data = []
 self.recording = 0
 def handle_starttag(self, tag, attribute):
 if tag != self.tag_name:
 return
 # clearer implicit loop, with no "break"
 if not any(name == 'href' and value.startswith('item')
 for name, value in attribute):
 return
 # no need to check if self.recording == 0, because in that case setting
 # self.recording = 1 is equivalent to incrementing self.recording!
 self.recording += 1
 def handle_endtag(self, tag):
 if tag == self.tag_name and self.recording:
 self.recording -= 1
 def handle_data(self, data):
 if self.recording:
 self.data.append(data)
def main():
 HOST = 'http://news.ycombinator.com/ask'
 parser = HNParser()
 f = urllib2.urlopen(HOST)
 rawHTML = f.read()
 parser.feed(rawHTML)
 try:
 # use fancy list indexing and omit last "Feature Request" link
 for item in parser.data[:-1:2]:
 print item
 finally:
 # close the parser even if exception is raised
 parser.close()
if __name__ == '__main__':
 main()

I’m not convinced that the generic tag_name is more readable than 'a', on the contrary. I’m all for using an attribute here, but give it a proper name. The context suggets that it should be called starttag – or maybe linktag. Also, the try ... finally: parser.close() should be replaced with a with block.

Stack Exchange Network

HNews "ask section" page scraping Python script

2 Answers 2

Hot Network Questions

HNews "ask section" page scraping Python script

2 Answers 2

Related

Hot Network Questions