Here is a small script I wrote to get the HNews ask section and display them without using a web browser. I'm just looking for feedback on how to improve my style/coding logic/overall code.
#!/usr/bin/python
'''This script gets the headlines from hacker news ask section'''
import urllib2
import HTMLParser
import re
class HNParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.data=[]
self.recording=0
def handle_starttag(self, tag, attribute):
if tag!='a':
return
elif self.recording:
self.recording +=1
return
for name, value in attribute:
if name=='href' and value[0]=='i':
break
else:
return
self.recording=1
def handle_endtag(self, tag):
if tag=='a' and self.recording:
self.recording-=1
def handle_data(self, data):
if self.recording:
self.data.append(data)
HOST='http://news.ycombinator.com/ask'
parser=HNParser()
f=urllib2.urlopen(HOST)
rawHTML=f.read()
parser.feed(rawHTML)
i=0
for items in parser.data:
try:
print parser.data[2*i]
i+=1
except IndexError:
break
parser.close()
-
\$\begingroup\$ You'll almost definitely get someone who points out that your indenting is incorrect ... wait that was me! Just check the preview before you submit it so that the code section looks the same as your file. \$\endgroup\$James Khoury– James Khoury2011年12月14日 00:53:22 +00:00Commented Dec 14, 2011 at 0:53
2 Answers 2
This is how I would modify your script. Comments are inline. This version is PEP8-PyLint-PyFlakes approved. ;-)
#!/usr/bin/python
'''This script gets the headlines from hacker news ask section'''
import urllib2
import HTMLParser
# remove unused imports
class HNParser(HTMLParser.HTMLParser):
# use class-level attribute instead of "magic" strings.
tag_name = 'a'
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.data = []
self.recording = 0
def handle_starttag(self, tag, attribute):
if tag != self.tag_name:
return
# clearer implicit loop, with no "break"
if not any(name == 'href' and value.startswith('item')
for name, value in attribute):
return
# no need to check if self.recording == 0, because in that case setting
# self.recording = 1 is equivalent to incrementing self.recording!
self.recording += 1
def handle_endtag(self, tag):
if tag == self.tag_name and self.recording:
self.recording -= 1
def handle_data(self, data):
if self.recording:
self.data.append(data)
def main():
HOST = 'http://news.ycombinator.com/ask'
parser = HNParser()
f = urllib2.urlopen(HOST)
rawHTML = f.read()
parser.feed(rawHTML)
try:
# use fancy list indexing and omit last "Feature Request" link
for item in parser.data[:-1:2]:
print item
finally:
# close the parser even if exception is raised
parser.close()
if __name__ == '__main__':
main()
-
\$\begingroup\$ I’m not convinced that the generic
tag_name
is more readable than'a'
, on the contrary. I’m all for using an attribute here, but give it a proper name. The context suggets that it should be calledstarttag
– or maybelinktag
. Also, thetry ... finally: parser.close()
should be replaced with awith
block. \$\endgroup\$Konrad Rudolph– Konrad Rudolph2012年07月14日 20:22:46 +00:00Commented Jul 14, 2012 at 20:22
Instead of HTMLParser.HTMLParser.__init__(self)
you should write super(HTMLParser.HTMLParser, self).__init__()
urllib2
and HTMLParser
, though included with Python, are very limited. requests and lxml + cssselect are more powerful. With them, your script can be written as
import requests
from lxml.html import fromstring
content = requests.get('https://news.ycombinator.com/ask').content
html_doc = fromstring(content)
# Select all title links & ignore
links = html_doc.cssselect('.title > a[href^=item]')
print '\n'.join(link.text for link in links)