I'm using HTMLParser (python 2.7)to parse pages I pull down with urllib2,and am coming across AttributeError exceptions when I want to store my data into a list in feed method. But if comment out the __init__ method, the exception was gone
main.py
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
def __init__(self):
self.terms = []
self.definitions = []
def handle_starttag(self, tag, attrs):
# retrive the terms
if tag == 'div':
for attribute, value in attrs:
if value == 'word':
self.terms.append(attrs[1][1])
# retrive the definitions
if value == 'desc':
if attrs[1][1]:
self.definitions.append(attrs[1][1])
else:
self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)
Output
Traceback (most recent call last):
File "/Users/megachweng/Project/Anki-Youdao/combined.py", line 35, in <module>
parser.feed(html)
File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py", line 116, in feed
self.rawdata = self.rawdata + data
AttributeError: MyHTMLParser instance has no attribute 'rawdata'
asked Jun 15, 2017 at 11:00
Chweng Mega
1,6681 gold badge15 silver badges23 bronze badges
-
sorry, because of some reason I cant use any third-part packagesChweng Mega– Chweng Mega2017年06月15日 11:03:23 +00:00Commented Jun 15, 2017 at 11:03
2 Answers 2
I think that you don't initialize HTMLParser properly. Maybe you don't need to initialize it at all. This works for me:
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print "Encountered a start tag:", tag
# retrive the terms
if tag == 'div':
for attribute, value in attrs:
if value == 'word':
self.terms.append(attrs[1][1])
# retrive the definitions
if value == 'desc':
if attrs[1][1]:
self.definitions.append(attrs[1][1])
else:
self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)
UPDATE
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.terms = []
self.definitions = []
def handle_starttag(self, tag, attrs):
# retrive the terms
for attribute in attrs:
if attribute[0] == 'align':
self.terms.append(attribute[1])
self.definitions.append(attribute[1])
parser = MyHTMLParser()
html = "<table align='center'><tr><td align='left'><p>ciao</p></td></tr></table>"
# extract the terms and definitions
parser.feed(html)
print parser.terms
print parser.definitions
Output:
['center', 'left']
['center', 'left']
Sign up to request clarification or add additional context in comments.
1 Comment
Chweng Mega
ya,it absolutely works, but any ideas how to store the attrs[1][1] into a list so I can access to it(print parser.terms --MyHTMLParser instance has no attribute 'terms')
OK I got the solution,super().__init__ cannot work, must hard code the name
def __init__(self):
HTMLParser.__init__(self)
answered Jun 15, 2017 at 12:01
Chweng Mega
1,6681 gold badge15 silver badges23 bronze badges
Comments
lang-py