Python HTMLParser: AttributeError

Question 1

I'm using HTMLParser (python 2.7)to parse pages I pull down with urllib2,and am coming across AttributeError exceptions when I want to store my data into a list in feed method. But if comment out the __init__ method, the exception was gone

main.py

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
 def __init__(self):
 self.terms = []
 self.definitions = []
 def handle_starttag(self, tag, attrs):
 # retrive the terms
 if tag == 'div':
 for attribute, value in attrs:
 if value == 'word':
 self.terms.append(attrs[1][1])
 # retrive the definitions
 if value == 'desc':
 if attrs[1][1]:
 self.definitions.append(attrs[1][1])
 else:
 self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)

Output

Traceback (most recent call last):
 File "/Users/megachweng/Project/Anki-Youdao/combined.py", line 35, in <module>
 parser.feed(html)
 File "/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/HTMLParser.py", line 116, in feed
 self.rawdata = self.rawdata + data
AttributeError: MyHTMLParser instance has no attribute 'rawdata'

Question 2

sorry, because of some reason I cant use any third-part packages

Question 3

I think that you don't initialize HTMLParser properly. Maybe you don't need to initialize it at all. This works for me:

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser): 
 def handle_starttag(self, tag, attrs):
 print "Encountered a start tag:", tag
 # retrive the terms
 if tag == 'div':
 for attribute, value in attrs:
 if value == 'word':
 self.terms.append(attrs[1][1])
 # retrive the definitions
 if value == 'desc':
 if attrs[1][1]:
 self.definitions.append(attrs[1][1])
 else:
 self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)

UPDATE

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
 def __init__(self):
 HTMLParser.__init__(self)
 self.terms = []
 self.definitions = []
 def handle_starttag(self, tag, attrs):
 # retrive the terms
 for attribute in attrs:
 if attribute[0] == 'align':
 self.terms.append(attribute[1])
 self.definitions.append(attribute[1])
parser = MyHTMLParser()
html = "<table align='center'><tr><td align='left'><p>ciao</p></td></tr></table>"
# extract the terms and definitions
parser.feed(html)
print parser.terms
print parser.definitions

Output:

['center', 'left']

Question 4

ya,it absolutely works, but any ideas how to store the attrs[1][1] into a list so I can access to it(print parser.terms --MyHTMLParser instance has no attribute 'terms')

Question 5

OK I got the solution,super().__init__ cannot work, must hard code the name

def __init__(self):
 HTMLParser.__init__(self)

Andrea 6036 silver badges15 bronze badges · Accepted Answer · 2017-06-15 11:30:00Z

I think that you don't initialize HTMLParser properly. Maybe you don't need to initialize it at all. This works for me:

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser): 
 def handle_starttag(self, tag, attrs):
 print "Encountered a start tag:", tag
 # retrive the terms
 if tag == 'div':
 for attribute, value in attrs:
 if value == 'word':
 self.terms.append(attrs[1][1])
 # retrive the definitions
 if value == 'desc':
 if attrs[1][1]:
 self.definitions.append(attrs[1][1])
 else:
 self.definitions.append(None)
parser = MyHTMLParser()
# open page and retrive source page
response = urllib2.urlopen('http://localhost/')
html = response.read().decode('utf-8')
response.close()
# extract the terms and definitions
parser.feed(html)

UPDATE

# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
import urllib2
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class MyHTMLParser(HTMLParser):
 def __init__(self):
 HTMLParser.__init__(self)
 self.terms = []
 self.definitions = []
 def handle_starttag(self, tag, attrs):
 # retrive the terms
 for attribute in attrs:
 if attribute[0] == 'align':
 self.terms.append(attribute[1])
 self.definitions.append(attribute[1])
parser = MyHTMLParser()
html = "<table align='center'><tr><td align='left'><p>ciao</p></td></tr></table>"
# extract the terms and definitions
parser.feed(html)
print parser.terms
print parser.definitions

Output:

['center', 'left']

ya,it absolutely works, but any ideas how to store the attrs[1][1] into a list so I can access to it(print parser.terms --MyHTMLParser instance has no attribute 'terms')

CollectivesTM on Stack Overflow

Python HTMLParser: AttributeError

main.py

Output

2 Answers 2

1 Comment

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

CollectivesTM on Stack Overflow

main.py

Output

2 Answers 2

1 Comment

Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related