"""A parser for HTML and XHTML."""
# This file is based on sgmllib.py, but the API is slightly different.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
import markupbase
import re
# Regular expressions used for parsing
interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\z)') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-za-z]') piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
locatestarttagend = re.compile(r"""
<[a-za-z][-.a-za-z0-9:_]* # tag name (?:\s+ # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value |\"[^\"]*\" # LIT-enclosed value |[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
endtagfind = re.compile('\s*([a-za-z][-.a-za-z0-9:_]*)\s*>')
class HTMLParseError(Exception):
"""Exception raised for all parse errors."""
def __init__(self, msg, position=(None, None)):
assert msg
self.msg = msg
self.lineno = position[0]
self.offset = position[1]
def __str__(self):
result = self.msg
if self.lineno is not None:
result = result + ", at line %d" % self.lineno
if self.offset is not None:
result = result + ", column %d" % (self.offset + 1)
return result
class HTMLParser(markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Usage:
p = HTMLParser()
p.feed(data)
...
p.close()
Start tags are handled by calling self.handle_starttag() or
self.handle_startendtag(); end tags by self.handle_endtag(). The
data between tags is passed from the parser to the derived class
by calling self.handle_data() with the data as argument (the data
may be split up in arbitrary chunks). Entity references are
passed by calling self.handle_entityref() with the entity
reference as the argument. Numeric character references are
passed to self.handle_charref() with the string containing the
reference as the argument.
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self):
"""Initialize and reset this instance."""
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.lasttag = '???'
self.interesting = interesting_normal
markupbase.ParserBase.reset(self)
def feed(self, data):
"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""Handle any buffered data."""
self.goahead(1)
def error(self, message):
raise HTMLParseError(message, self.getpos())
__starttag_text = None
def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
def set_cdata_mode(self):
self.interesting = interesting_cdata
def clear_cdata_mode(self):
self.interesting = interesting_normal
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n: match = self.interesting.search(rawdata, i) # < or & if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith if startswith('<', i): if starttagopen.match(rawdata, i): # < + letter k = self.parse_starttag(i) elif startswith("", i): k = self.parse_endtag(i) if k>= 0:
self.clear_cdata_mode()
elif startswith("