class REXML::Parsers::BaseParser

Using the Pull Parser

This API is experimental, and subject to change.

parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
while parser.has_next?
 res = parser.next
 puts res[1]['att'] if res.start_tag? and res[0] == 'b'
end

See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.

Notice that:

parser = PullParser.new( "<a>BAD DOCUMENT" )
while parser.has_next?
 res = parser.next
 raise res[1] if res.error?
end

Nat Price gave me some good ideas for the API.

Constants

ATTDEF
ATTDEF_RE
ATTLISTDECL_PATTERN
ATTLISTDECL_START
ATTRIBUTE_PATTERN
ATTTYPE
ATTVALUE
CDATA_END
CDATA_PATTERN
CDATA_START
CLOSE_MATCH
COMBININGCHAR
COMMENT_PATTERN
COMMENT_START
DEFAULTDECL
DEFAULT_ENTITIES
DIGIT
DOCTYPE_END
DOCTYPE_PATTERN
DOCTYPE_START
ELEMENTDECL_PATTERN
ELEMENTDECL_START
ENCODING
ENTITYDECL
ENTITYDEF
ENTITYVALUE
ENTITY_START
ENUMERATEDTYPE
ENUMERATION
EREFERENCE
EXTENDER
EXTERNALID
GEDECL
IDENTITY
INSTRUCTION_PATTERN
INSTRUCTION_START
LETTER
MISSING_ATTRIBUTE_QUOTES

These are patterns to identify common markup errors, to make the error messages more informative.

NAME
NAMECHAR
NAME_STR
NCNAME_STR
NDATADECL
NMTOKEN
NMTOKENS
NOTATIONDECL_START
NOTATIONTYPE
PEDECL
PEDEF
PEREFERENCE
PUBIDCHAR

Entity constants

PUBIDLITERAL
PUBLIC
REFERENCE
REFERENCE_RE
STANDALONE
SYSTEM
SYSTEMENTITY
SYSTEMLITERAL
TAG_MATCH
TEXT_PATTERN
UNAME_STR
VERSION
XMLDECL_PATTERN
XMLDECL_START

Attributes

source[R]

Public Class Methods

new( source ) click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 116
def initialize( source )
 self.stream = source
 @listeners = []
end

Public Instance Methods

add_listener( listener ) click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 121
def add_listener( listener )
 @listeners << listener
end
empty?() click to toggle source

Returns true if there are no more events

# File lib/rexml/parsers/baseparser.rb, line 147
def empty?
 return (@source.empty? and @stack.empty?)
end
entity( reference, entities ) click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 448
def entity( reference, entities )
 value = nil
 value = entities[ reference ] if entities
 if not value
 value = DEFAULT_ENTITIES[ reference ]
 value = value[2] if value
 end
 unnormalize( value, entities ) if value
end
has_next?() click to toggle source

Returns true if there are more events. Synonymous with !empty?

# File lib/rexml/parsers/baseparser.rb, line 152
def has_next?
 return !(@source.empty? and @stack.empty?)
end
normalize( input, entities=nil, entity_filter=nil ) click to toggle source

Escapes all possible entities

# File lib/rexml/parsers/baseparser.rb, line 459
def normalize( input, entities=nil, entity_filter=nil )
 copy = input.clone
 # Doing it like this rather than in a loop improves the speed
 copy.gsub!( EREFERENCE, '&amp;' )
 entities.each do |key, value|
 copy.gsub!( value, "&#{key};" ) unless entity_filter and
 entity_filter.include?(entity)
 end if entities
 copy.gsub!( EREFERENCE, '&amp;' )
 DEFAULT_ENTITIES.each do |key, value|
 copy.gsub!( value[3], value[1] )
 end
 copy
end
peek(depth=0) click to toggle source

Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.

# File lib/rexml/parsers/baseparser.rb, line 168
def peek depth=0
 raise %Q[Illegal argument "#{depth}"] if depth < -1
 temp = []
 if depth == -1
 temp.push(pull()) until empty?
 else
 while @stack.size+temp.size < depth+1
 temp.push(pull())
 end
 end
 @stack += temp if temp.size > 0
 @stack[depth]
end
position() click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 137
def position
 if @source.respond_to? :position
 @source.position
 else
 # FIXME
 0
 end
end
pull() click to toggle source

Returns the next event. This is a PullEvent object.

# File lib/rexml/parsers/baseparser.rb, line 183
def pull
 pull_event.tap do |event|
 @listeners.each do |listener|
 listener.receive event
 end
 end
end
stream=( source ) click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 127
def stream=( source )
 @source = SourceFactory.create_from( source )
 @closed = nil
 @document_status = nil
 @tags = []
 @stack = []
 @entities = []
 @nsstack = []
end
unnormalize( string, entities=nil, filter=nil ) click to toggle source

Unescapes all possible entities

# File lib/rexml/parsers/baseparser.rb, line 475
def unnormalize( string, entities=nil, filter=nil )
 rv = string.clone
 rv.gsub!( /\r\n?/, "\n" )
 matches = rv.scan( REFERENCE_RE )
 return rv if matches.size == 0
 rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
 m=1ドル
 m = "0#{m}" if m[0] == ?x
 [Integer(m)].pack('U*')
 }
 matches.collect!{|x|x[0]}.compact!
 if matches.size > 0
 matches.each do |entity_reference|
 unless filter and filter.include?(entity_reference)
 entity_value = entity( entity_reference, entities )
 if entity_value
 re = /&#{entity_reference};/
 rv.gsub!( re, entity_value )
 else
 er = DEFAULT_ENTITIES[entity_reference]
 rv.gsub!( er[0], er[2] ) if er
 end
 end
 end
 rv.gsub!( /&amp;/, '&' )
 end
 rv
end
unshift(token) click to toggle source

Push an event back on the head of the stream. This method has (theoretically) infinite depth.

# File lib/rexml/parsers/baseparser.rb, line 158
def unshift token
 @stack.unshift(token)
end

Private Instance Methods

need_source_encoding_update?(xml_declaration_encoding) click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 505
def need_source_encoding_update?(xml_declaration_encoding)
 return false if xml_declaration_encoding.nil?
 return false if /\AUTF-16\z/i =~ xml_declaration_encoding
 true
end
pull_event() click to toggle source
# File lib/rexml/parsers/baseparser.rb, line 191
def pull_event
 if @closed
 x, @closed = @closed, nil
 return [ :end_element, x ]
 end
 return [ :end_document ] if empty?
 return @stack.shift if @stack.size > 0
 #STDERR.puts @source.encoding
 @source.read if @source.buffer.size<2
 #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
 if @document_status == nil
 #@source.consume( /^\s*/um )
 word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
 word = word[1] unless word.nil?
 #STDERR.puts "WORD = #{word.inspect}"
 case word
 when COMMENT_START
 return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
 when XMLDECL_START
 #STDERR.puts "XMLDECL"
 results = @source.match( XMLDECL_PATTERN, true )[1]
 version = VERSION.match( results )
 version = version[1] unless version.nil?
 encoding = ENCODING.match(results)
 encoding = encoding[1] unless encoding.nil?
 if need_source_encoding_update?(encoding)
 @source.encoding = encoding
 end
 if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
 encoding = "UTF-16"
 end
 standalone = STANDALONE.match(results)
 standalone = standalone[1] unless standalone.nil?
 return [ :xmldecl, version, encoding, standalone ]
 when INSTRUCTION_START
 return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
 when DOCTYPE_START
 md = @source.match( DOCTYPE_PATTERN, true )
 @nsstack.unshift(curr_ns=Set.new)
 identity = md[1]
 close = md[2]
 identity =~ IDENTITY
 name = 1ドル
 raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
 pub_sys = 2ドル.nil? ? nil : 2ドル.strip
 long_name = 4ドル.nil? ? nil : 4ドル.strip
 uri = 6ドル.nil? ? nil : 6ドル.strip
 args = [ :start_doctype, name, pub_sys, long_name, uri ]
 if close == ">"
 @document_status = :after_doctype
 @source.read if @source.buffer.size<2
 md = @source.match(/^\s*/um, true)
 @stack << [ :end_doctype ]
 else
 @document_status = :in_doctype
 end
 return args
 when /^\s+/
 else
 @document_status = :after_doctype
 @source.read if @source.buffer.size<2
 md = @source.match(/\s*/um, true)
 if @source.encoding == "UTF-8"
 @source.buffer.force_encoding(::Encoding::UTF_8)
 end
 end
 end
 if @document_status == :in_doctype
 md = @source.match(/\s*(.*?>)/um)
 case md[1]
 when SYSTEMENTITY
 match = @source.match( SYSTEMENTITY, true )[1]
 return [ :externalentity, match ]
 when ELEMENTDECL_START
 return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 when ENTITY_START
 match = @source.match( ENTITYDECL, true ).to_a.compact
 match[0] = :entitydecl
 ref = false
 if match[1] == '%'
 ref = true
 match.delete_at 1
 end
 # Now we have to sort out what kind of entity reference this is
 if match[2] == 'SYSTEM'
 # External reference
 match[3] = match[3][1..-2] # PUBID
 match.delete_at(4) if match.size > 4 # Chop out NDATA decl
 # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
 elsif match[2] == 'PUBLIC'
 # External reference
 match[3] = match[3][1..-2] # PUBID
 match[4] = match[4][1..-2] # HREF
 match.delete_at(5) if match.size > 5 # Chop out NDATA decl
 # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ]
 else
 match[2] = match[2][1..-2]
 match.pop if match.size == 4
 # match is [ :entity, name, value ]
 end
 match << '%' if ref
 return match
 when ATTLISTDECL_START
 md = @source.match( ATTLISTDECL_PATTERN, true )
 raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
 element = md[1]
 contents = md[0]
 pairs = {}
 values = md[0].scan( ATTDEF_RE )
 values.each do |attdef|
 unless attdef[3] == "#IMPLIED"
 attdef.compact!
 val = attdef[3]
 val = attdef[4] if val == "#FIXED "
 pairs[attdef[0]] = val
 if attdef[0] =~ /^xmlns:(.*)/
 @nsstack[0] << 1ドル
 end
 end
 end
 return [ :attlistdecl, element, pairs, contents ]
 when NOTATIONDECL_START
 md = nil
 if @source.match( PUBLIC )
 md = @source.match( PUBLIC, true )
 vals = [md[1],md[2],md[4],md[6]]
 elsif @source.match( SYSTEM )
 md = @source.match( SYSTEM, true )
 vals = [md[1],md[2],nil,md[4]]
 else
 raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
 end
 return [ :notationdecl, *vals ]
 when DOCTYPE_END
 @document_status = :after_doctype
 @source.match( DOCTYPE_END, true )
 return [ :end_doctype ]
 end
 end
 begin
 if @source.buffer[0] == ?<
 if @source.buffer[1] == ?/
 @nsstack.shift
 last_tag = @tags.pop
 #md = @source.match_to_consume( '>', CLOSE_MATCH)
 md = @source.match( CLOSE_MATCH, true )
 raise REXML::ParseException.new( "Missing end tag for "+
 "'#{last_tag}' (got \"#{md[1]}\")",
 @source) unless last_tag == md[1]
 return [ :end_element, last_tag ]
 elsif @source.buffer[1] == ?!
 md = @source.match(/\A(\s*[^>]*>)/um)
 #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
 raise REXML::ParseException.new("Malformed node", @source) unless md
 if md[0][2] == ?-
 md = @source.match( COMMENT_PATTERN, true )
 case md[1]
 when /--/, /-\z/
 raise REXML::ParseException.new("Malformed comment", @source)
 end
 return [ :comment, md[1] ] if md
 else
 md = @source.match( CDATA_PATTERN, true )
 return [ :cdata, md[1] ] if md
 end
 raise REXML::ParseException.new( "Declarations can only occur "+
 "in the doctype declaration.", @source)
 elsif @source.buffer[1] == ??
 md = @source.match( INSTRUCTION_PATTERN, true )
 return [ :processing_instruction, md[1], md[2] ] if md
 raise REXML::ParseException.new( "Bad instruction declaration",
 @source)
 else
 # Get the next tag
 md = @source.match(TAG_MATCH, true)
 unless md
 # Check for missing attribute quotes
 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
 end
 attributes = {}
 prefixes = Set.new
 prefixes << md[2] if md[2]
 @nsstack.unshift(curr_ns=Set.new)
 if md[4].size > 0
 attrs = md[4].scan( ATTRIBUTE_PATTERN )
 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
 attrs.each do |attr_name, prefix, local_part, quote, value|
 if prefix == "xmlns"
 if local_part == "xml"
 if value != "http://www.w3.org/XML/1998/namespace"
 msg = "The 'xml' prefix must not be bound to any other namespace "+
 "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
 raise REXML::ParseException.new( msg, @source, self )
 end
 elsif local_part == "xmlns"
 msg = "The 'xmlns' prefix must not be declared "+
 "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
 raise REXML::ParseException.new( msg, @source, self)
 end
 curr_ns << local_part
 elsif prefix
 prefixes << prefix unless prefix == "xml"
 end
 if attributes.has_key?(attr_name)
 msg = "Duplicate attribute #{attr_name.inspect}"
 raise REXML::ParseException.new(msg, @source, self)
 end
 attributes[attr_name] = value
 end
 end
 # Verify that all of the prefixes have been defined
 for prefix in prefixes
 unless @nsstack.find{|k| k.member?(prefix)}
 raise UndefinedNamespaceException.new(prefix,@source,self)
 end
 end
 if md[6]
 @closed = md[1]
 @nsstack.shift
 else
 @tags.push( md[1] )
 end
 return [ :start_element, md[1], attributes ]
 end
 else
 md = @source.match( TEXT_PATTERN, true )
 if md[0].length == 0
 @source.match( /(\s+)/, true )
 end
 #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
 #return [ :text, "" ] if md[0].length == 0
 # unnormalized = Text::unnormalize( md[1], self )
 # return PullEvent.new( :text, md[1], unnormalized )
 return [ :text, md[1] ]
 end
 rescue REXML::UndefinedNamespaceException
 raise
 rescue REXML::ParseException
 raise
 rescue Exception, NameError => error
 raise REXML::ParseException.new( "Exception parsing",
 @source, self, (error ? error : $!) )
 end
 return [ :dummy ]
end