class RDoc::Markup::Parser

A recursive-descent parser for RDoc markup.

The parser tokenizes an input string then parses the tokens into a Document. Documents can be converted into output formats by writing a visitor like RDoc::Markup::ToHTML.

The parser only handles the block-level constructs Paragraph, List, ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as +blah+ is handled separately by RDoc::Markup::AttributeManager.

To see what markup the Parser implements read RDoc. To see how to use RDoc markup to format text in your program read RDoc::Markup.

Constants

LIST_TOKENS: List token types

Attributes

debug[RW]

Enables display of debugging information

tokens[R]

Token accessor

Public Class Methods

new() click to toggle source

Creates a new Parser. See also ::parse

# File lib/rdoc/markup/parser.rb, line 77
def initialize
 @binary_input = nil
 @current_token = nil
 @debug = false
 @have_encoding = Object.const_defined? :Encoding
 @have_byteslice = ''.respond_to? :byteslice
 @input = nil
 @input_encoding = nil
 @line = 0
 @line_pos = 0
 @s = nil
 @tokens = []
end

parse(str) click to toggle source

Parses str into a Document.

Use RDoc::Markup#parse instead of this method.

# File lib/rdoc/markup/parser.rb, line 58
def self.parse str
 parser = new
 parser.tokenize str
 doc = RDoc::Markup::Document.new
 parser.parse doc
end

tokenize(str) click to toggle source

Returns a token stream for str, for testing

# File lib/rdoc/markup/parser.rb, line 68
def self.tokenize str
 parser = new
 parser.tokenize str
 parser.tokens
end

Public Instance Methods

build_heading(level) click to toggle source

Builds a Heading of level

# File lib/rdoc/markup/parser.rb, line 94
def build_heading level
 type, text, = get
 text = case type
 when :TEXT then
 skip :NEWLINE
 text
 else
 unget
 ''
 end
 RDoc::Markup::Heading.new level, text
end

build_list(margin) click to toggle source

Builds a List flush to margin

# File lib/rdoc/markup/parser.rb, line 112
def build_list margin
 p :list_start => margin if @debug
 list = RDoc::Markup::List.new
 label = nil
 until @tokens.empty? do
 type, data, column, = get
 case type
 when *LIST_TOKENS then
 if column < margin || (list.type && list.type != type) then
 unget
 break
 end
 list.type = type
 peek_type, _, column, = peek_token
 case type
 when :NOTE, :LABEL then
 label = [] unless label
 if peek_type == :NEWLINE then
 # description not on the same line as LABEL/NOTE
 # skip the trailing newline & any blank lines below
 while peek_type == :NEWLINE
 get
 peek_type, _, column, = peek_token
 end
 # we may be:
 # - at end of stream
 # - at a column < margin:
 # [text]
 # blah blah blah
 # - at the same column, but with a different type of list item
 # [text]
 # * blah blah
 # - at the same column, with the same type of list item
 # [one]
 # [two]
 # In all cases, we have an empty description.
 # In the last case only, we continue.
 if peek_type.nil? || column < margin then
 empty = true
 elsif column == margin then
 case peek_type
 when type
 empty = :continue
 when *LIST_TOKENS
 empty = true
 else
 empty = false
 end
 else
 empty = false
 end
 if empty then
 label << data
 next if empty == :continue
 break
 end
 end
 else
 data = nil
 end
 if label then
 data = label << data
 label = nil
 end
 list_item = RDoc::Markup::ListItem.new data
 parse list_item, column
 list << list_item
 else
 unget
 break
 end
 end
 p :list_end => margin if @debug
 if list.empty? then
 return nil unless label
 return nil unless [:LABEL, :NOTE].include? list.type
 list_item = RDoc::Markup::ListItem.new label, RDoc::Markup::BlankLine.new
 list << list_item
 end
 list
end

build_paragraph(margin) click to toggle source

Builds a Paragraph that is flush to margin

# File lib/rdoc/markup/parser.rb, line 212
def build_paragraph margin
 p :paragraph_start => margin if @debug
 paragraph = RDoc::Markup::Paragraph.new
 until @tokens.empty? do
 type, data, column, = get
 if type == :TEXT and column == margin then
 paragraph << data
 break if peek_token.first == :BREAK
 data << ' ' if skip :NEWLINE
 else
 unget
 break
 end
 end
 paragraph.parts.last.sub!(/ \z/, '') # cleanup
 p :paragraph_end => margin if @debug
 paragraph
end

build_verbatim(margin) click to toggle source

Builds a Verbatim that is indented from margin.

The verbatim block is shifted left (the least indented lines start in column 0). Each part of the verbatim is one line of text, always terminated by a newline. Blank lines always consist of a single newline character, and there is never a single newline at the end of the verbatim.

# File lib/rdoc/markup/parser.rb, line 247
def build_verbatim margin
 p :verbatim_begin => margin if @debug
 verbatim = RDoc::Markup::Verbatim.new
 min_indent = nil
 generate_leading_spaces = true
 line = ''
 until @tokens.empty? do
 type, data, column, = get
 if type == :NEWLINE then
 line << data
 verbatim << line
 line = ''
 generate_leading_spaces = true
 next
 end
 if column <= margin
 unget
 break
 end
 if generate_leading_spaces then
 indent = column - margin
 line << ' ' * indent
 min_indent = indent if min_indent.nil? || indent < min_indent
 generate_leading_spaces = false
 end
 case type
 when :HEADER then
 line << '=' * data
 _, _, peek_column, = peek_token
 peek_column ||= column + data
 indent = peek_column - column - data
 line << ' ' * indent
 when :RULE then
 width = 2 + data
 line << '-' * width
 _, _, peek_column, = peek_token
 peek_column ||= column + width
 indent = peek_column - column - width
 line << ' ' * indent
 when :BREAK, :TEXT then
 line << data
 else # *LIST_TOKENS
 list_marker = case type
 when :BULLET then data
 when :LABEL then "[#{data}]"
 when :NOTE then "#{data}::"
 else # :LALPHA, :NUMBER, :UALPHA
 "#{data}."
 end
 line << list_marker
 peek_type, _, peek_column = peek_token
 unless peek_type == :NEWLINE then
 peek_column ||= column + list_marker.length
 indent = peek_column - column - list_marker.length
 line << ' ' * indent
 end
 end
 end
 verbatim << line << "\n" unless line.empty?
 verbatim.parts.each { |p| p.slice!(0, min_indent) unless p == "\n" } if min_indent > 0
 verbatim.normalize
 p :verbatim_end => margin if @debug
 verbatim
end

char_pos(byte_offset) click to toggle source

The character offset for the input string at the given byte_offset

# File lib/rdoc/markup/parser.rb, line 325
def char_pos byte_offset
 if @have_byteslice then
 @input.byteslice(0, byte_offset).length
 elsif @have_encoding then
 matched = @binary_input[0, byte_offset]
 matched.force_encoding @input_encoding
 matched.length
 else
 byte_offset
 end
end

get() click to toggle source

Pulls the next token from the stream.

# File lib/rdoc/markup/parser.rb, line 340
def get
 @current_token = @tokens.shift
 p :get => @current_token if @debug
 @current_token
end

parse(parent, indent = 0) click to toggle source

Parses the tokens into an array of RDoc::Markup::XXX objects, and appends them to the passed parent RDoc::Markup::YYY object.

Exits at the end of the token stream, or when it encounters a token in a column less than indent (unless it is a NEWLINE).

Returns parent.

# File lib/rdoc/markup/parser.rb, line 355
def parse parent, indent = 0
 p :parse_start => indent if @debug
 until @tokens.empty? do
 type, data, column, = get
 case type
 when :BREAK then
 parent << RDoc::Markup::BlankLine.new
 skip :NEWLINE, false
 next
 when :NEWLINE then
 # trailing newlines are skipped below, so this is a blank line
 parent << RDoc::Markup::BlankLine.new
 skip :NEWLINE, false
 next
 end
 # indentation change: break or verbatim
 if column < indent then
 unget
 break
 elsif column > indent then
 unget
 parent << build_verbatim(indent)
 next
 end
 # indentation is the same
 case type
 when :HEADER then
 parent << build_heading(data)
 when :RULE then
 parent << RDoc::Markup::Rule.new(data)
 skip :NEWLINE
 when :TEXT then
 unget
 parse_text parent, indent
 when *LIST_TOKENS then
 unget
 parent << build_list(indent)
 else
 type, data, column, line = @current_token
 raise ParseError, "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
 end
 end
 p :parse_end => indent if @debug
 parent
end

peek_token() click to toggle source

Returns the next token on the stream without modifying the stream

# File lib/rdoc/markup/parser.rb, line 418
def peek_token
 token = @tokens.first || []
 p :peek => token if @debug
 token
end

setup_scanner(input) click to toggle source

Creates the StringScanner

# File lib/rdoc/markup/parser.rb, line 427
def setup_scanner input
 @line = 0
 @line_pos = 0
 @input = input.dup
 if @have_encoding and not @have_byteslice then
 @input_encoding = @input.encoding
 @binary_input = @input.force_encoding Encoding::BINARY
 end
 @s = StringScanner.new input
end

skip(token_type, error = true) click to toggle source

Skips the next token if its type is token_type.

Optionally raises an error if the next token is not of the expected type.

# File lib/rdoc/markup/parser.rb, line 445
def skip token_type, error = true
 type, = get
 return unless type # end of stream
 return @current_token if token_type == type
 unget
 raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if error
end

token_pos(byte_offset) click to toggle source

Calculates the column (by character) and line of the current token based on byte_offset.

# File lib/rdoc/markup/parser.rb, line 541
def token_pos byte_offset
 offset = char_pos byte_offset
 [offset - @line_pos, @line]
end

tokenize(input) click to toggle source

Turns text input into a stream of tokens

# File lib/rdoc/markup/parser.rb, line 456
def tokenize input
 setup_scanner input
 until @s.eos? do
 pos = @s.pos
 # leading spaces will be reflected by the column of the next token
 # the only thing we loose are trailing spaces at the end of the file
 next if @s.scan(/ +/)
 # note: after BULLET, LABEL, etc.,
 # indent will be the column of the next non-newline token
 @tokens << case
 # [CR]LF => :NEWLINE
 when @s.scan(/\r?\n/) then
 token = [:NEWLINE, @s.matched, *token_pos(pos)]
 @line_pos = char_pos @s.pos
 @line += 1
 token
 # === text => :HEADER then :TEXT
 when @s.scan(/(=+)(\s*)/) then
 level = @s[1].length
 header = [:HEADER, level, *token_pos(pos)]
 if @s[2] =~ /^\r?\n/ then
 @s.pos -= @s[2].length
 header
 else
 pos = @s.pos
 @s.scan(/.*/)
 @tokens << header
 [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)]
 end
 # --- (at least 3) and nothing else on the line => :RULE
 when @s.scan(/(-{3,}) *\r?$/) then
 [:RULE, @s[1].length - 2, *token_pos(pos)]
 # * or - followed by white space and text => :BULLET
 when @s.scan(/([*-]) +(\S)/) then
 @s.pos -= @s[2].bytesize # unget \S
 [:BULLET, @s[1], *token_pos(pos)]
 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
 when @s.scan(/([a-z]|\d+)\. +(\S)/i) then
 # FIXME if tab(s), the column will be wrong
 # either support tabs everywhere by first expanding them to
 # spaces, or assume that they will have been replaced
 # before (and provide a check for that at least in debug
 # mode)
 list_label = @s[1]
 @s.pos -= @s[2].bytesize # unget \S
 list_type =
 case list_label
 when /[a-z]/ then :LALPHA
 when /[A-Z]/ then :UALPHA
 when /\d/ then :NUMBER
 else
 raise ParseError, "BUG token #{list_label}"
 end
 [list_type, list_label, *token_pos(pos)]
 # [text] followed by spaces or end of line => :LABEL
 when @s.scan(/\[(.*?)\]( +|\r?$)/) then
 [:LABEL, @s[1], *token_pos(pos)]
 # text:: followed by spaces or end of line => :NOTE
 when @s.scan(/(.*?)::( +|\r?$)/) then
 [:NOTE, @s[1], *token_pos(pos)]
 # anything else: :TEXT
 else @s.scan(/(.*?)( )?\r?$/)
 token = [:TEXT, @s[1], *token_pos(pos)]
 if @s[2] then
 @tokens << token
 [:BREAK, @s[2], *token_pos(pos + @s[1].length)]
 else
 token
 end
 end
 end
 self
end

unget() click to toggle source

Returns the current token to the token stream

# File lib/rdoc/markup/parser.rb, line 550
def unget
 token = @current_token
 p :unget => token if @debug
 raise Error, 'too many #ungets' if token == @tokens.first
 @tokens.unshift token if token
end