Lexer for an experimental tiny language in Go

Question 1

I am implementing a lexer for an experimental language by the name "Phoenix", the language supports four primary arithmetic expression for integers only (add, subtract, multiply, divide), variable assignment and print statement.

Typical input:

# this is a comment
value = 1 + 7 / 4 * (30)
print('Value = ', value)

lexer.go:

package lexer
import (
 "bytes"
 "fmt"
 "io"
 "os"
 "phoenix/log"
 "phoenix/token"
 "unicode"
)
// Lexer type for our lexer
type Lexer struct {
 SourceFileName string
 fh *os.File
 UnScannedChar byte
 CurrentLine, CurrentCol uint64
}
// New sets up the lexer
func (l *Lexer) New(fileName string) {
 tmpFh, err := os.Open(fileName)
 l.fh = tmpFh
 if err != nil {
 log.Error(fmt.Sprintf("couldn't open file '%v'", fileName))
 os.Exit(0)
 }
 l.SourceFileName = fileName
 l.CurrentLine = 1
}
// NextChar updates Lexer.unScannedChar
func (l *Lexer) NextChar() (err error) {
 tempChar := make([]byte, 1)
 _, er := l.fh.Read(tempChar)
 if er != nil {
 if er == io.EOF {
 err = io.EOF
 return
 }
 log.Error(fmt.Sprintf("error while reading from file '%v'",
 l.SourceFileName))
 os.Exit(0)
 }
 l.UnScannedChar = tempChar[0]
 if l.UnScannedChar == '\n' {
 l.CurrentLine++
 l.CurrentCol = 0
 } else {
 l.CurrentCol++
 }
 return
}
func (l *Lexer) isIdentifierStart() bool {
 return unicode.IsLetter(rune(l.UnScannedChar)) ||
 l.UnScannedChar == '_'
}
func (l *Lexer) isIdentifierPart() bool {
 return l.isIdentifierStart() || unicode.IsNumber(rune(l.UnScannedChar))
}
func (l *Lexer) scanInteger() (newToken token.Token) {
 var buffer bytes.Buffer
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 for unicode.IsDigit(rune(l.UnScannedChar)) {
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 }
 newToken.Rep = token.INTEGER
 newToken.Data = buffer.String()
 return
}
func (l *Lexer) scanID() (newToken token.Token) {
 var buffer bytes.Buffer
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 for l.isIdentifierPart() {
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 }
 newToken.Data = buffer.String()
 switch newToken.Data {
 case "print":
 newToken.Rep = token.PRINT
 default:
 newToken.Rep = token.ID
 }
 /* we need to seek back the reading cursor as the next call of NextToken()
 will invoke NextChar(), neglecting the current unscanned character */
 l.fh.Seek(-1, os.SEEK_CUR)
 return
}
func (l *Lexer) scanString() (newToken token.Token) {
 var buffer bytes.Buffer
 l.NextChar()
 for l.UnScannedChar != '\'' {
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 }
 newToken.Rep = token.STRING
 newToken.Data = buffer.String()
 return
}
// NextToken scans the Lexer.UnScannedChar and creates its correspondant token
func (l *Lexer) NextToken() (newToken token.Token) {
 err := l.NextChar()
 if err == io.EOF {
 newToken.Rep = token.EOF
 return
 }
 // scan integers
 if unicode.IsDigit(rune(l.UnScannedChar)) {
 return l.scanInteger()
 }
 // scan identifiers and print keyword
 if l.isIdentifierStart() {
 return l.scanID()
 }
 // scan strings
 if l.UnScannedChar == '\'' {
 return l.scanString()
 }
 // skip comments
 if l.UnScannedChar == '#' {
 err := l.NextChar()
 for err != io.EOF && l.UnScannedChar != '\n' {
 err = l.NextChar()
 if err == io.EOF {
 newToken.Rep = token.EOF
 return
 }
 }
 return l.NextToken()
 }
 switch l.UnScannedChar {
 case '\f', '\t', '\r', ' ':
 return l.NextToken()
 case '\n':
 newToken.Rep = token.NEWLINE
 case '(':
 newToken.Rep = token.LPAREN
 case ')':
 newToken.Rep = token.RPAREN
 case ',':
 newToken.Rep = token.COMMA
 case '+':
 newToken.Rep = token.ADD
 case '-':
 newToken.Rep = token.SUB
 case '*':
 newToken.Rep = token.MUL
 case '/':
 newToken.Rep = token.DIV
 case '=':
 newToken.Rep = token.ASSIGN
 default:
 log.Error(fmt.Sprintf(
 "at file %v, line %v: unknown character '%c'", l.SourceFileName,
 l.CurrentLine, l.UnScannedChar))
 newToken.Rep = token.UNKNOWN
 }
 return
}

token.go:

package token
const (
 SINGLEQ rune = iota
 LPAREN
 RPAREN
 COMMA
 ASSIGN
 ADD
 SUB
 MUL
 DIV
 STRING
 INTEGER
 REAL
 ID
 PRINT
 NEWLINE
 EOF
 UNKNOWN
)
type Token struct {
 Rep rune
 Data string
}
func (t Token) String() string {
 switch t.Rep {
 case SINGLEQ:
 return "'"
 case LPAREN:
 return "("
 case RPAREN:
 return ")"
 case COMMA:
 return ","
 case ASSIGN:
 return "="
 case ADD:
 return "+"
 case SUB:
 return "-"
 case MUL:
 return "*"
 case DIV:
 return "/"
 case STRING:
 return "string"
 case INTEGER:
 return "integer"
 case ID:
 return "identifier"
 case PRINT:
 return "print"
 case NEWLINE:
 return "newline"
 case EOF:
 return "EOF"
 }
 return "unknown"
}

I need the lexer to be as fast as possible and the code should be flexible to implement more extensions/features for the language.

So how can it be improved?

Question 2

This will not be a full review, but rather a couple of improvements you can make. Firstly, why do all those string <-> []byte conversions when writing into a buffer? You can just do e.g.

buffer.Write([]byte{l.UnScannedChar})

instead of

buffer.WriteString(string(l.UnScannedChar))

Also, since you're only using file's Read and Seek methods, why not change os.File to io.ReadSeeker?

Question 3

You should definitely not call os.Exit() from any library function return the error instead. I would not log unhandled errors either.

New has nothing to do with the Lexer as input, it is more like a constructor, so it could be a "static" method returning a Lexer instance.

Ainar-G Ainar-G 4964 silver badges8 bronze badges · Answer 1 · 2016-06-29 11:58:34Z

This will not be a full review, but rather a couple of improvements you can make. Firstly, why do all those string <-> []byte conversions when writing into a buffer? You can just do e.g.

buffer.Write([]byte{l.UnScannedChar})

instead of

buffer.WriteString(string(l.UnScannedChar))

Also, since you're only using file's Read and Seek methods, why not change os.File to io.ReadSeeker?

Gyongyeee Gyongyeee 693 bronze badges · Answer 2 · 2016-07-03 22:29:13Z

You should definitely not call os.Exit() from any library function return the error instead. I would not log unhandled errors either.

New has nothing to do with the Lexer as input, it is more like a constructor, so it could be a "static" method returning a Lexer instance.

Stack Exchange Network

Lexer for an experimental tiny language in Go

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Lexer for an experimental tiny language in Go

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions