7
\$\begingroup\$

I am implementing a lexer for an experimental language by the name "Phoenix", the language supports four primary arithmetic expression for integers only (add, subtract, multiply, divide), variable assignment and print statement.

Typical input:

# this is a comment
value = 1 + 7 / 4 * (30)
print('Value = ', value)

lexer.go:

package lexer
import (
 "bytes"
 "fmt"
 "io"
 "os"
 "phoenix/log"
 "phoenix/token"
 "unicode"
)
// Lexer type for our lexer
type Lexer struct {
 SourceFileName string
 fh *os.File
 UnScannedChar byte
 CurrentLine, CurrentCol uint64
}
// New sets up the lexer
func (l *Lexer) New(fileName string) {
 tmpFh, err := os.Open(fileName)
 l.fh = tmpFh
 if err != nil {
 log.Error(fmt.Sprintf("couldn't open file '%v'", fileName))
 os.Exit(0)
 }
 l.SourceFileName = fileName
 l.CurrentLine = 1
}
// NextChar updates Lexer.unScannedChar
func (l *Lexer) NextChar() (err error) {
 tempChar := make([]byte, 1)
 _, er := l.fh.Read(tempChar)
 if er != nil {
 if er == io.EOF {
 err = io.EOF
 return
 }
 log.Error(fmt.Sprintf("error while reading from file '%v'",
 l.SourceFileName))
 os.Exit(0)
 }
 l.UnScannedChar = tempChar[0]
 if l.UnScannedChar == '\n' {
 l.CurrentLine++
 l.CurrentCol = 0
 } else {
 l.CurrentCol++
 }
 return
}
func (l *Lexer) isIdentifierStart() bool {
 return unicode.IsLetter(rune(l.UnScannedChar)) ||
 l.UnScannedChar == '_'
}
func (l *Lexer) isIdentifierPart() bool {
 return l.isIdentifierStart() || unicode.IsNumber(rune(l.UnScannedChar))
}
func (l *Lexer) scanInteger() (newToken token.Token) {
 var buffer bytes.Buffer
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 for unicode.IsDigit(rune(l.UnScannedChar)) {
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 }
 newToken.Rep = token.INTEGER
 newToken.Data = buffer.String()
 return
}
func (l *Lexer) scanID() (newToken token.Token) {
 var buffer bytes.Buffer
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 for l.isIdentifierPart() {
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 }
 newToken.Data = buffer.String()
 switch newToken.Data {
 case "print":
 newToken.Rep = token.PRINT
 default:
 newToken.Rep = token.ID
 }
 /* we need to seek back the reading cursor as the next call of NextToken()
 will invoke NextChar(), neglecting the current unscanned character */
 l.fh.Seek(-1, os.SEEK_CUR)
 return
}
func (l *Lexer) scanString() (newToken token.Token) {
 var buffer bytes.Buffer
 l.NextChar()
 for l.UnScannedChar != '\'' {
 buffer.WriteString(string(l.UnScannedChar))
 l.NextChar()
 }
 newToken.Rep = token.STRING
 newToken.Data = buffer.String()
 return
}
// NextToken scans the Lexer.UnScannedChar and creates its correspondant token
func (l *Lexer) NextToken() (newToken token.Token) {
 err := l.NextChar()
 if err == io.EOF {
 newToken.Rep = token.EOF
 return
 }
 // scan integers
 if unicode.IsDigit(rune(l.UnScannedChar)) {
 return l.scanInteger()
 }
 // scan identifiers and print keyword
 if l.isIdentifierStart() {
 return l.scanID()
 }
 // scan strings
 if l.UnScannedChar == '\'' {
 return l.scanString()
 }
 // skip comments
 if l.UnScannedChar == '#' {
 err := l.NextChar()
 for err != io.EOF && l.UnScannedChar != '\n' {
 err = l.NextChar()
 if err == io.EOF {
 newToken.Rep = token.EOF
 return
 }
 }
 return l.NextToken()
 }
 switch l.UnScannedChar {
 case '\f', '\t', '\r', ' ':
 return l.NextToken()
 case '\n':
 newToken.Rep = token.NEWLINE
 case '(':
 newToken.Rep = token.LPAREN
 case ')':
 newToken.Rep = token.RPAREN
 case ',':
 newToken.Rep = token.COMMA
 case '+':
 newToken.Rep = token.ADD
 case '-':
 newToken.Rep = token.SUB
 case '*':
 newToken.Rep = token.MUL
 case '/':
 newToken.Rep = token.DIV
 case '=':
 newToken.Rep = token.ASSIGN
 default:
 log.Error(fmt.Sprintf(
 "at file %v, line %v: unknown character '%c'", l.SourceFileName,
 l.CurrentLine, l.UnScannedChar))
 newToken.Rep = token.UNKNOWN
 }
 return
}

token.go:

package token
const (
 SINGLEQ rune = iota
 LPAREN
 RPAREN
 COMMA
 ASSIGN
 ADD
 SUB
 MUL
 DIV
 STRING
 INTEGER
 REAL
 ID
 PRINT
 NEWLINE
 EOF
 UNKNOWN
)
type Token struct {
 Rep rune
 Data string
}
func (t Token) String() string {
 switch t.Rep {
 case SINGLEQ:
 return "'"
 case LPAREN:
 return "("
 case RPAREN:
 return ")"
 case COMMA:
 return ","
 case ASSIGN:
 return "="
 case ADD:
 return "+"
 case SUB:
 return "-"
 case MUL:
 return "*"
 case DIV:
 return "/"
 case STRING:
 return "string"
 case INTEGER:
 return "integer"
 case ID:
 return "identifier"
 case PRINT:
 return "print"
 case NEWLINE:
 return "newline"
 case EOF:
 return "EOF"
 }
 return "unknown"
}

I need the lexer to be as fast as possible and the code should be flexible to implement more extensions/features for the language.

So how can it be improved?

asked Jun 28, 2016 at 15:51
\$\endgroup\$

2 Answers 2

2
\$\begingroup\$

This will not be a full review, but rather a couple of improvements you can make. Firstly, why do all those string <-> []byte conversions when writing into a buffer? You can just do e.g.

buffer.Write([]byte{l.UnScannedChar})

instead of

buffer.WriteString(string(l.UnScannedChar))

Also, since you're only using file's Read and Seek methods, why not change os.File to io.ReadSeeker?

answered Jun 29, 2016 at 11:58
\$\endgroup\$
2
\$\begingroup\$

You should definitely not call os.Exit() from any library function return the error instead. I would not log unhandled errors either.

New has nothing to do with the Lexer as input, it is more like a constructor, so it could be a "static" method returning a Lexer instance.

answered Jul 3, 2016 at 22:29
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.