Lexer for a small language in D

Question 1

I am writing an interpreter for a simple mathematical programming language.

Below is a simple program written in it that prints 41.22:

let $x = 35; # This is a comment
$x + 6.22;

Below is the code for the lexer. As this is my first attempt at writing an interpreter, I would appreciate any general suggestions for improvement. I am also new to the D programming language, so I would like to know if it could be made more idiomatic.

import std.stdio;
import std.format;
import std.ascii : isAlpha, isDigit;
void main() {
 Token token;
 Lexer lexer;
 lexer.fileName = "test";
 lexer.input("let $x = 35;\n$x + 6.22;");
 while (token.type != TokenType.eof) {
 try token = lexer.lex();
 catch (Panic) break;
 writeln(token);
 }
}
enum State {
 none,
 number,
 keyword,
 variable,
 comment
}
enum TokenType {
 number,
 keyword,
 operator,
 variable,
 semicolon,
 eof
}
struct Token {
 TokenType type;
 string value;
}
class Panic : Exception {
 // Thrown when the 'panic' method of 'Lexer' is called
 this(string msg) {
 super(msg);
 }
}
bool isWhitespace(int c) {
 return (c == ' ' || c == '\t');
}
bool isOperator(int c) {
 return (c == '=' || c == '+' ||
 c == '-' || c == '*' ||
 c == '/' || c == '%');
}
bool isDelimiter(int c) {
 return (c == '\n' || c == EOF || c == ';' ||
 isWhitespace(c) || isOperator(c));
}
struct Lexer {
 string code;
 string fileName;
 uint lineNum = 1;
 uint colNum = 1;
 int c;
 State state = State.none;
 void panic(string message) {
 writefln("Syntax ERROR in '%s' at line %d, column %d:",
 fileName, lineNum, colNum);
 writeln(message);
 throw new Panic("Syntax error");
 }
 void input(string code) {
 this.code = code;
 if (code == "") c = EOF;
 else c = code[0];
 }
 void nextChar() {
 colNum++;
 code = code[1..$];
 if (code == "") {
 c = EOF;
 } else {
 c = code[0];
 }
 }
 Token lex() {
 // Returns the next token
 Token token;
 while (1) {
 switch (state) {
 case State.none:
 if (c == EOF) {
 token.type = TokenType.eof;
 return token;
 } else if (isWhitespace(c)) {
 nextChar();
 } else if (c == '\n') {
 lineNum++;
 nextChar();
 colNum = 1;
 } else if (c == '#') {
 nextChar();
 state = State.comment;
 } else if (c == ';') {
 token.type = TokenType.semicolon;
 nextChar();
 return token;
 } else if (c == '$') {
 nextChar();
 state = State.variable;
 } else if (isAlpha(c)) {
 state = State.keyword;
 } else if (isOperator(c)) {
 token.value ~= c;
 token.type = TokenType.operator;
 nextChar();
 return token;
 } else if (isDigit(c)) {
 state = State.number;
 } else {
 panic(format("Unexpected character `%c`",
 cast(char)(c)));
 }
 break;
 case State.comment:
 while (c != '\n' && c != EOF) nextChar();
 state = State.none;
 break;
 case State.variable:
 while(!isDelimiter(c)) {
 token.value ~= c;
 nextChar();
 }
 state = State.none;
 token.type = TokenType.variable;
 return token;
 case State.keyword:
 while (!isDelimiter(c)) {
 token.value ~= c;
 nextChar();
 }
 state = State.none;
 token.type = TokenType.keyword;
 return token;
 case State.number:
 while (isDigit(c) || c == '.') {
 token.value ~= c;
 nextChar();
 }
 state = State.none;
 token.type = TokenType.number;
 return token;
 default:
 assert(0);
 }
 }
 }
}

Question 2

Looks very nice - clean code, readable, very good names.

You're sometimes using brackets, sometimes not (compare input and nextChar, e.g.). I suggest choosing one of these and sticking to it.
Range primitives could simplify and improve parts of your code, and is considered very idiomatic D.

a. Your code can be made Unicode compatible simply by using std.range.primitives.front and std.range.primitives.popFront in place of code[0] and code = code[1..$]:
```
void nextChar() {
 colNum++;
 code.popFront();
 if (code == "") c = EOF;
 else c = code.front;
}
```
b. c can be replaced by a wrapper around std.range.primitives.front and std.range.primitives.empty:
```
@property
dchar c() {
 if (code.empty) return EOF;
 return code.front;
}
```
c. Consider making Lexer a template that can take different range types. If you have done 2a and 2b, that should be as simple as:
```
struct Lexer(R) if (isInputRange!R && is(ElementType!R == dchar)) {
 R code;
 // Leave remaining code as is.
}
```
I'd place the comment on the Panic type itself, not on its constructor, since the comment describes the type, and one generally cares more about the type than the constructor.
isWhitespace, isOperator, and isDelimiter can be simplified with std.algorithm.comparison.among:
```
bool isOperator(int c) {
 return !!c.among('=', '+', '-', '*', '/', '%');
}
```
(among returns the index of c in the list of items. The double exclamation mark turns the number into a bool)
c is an int, but is always used as a dchar. Changing its type to dchar, I get an error message complaining that EOF can't be assigned to it. Consider redefining EOF as enum dchar EOF = dchar.init;, maybe?

One thing that would disappear if the above advice is followed, but might be worth pointing out still:
The second half of nextChar is identical to input. nextChar's body could be simplified to:
```
void nextChar() {
 colNum++;
 input(code[1..$]);
}
```

Thought of one more thing that might be worth considering:

panic exhibits a conflation of responsibilities. Consider putting more information in the exception, and handling the output of error information at lex's call site:

void main() {
 // ...
 try token = lexer.lex();
 catch (Panic p) {
 writeln(p);
 break;
 }
 // ...
}
class Panic : Exception {
 string fileName;
 uint lineNum;
 uint colNum;
 this(string msg, string file, uint line, uint col) {
 super(msg);
 fileName = file;
 lineNum = line;
 colNum = col;
 }
 override string toString() {
 return format("Syntax ERROR in '%s' at line %d, column %d:\n%s",
 fileName, lineNum, colNum, msg);
 }
}
struct Lexer {
 //...
 void panic(string message) {
 throw new Panic(message, fileName, lineNum, colNum);
 }
 //...
}

This way, the error information is available to other pieces of your code, and the lexer can be used in programs that don't use the console.

Related, consider a different name for the type Panic. In the code you've shown, it's a SyntaxErrorException, and the name Panic doesn't convey any information about what's actually gone wrong. The same argument applies the function panic.

BioTronic BioTronic 2411 silver badge3 bronze badges · Accepted Answer · 2018-05-28 09:05:15Z

Looks very nice - clean code, readable, very good names.

You're sometimes using brackets, sometimes not (compare input and nextChar, e.g.). I suggest choosing one of these and sticking to it.
Range primitives could simplify and improve parts of your code, and is considered very idiomatic D.

a. Your code can be made Unicode compatible simply by using std.range.primitives.front and std.range.primitives.popFront in place of code[0] and code = code[1..$]:
```
void nextChar() {
 colNum++;
 code.popFront();
 if (code == "") c = EOF;
 else c = code.front;
}
```
b. c can be replaced by a wrapper around std.range.primitives.front and std.range.primitives.empty:
```
@property
dchar c() {
 if (code.empty) return EOF;
 return code.front;
}
```
c. Consider making Lexer a template that can take different range types. If you have done 2a and 2b, that should be as simple as:
```
struct Lexer(R) if (isInputRange!R && is(ElementType!R == dchar)) {
 R code;
 // Leave remaining code as is.
}
```
I'd place the comment on the Panic type itself, not on its constructor, since the comment describes the type, and one generally cares more about the type than the constructor.
isWhitespace, isOperator, and isDelimiter can be simplified with std.algorithm.comparison.among:
```
bool isOperator(int c) {
 return !!c.among('=', '+', '-', '*', '/', '%');
}
```
(among returns the index of c in the list of items. The double exclamation mark turns the number into a bool)
c is an int, but is always used as a dchar. Changing its type to dchar, I get an error message complaining that EOF can't be assigned to it. Consider redefining EOF as enum dchar EOF = dchar.init;, maybe?

One thing that would disappear if the above advice is followed, but might be worth pointing out still:
The second half of nextChar is identical to input. nextChar's body could be simplified to:
```
void nextChar() {
 colNum++;
 input(code[1..$]);
}
```

Thought of one more thing that might be worth considering:

panic exhibits a conflation of responsibilities. Consider putting more information in the exception, and handling the output of error information at lex's call site:

void main() {
 // ...
 try token = lexer.lex();
 catch (Panic p) {
 writeln(p);
 break;
 }
 // ...
}
class Panic : Exception {
 string fileName;
 uint lineNum;
 uint colNum;
 this(string msg, string file, uint line, uint col) {
 super(msg);
 fileName = file;
 lineNum = line;
 colNum = col;
 }
 override string toString() {
 return format("Syntax ERROR in '%s' at line %d, column %d:\n%s",
 fileName, lineNum, colNum, msg);
 }
}
struct Lexer {
 //...
 void panic(string message) {
 throw new Panic(message, fileName, lineNum, colNum);
 }
 //...
}

This way, the error information is available to other pieces of your code, and the lexer can be used in programs that don't use the console.

Related, consider a different name for the type Panic. In the code you've shown, it's a SyntaxErrorException, and the name Panic doesn't convey any information about what's actually gone wrong. The same argument applies the function panic.

Stack Exchange Network

Lexer for a small language in D

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions