I am writing an interpreter for a simple mathematical programming language.
Below is a simple program written in it that prints 41.22
:
let $x = 35; # This is a comment
$x + 6.22;
Below is the code for the lexer. As this is my first attempt at writing an interpreter, I would appreciate any general suggestions for improvement. I am also new to the D programming language, so I would like to know if it could be made more idiomatic.
import std.stdio;
import std.format;
import std.ascii : isAlpha, isDigit;
void main() {
Token token;
Lexer lexer;
lexer.fileName = "test";
lexer.input("let $x = 35;\n$x + 6.22;");
while (token.type != TokenType.eof) {
try token = lexer.lex();
catch (Panic) break;
writeln(token);
}
}
enum State {
none,
number,
keyword,
variable,
comment
}
enum TokenType {
number,
keyword,
operator,
variable,
semicolon,
eof
}
struct Token {
TokenType type;
string value;
}
class Panic : Exception {
// Thrown when the 'panic' method of 'Lexer' is called
this(string msg) {
super(msg);
}
}
bool isWhitespace(int c) {
return (c == ' ' || c == '\t');
}
bool isOperator(int c) {
return (c == '=' || c == '+' ||
c == '-' || c == '*' ||
c == '/' || c == '%');
}
bool isDelimiter(int c) {
return (c == '\n' || c == EOF || c == ';' ||
isWhitespace(c) || isOperator(c));
}
struct Lexer {
string code;
string fileName;
uint lineNum = 1;
uint colNum = 1;
int c;
State state = State.none;
void panic(string message) {
writefln("Syntax ERROR in '%s' at line %d, column %d:",
fileName, lineNum, colNum);
writeln(message);
throw new Panic("Syntax error");
}
void input(string code) {
this.code = code;
if (code == "") c = EOF;
else c = code[0];
}
void nextChar() {
colNum++;
code = code[1..$];
if (code == "") {
c = EOF;
} else {
c = code[0];
}
}
Token lex() {
// Returns the next token
Token token;
while (1) {
switch (state) {
case State.none:
if (c == EOF) {
token.type = TokenType.eof;
return token;
} else if (isWhitespace(c)) {
nextChar();
} else if (c == '\n') {
lineNum++;
nextChar();
colNum = 1;
} else if (c == '#') {
nextChar();
state = State.comment;
} else if (c == ';') {
token.type = TokenType.semicolon;
nextChar();
return token;
} else if (c == '$') {
nextChar();
state = State.variable;
} else if (isAlpha(c)) {
state = State.keyword;
} else if (isOperator(c)) {
token.value ~= c;
token.type = TokenType.operator;
nextChar();
return token;
} else if (isDigit(c)) {
state = State.number;
} else {
panic(format("Unexpected character `%c`",
cast(char)(c)));
}
break;
case State.comment:
while (c != '\n' && c != EOF) nextChar();
state = State.none;
break;
case State.variable:
while(!isDelimiter(c)) {
token.value ~= c;
nextChar();
}
state = State.none;
token.type = TokenType.variable;
return token;
case State.keyword:
while (!isDelimiter(c)) {
token.value ~= c;
nextChar();
}
state = State.none;
token.type = TokenType.keyword;
return token;
case State.number:
while (isDigit(c) || c == '.') {
token.value ~= c;
nextChar();
}
state = State.none;
token.type = TokenType.number;
return token;
default:
assert(0);
}
}
}
}
1 Answer 1
Looks very nice - clean code, readable, very good names.
You're sometimes using brackets, sometimes not (compare
input
andnextChar
, e.g.). I suggest choosing one of these and sticking to it.Range primitives could simplify and improve parts of your code, and is considered very idiomatic D.
a. Your code can be made Unicode compatible simply by using
std.range.primitives.front
andstd.range.primitives.popFront
in place ofcode[0]
andcode = code[1..$]
:void nextChar() { colNum++; code.popFront(); if (code == "") c = EOF; else c = code.front; }
b.
c
can be replaced by a wrapper aroundstd.range.primitives.front
andstd.range.primitives.empty
:@property dchar c() { if (code.empty) return EOF; return code.front; }
c. Consider making
Lexer
a template that can take different range types. If you have done 2a and 2b, that should be as simple as:struct Lexer(R) if (isInputRange!R && is(ElementType!R == dchar)) { R code; // Leave remaining code as is. }
I'd place the comment on the
Panic
type itself, not on its constructor, since the comment describes the type, and one generally cares more about the type than the constructor.isWhitespace, isOperator, and isDelimiter can be simplified with
std.algorithm.comparison.among
:bool isOperator(int c) { return !!c.among('=', '+', '-', '*', '/', '%'); }
(
among
returns the index of c in the list of items. The double exclamation mark turns the number into a bool)c
is anint
, but is always used as adchar
. Changing its type todchar
, I get an error message complaining thatEOF
can't be assigned to it. Consider redefiningEOF
asenum dchar EOF = dchar.init;
, maybe?One thing that would disappear if the above advice is followed, but might be worth pointing out still:
The second half of
nextChar
is identical toinput
.nextChar
's body could be simplified to:void nextChar() { colNum++; input(code[1..$]); }
Thought of one more thing that might be worth considering:
panic
exhibits a conflation of responsibilities. Consider putting more information in the exception, and handling the output of error information atlex
's call site:void main() { // ... try token = lexer.lex(); catch (Panic p) { writeln(p); break; } // ... } class Panic : Exception { string fileName; uint lineNum; uint colNum; this(string msg, string file, uint line, uint col) { super(msg); fileName = file; lineNum = line; colNum = col; } override string toString() { return format("Syntax ERROR in '%s' at line %d, column %d:\n%s", fileName, lineNum, colNum, msg); } } struct Lexer { //... void panic(string message) { throw new Panic(message, fileName, lineNum, colNum); } //... }
This way, the error information is available to other pieces of your code, and the lexer can be used in programs that don't use the console.
Related, consider a different name for the type Panic
. In the code you've shown, it's a SyntaxErrorException
, and the name Panic
doesn't convey any information about what's actually gone wrong. The same argument applies the function panic
.