I had some spare time so I have decided to write a simple tokenizer in C# for my own programming language. The tokenizer is not finished, but it is working and I would like to hear your opinion on things I can change a bit, improve or completely modify. It is lacking FloatLiteral
token and it assumes the input is not empty, but I will add proper exception handling later. I have decided not to use Regex, as it kills all of the fun in creating this.
Language keywords
PRINT
VAR
LET
GOTO
IF
WHILE
Source Code
The source code is split in three files.
TokenType.cs
using System;
namespace Lang.Lexer
{
public enum TokenType : ushort
{
// keyword
PrintKeyword = 1, // PRINT
VarKeyword = 2, // VAR
LetKeyword = 3, // LET
GotoKeyword = 4, // GOTO
IfKeyword = 5, // IF
WhileKeyword = 6, // WHILE
// literal
IntegerLiteral = 10,
// identifier
Identifier = 20,
// operator
Assignment = 30, // =
Plus = 31, // +
Minus = 32, // -
Multiplication = 33, // *
Division = 34, // /
Modulo = 35, // %
EqualTo = 36, // ==
NotEqualTo = 37, // !=
GeaterThan = 38, // >
LessThan = 39, // <
GreaterThanOrEqualTo = 40, // >=
LessThanOrEqualTo = 41, // <=
// logical operators
LogicalNOT = 42, // !
LogicalAND = 43, // &&
LogicalOR = 44, // ||
// trivia
Space = 60,
}
}
Token.cs
namespace Lang.Lexer
{
public sealed class Token
{
public TokenType Type { get; private set; }
public string Lexeme { get; private set; }
public int CurrentLine { get; private set; }
public Token(TokenType type, string lexeme, int currentLine)
{
Type = type;
Lexeme = lexeme;
CurrentLine = currentLine;
}
}
}
Tokenizer.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace Lang.Lexer
{
public sealed class Tokenizer
{
readonly List<string> Lines;
int Line = 0;
int Position = 0;
bool EOF = false;
int lexemeLength = 0;
public Tokenizer(string source)
{
Lines = new List<string>(Regex.Split(source, Environment.NewLine));
}
char GetChar()
{
if (EOF) return (char)0;
char c = Lines[Line][Position];
if (Position + 1 < Lines[Line].Length)
{
Position++;
}
else
{
if (Line + 1 < Lines.Count)
{
Line++;
Position = 0;
}
else
{
EOF = true;
Position++;
}
}
return c;
}
void UngetString(int count)
{
for (int i = 0; i < count; i++)
{
UngetChar();
}
}
void UngetChar()
{
if (Position != 0)
{
if (!EOF)
{
Position--;
}
else
{
Position--;
EOF = false;
}
}
else
{
Line--;
Position = Lines[Line].Length - 1;
}
}
char PeekChar()
{
char c = GetChar();
if (c != (char)0) UngetChar();
return c;
}
public void Unget()
{
UngetString(lexemeLength);
}
public Token Peek()
{
Token token = Get();
Unget();
return token;
}
public Token Get()
{
if (EOF) return null;
TokenType type;
string lexeme = string.Empty;
if ((type = IsSpace()) != 0)
{
return new Token(type, lexeme, Line);
}
if ((type = IsOperator()) != 0)
{
return new Token(type, lexeme, Line);
}
if ((type = IsKeyword()) != 0)
{
return new Token(type, lexeme, Line);
}
Tuple<TokenType, String> identifier = IsIdentifier();
if (identifier.Item1 != 0)
{
return new Token(TokenType.Identifier, identifier.Item2, Line);
}
Tuple<TokenType, String> integerLiteral = IsIntegerLiteral();
if (integerLiteral.Item1 != 0)
{
return new Token(TokenType.IntegerLiteral, integerLiteral.Item2, Line);
}
//bad token
return null;
}
Tuple<TokenType, String> IsIntegerLiteral()
{
if (!char.IsDigit(PeekChar()))
return new Tuple<TokenType, string>(0, string.Empty);
string lexeme = GetChar().ToString();
int count = 1;
int line = Line;
while (char.IsDigit(PeekChar()))
{
lexeme = lexeme + GetChar();
count++;
if (line != Line)
{
UngetString(count);
return new Tuple<TokenType, string>(0, string.Empty);
}
}
lexemeLength = count;
return new Tuple<TokenType, string>(TokenType.Identifier, lexeme);
}
TokenType IsKeyword()
{
if (!char.IsLetter(PeekChar())) return 0;
string lexeme = GetChar().ToString();
int count = 1;
int line = Line;
while (char.IsLetter(PeekChar()))
{
lexeme = lexeme + GetChar();
count++;
if (line != Line) break;
}
switch (lexeme.ToUpper())
{
case "PRINT":
{
lexemeLength = count;
return TokenType.PrintKeyword;
}
case "VAR":
{
lexemeLength = count;
return TokenType.VarKeyword;
}
case "LET":
{
lexemeLength = count;
return TokenType.LetKeyword;
}
case "GOTO":
{
lexemeLength = count;
return TokenType.GotoKeyword;
}
case "IF":
{
lexemeLength = count;
return TokenType.IfKeyword;
}
case "WHILE":
{
lexemeLength = count;
return TokenType.WhileKeyword;
}
}
UngetString(count);
return 0;
}
Tuple<TokenType, String> IsIdentifier()
{
if (!(char.IsLetter(PeekChar()) || PeekChar() == '_'))
return new Tuple<TokenType, string>(0, string.Empty);
string lexeme = GetChar().ToString();
int count = 1;
int line = Line;
while ((char.IsLetter(PeekChar()) || char.IsDigit(PeekChar()) || PeekChar() == '_'))
{
lexeme = lexeme + GetChar();
count++;
if (line != Line)
{
UngetString(count);
return new Tuple<TokenType, string>(0, string.Empty);
}
}
lexemeLength = count;
return new Tuple<TokenType, string>(TokenType.Identifier, lexeme);
}
TokenType IsSpace()
{
if (char.IsWhiteSpace(PeekChar()))
{
GetChar();
lexemeLength = 1;
return TokenType.Space;
}
return 0;
}
TokenType IsOperator()
{
char c = PeekChar();
switch (c)
{
case '=':
{
GetChar();
if (PeekChar() == '=')
{
GetChar();
lexemeLength = 2;
return TokenType.EqualTo;
}
lexemeLength = 1;
return TokenType.Assignment;
}
case '+':
{
GetChar();
lexemeLength = 1;
return TokenType.Plus;
}
case '-':
{
GetChar();
lexemeLength = 1;
return TokenType.Minus;
}
case '*':
{
GetChar();
lexemeLength = 1;
return TokenType.Multiplication;
}
case '/':
{
GetChar();
lexemeLength = 1;
return TokenType.Division;
}
case '%':
{
GetChar();
lexemeLength = 1;
return TokenType.Modulo;
}
case '!':
{
GetChar();
if (PeekChar() == '=')
{
GetChar();
lexemeLength = 2;
return TokenType.NotEqualTo;
}
lexemeLength = 1;
return TokenType.LogicalNOT;
}
case '>':
{
GetChar();
if (PeekChar() == '=')
{
GetChar();
lexemeLength = 2;
return TokenType.GreaterThanOrEqualTo;
}
lexemeLength = 1;
return TokenType.GeaterThan;
}
case '<':
{
GetChar();
if (PeekChar() == '=')
{
GetChar();
lexemeLength = 2;
return TokenType.LessThanOrEqualTo;
}
lexemeLength = 1;
return TokenType.LessThan;
}
case '&':
{
GetChar();
if (PeekChar() == '&')
{
GetChar();
lexemeLength = 2;
return TokenType.LogicalAND;
}
lexemeLength = 1;
return 0;
}
case '|':
{
GetChar();
if (PeekChar() == '|')
{
GetChar();
lexemeLength = 2;
return TokenType.LogicalOR;
}
lexemeLength = 1;
return 0;
}
default:
{
return 0;
}
}
}
}
}
-
\$\begingroup\$ I'm not sure if you intended on entering the CODE REVIEW challenge, but I've extended the entry deadline to the end of this week, since your question was posted before the original deadline. Please answer the linked Meta question if you would like to enter. (Possible +500 bounty.) \$\endgroup\$Der Kommissar– Der Kommissar2017年12月04日 21:24:30 +00:00Commented Dec 4, 2017 at 21:24
3 Answers 3
Much has already been said, but I think I've still got something to add:
Whitespace
- Returning a separate token for each whitespace character is probably not very useful. If whitespace isn't significant in your language then you can simply ignore it (no tokens). Otherwise, why not combine consecutive whitespace characters into a single token?
- Your token type is
Space
, but it's used for all whitespace characters, not just space characters.Whitespace
would be a better name.
Ungetting
- I'm not sure why calling code should be able to
Unget
a token. Surely calling code could buffer a few tokens, rather than letting the tokenizer repeat its work? - It's buggy:
Unget
doesn't updatelexemelength
, so ungetting twice or more can easily end up in the middle of a previous token. UngetChar
doesn't check if the line is less than 0. Combined with the above problem, this means that it's possible to move to negative lines, resulting in anArgumentOutOfRangeException
.
Bugs & unfinished business
- The tokenizer just stops after it encounters an unrecognized character. You may want to return an 'unknown' token instead, and continue with the rest. This allows calling code to report (more detailed) errors: 'expected X between A and B, but got Y'.
|a|b
and&a&b
result in two identifier tokens - the|
and&
characters are ignored.- Splitting the input on newlines makes tracking line numbers easier, but it also makes various other methods more complicated and error-prone. Personally I don't think it's worth the trade-off:
1\r\n2
results in a single integer token:12
. This is probably not what you want.- Likewise,
&\r\n&
results in a single logical-and token. Newline characters appear to be invisible to most tokenizing methods. - However,
foo\r\nbar
fails to tokenize: the first call toGet
just returnsnull
.
IsIntegerLiteral
returnsIdentifier
as type - probably a copy/paste error.Get
returns the correct type, so it currently doesn't cause problems, but it's a bug waiting to happen.
General remarks & suggestions
- You're already keeping track of the position within each line, so why not add that to
Token
as well? - It's not immediately obvious whether
PeekChar
works correctly. The following should be easier to verify (it obviously always callsUnget
afterGet
):if (EOF) { return 0; } else { var c = Get(); Unget(); return c; }
- Why not let the various type-specific methods return a
Token
directly? It would simplify theGet
method: just return the result of the first method that doesn't returnnull
. - Keywords are just special identifiers. You should be able to combine them into a single method, with a keyword dictionary lookup afterwards. The above point (returning
Token
) would also make this easier to do. - Modifying the
Peek
method to take an offset argument allows you to simplifyIsOperator
: instead of having to callGetChar()
before anotherPeek()
call, you could simply callPeek(2)
. - It could be helpful if the tokenizer would recognize negative numbers directly, instead of returning a
Minus
token and a positive integer token. - Personally, I'd create a separate
CharacterStream
class to take care of character advancing and peeking. Maybe it's overkill, but it allows the tokenizer class to focus on a single responsibility. - Finally, this is the kind of thing where I think investing in automated tests will quickly pay for itself.
These are just some quick thoughts off the top of my head.
Is there any particular functional reason for
TokenType
to beushort
? In general, you don't modify from the base type unless it's intended to be passed to a native API.Make the comments on your
enum
values XML comments:PrintKeyword = 1, // PRINT
becomes
/// <summary> /// PRINT /// </summary> PrintKeyword = 1,
for example. It assists the IntelliSense and can be used to automatically generate documentation.
Excellent use of
sealed
on theToken
class - kudos! A hint - since it's just a container for data rather than behavior, and if you're using C#6, you can remove theprivate set;
bits of the properties. C#6 (in other words, Visual Studio 2015 and newer) supports read-only auto-properties. TheLexeme
property would then look like:public string Lexeme { get; }
and the semantics of that class become even more evident: these properties will never change after constructor assignment.
Tokenizer
class. This is a big one. And it seems to have a lot of moving parts. The first thing I'd suggest is stick to C# idiomatic standards:Do not depend on defaults for accessibility keywords. Add
private
to things that are private:private readonly List<string> Lines;
Use C# idiomatic casing rules: class members should be camelCased (or possibly _camelCased, or even _PascalCased). The example above for
Lines
(which is PascalCased) more matches how to name properties. So we might go with:private readonly List<string> _lines;
While I'm on that, I'd recommend declaring that type as
IList<string>
. It's good practice to develop to interfaces rather than concrete types.For maintainability's sake, always wrap the body of an
if
statement in curly braces. Your future self, and anyone you work with will appreciate it. Something like (note my class member now renamed):if (this._eof) return (char)0;
becomes
if (this._eof) { return (char)0; }
While we're on the subject, an easier way to write
(char)0
is'0円'
. It's literally the null character constant. No casting, no fuss.Couple things on your
Tuples
. One is that you should use the build-in C# language typestring
in your declarations rather than the CLR typeString
. Two, it's idiomatic to useTuple.Create(...)
rather thannew Tuple(...)
.In your
switch
statements, you surround eachcase
with curly braces. Not needed. The code betweencase
s are blocks already. If you need it to scope variables, that's fine, but eliminate them otherwise.I like
var
a lot and use it everywhere. But you might not be a fan. That's okay! However, if you can replace something with generics likeTuple<TokenType, string> integerLiteral = ...
withvar integerLiteral = ...
, I think it makes sense and removes boilerplate from the code.In the method
IsIntegerLiteral
, I would recommend changing thelexeme
variable fromstring
toStringBuilder
. The declaration becomes (yep, usingvar
!):var lexeme = new StringBuilder(GetChar().ToString());
a few lines down, you then:
lexeme.Append(GetChar());
and when you return the
Tuple
at the end, you finally (noteTuple.Create
):return Tuple.Create(TokenType.Identifier, lexeme.ToString());
And, you'd do similarly in the
IsKeyword
andIsIdentifier
methods.
That should do for now.
I have some notes in addition to answer of Jesse C. Slicer.
In IsKeyword
method the first line is
if (!char.IsLetter(PeekChar())) return 0;
Looks like C-style. If method returns enum return enum. Define a field in TokenType
like NoToken = 0
and return it. And then compare result in all if
s with TokenType.NoToken
rather than with zero. Do this for all methods where at now you return 0 as TokenType
.
Instead of this
switch (lexeme.ToUpper()) { case "PRINT": { lexemeLength = count; return TokenType.PrintKeyword; } case "VAR": { lexemeLength = count; return TokenType.VarKeyword; } case "LET": { lexemeLength = count; return TokenType.LetKeyword; } case "GOTO": { lexemeLength = count; return TokenType.GotoKeyword; } case "IF": { lexemeLength = count; return TokenType.IfKeyword; } case "WHILE": { lexemeLength = count; return TokenType.WhileKeyword; } }
you can define a dictionary which maps lexeme to TokenType
:
private static readonly Dictionary<string, TokenType> _lexemeToTokenTypes =
new Dictionary<string, TokenType>(StringComparer.OrdinalIgnoreCase)
{
["PRINT"] = TokenType.PrintKeyword,
["VAR"] = TokenType.VarKeyword,
...
};
And thus your switch
will transform into
TokenType tokenType;
if (_lexemeToTokenTypes.TryGetValue(lexeme, out tokenType))
{
_lexemeLength = count;
return tokenType;
}
Also I recommend to define all those lexems as named string constants instead of literals.
In IsOperator
you have a long switch
too and you can simplify it with dictionaries as well.
In Tokenizer
's constructor check input for null
and throw appropriate exception:
if (source == null)
throw new ArgumentNullException(nameof(source));
Looking on this lines some people can say that omitting curly braces is very very very bad. Although I don't mind to place them for every if
I decided to go away from this "rule" some years ago. I totally understand what problems curly braces should prevent but I need to say that I never had any issues with if
s without them. So it is just a matter of taste. But you can use them and if you feel you are not experienced enough it probably will be better.
I recommend to change return type of the GetChar()
from char
to char?
and return null
if there are no more chars. Then everywhere compare result with null
rather than with 0
or '0円'
. In my opinion null
expresses end of input more explicitly than some char defined as termination symbol. '0円'
is normal symbol, null
is not.
Explore related questions
See similar questions with these tags.