Simple language tokenizer in C#

Question 1

I had some spare time so I have decided to write a simple tokenizer in C# for my own programming language. The tokenizer is not finished, but it is working and I would like to hear your opinion on things I can change a bit, improve or completely modify. It is lacking FloatLiteral token and it assumes the input is not empty, but I will add proper exception handling later. I have decided not to use Regex, as it kills all of the fun in creating this.

Language keywords

PRINT
VAR
LET
GOTO
IF
WHILE

Source Code

The source code is split in three files.

TokenType.cs

using System;
namespace Lang.Lexer
{
 public enum TokenType : ushort
 {
 // keyword
 PrintKeyword = 1, // PRINT
 VarKeyword = 2, // VAR
 LetKeyword = 3, // LET
 GotoKeyword = 4, // GOTO
 IfKeyword = 5, // IF
 WhileKeyword = 6, // WHILE
 // literal
 IntegerLiteral = 10,
 // identifier
 Identifier = 20,
 // operator
 Assignment = 30, // =
 Plus = 31, // +
 Minus = 32, // -
 Multiplication = 33, // *
 Division = 34, // /
 Modulo = 35, // %
 EqualTo = 36, // ==
 NotEqualTo = 37, // !=
 GeaterThan = 38, // >
 LessThan = 39, // <
 GreaterThanOrEqualTo = 40, // >=
 LessThanOrEqualTo = 41, // <=
 // logical operators
 LogicalNOT = 42, // !
 LogicalAND = 43, // &&
 LogicalOR = 44, // ||
 // trivia
 Space = 60,
 }
}

Token.cs

namespace Lang.Lexer
{
 public sealed class Token
 {
 public TokenType Type { get; private set; }
 public string Lexeme { get; private set; }
 public int CurrentLine { get; private set; }
 public Token(TokenType type, string lexeme, int currentLine)
 {
 Type = type;
 Lexeme = lexeme;
 CurrentLine = currentLine;
 }
 }
}

Tokenizer.cs

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace Lang.Lexer
{
 public sealed class Tokenizer
 {
 readonly List<string> Lines;
 int Line = 0;
 int Position = 0;
 bool EOF = false;
 int lexemeLength = 0;
 public Tokenizer(string source)
 {
 Lines = new List<string>(Regex.Split(source, Environment.NewLine));
 }
 char GetChar()
 {
 if (EOF) return (char)0;
 char c = Lines[Line][Position];
 if (Position + 1 < Lines[Line].Length)
 {
 Position++;
 }
 else
 {
 if (Line + 1 < Lines.Count)
 {
 Line++;
 Position = 0;
 }
 else
 {
 EOF = true;
 Position++;
 }
 }
 return c;
 }
 void UngetString(int count)
 {
 for (int i = 0; i < count; i++)
 {
 UngetChar();
 }
 }
 void UngetChar()
 {
 if (Position != 0)
 {
 if (!EOF)
 {
 Position--;
 }
 else
 {
 Position--;
 EOF = false;
 }
 }
 else
 {
 Line--;
 Position = Lines[Line].Length - 1;
 }
 }
 char PeekChar()
 {
 char c = GetChar();
 if (c != (char)0) UngetChar();
 return c;
 }
 public void Unget()
 {
 UngetString(lexemeLength);
 }
 public Token Peek()
 {
 Token token = Get();
 Unget();
 return token;
 }
 public Token Get()
 {
 if (EOF) return null;
 TokenType type;
 string lexeme = string.Empty;
 if ((type = IsSpace()) != 0)
 {
 return new Token(type, lexeme, Line);
 }
 if ((type = IsOperator()) != 0)
 {
 return new Token(type, lexeme, Line);
 }
 if ((type = IsKeyword()) != 0)
 {
 return new Token(type, lexeme, Line);
 }
 Tuple<TokenType, String> identifier = IsIdentifier();
 if (identifier.Item1 != 0)
 {
 return new Token(TokenType.Identifier, identifier.Item2, Line);
 }
 Tuple<TokenType, String> integerLiteral = IsIntegerLiteral();
 if (integerLiteral.Item1 != 0)
 {
 return new Token(TokenType.IntegerLiteral, integerLiteral.Item2, Line);
 }
 //bad token
 return null;
 }
 Tuple<TokenType, String> IsIntegerLiteral()
 {
 if (!char.IsDigit(PeekChar()))
 return new Tuple<TokenType, string>(0, string.Empty);
 string lexeme = GetChar().ToString();
 int count = 1;
 int line = Line;
 while (char.IsDigit(PeekChar()))
 {
 lexeme = lexeme + GetChar();
 count++;
 if (line != Line)
 {
 UngetString(count);
 return new Tuple<TokenType, string>(0, string.Empty);
 }
 }
 lexemeLength = count;
 return new Tuple<TokenType, string>(TokenType.Identifier, lexeme);
 }
 TokenType IsKeyword()
 {
 if (!char.IsLetter(PeekChar())) return 0;
 string lexeme = GetChar().ToString();
 int count = 1;
 int line = Line;
 while (char.IsLetter(PeekChar()))
 {
 lexeme = lexeme + GetChar();
 count++;
 if (line != Line) break;
 }
 switch (lexeme.ToUpper())
 {
 case "PRINT":
 {
 lexemeLength = count;
 return TokenType.PrintKeyword;
 }
 case "VAR":
 {
 lexemeLength = count;
 return TokenType.VarKeyword;
 }
 case "LET":
 {
 lexemeLength = count;
 return TokenType.LetKeyword;
 }
 case "GOTO":
 {
 lexemeLength = count;
 return TokenType.GotoKeyword;
 }
 case "IF":
 {
 lexemeLength = count;
 return TokenType.IfKeyword;
 }
 case "WHILE":
 {
 lexemeLength = count;
 return TokenType.WhileKeyword;
 }
 }
 UngetString(count);
 return 0;
 }
 Tuple<TokenType, String> IsIdentifier() 
 {
 if (!(char.IsLetter(PeekChar()) || PeekChar() == '_')) 
 return new Tuple<TokenType, string>(0, string.Empty);
 string lexeme = GetChar().ToString();
 int count = 1;
 int line = Line;
 while ((char.IsLetter(PeekChar()) || char.IsDigit(PeekChar()) || PeekChar() == '_'))
 {
 lexeme = lexeme + GetChar();
 count++;
 if (line != Line)
 {
 UngetString(count);
 return new Tuple<TokenType, string>(0, string.Empty);
 }
 }
 lexemeLength = count;
 return new Tuple<TokenType, string>(TokenType.Identifier, lexeme);
 }
 TokenType IsSpace()
 {
 if (char.IsWhiteSpace(PeekChar()))
 {
 GetChar();
 lexemeLength = 1;
 return TokenType.Space;
 }
 return 0;
 }
 TokenType IsOperator()
 {
 char c = PeekChar();
 switch (c)
 {
 case '=':
 {
 GetChar();
 if (PeekChar() == '=')
 {
 GetChar();
 lexemeLength = 2;
 return TokenType.EqualTo;
 }
 lexemeLength = 1;
 return TokenType.Assignment;
 }
 case '+':
 {
 GetChar();
 lexemeLength = 1;
 return TokenType.Plus;
 }
 case '-':
 {
 GetChar();
 lexemeLength = 1;
 return TokenType.Minus;
 }
 case '*':
 {
 GetChar();
 lexemeLength = 1;
 return TokenType.Multiplication;
 }
 case '/':
 {
 GetChar();
 lexemeLength = 1;
 return TokenType.Division;
 }
 case '%':
 {
 GetChar();
 lexemeLength = 1;
 return TokenType.Modulo;
 }
 case '!':
 {
 GetChar();
 if (PeekChar() == '=')
 {
 GetChar();
 lexemeLength = 2;
 return TokenType.NotEqualTo;
 }
 lexemeLength = 1;
 return TokenType.LogicalNOT;
 }
 case '>':
 {
 GetChar();
 if (PeekChar() == '=')
 {
 GetChar();
 lexemeLength = 2;
 return TokenType.GreaterThanOrEqualTo;
 }
 lexemeLength = 1;
 return TokenType.GeaterThan;
 }
 case '<':
 {
 GetChar();
 if (PeekChar() == '=')
 {
 GetChar();
 lexemeLength = 2;
 return TokenType.LessThanOrEqualTo;
 }
 lexemeLength = 1;
 return TokenType.LessThan;
 }
 case '&':
 {
 GetChar();
 if (PeekChar() == '&')
 {
 GetChar();
 lexemeLength = 2;
 return TokenType.LogicalAND;
 }
 lexemeLength = 1;
 return 0;
 }
 case '|':
 {
 GetChar();
 if (PeekChar() == '|')
 {
 GetChar();
 lexemeLength = 2;
 return TokenType.LogicalOR;
 }
 lexemeLength = 1;
 return 0;
 }
 default:
 {
 return 0;
 }
 }
 }
 }
}

Question 2

I'm not sure if you intended on entering the CODE REVIEW challenge, but I've extended the entry deadline to the end of this week, since your question was posted before the original deadline. Please answer the linked Meta question if you would like to enter. (Possible +500 bounty.)

Question 3

Much has already been said, but I think I've still got something to add:

Whitespace

Returning a separate token for each whitespace character is probably not very useful. If whitespace isn't significant in your language then you can simply ignore it (no tokens). Otherwise, why not combine consecutive whitespace characters into a single token?
Your token type is Space, but it's used for all whitespace characters, not just space characters. Whitespace would be a better name.

Ungetting

I'm not sure why calling code should be able to Unget a token. Surely calling code could buffer a few tokens, rather than letting the tokenizer repeat its work?
It's buggy: Unget doesn't update lexemelength, so ungetting twice or more can easily end up in the middle of a previous token.
UngetChar doesn't check if the line is less than 0. Combined with the above problem, this means that it's possible to move to negative lines, resulting in an ArgumentOutOfRangeException.

Bugs & unfinished business

The tokenizer just stops after it encounters an unrecognized character. You may want to return an 'unknown' token instead, and continue with the rest. This allows calling code to report (more detailed) errors: 'expected X between A and B, but got Y'.
|a|b and &a&b result in two identifier tokens - the | and & characters are ignored.
Splitting the input on newlines makes tracking line numbers easier, but it also makes various other methods more complicated and error-prone. Personally I don't think it's worth the trade-off:
- 1\r\n2 results in a single integer token: 12. This is probably not what you want.
- Likewise, &\r\n& results in a single logical-and token. Newline characters appear to be invisible to most tokenizing methods.
- However, foo\r\nbar fails to tokenize: the first call to Get just returns null.
IsIntegerLiteral returns Identifier as type - probably a copy/paste error. Get returns the correct type, so it currently doesn't cause problems, but it's a bug waiting to happen.

General remarks & suggestions

You're already keeping track of the position within each line, so why not add that to Token as well?
It's not immediately obvious whether PeekChar works correctly. The following should be easier to verify (it obviously always calls Unget after Get): if (EOF) { return 0; } else { var c = Get(); Unget(); return c; }
Why not let the various type-specific methods return a Token directly? It would simplify the Get method: just return the result of the first method that doesn't return null.
Keywords are just special identifiers. You should be able to combine them into a single method, with a keyword dictionary lookup afterwards. The above point (returning Token) would also make this easier to do.
Modifying the Peek method to take an offset argument allows you to simplify IsOperator: instead of having to call GetChar() before another Peek() call, you could simply call Peek(2).
It could be helpful if the tokenizer would recognize negative numbers directly, instead of returning a Minus token and a positive integer token.
Personally, I'd create a separate CharacterStream class to take care of character advancing and peeking. Maybe it's overkill, but it allows the tokenizer class to focus on a single responsibility.
Finally, this is the kind of thing where I think investing in automated tests will quickly pay for itself.

Question 4

These are just some quick thoughts off the top of my head.

Is there any particular functional reason for TokenType to be ushort? In general, you don't modify from the base type unless it's intended to be passed to a native API.
Make the comments on your enum values XML comments:
```
PrintKeyword = 1, // PRINT
```
becomes
```
/// <summary>
/// PRINT
/// </summary>
PrintKeyword = 1,
```
for example. It assists the IntelliSense and can be used to automatically generate documentation.
Excellent use of sealed on the Token class - kudos! A hint - since it's just a container for data rather than behavior, and if you're using C#6, you can remove the private set; bits of the properties. C#6 (in other words, Visual Studio 2015 and newer) supports read-only auto-properties. The Lexeme property would then look like:
```
public string Lexeme { get; }
```
and the semantics of that class become even more evident: these properties will never change after constructor assignment.
Tokenizer class. This is a big one. And it seems to have a lot of moving parts. The first thing I'd suggest is stick to C# idiomatic standards:
- Do not depend on defaults for accessibility keywords. Add private to things that are private:
```
private readonly List<string> Lines;
```
- Use C# idiomatic casing rules: class members should be camelCased (or possibly _camelCased, or even _PascalCased). The example above for Lines (which is PascalCased) more matches how to name properties. So we might go with:
```
private readonly List<string> _lines;
```
- While I'm on that, I'd recommend declaring that type as IList<string>. It's good practice to develop to interfaces rather than concrete types.
- For maintainability's sake, always wrap the body of an if statement in curly braces. Your future self, and anyone you work with will appreciate it. Something like (note my class member now renamed):
```
if (this._eof) return (char)0;
```
  becomes
```
if (this._eof)
{
 return (char)0;
}
```
- While we're on the subject, an easier way to write (char)0 is '0円'. It's literally the null character constant. No casting, no fuss.
- Couple things on your Tuples. One is that you should use the build-in C# language type string in your declarations rather than the CLR type String. Two, it's idiomatic to use Tuple.Create(...) rather than new Tuple(...).
- In your switch statements, you surround each case with curly braces. Not needed. The code between cases are blocks already. If you need it to scope variables, that's fine, but eliminate them otherwise.
- I like var a lot and use it everywhere. But you might not be a fan. That's okay! However, if you can replace something with generics like Tuple<TokenType, string> integerLiteral = ... with var integerLiteral = ..., I think it makes sense and removes boilerplate from the code.
- In the method IsIntegerLiteral, I would recommend changing the lexeme variable from string to StringBuilder. The declaration becomes (yep, using var!):
```
var lexeme = new StringBuilder(GetChar().ToString());
```
  a few lines down, you then:
```
 lexeme.Append(GetChar());
```
  and when you return the Tuple at the end, you finally (note Tuple.Create):
```
return Tuple.Create(TokenType.Identifier, lexeme.ToString());
```
  And, you'd do similarly in the IsKeyword and IsIdentifier methods.

That should do for now.

Question 5

I have some notes in addition to answer of Jesse C. Slicer.

In IsKeyword method the first line is

if (!char.IsLetter(PeekChar())) return 0;

Looks like C-style. If method returns enum return enum. Define a field in TokenType like NoToken = 0 and return it. And then compare result in all ifs with TokenType.NoToken rather than with zero. Do this for all methods where at now you return 0 as TokenType.

Instead of this

switch (lexeme.ToUpper())
{
 case "PRINT":
 {
 lexemeLength = count;
 return TokenType.PrintKeyword;
 }
 case "VAR":
 {
 lexemeLength = count;
 return TokenType.VarKeyword;
 }
 case "LET":
 {
 lexemeLength = count;
 return TokenType.LetKeyword;
 }
 case "GOTO":
 {
 lexemeLength = count;
 return TokenType.GotoKeyword;
 }
 case "IF":
 {
 lexemeLength = count;
 return TokenType.IfKeyword;
 }
 case "WHILE":
 {
 lexemeLength = count;
 return TokenType.WhileKeyword;
 }
}

you can define a dictionary which maps lexeme to TokenType:

private static readonly Dictionary<string, TokenType> _lexemeToTokenTypes =
 new Dictionary<string, TokenType>(StringComparer.OrdinalIgnoreCase)
 {
 ["PRINT"] = TokenType.PrintKeyword,
 ["VAR"] = TokenType.VarKeyword,
 ...
 };

And thus your switch will transform into

TokenType tokenType;
if (_lexemeToTokenTypes.TryGetValue(lexeme, out tokenType))
{
 _lexemeLength = count;
 return tokenType;
}

Also I recommend to define all those lexems as named string constants instead of literals.

In IsOperator you have a long switch too and you can simplify it with dictionaries as well.

In Tokenizer's constructor check input for null and throw appropriate exception:

if (source == null)
 throw new ArgumentNullException(nameof(source));

Looking on this lines some people can say that omitting curly braces is very very very bad. Although I don't mind to place them for every if I decided to go away from this "rule" some years ago. I totally understand what problems curly braces should prevent but I need to say that I never had any issues with ifs without them. So it is just a matter of taste. But you can use them and if you feel you are not experienced enough it probably will be better.

I recommend to change return type of the GetChar() from char to char? and return null if there are no more chars. Then everywhere compare result with null rather than with 0 or '0円'. In my opinion null expresses end of input more explicitly than some char defined as termination symbol. '0円' is normal symbol, null is not.

Pieter Witvoet Pieter Witvoet 8,27916 silver badges28 bronze badges · Accepted Answer · 2017-08-04 09:53:35Z

Much has already been said, but I think I've still got something to add:

Whitespace

Returning a separate token for each whitespace character is probably not very useful. If whitespace isn't significant in your language then you can simply ignore it (no tokens). Otherwise, why not combine consecutive whitespace characters into a single token?
Your token type is Space, but it's used for all whitespace characters, not just space characters. Whitespace would be a better name.

Ungetting

I'm not sure why calling code should be able to Unget a token. Surely calling code could buffer a few tokens, rather than letting the tokenizer repeat its work?
It's buggy: Unget doesn't update lexemelength, so ungetting twice or more can easily end up in the middle of a previous token.
UngetChar doesn't check if the line is less than 0. Combined with the above problem, this means that it's possible to move to negative lines, resulting in an ArgumentOutOfRangeException.

Bugs & unfinished business

The tokenizer just stops after it encounters an unrecognized character. You may want to return an 'unknown' token instead, and continue with the rest. This allows calling code to report (more detailed) errors: 'expected X between A and B, but got Y'.
|a|b and &a&b result in two identifier tokens - the | and & characters are ignored.
Splitting the input on newlines makes tracking line numbers easier, but it also makes various other methods more complicated and error-prone. Personally I don't think it's worth the trade-off:
- 1\r\n2 results in a single integer token: 12. This is probably not what you want.
- Likewise, &\r\n& results in a single logical-and token. Newline characters appear to be invisible to most tokenizing methods.
- However, foo\r\nbar fails to tokenize: the first call to Get just returns null.
IsIntegerLiteral returns Identifier as type - probably a copy/paste error. Get returns the correct type, so it currently doesn't cause problems, but it's a bug waiting to happen.

General remarks & suggestions

You're already keeping track of the position within each line, so why not add that to Token as well?
It's not immediately obvious whether PeekChar works correctly. The following should be easier to verify (it obviously always calls Unget after Get): if (EOF) { return 0; } else { var c = Get(); Unget(); return c; }
Why not let the various type-specific methods return a Token directly? It would simplify the Get method: just return the result of the first method that doesn't return null.
Keywords are just special identifiers. You should be able to combine them into a single method, with a keyword dictionary lookup afterwards. The above point (returning Token) would also make this easier to do.
Modifying the Peek method to take an offset argument allows you to simplify IsOperator: instead of having to call GetChar() before another Peek() call, you could simply call Peek(2).
It could be helpful if the tokenizer would recognize negative numbers directly, instead of returning a Minus token and a positive integer token.
Personally, I'd create a separate CharacterStream class to take care of character advancing and peeking. Maybe it's overkill, but it allows the tokenizer class to focus on a single responsibility.
Finally, this is the kind of thing where I think investing in automated tests will quickly pay for itself.

Stack Exchange Network

Simple language tokenizer in C#

Language keywords

Source Code

TokenType.cs

Token.cs

Tokenizer.cs

3 Answers 3

Whitespace

Ungetting

Bugs & unfinished business

General remarks & suggestions

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Simple language tokenizer in C#

Language keywords

Source Code

TokenType.cs

Token.cs

Tokenizer.cs

3 Answers 3

Whitespace

Ungetting

Bugs & unfinished business

General remarks & suggestions

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions