As we have our Community Oriented Development Extravaganza, Requiring Extreme Vigor Inciting Extraordinary Winners 2017 Challenge. Finally I had some time to write a tokenizer for my own language. It is the most complex tokenizer I have written so far. The language consists of 32 keywords and supports most of the operators. Beyond that we have numeric literals, string literals, address literals (virtual memory pointer address) and character literals. I read a lot before writing this tokenizer, as many similiar questions were asked here on Code Review. And because it is my first attempt in implementing a programming language in C#, I did C before, I would like to know what I can improve or change, if there are any exceptions I did not notice.
The tokenizer generates tokens which I will use to generate an AST during parsing. I did not use Regex or any lexer generators as that was not the point for me.
In the tokens sfunc
and svar
, the s
stands for static
.
I didn't include the preprocessor here, as it is already fairly big chunk of code.
CharStream.cs
using System;
using System.Collections.Generic;
using System.Text;
namespace Compiler.Lexer
{
public class CharStream
{
private readonly IList<string> Lines;
public bool EOF { get; private set; }
public int CurrentLineNumber { get; set; }
public int CurrentPosition { get; set; }
public CharStream(string source)
{
if (!string.IsNullOrEmpty(source))
{
Lines = Preprocessor.Utilities.GenerateList(source);
}
else
{
EOF = true;
return;
}
int line = 0;
while (string.IsNullOrEmpty(Lines[line]))
{
line++;
}
EOF = false;
CurrentLineNumber = line;
CurrentPosition = 0;
}
public char? Peek()
{
if (EOF)
{
return null;
}
char? c = Get();
Unget();
return c;
}
public string Peek(int count)
{
string peek = Get(count);
Unget(count);
return peek;
}
public char? Get()
{
if (EOF)
{
return null;
}
while (String.IsNullOrEmpty(Lines[CurrentLineNumber])
&& CurrentLineNumber < Lines.Count)
{
CurrentLineNumber++;
}
char c = Lines[CurrentLineNumber][CurrentPosition];
if (CurrentPosition + 1 < Lines[CurrentLineNumber].Length)
{
CurrentPosition++;
}
else
{
if (CurrentLineNumber + 1 < Lines.Count)
{
CurrentLineNumber++;
CurrentPosition = 0;
}
else
{
EOF = true;
}
}
return c;
}
public string Get(int count)
{
var stringBuilder = new StringBuilder();
for (int i = 0; i < count; i++)
{
char? c = Get();
if (c == null)
{
return null;
}
else
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString();
}
private void Unget()
{
if (EOF)
{
EOF = false;
}
else
{
if (CurrentPosition > 0)
{
CurrentPosition--;
}
else if (CurrentLineNumber > 0)
{
while (string.IsNullOrEmpty(Lines[--CurrentLineNumber])) ;
CurrentPosition = Lines[CurrentLineNumber].Length - 1;
}
}
}
private void Unget(int count)
{
for (int i = 0; i < count; i++)
{
Unget();
}
}
}
}
TokenKind.cs
using System;
namespace Compiler.Lexer
{
// do not change token numbers
public enum TokenKind : ushort
{
UnknownToken = 0,
AndKeyword = 1,
BreakKeyword = 2,
CaseKeyword = 3,
CatchKeyword = 4,
ClassKeyword = 5,
ConstKeyword = 6,
ContinueKeyword = 7,
DefaultKeyword = 8,
DoKeyword = 9,
ElseKeyword = 10,
EnumKeyword = 11,
FalseKeyword = 12,
ForKeyword = 13,
FuncKeyword = 14,
IfKeyword = 15,
InKeyword = 16,
IsKeyword = 17,
NewKeyword = 18,
NorKeyword = 19,
NullKeyword = 20,
ObjectKeyword = 21,
OrKeyword = 22,
PackageKeyword = 23,
ReturnKeyword = 24,
SfuncKeyword = 25,
SvarKeyword = 26,
SwitchKeyword = 27,
ThisKeyword = 28,
TrueKeyword = 29,
TryKeyword = 30,
VarKeyword = 31,
WhileKeyword = 32,
Identifier = 100,
CharacterLiteral = 101,
StringLiteral = 102,
IntegerLiteral = 103,
RealLiteral = 104,
AddressLiteral = 105,
Assignment = 200,
Addition = 201,
Subtraction = 202,
UnaryPlus = 203,
UnaryMinus = 204,
Multiplication = 205,
Division = 206,
Modulo = 207,
EqualTo = 208,
NotEqualTo = 209,
GeaterThan = 210,
LessThan = 211,
GreaterThanOrEqualTo = 212,
LessThanOrEqualTo = 213,
LogicalNOT = 214,
LogicalAND = 215,
LogicalOR = 216,
PostfixPlus = 217,
PostfixMinus = 218,
PrefixPlus = 219,
PrefixMinus = 220,
BitwiseNOT = 300,
BitwiseAND = 301,
BitwiseOR = 302,
BitwiseXOR = 303,
BitwiseLeftShift = 304,
BitwiseRightShift = 305,
AdditionAssignment = 400,
SubtractionAssignment = 401,
MultiplicationAssignment = 402,
DivisionAssignment = 403,
ModuloAssignment = 404,
BitwiseANDAssignment = 405,
BitwiseORAssignment = 406,
BitwiseXORAssignment = 407,
BitwiseLeftShiftAssignment = 408,
BitwiseRightShiftAssignment = 409,
OpenParenthesis = 500,
CloseParenthesis = 501,
OpenBrace = 502,
CloseBrace = 503,
OpenBracket = 504,
CloseBracket = 505,
Colon = 506,
Semicolon = 507,
Comma = 508,
Dot = 509,
Question = 510,
}
}
Token.cs
using System;
namespace Compiler.Lexer
{
public sealed class Token
{
public TokenKind Kind { get; }
public string Lexeme { get; }
public int LineNumber { get; }
public int Position { get; }
public Token(TokenKind kind, string lexeme, int lineNumber, int position)
{
Kind = kind;
Lexeme = lexeme;
LineNumber = lineNumber;
Position = position;
}
}
}
Tokenizer.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Compiler.Lexer
{
public partial class Tokenizer
{
private CharStream Stream;
private Token LastToken;
public Tokenizer(string source)
{
Stream = new CharStream(source);
LastToken = new Token
(TokenKind.UnknownToken, string.Empty, 0, 0);
}
public Token Get()
{
Token get;
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
do
{
if (Stream.EOF)
{
return null;
}
} while (IsWhiteSpace());
get = IsSymbol();
if (get != null)
{
return get;
}
get = IsOperator();
if (get != null)
{
return get;
}
get = IsIdentifier();
if (get != null)
{
return get;
}
get = IsCharacterLiteral();
if (get != null)
{
return get;
}
get = IsStringLiteral();
if (get != null)
{
return get;
}
get = IsNumericLiteral();
if (get != null)
{
return get;
}
get = IsAddressLiteral();
if (get != null)
{
return get;
}
get = new Token(TokenKind.UnknownToken, Stream.Get().ToString(),
lineNumber, position);
return get;
}
public Token Peek()
{
int line = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
Token previousToken = LastToken;
Token peek = Get();
Stream.CurrentLineNumber = line;
Stream.CurrentPosition = position;
LastToken = previousToken;
return peek;
}
private bool IsWhiteSpace()
{
bool isWhiteSpace = false;
while (Stream.Peek() != null && char.IsWhiteSpace((char)Stream.Peek()))
{
isWhiteSpace = true;
Stream.Get();
}
return isWhiteSpace;
}
private Token IsSymbol()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
string lexeme = Stream.Peek().ToString();
TokenKind tokenKind;
if (SymbolToTokenKind.TryGetValue(lexeme, out tokenKind))
{
Stream.Get();
Token token = new Token(tokenKind, lexeme, lineNumber, position);
LastToken = token;
return token;
}
return null;
}
private Token IsOperator()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
var lexeme = new StringBuilder();
char? firstChar = null;
if (Stream.Peek() == '+' || Stream.Peek() == '-')
{
firstChar = Stream.Peek();
}
while (Stream.Peek() != null
&& lineNumber == Stream.CurrentLineNumber
&& OperatorToTokenKind.ContainsKey(Stream.Peek().ToString())
|| (Stream.Peek() == '+' && firstChar == '+')
|| (Stream.Peek() == '-' && firstChar == '-'))
{
lexeme.Append(Stream.Get());
}
if ((lexeme.ToString() == "+" || lexeme.ToString() == "-")
& (ushort)LastToken.Kind > 99 && (ushort)LastToken.Kind < 200)
{
//A+B or A-B
lexeme.Insert(0, 'A');
lexeme.Insert(2, 'B');
}
else if (lexeme.ToString() == "+" || lexeme.ToString() == "-")
{
//+A or -A
lexeme.Insert(1, 'A');
}
if ((lexeme.ToString() == "++" || lexeme.ToString() == "--")
& (ushort)LastToken.Kind > 99 && (ushort)LastToken.Kind < 200)
{
//A++ or A--
lexeme.Insert(0, 'A');
}
else if (lexeme.ToString() == "++" || lexeme.ToString() == "--")
{
//++A or --A
lexeme.Insert(2, 'A');
}
TokenKind tokenKind;
if (OperatorToTokenKind.TryGetValue(lexeme.ToString(), out tokenKind))
{
Token token = new Token
(tokenKind, lexeme.ToString(), lineNumber, position);
LastToken = token;
return token;
}
return null;
}
private Token IsIdentifier()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
var lexeme = new StringBuilder();
TokenKind tokenKind;
Token token;
if (Stream.Peek() == null
|| !(char.IsLetter((char)Stream.Peek()) || Stream.Peek() == '_'))
{
return null;
}
lexeme.Append(Stream.Get());
int count = 0;
while (Stream.Peek() != null
&& lineNumber == Stream.CurrentLineNumber
&& (char.IsLetter((char)Stream.Peek())
|| char.IsDigit((char)Stream.Peek())
|| Stream.Peek() == '_'))
{
count++;
lexeme.Append(Stream.Get());
}
if (KeywordToTokenKind.TryGetValue(lexeme.ToString(), out tokenKind))
{
token = new Token
(tokenKind, lexeme.ToString(), lineNumber, position);
LastToken = token;
return token;
}
token = new Token
(TokenKind.Identifier, lexeme.ToString(), lineNumber, position);
LastToken = token;
return token;
}
private Token IsCharacterLiteral()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
var lexeme = new StringBuilder();
Token token;
if (Stream.Peek() != '\'')
{
return null;
}
int singleQuoteCount = 2;
while (Stream.Peek() != null
&& singleQuoteCount > 0
&& lineNumber == Stream.CurrentLineNumber)
{
char c = (char)Stream.Get();
if (c == '\\' && Stream.Peek() == '\'')
{
singleQuoteCount++;
}
if (c == '\'')
{
singleQuoteCount--;
}
lexeme.Append(c);
}
if (singleQuoteCount != 0)
{
return null;
}
token = new Token
(TokenKind.CharacterLiteral, lexeme.ToString(), lineNumber, position);
LastToken = token;
return token;
}
private Token IsStringLiteral()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
var lexeme = new StringBuilder();
Token token;
if (Stream.Peek() != '\"')
{
return null;
}
int doubleQuoteCount = 2;
while (Stream.Peek() != null && doubleQuoteCount > 0)
{
char c = (char)Stream.Get();
if (c == '\\' && Stream.Peek() == '\"')
{
doubleQuoteCount++;
}
if (c == '\"')
{
doubleQuoteCount--;
}
lexeme.Append(c);
}
if (doubleQuoteCount != 0)
{
return null;
}
token = new Token
(TokenKind.StringLiteral, lexeme.ToString(), lineNumber, position);
LastToken = token;
return token;
}
private Token IsNumericLiteral()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
var lexeme = new StringBuilder();
Token token;
bool isReal = false;
if (Stream.Peek() == null || !char.IsDigit((char)Stream.Peek()))
{
return null;
}
while (Stream.Peek() != null
&& lineNumber == Stream.CurrentLineNumber
&& (char.IsDigit((char)Stream.Peek())
|| Stream.Peek() == '.'
|| Stream.Peek() == 'M'))
{
if (Stream.Peek() == '.' && (!char.IsDigit(Stream.Peek(2)[1]) || isReal))
{
break;
}
if (Stream.Peek() == '.')
{
isReal = true;
}
lexeme.Append(Stream.Get());
}
if (isReal)
{
token = new Token
(TokenKind.RealLiteral, lexeme.ToString(), lineNumber, position);
}
else
{
token = new Token
(TokenKind.IntegerLiteral, lexeme.ToString(), lineNumber, position);
}
LastToken = token;
return token;
}
private Token IsAddressLiteral()
{
int lineNumber = Stream.CurrentLineNumber;
int position = Stream.CurrentPosition;
var lexeme = new StringBuilder();
Token token;
bool isAddress = false;
if (Stream.Peek() == null || Stream.Peek() != '@')
{
return null;
}
while (Stream.Peek() != null
&& lineNumber == Stream.CurrentLineNumber
&& (char.IsDigit((char)Stream.Peek())
|| Stream.Peek() == '@'))
{
if (Stream.Peek() == '@' && (!char.IsDigit(Stream.Peek(2)[1]) || isAddress))
{
break;
}
if (Stream.Peek() == '@')
{
isAddress = true;
}
lexeme.Append(Stream.Get());
}
token = new Token
(TokenKind.AddressLiteral, lexeme.ToString(), lineNumber, position);
LastToken = token;
return token;
}
}
}
Keywords.cs
using System;
using System.Collections.Generic;
namespace Compiler.Lexer
{
public partial class Tokenizer
{
private static readonly Dictionary<string, TokenKind> KeywordToTokenKind =
new Dictionary<string, TokenKind>(StringComparer.OrdinalIgnoreCase)
{
["and"] = TokenKind.AndKeyword,
["break"] = TokenKind.BreakKeyword,
["case"] = TokenKind.CaseKeyword,
["catch"] = TokenKind.CatchKeyword,
["class"] = TokenKind.ClassKeyword,
["const"] = TokenKind.ConstKeyword,
["continue"] = TokenKind.ContinueKeyword,
["default"] = TokenKind.DefaultKeyword,
["do"] = TokenKind.DoKeyword,
["else"] = TokenKind.ElseKeyword,
["enum"] = TokenKind.EnumKeyword,
["false"] = TokenKind.FalseKeyword,
["for"] = TokenKind.ForKeyword,
["func"] = TokenKind.FuncKeyword,
["if"] = TokenKind.IfKeyword,
["in"] = TokenKind.InKeyword,
["is"] = TokenKind.IsKeyword,
["new"] = TokenKind.NewKeyword,
["nor"] = TokenKind.NorKeyword,
["null"] = TokenKind.NullKeyword,
["object"] = TokenKind.ObjectKeyword,
["or"] = TokenKind.OrKeyword,
["package"] = TokenKind.PackageKeyword,
["return"] = TokenKind.ReturnKeyword,
["sfunc"] = TokenKind.SfuncKeyword,
["svar"] = TokenKind.SvarKeyword,
["switch"] = TokenKind.SwitchKeyword,
["this"] = TokenKind.ThisKeyword,
["true"] = TokenKind.TrueKeyword,
["try"] = TokenKind.TryKeyword,
["var"] = TokenKind.VarKeyword,
["while"] = TokenKind.WhileKeyword,
};
}
}
Operators.cs
using System;
using System.Collections.Generic;
namespace Compiler.Lexer
{
public partial class Tokenizer
{
private static readonly Dictionary<string, TokenKind> OperatorToTokenKind =
new Dictionary<string, TokenKind>(StringComparer.OrdinalIgnoreCase)
{
["="] = TokenKind.Assignment,
["A+B"] = TokenKind.Addition,
["A-B"] = TokenKind.Subtraction,
["+A"] = TokenKind.UnaryPlus,
["-A"] = TokenKind.UnaryMinus,
["*"] = TokenKind.Multiplication,
["/"] = TokenKind.Division,
["%"] = TokenKind.Modulo,
["=="] = TokenKind.EqualTo,
["!="] = TokenKind.NotEqualTo,
[">"] = TokenKind.GeaterThan,
["<"] = TokenKind.LessThan,
[">="] = TokenKind.GreaterThanOrEqualTo,
["<="] = TokenKind.LessThanOrEqualTo,
["!"] = TokenKind.LogicalNOT,
["&&"] = TokenKind.LogicalAND,
["||"] = TokenKind.LogicalOR,
["A++"] = TokenKind.PostfixPlus,
["A--"] = TokenKind.PostfixMinus,
["++A"] = TokenKind.PrefixPlus,
["--A"] = TokenKind.PrefixMinus,
["~"] = TokenKind.BitwiseNOT,
["&"] = TokenKind.BitwiseAND,
["|"] = TokenKind.BitwiseOR,
["^"] = TokenKind.BitwiseXOR,
["<<"] = TokenKind.BitwiseLeftShift,
[">>"] = TokenKind.BitwiseRightShift,
["+="] = TokenKind.AdditionAssignment,
["-="] = TokenKind.SubtractionAssignment,
["*="] = TokenKind.MultiplicationAssignment,
["/="] = TokenKind.DivisionAssignment,
["%="] = TokenKind.ModuloAssignment,
["&="] = TokenKind.BitwiseANDAssignment,
["|="] = TokenKind.BitwiseORAssignment,
["^="] = TokenKind.BitwiseXORAssignment,
["<<="] = TokenKind.BitwiseLeftShiftAssignment,
[">>="] = TokenKind.BitwiseRightShiftAssignment,
};
}
}
Symbols.cs
using System;
using System.Collections.Generic;
namespace Compiler.Lexer
{
public partial class Tokenizer
{
private static readonly Dictionary<string, TokenKind> SymbolToTokenKind =
new Dictionary<string, TokenKind>(StringComparer.OrdinalIgnoreCase)
{
["("] = TokenKind.OpenParenthesis,
[")"] = TokenKind.CloseParenthesis,
["{"] = TokenKind.OpenBrace,
["}"] = TokenKind.CloseBrace,
["["] = TokenKind.OpenBracket,
["]"] = TokenKind.CloseBracket,
[":"] = TokenKind.Colon,
[";"] = TokenKind.Semicolon,
[","] = TokenKind.Comma,
["."] = TokenKind.Dot,
["?"] = TokenKind.Question,
};
}
}
Utilities.cs
using System;
using System.Collections.Generic;
using System.Linq;
using System.IO;
namespace Compiler.Preprocessor
{
public static class Utilities
{
public static List<string> GenerateList(string source)
{
return source.Split(new string[] { "\r\n", "\n" },
StringSplitOptions.None).ToList();
}
}
}
Test script
Tokenizer tokenizer = new Tokenizer(source);
Token token;
while ((token = tokenizer.Peek()) != null)
{
Console.WriteLine(token.Kind + " LEX: " + token.Lexeme + " POS: " + token.Position + " LN: " + token.LineNumber);
tokenizer.Get();
}
Samples
Hello World
func Main():
print("Hello World")
Generated tokens:
- FuncKeyword LEX: func POS: 0 LN: 0
- Identifier LEX: Main POS: 5 LN: 0
- OpenParenthesis LEX: ( POS: 9 LN: 0
- CloseParenthesis LEX: ) POS: 10 LN: 0
- Colon LEX: : POS: 11 LN: 0
- Identifier LEX: print POS: 4 LN: 1
- OpenParenthesis LEX: ( POS: 9 LN: 1
- StringLiteral LEX: "Hello World" POS: 10 LN: 1
- CloseParenthesis LEX: ) POS: 23 LN: 1
Fibonacci
func Fib(n):
if n == 1 or n == 2:
return 1
return fib(n - 1) + fib(n-1)
func Main():
print(fib(5))
Generated tokens:
- FuncKeyword LEX: func POS: 0 LN: 0
- Identifier LEX: Fib POS: 5 LN: 0
- OpenParenthesis LEX: ( POS: 8 LN: 0
- Identifier LEX: n POS: 9 LN: 0
- CloseParenthesis LEX: ) POS: 10 LN: 0
- Colon LEX: : POS: 11 LN: 0
- IfKeyword LEX: if POS: 4 LN: 1
- Identifier LEX: n POS: 7 LN: 1
- EqualTo LEX: == POS: 9 LN: 1
- IntegerLiteral LEX: 1 POS: 12 LN: 1
- OrKeyword LEX: or POS: 14 LN: 1
- Identifier LEX: n POS: 17 LN: 1
- EqualTo LEX: == POS: 19 LN: 1
- IntegerLiteral LEX: 2 POS: 22 LN: 1
- Colon LEX: : POS: 23 LN: 1
- ReturnKeyword LEX: return POS: 8 LN: 2
- IntegerLiteral LEX: 1 POS: 15 LN: 2
- ReturnKeyword LEX: return POS: 4 LN: 3
- Identifier LEX: fib POS: 11 LN: 3
- OpenParenthesis LEX: ( POS: 14 LN: 3
- Identifier LEX: n POS: 15 LN: 3
- Subtraction LEX: A-B POS: 17 LN: 3
- IntegerLiteral LEX: 1 POS: 19 LN: 3
- CloseParenthesis LEX: ) POS: 20 LN: 3
- Addition LEX: A+B POS: 22 LN: 3
- Identifier LEX: fib POS: 24 LN: 3
- OpenParenthesis LEX: ( POS: 27 LN: 3
- Identifier LEX: n POS: 28 LN: 3
- Subtraction LEX: A-B POS: 29 LN: 3
- IntegerLiteral LEX: 1 POS: 30 LN: 3
- CloseParenthesis LEX: ) POS: 31 LN: 3
- FuncKeyword LEX: func POS: 0 LN: 5
- Identifier LEX: Main POS: 5 LN: 5
- OpenParenthesis LEX: ( POS: 9 LN: 5
- CloseParenthesis LEX: ) POS: 10 LN: 5
- Colon LEX: : POS: 11 LN: 5
- Identifier LEX: print POS: 4 LN: 6
- OpenParenthesis LEX: ( POS: 9 LN: 6
- Identifier LEX: fib POS: 10 LN: 6
- OpenParenthesis LEX: ( POS: 13 LN: 6
- IntegerLiteral LEX: 5 POS: 14 LN: 6
- CloseParenthesis LEX: ) POS: 15 LN: 6
- CloseParenthesis LEX: ) POS: 16 LN: 6
-
\$\begingroup\$ Looks good to me! \$\endgroup\$J_H– J_H2017年08月13日 06:02:44 +00:00Commented Aug 13, 2017 at 6:02
-
\$\begingroup\$ OUCH! Looks great. \$\endgroup\$paparazzo– paparazzo2017年08月13日 06:15:16 +00:00Commented Aug 13, 2017 at 6:15
-
1\$\begingroup\$ I'd be great if you could add a sample fo the language you're tokenizing... \$\endgroup\$t3chb0t– t3chb0t2017年08月13日 13:41:45 +00:00Commented Aug 13, 2017 at 13:41
-
1\$\begingroup\$ Added samples and the output tokens \$\endgroup\$Michał Paszkowski– Michał Paszkowski2017年08月13日 14:50:39 +00:00Commented Aug 13, 2017 at 14:50
-
\$\begingroup\$ Prefix and Postfix plus should be called increment, instead of plus. \$\endgroup\$user98809– user988092017年08月13日 16:42:50 +00:00Commented Aug 13, 2017 at 16:42
4 Answers 4
This looks well structured, with responsibilities nicely separated, but as they say: 'the devil is in the details'. Let's have a look:
CharStream
- This class contains some surprising behavior:
Peek(int count)
returnsnull
if there are less thancount
characters left, instead of returning only the remaining characters. This is something that I expect to be documented. The same goes forGet
.- Newline characters are completely ignored. This makes things more complicated for the tokenizer: instead of only having to check characters (which it has to do anyway), it now also has to compare line numbers (which is easy to forget).
- Both
Peek(int count)
andGet(int count)
do not correctly (or at all) reset the stream position if they returnnull
. Instead of anUnget
method, you could simply store the previous location in local variables, and reset them before returning. Reset them in afinally
block to make sure an exception won't mess things up. - Line number and position have public setters. You should probably make those private. If you want them to be public, they should guard against invalid positions.
- The empty-line skipping loop in the constructor fails on input that consists of only empty lines (
ArgumentOutOfRangeException
). Personally I wouldn't ignore newline characters and skip empty lines at this level - the tokenizer can easily deal with it when it skips whitespace. - The
Get()
logic fails on input that ends with empty lines. Empty lines should be skipped as part of the stream position advancing code - after you fetch a character, and before you determine whether the end of the stream is reached. Unget
contains awhile
loop with an empty body. While it works as it should, it does make the code harder to read.
TokenKind
- There's a comment here that says that token numbers should not be changed, but it does not explain why.
- Personally I would add a category name comment above each group of token types, to improve readability.
Tokenizer
- Using partial classes strikes me as rather odd.
partial
is mostly used when generated and hand-written code need to be combined. I would probably use regions for this kind of separation, if at all, but I think t3chb0t's suggestion is much nicer: annotated enum values combined with a little initialization code. - Turning
IsWhitespace
into aSkipWhitespace
method would reduce clutter in yourGet
method. - Due to
CharStream
ignoring newlines, your tokenizer has to compare line numbers, but it doesn't do so consistently: identifiers, numbers and some operators are split up by a newline, while strings and some other operators (such as++
) are not. - The use of
LastToken
is questionable:- Every token-parsing method has to set it, which is easy to get wrong. It'd be better if
Get
did so, perhaps in afinally
block. - It's only used by
IsOperator
, to distinguish between unary and binary +/- and prefix and postfix ++/--. But the tokenizer does not have sufficient information to (reliably) do so: take(a) + b
anda() + b
for example. Another example would be<
and>
in C#: are they less-than and greater-than operators, or the start and end of a generic parameter list? Don't try to do this in the tokenizer, leave it for the parser.
- Every token-parsing method has to set it, which is easy to get wrong. It'd be better if
- There are a few
Stream.Peek().ToString()
calls without a null-check, and there are certain inputs where that causes problems (such as4.
and@
). - A lot of the
Is*
methods contain an early-out check, but they all do a little bit of work even before that check. That work is wasted if the check fails, so it should be done after the check. IsCharacterLiteral
andIsStringLiteral
can be simplified by skipping the first'
or"
(which was peeked already) and by creating a helper method for parsing escape sequences (such as\'
,\"
,\n
and various others). This allows you to simplify these methods and to get consistent escape sequence behavior across characters and strings.IsCharacterLiteral
does not limit character literals to a single character.- Why is
4MMM4
a valid numeric literal? What does theM
stand for? IsAddressLiteral
gets stuck on inputs like@a
- the tokenizer continues to return empty address tokens.
Utilities
- The name
GenerateList
doesn't really describe what the method does. Given how simple the method is, I'm not sure you even need this. I'm also not sure why it returns a list instead of an array -CharStream
doesn't seem to need it to be a list.
General
- Some documentation, and a few explanation comments in the more complex parsing methods, would be helpful.
- Some names could be a little clearer:
NextChar
andNextToken
are a bit more obvious than simplyGet
, andSplitLines
is more descriptive thanGenerateList
. - Private fields are normally written in camelCase, not PascalCase. It's also common to prefix them with a _.
- Declaring variables at the top of a method is very C-like. I usually declare variables as close to where they're actually used as possible.
- I don't know if you've got automated tests for this, but if not, it's probably a good idea to create some. It may also be useful to do some 'fuzzing': feeding randomly generated strings into your tokenizer may help you find other problems.
Looks pretty tidy, a few things:
Consider reversing the check for an empty
source
inCharStream
:if (string.IsNullOrEmpty(source)) { EOF = true; return; } Lines = Preprocessor.Utilities.GenerateList(source);
This deals with that special condition immediately and doesn't disrupt the logic flow (as opposed to - do something if valid, then handle empty case and then continue with normal case)
I don't know which language this is for but you should name you namespaces slightly less generic than
Compiler
- there are many compilers out there and your namespace should be more distinguishable.I'm not convinced that the implementation of
Peek
is ideal with theGet
andUnget
logic. I think it would be better to use temporary variables to update current positions and then discard them forPeek
or set them as current positions forGet
. Same as you've done in thePeek
method inTokenizer
.In the
Get
method ofTokenizer
the local variable should be namedtoken
orcurrentToken
.get
is an action and doesn't describe what the variable is meant to contain.The null coalescing operator (
??
) is quite useful in situations like in theTokenizer
. With its helpGet
can be shortened to:get = IsSymbol() ?? IsOperator() ?? IsIdentifier() ?? IsCharacterLiteral() ?? IsStringLiteral() ?? IsNumericLiteral() ?? IsAddressLiteral();
private static readonly Dictionary<string, TokenKind> KeywordToTokenKind = new Dictionary<string, TokenKind>(StringComparer.OrdinalIgnoreCase) { ["and"] = TokenKind.AndKeyword, .. };
I would create this and other similar dictionaries automatically with reflection by decorating the enums with some attribute and drop the suffixes (if possible):
public enum TokenKind : ushort
{
[Keyword("and")]
And = 1,
..
[Operator("A+B")]
Addition = 201,
..
}
You could get all values with a certain attribute and put them in a dictionary:
private static readonly Dictionary<string, TokenKind> KeywordToTokenKind =
Enum
.GetNames(typeof(TokenKind))
.Select(name => (Name: name, Keyword: typeof(TokenKind).GetField(name).GetCustomAttribute<KeywordAttribute>()?.ToString()))
.Where(t => t.Keyword != null)
.ToDictionary(t => t.Keyword, t => Enum.Parse(typeof(TokenKind), t.Name));
where:
public class KeywordAttribute : Attribute
{
private readonly string _value;
public KeywordAttribute(string value) => _value = value;
public override string ToString() => _value;
}
-
1\$\begingroup\$ +1, this is a great (and fun) suggestion, though I abhor the use of lambda syntax for non-returning (and non-trivial) methods/constructors (
KeywordAttribute
constructor)! \$\endgroup\$VisualMelon– VisualMelon2017年08月13日 09:06:22 +00:00Commented Aug 13, 2017 at 9:06
That's indeed a well made solution. There is one small thing that I would like to add that has not been mentioned so far: the optional char?
returned by Peek()
and Get()
. Instead, what I would use is a null character or '0円'
.
That way the expression
while (Stream.Peek() != null && char.IsWhiteSpace((char) Stream.Peek()))
can be simplified to
while (char.IsWhiteSpace(Stream.Peek()))
Which is equivalent, potentially faster as you don't call Stream.Peek()
twice, it avoids the dreaded null
, it avoids casting to char
and keeps the static analysis happy. I've run all the char.IsXyz(char)
with '0円'
and the only one that returned true
was the char.IsControl()
so it won't mess with the rest of the implementation -that's the point after all. Finally, if you hate magic strings/numbers/chars/whatever and you hunt them mercilessly, then you can declare a public const
:
public const char NullTerminating = '0円';
Second one other minor thing that was not mentioned so far, is that in IsIdentifier()
you can replace the
char.IsLetter((char)Stream.Peek()) || char.IsDigit((char)Stream.Peek())
with
char.IsLetterOrDigit((char)Stream.Peek())
which as you can guess is equivalent.
Finally, one other tiny bonus part that I saw while running with the test input you've provided: Tokenizer.IsOperator()
checks
if ((lexeme.ToString() == "++" || lexeme.ToString() == "--")
& (ushort) LastToken.Kind > 99 && (ushort) LastToken.Kind < 200)
Focusing on the second line, I would move that check to Token
class which has zero functions:
public bool IsLiteral()
{
return (ushort) Kind > 99 && (ushort) Kind < 200;
}
Having read Pieter Witvoet's answer who made a more thorough review than I did, I only want to focus on the notion of moving such checks in the class where they belong. That promotes readability as LastToken.IsLiteral()
is more readable than (ushort) Kind > 99 && (ushort) Kind < 200
. Also if for some twist of luck you will have to change the numbers in the TokenKind
enum
then you will have to make the change in one place and not countless.
-
1\$\begingroup\$ Are you absolutely 100% certain that there cannot be a nul character in the input stream? You don't want the parser to stop because of an errant, accidentally entered character (I'd expect some sort of error message). \$\endgroup\$1201ProgramAlarm– 1201ProgramAlarm2021年04月15日 17:55:36 +00:00Commented Apr 15, 2021 at 17:55
-
\$\begingroup\$ @1201ProgramAlarm Excellent question! And it has an answer in SO: However, the NUL byte has no valid reason for existing in a plain text file, and it may be considered at least a de facto standard for text files that they do not contain the NUL byte \$\endgroup\$Stelios Adamantidis– Stelios Adamantidis2021年04月15日 18:01:07 +00:00Commented Apr 15, 2021 at 18:01
-
\$\begingroup\$ It may not have a valid reason to be there, but that doesn't mean one won't find its way in. I've seen various problems over the years (and several questions on SO) related to extraneous/hidden characters in text streams. \$\endgroup\$1201ProgramAlarm– 1201ProgramAlarm2021年04月15日 18:04:33 +00:00Commented Apr 15, 2021 at 18:04
-
\$\begingroup\$ @1201ProgramAlarm indeed I understand your point it's valid. If we take it though to its extreme, even char itself is not safe: it will break surrogate pairs for some input that contains them. :) Try replacing
Main
of the example with𠈓
and see what happens ;) \$\endgroup\$Stelios Adamantidis– Stelios Adamantidis2021年04月15日 18:27:32 +00:00Commented Apr 15, 2021 at 18:27
Explore related questions
See similar questions with these tags.