I'm writing a compiler for a couple of months now, this is the tokenization part of the lexer.
I would like a code review to improve my coding style and learn new techniques to pretty up my code and make it easier to maintain.
Also because I did not actually study compiler design, I do not know if the structure makes sense at all, so feel free to give any kind of criticism.
This class is a compiler instance, today I will only discuss the tokenization part of it.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
namespace ShirLanguageCompiler
{
public class ShirEnvironment
{
public ShirEnvironment(string inputpath)
{
this.code = File.ReadAllText(inputpath);
Console.WriteLine("Starting tokenizer");
this.Tokenizer = new Tokenizer(this);
Console.WriteLine("Starting parser");
this.Parser = new Parser(this);
Console.WriteLine("Starting Compiling Environment");
this.Factory = new ILFactory(this);
Console.WriteLine("Starting Virtual Machine");
this.VirtualMachine = new VM(this);
}
public void Compile()
{
Tokenizer.Tokenize();
Console.WriteLine(String.Join(Environment.NewLine, Tokens));
Parser.Parse();
Debug.Assert(Tokenizer.AsCode() == code);
Factory.Generate();
VirtualMachine.Execute();
}
public Tokenizer Tokenizer;
public Parser Parser;
public ILFactory Factory;
public VM VirtualMachine;
public ILEnv Intermidiate = new ILEnv();
public List<Token> Tokens = new List<Token>();
public List<SyntaxNode> Nodes = new List<SyntaxNode>();
public ProgramNode Program = new ProgramNode();
public string code;
}
}
The Tokenizer:
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace ShirLanguageCompiler
{
public class Tokenizer
{
public Tokenizer(ShirEnvironment _env)
{
this.env = _env;
}
ShirEnvironment env;
int cursor;
int line = 0, col = 0;
private readonly Dictionary<string, Regex> Patterns = new Dictionary<string, Regex>()
{
{"CharPattern", new Regex("[\\$a-zA-Z]")},
{"StringPattern", new Regex("[^\"]") },
{"NumCharPattern", new Regex("[[0-9a-zA-Z]")},
{"NumberPattern", new Regex("[0-9\\.]") }
};
private static readonly Dictionary<string, SyntaxKind> Keywords = new Dictionary<string, SyntaxKind>()
{
{"True", SyntaxKind.LiteralTrueKeyword },
{"False", SyntaxKind.LiteralFalseKeyword },
{"ref", SyntaxKind.RefKeyword },
{"val", SyntaxKind.ValKeyword },
{"return", SyntaxKind.ReturnKeyword },
{"bind", SyntaxKind.BindKeyword },
{"boolean", SyntaxKind.BooleanKeyword },
{"number", SyntaxKind.NumberKeyword },
{"letter", SyntaxKind.LetterKeyword },
{"string", SyntaxKind.StringKeyword }
};
private static readonly Dictionary<SyntaxKind, Regex> Definitions = new Dictionary<SyntaxKind, Regex>()
{
{ SyntaxKind.ColonToken, new Regex(":") },
{ SyntaxKind.SemiColonToken, new Regex(";") },
{ SyntaxKind.AssignmentToken, new Regex("=>") },
{ SyntaxKind.AccessorToken, new Regex("->") },
{ SyntaxKind.LiteralCharToken, new Regex("\'") },
{ SyntaxKind.LiteralStringToken, new Regex("\"") },
{ SyntaxKind.LiteralNumberToken, new Regex("[0-9]") },
{ SyntaxKind.OpenCurlyBracketToken, new Regex("\\{") },
{ SyntaxKind.CloseCurlyBracketToken, new Regex("\\}") },
{ SyntaxKind.OpenParenthesisToken, new Regex("\\(") },
{ SyntaxKind.CloseParenthesisToken, new Regex("\\)") },
{ SyntaxKind.OpenSquareBracketToken, new Regex("\\[") },
{ SyntaxKind.CloseSquareBracketToken, new Regex("\\]") },
{ SyntaxKind.CommaToken, new Regex(",") },
{ SyntaxKind.EOLToken, new Regex("[\\r\\n]") },
{ SyntaxKind.WhitespaceToken, new Regex("\\s") },
{ SyntaxKind.QuestionMarkToken, new Regex("\\?") },
{ SyntaxKind.PlusOperationToken, new Regex("\\+") },
{ SyntaxKind.MinusOperationToken, new Regex("\\-") },
{ SyntaxKind.MultiplyOperationToken, new Regex("\\*") },
{ SyntaxKind.PowerOperationToken, new Regex("\\*\\*") },
{ SyntaxKind.RootOperationToken, new Regex("\\/\\/") },
{ SyntaxKind.DivideOperationToken, new Regex("\\/") },
{ SyntaxKind.EqualToken, new Regex("==") },
{ SyntaxKind.InEqualToken, new Regex("!=") }
};
static readonly SyntaxKind[] LiteralTokens =
{
SyntaxKind.EOLToken ,
SyntaxKind.WhitespaceToken ,
SyntaxKind.QuestionMarkToken ,
SyntaxKind.ColonToken ,
SyntaxKind.SemiColonToken ,
SyntaxKind.CommaToken ,
SyntaxKind.OpenParenthesisToken ,
SyntaxKind.CloseParenthesisToken ,
SyntaxKind.OpenSquareBracketToken ,
SyntaxKind.CloseSquareBracketToken ,
SyntaxKind.OpenCurlyBracketToken ,
SyntaxKind.CloseCurlyBracketToken ,
SyntaxKind.PlusOperationToken ,
SyntaxKind.MinusOperationToken ,
SyntaxKind.MultiplyOperationToken ,
SyntaxKind.DivideOperationToken
};
/*
* Implementing a generic tokenizer here
* this might seem not standart, but im not following any standarts here. that would be boring wouldn't it
*/
private bool MatchesPattern(Regex expression, int size = 1) => env.code.Length >= cursor + size && expression.IsMatch(env.code.Substring(cursor, size));
private bool MatchesDefition(SyntaxKind kind, int size = 1) => MatchesPattern(Definitions[kind], size);
public void Tokenize()
{
for (cursor = 0; cursor < env.code.Length;)
{
int savecursor = cursor;
if (MatchesDefition(SyntaxKind.PowerOperationToken, 2))
{
MakeToken(SyntaxKind.PowerOperationToken, cursor, 2);
cursor += 2;
continue;
}
if (MatchesDefition(SyntaxKind.RootOperationToken, 2))
{
MakeToken(SyntaxKind.RootOperationToken, cursor, 2);
cursor += 2;
continue;
}
if (MatchesDefition(SyntaxKind.EqualToken, 2))
{
MakeToken(SyntaxKind.EqualToken, cursor, 2);
cursor += 2;
continue;
}
if (MatchesDefition(SyntaxKind.InEqualToken, 2))
{
MakeToken(SyntaxKind.InEqualToken, cursor, 2);
cursor += 2;
continue;
}
if (MatchesDefition(SyntaxKind.AccessorToken, 2))
{
MakeToken(SyntaxKind.AccessorToken, cursor, 2);
cursor += 2;
continue;
}
if (LiteralTokens.Any(n=>MatchesDefition(n)))
{
MakeToken(LiteralTokens.First(n=>MatchesDefition(n)));
cursor++;
continue;
}
if (MatchesDefition(SyntaxKind.AssignmentToken,2))
{
MakeToken(SyntaxKind.AssignmentToken,cursor,2);
cursor+=2;
continue;
}
if (MatchesDefition(SyntaxKind.LiteralCharToken))
{
int oldcursor = cursor;
do
{
cursor++;
}
while (MatchesPattern(Patterns["NumberPattern"]));
if (MatchesDefition(SyntaxKind.LiteralCharToken))
MakeToken(SyntaxKind.LiteralCharToken,oldcursor,cursor - oldcursor + 1);
else
throw new ShirException.TokenizerException.CountNotTokenizeCharException($"char: {env.code.Substring(oldcursor, cursor - oldcursor + 1)} could not be tokenized");
cursor++;
continue;
}
if (MatchesDefition(SyntaxKind.LiteralStringToken))
{
int oldcursor = cursor;
do
{
cursor++;
}
while (MatchesPattern(Patterns["StringPattern"]));
if (MatchesDefition(SyntaxKind.LiteralStringToken))
MakeToken(SyntaxKind.LiteralStringToken, oldcursor, cursor - oldcursor + 1);
else
throw new ShirException.TokenizerException.CountNotTokenizeCharException($"char: {env.code.Substring(oldcursor, cursor - oldcursor + 1)} could not be tokenized");
cursor++;
continue;
}
if (MatchesDefition(SyntaxKind.LiteralNumberToken))
{
int oldcursor = cursor;
do
{
cursor++;
}
while (MatchesPattern(Patterns["NumberPattern"]));
MakeToken(SyntaxKind.LiteralNumberToken, oldcursor, cursor - oldcursor);
continue;
}
if (MatchesPattern(Patterns["CharPattern"]))
{
int oldcursor = cursor;
do
{
cursor++;
}
while (MatchesPattern(Patterns["NumCharPattern"]));
int len = cursor - oldcursor;
string TokenString = env.code.Substring(oldcursor, len);
if(Keywords.ContainsKey(TokenString))
MakeToken(Keywords[TokenString], oldcursor, len);
else
{
char nextchar = env.code[cursor];
if (nextchar == '(')
MakeToken(SyntaxKind.FunctionNameToken, oldcursor, len);
else
MakeToken(SyntaxKind.VariableNameToken, oldcursor, len);
}
continue;
}
if (savecursor == cursor)
throw new ShirException.TokenizerException.CountNotTokenizeCharException($"char: {env.code[cursor]} could not be tokenized");
cursor++;
}
}
public string AsCode()
{
return string.Join("",env.Tokens.Select(n=>n.GetValue()));
}
private void MakeToken(SyntaxKind type, int oldcursor, int length)
{
col += length;
if (type == SyntaxKind.EOLToken)
{
line++;
col = 0;
}
env.Tokens.Add(new Token(oldcursor, length, type, env,line,col));
}
private void MakeToken(SyntaxKind type, int length = 1)
{
col += length;
if (type == SyntaxKind.EOLToken)
{
line++;
col = 0;
}
env.Tokens.Add(new Token(cursor,length,type, env,line,col));
}
}
}
A Token:
using System.Text.RegularExpressions;
namespace ShirLanguageCompiler
{
public enum SyntaxKind
{
// math operators
PlusOperationToken,
MinusOperationToken,
DivideOperationToken,
MultiplyOperationToken,
PowerOperationToken,
RootOperationToken,
//boolean tokens
EqualToken,
InEqualToken,
VariableNameToken,
FunctionNameToken,
// Variable type rokens
NumberKeyword,
BooleanKeyword,
LetterKeyword,
StringKeyword,
// Function related rokens
BindKeyword,
ReturnKeyword,
RefKeyword,
ValKeyword,
// Literal Values
LiteralTrueKeyword,
LiteralFalseKeyword,
LiteralNumberToken,
LiteralCharToken,
LiteralStringToken,
EOLToken,
QuoteToken,
ColonToken,
SemiColonToken,
CommaToken,
QuestionMarkToken,
WhitespaceToken,
AssignmentToken,
AccessorToken,
OpenParenthesisToken,
CloseParenthesisToken,
OpenCurlyBracketToken,
CloseCurlyBracketToken,
OpenSquareBracketToken,
CloseSquareBracketToken,
}
public class Token
{
ShirEnvironment env;
public int start { get; private set; }
public int length { get; private set; }
public int line, col;
public SyntaxKind type { get; private set; }
public Token(int _start, int _length, SyntaxKind _type,ShirEnvironment _env,int line,int col) {
this.start = _start;
this.length = _length;
this.type = _type;
this.env = _env;
this.line = line;
this.col = col;
}
public string GetLocation() => $"<line:{line},col:{col}>";
public override string ToString() => $"<{type}> start: {start} length: {length} value: {evaluate()}";
public string evaluate() => $"'{Regex.Escape(env.code.Substring(start, length))}'";
public string GetValue() => env.code.Substring(start, length); //type == SyntaxKind.EOLToken ? Environment.NewLine :
}
}
-
5\$\begingroup\$ If you are interested in some modern and non-traditional approaches to compiler design, I recommend you look at Dotty (Compilers Are Databases talk, slides), Idris (and approaches to writing compilers in Haskell in general), Nanopass compilers, and also the Roslyn C♯ compiler. \$\endgroup\$Jörg W Mittag– Jörg W Mittag2017年06月13日 12:54:26 +00:00Commented Jun 13, 2017 at 12:54
-
\$\begingroup\$ The complexer your language gets, the harder it will be to tokenize using Regex. Consider making your own InputStream and TokenStream with look-around and backtracking support. \$\endgroup\$dfhwze– dfhwze2019年05月26日 16:10:15 +00:00Commented May 26, 2019 at 16:10
4 Answers 4
One small comment about your API.
public void Tokenize()
I would expect a Tokenize method to return a stream (IEnumerable
) of Token
s rather than modify state. This should be a pure and idempotent function in my opinion.
-
\$\begingroup\$ Youre probably right..I think the structure of my compiler is really wrong. I wish i could read up on it \$\endgroup\$downrep_nation– downrep_nation2017年06月13日 11:42:07 +00:00Commented Jun 13, 2017 at 11:42
-
5\$\begingroup\$ This was one of the top hits when I googled for "compiler theory" diku.dk/~torbenm/Basics/basics_lulu2.pdf \$\endgroup\$RubberDuck– RubberDuck2017年06月13日 12:05:55 +00:00Commented Jun 13, 2017 at 12:05
Your code looks very clean and organized.
Only some comments (without understanding the code in depth)
- You would probably use the regex option
RegexOptions.Compiled
to improve performance - The
Tokenize
method has lots of similar code fragments. You could use a list ofSyntaxKind
elements like you did withLiteralTokens
. - Here you call
MatchesDefition
twice:
if (LiteralTokens.Any(n=>MatchesDefition(n))) { MakeToken(LiteralTokens.First(n=>MatchesDefition(n))); cursor++; continue; }
It would be more performant to use the result from the first check like:
bool foundMatch = false;
foreach (var token in LiteralTokens)
{
if (MatchesDefition(n))
{
MakeToken(token));
cursor++;
foundMatch = true;
break;
}
}
if (foundMatch) { continue; }
Sure, it is not so elegant, but IMHO for a tokanizer performance comes before elegance. Probably there is also a better solution ;)
- You are using a dictionary (
Patterns
) to access the different patterns. Why not using variables? It is faster and you don't need to work with string to access them. - The field
env
can be readonly
Realy just a few small remarks... overall it's a nice piece of code - good work :)
-
\$\begingroup\$ I think the answer from Nikita B will solve your point in a better way but an alternative to your solution would be: var x = LiteralTokens.FirstOrDefault(n=>MatchesDefition(n)); if(x != null) {...} \$\endgroup\$NiklasJ– NiklasJ2017年06月15日 08:02:47 +00:00Commented Jun 15, 2017 at 8:02
-
\$\begingroup\$ @NiklasJ: That was also my first thought. Unfortunately,
LiteralTokens
is an array ofSyntaxKind
s which are enums. Therefore the default value is the first value of the enum which isSyntaxKind.PlusOperationToken
. It could be used, if theSyntaxKind
enum will be extended by the valueSyntaxKind.None = 0
. \$\endgroup\$JanDotNet– JanDotNet2017年06月15日 19:25:35 +00:00Commented Jun 15, 2017 at 19:25
A few other things:
cursor += 2;
andcursor++;
are all over the place in yourTokenizer
. This looks very error-prone. You need a class that will move cursor automatically when you read a substring. Something similar toStringReader
, but designed to better fit your task. Maybe://instead of returning strings you can return a complex object, //that would also hold information about columns, lines, etc. interface ICodeReader { //reads substring without moving `Position` string Peek(int count); //reads substring and moves `Position` by `count` string Read(int count); long Length { get; } //code length long Position { get; set; } //cursor //other members? }
There is a lot of code repetition in
Tokenize
method, which boils down to "check if this string is a token of type X and parse it, if it is". This sounds like an interface to me:interface ITokenParser { bool CanParse(ICodeReader reader, ....); IEnumerable<Token> Parse(ICodeReader reader, ....) }
implement one of those for every token (or every group of related tokens), and you should be able to rewrite
Tokenize
as:var parsers = new ITokenParser[] {...}; //should be a field var tokens = new List<Token>(); while(reader.Position < reader.Length) { var parser = parsers.First(p => p.CanParse(reader, ...)); tokens.AddRange(parser.Parse(reader, ...)); }
You will also be able to easily unit test every
ITokenParser
in isolation, which is a major benefit.
-
\$\begingroup\$ Definitely a cool idea, i will see how i could implement such thing. Thanks for the suggestions \$\endgroup\$downrep_nation– downrep_nation2017年06月14日 08:39:02 +00:00Commented Jun 14, 2017 at 8:39
You can use verbatim strings to reduce the slash chaos:
{ SyntaxKind.RootOperationToken, new Regex(@"\/\/") },
You can also use IReadOnlyDictionary to prevent your dictionaries from being modified. The readonly flag tells that no one can write something like Keywords = null
but currently it still can be modified, e.g. Keywords.Clear();
.
private static readonly IReadOnlyDictionary<string, SyntaxKind> Keywords = new Dictionary<string, SyntaxKind>()