What I'd like to see in your review, in order of relevance:
- Is there any bugs? (I see none, but...)
- Is the code efficient? (by whatever metric you'd like to use)
- Is the code easy to understand?
However any feedback is welcomed.
tags.lua
return {
tokens = {
num = 1,
add = 2,
sub = 3,
mul = 4,
div = 5,
rpa = 6,
lpa = 7,
eof = -1
},
nodes = {
num = 1,
neg = 2,
add = 3,
sub = 4,
mul = 5,
div = 6
}
}
tokenizer.lua
local tags = require 'tags'
local function tokenize (s)
local i = 1
local tokens = {}
local sub = string.sub
local function current ()
return sub(s, i, i)
end
local function is_digit ()
local c = current() or ' '
return c >= '0' and c <= '9'
end
while i <= #s do
if current() == '+' then
tokens[#tokens + 1] = { tag = tags.tokens.add, pos = i }
elseif current() == '-' then
tokens[#tokens + 1] = { tag = tags.tokens.sub, pos = i }
elseif current() == '*' then
tokens[#tokens + 1] = { tag = tags.tokens.mul, pos = i }
elseif current() == '/' then
tokens[#tokens + 1] = { tag = tags.tokens.div, pos = i }
elseif current() == '(' then
tokens[#tokens + 1] = { tag = tags.tokens.lpa, pos = i }
elseif current() == ')' then
tokens[#tokens + 1] = { tag = tags.tokens.rpa, pos = i }
elseif is_digit(current()) then
local j = i
while is_digit() do
i = i + 1
end
if current() == '.' then
i = i + 1
if not is_digit() then
error({ message = 'Expected digit after dot', pos = i })
end
while is_digit() do
i = i + 1
end
end
i = i - 1
tokens[#tokens + 1] = {
tag = tags.tokens.num,
lexeme = string.sub(s, j, i),
pos = j
}
elseif current() ~= ' ' then
error({ message = 'Symbol not recognized', pos = i })
end
i = i + 1
end
tokens[#tokens + 1] = { tag = tags.tokens.eof, pos = 1 }
return tokens
end
return { tokenize = tokenize }
parser.lua
local tags = require 'tags'
local tokenizer = require 'tokenizer'
local tremove = table.remove
local parse_expression_1
local context
local function advance()
context.i = context.i + 1
end
local function is_at(tag_name)
return context.tokens[context.i].tag == tags.tokens[tag_name]
end
local function add(node)
context.ast[#context.ast + 1] = node
end
local function pop()
return tremove(context.ast, #context.ast)
end
local function current()
return context.tokens[context.i]
end
local function report_syntax_error(message)
error({ message = message, pos = current().pos })
end
local function parse_expression_3 ()
if is_at('num') then
add({ tag = tags.nodes.num, lexeme = current().lexeme })
advance()
elseif is_at('sub') then
advance()
parse_expression_3()
add({ tag = tags.nodes.neg, expr = pop() })
elseif is_at('lpa') then
advance()
parse_expression_1()
if is_at('rpa') then
advance()
else
report_syntax_error('Expected right parenthese')
end
else
report_syntax_error('Expected number, left parenthese or minus sign')
end
end
local function parse_expression_2 ()
parse_expression_3()
while true do
local next_tag
if is_at('mul') then
next_tag = tags.nodes.mul
elseif is_at('div') then
next_tag = tags.nodes.div
else
return
end
advance()
parse_expression_3()
local right = pop()
local left = pop()
add({ tag = next_tag, left = left, right = right })
end
end
function parse_expression_1 ()
parse_expression_2()
while true do
local next_tag
if is_at('add') then
next_tag = tags.nodes.add
elseif is_at('sub') then
next_tag = tags.nodes.sub
else
return
end
advance()
parse_expression_2()
local right = pop()
local left = pop()
add({ tag = next_tag, left = left, right = right })
end
end
local function parse (raw_code)
context = { tokens = tokenizer.tokenize(raw_code), i = 1, ast = {} }
parse_expression_1()
if not is_at('eof') then
report_syntax_error('Expected end of input')
end
return context.ast[1]
end
return { parse = parse }
The following codes are not part of the parser, but you may find them handy in order to test the parser -- and you can review them too if you like.
interpreter.lua
local tags = require 'tags'
local function evaluate (ast)
if ast.tag == tags.nodes.num then
return tonumber(ast.lexeme)
end
if ast.tag == tags.nodes.neg then
return -evaluate(ast.expr)
end
if ast.tag == tags.nodes.add then
return evaluate(ast.left) + evaluate(ast.right)
end
if ast.tag == tags.nodes.sub then
return evaluate(ast.left) - evaluate(ast.right)
end
if ast.tag == tags.nodes.mul then
return evaluate(ast.left) * evaluate(ast.right)
end
if ast.tag == tags.nodes.div then
local left = evaluate(ast.left)
local right = evaluate(ast.right)
if right == 0 then
error({ message = 'Cannot divide by 0' })
end
return left / right
end
error('Internal error: Unexpected ast_tag')
end
return { evaluate = evaluate }
repl.lua
local parser = require 'parser'
local interpreter = require 'interpreter'
local function repl ()
print('type "exit" to exit')
while true do
io.write('> ')
local s = io.read()
if s == 'exit' then
break
end
local success, v = pcall(function ()
return interpreter.evaluate(parser.parse(s))
end)
if success then
print(v)
else
print('Error: ' .. v.message)
if v.pos then
print(s)
print(string.rep(' ', v.pos - 1) .. '^')
end
end
end
print()
print('Bye ;)')
end
repl()
1 Answer 1
Style
Your placement of the parameters in a function definition isn't consistent. The typical convention, across languages, is no space between the name and the (
. There is no "standard" style for Lua, though, so if you want to put a space before, go ahead! Just do it on all of your functions, and not just most of them.
A useful style is to use "trailing commas" whenever you have lists or tables split across multiple lines. This reduces the churn in line-based diffs (like what git
and hg
use) and overall can make the code look more regular.
tokens[#tokens + 1] = {
tag = tags.tokens.num,
lexeme = string.sub(s, j, i),
pos = j, -- you can add trailing comma here
}
"left parenthese" isn't correct spelling. Although it may look awkward, the correct word is "left parenthesis". One way to get around this (and also produce a more precise and concise diagnostic) is to just the symbol itself:
report_syntax_error('Expected number, `(` or `-`')
tags
Instead of using numbers for your tags, you can just use strings. ==
on strings in Lua is exactly as fast (and does essentially the same thing; comparing an integer rather than a float [if you're using Lua 5.1/5.2]) as ==
on numbers. This will mean your tables are much more interpretable when debugging and it simplifies the overall structure.
The advantage of some kind of "enum" approach is that you can more easily avoid typos (e.g. 'sbu'
instead of 'sub'
) because you have a central place to validate all of the strings you're using, and IDEs will be more able to help you. However, because this is so simple, and bugs here would almost surely be caught in simple tests, I would err on the side of simplicity and deletings tags.lua
altogether, just using the string constants as you need them.
One benefit that falls out of this is being able to use punctuation as tags, for example, the self explanatory '('
instead of 'lpu'
.
If you want to get value of tags.lua
, you might want to, for example, add a metatable to tokens
that throws an error if an unknown token is mentioned.
tokenizer
Making an alias local sub = string.sub
does help slightly with performance, but it's most likely not worth it. Following the same logic track, you should eliminate current
and simply repeat sub(s, i, i)
over and over. But actually, computing .
and entering functions is not where your code is spending most if its time, so it's not really worth optimizing.
This is especially true when there are other options that will save more time and clean up your code: For example, you don't need to call current()
so many times. Instead of having that function, you could instead make a local variable:
local current = s:sub(i, i)
if current == '+' then
By the way, since current()
always returned a string, current() or ' '
is redundant; the ' '
branch will never be used.
Instead of using while
loops for lexing numbers, you could use Lua's built in pattern matching. This will likely run faster, and is a lot less code.
local _, integer_stop = s:find("[0-9]+", i)
local number_stop = integer_stop
if s:sub(integer_stop + 1, integer_stop + 1) == "." then
local _, fraction_stop = s:find("[0-9]+", integer_stop + 2)
if not fraction_stop then
error({ message = 'Expected digit after dot', pos = integer_stop + 2 })
end
number_stop = fraction_stop
end
tokens[#tokens + 1] = {
pos = i,
lexeme = s:sub(i, number_stop),
}
i = number_stop - 1
parser
The same comment for localizing string.sub
goes for table.remove
; you shouldn't write code in a different way just to save a few cycles until everything else is perfect.
Your pop
function is unnecessary; table.remove
removes the last element from a list if you don't specify an argument. However, I find it inconsistent that you use t[#t + 1] =
to insert, but use table.remove
to remove. I would be consistent with my idioms, and use t[#t] = nil
to pop values off of a stack.
By the way, the typical name for what you call add
is push
. "add
" is an extra unfortunate name, since it's also the name of one of your tokens.
Using a global variable context
is an unnecessary source of complexity and ugliness, especially because the things that operate on it are so simple.
I would either
- eliminate
add
,pop
,is_at
, andcurrent
, inlining them into all of their uses - or make
context
an argument to each ofadd
,pop
,is_at
, andcurrent
.
You're not actually using your context.ast
stack though! Notice that you pop everything off at the end of every function. Instead of going through the stack, you should make your parse_
functions return the AST that they have just parsed. For example, here's what parse_expression_2
might look like (simplifying to only include '*'
and not '/'
):
local function parse_expression_2 ()
local ast = parse_expression_3()
while true do
if is_at('*') then
advance()
else
return ast
end
local right = parse_expression_3()
ast = {tag = '*', left = ast, right = right}
end
end
You could do a similar thing for the tokens (notice that is_at
is always paired with an advance()
immediately after!) where they return true
if it starts, and automatically advances, and otherwise returns false
. I would rename them to something like try_consume
in that case, to make it clearer what they do.
Instead of an 'eof' token, you could just use size of the token list; unlike, say, C, we know where the ends of our arrays are.
interpreter
The most obvious change to me is to use elseif
to compress the many cases. While you could always use that style when a branch return
s, it especially makes sense here since you are handling different cases of the same variable. Also, apply the suggestion to use raw strings rather than tokens, and you get something much cleaner:
local function evaluate (ast)
if ast.tag == 'num' then
return tonumber(ast.lexeme)
elseif ast.tag == 'u-' then
return -evaluate(ast.expr)
elseif ast.tag == '+' then
return evaluate(ast.left) + evaluate(ast.right)
end
repl
Your use of error
pcall
makes your code fairly simple, but it's also not a great design for large-scale projects. The best way to use error
in Lua is to mark programmer errors; ie, if your code throws an error
, you have a mistake in your code.
To mark bad input (like you're currently using error
for), it should return an error message. The Lua standard library typically does this by returning nil
and then the message:
return nil, "Cannot divide by 0"
However, doing this does make your code more complicated, since you have to check error values at every call. On the other hand, if you make a mistake in your program, your pcall
is going to try to read that and show it to the user like they put in bad input. (It will actually fail in a confusing way with 'Error: ' .. v.message
failing because v.message
will be nil
).
Not throwing error
s makes your code easier to embed in other code. If every caller of your library is supposed to surround every function call with a pcall
, you've not designed a very good interface!
On the other hand, if you can be certain that your code doesn't have any unexpected errors (or you catch and rethrow them, for example, by inspecting the type(v)
when pcall
returns unsuccessfully), you could keep the same internal structure, and just add a pcall
to return the error out of each exported function (just evaluate
, parse
).