This is a recursive descent parser for simple language with following grammar:
PROGRAM <- {STATEMENT ';'}* RETURN_STMT ';'
STATEMENT <- NAME_BINDING | TYPE_DECLARATION
TYPE_DECLARATION <- named_param identifier
NAME_BINDING <- identifier '=' EXPRESSION
EXPRESSION <- (simple arithmetics with (), +-*/ and unary minus, arguments can be constants, named parameters or identifiers)
RETURN_STMT = 'return' EXPRESSION {',' EXPRESSION}* ';'
Module entry point is parseGrammar, resulting AST represented as Program
module LangParser(parseGrammar, Token(..), Result(..)) where
import Control.Applicative
import Control.Monad
data Token =
Return |
Simple Char |
DecimalConst String |
HexConst String |
NamedParam String |
Identifier String |
LexError String |
EOF
deriving (Eq, Show)
data Op = ONop | OPlus | OMinus | OStar | OSlash | ONeg deriving (Show, Eq)
instance Ord Op where
compare ONeg _ = GT
compare _ ONeg = LT
compare OStar _ = GT
compare OSlash _ = GT
compare OPlus b
| b == OStar = LT
| b == OSlash = LT
| otherwise = GT
compare OMinus b
| b == OStar = LT
| b == OSlash = LT
| otherwise = GT
compare _ _ = error "invalid comparison"
data Expr =
EList Expr Expr |
EAdd Expr Expr |
ESub Expr Expr |
EMul Expr Expr |
EDiv Expr Expr |
ENeg Expr |
EIntConst Int |
EVariable String |
ENamedParam String
deriving Show
data Stmt =
SReturn [Expr] |
SDecl String String |
SBinding String Expr
deriving Show
type Program = [Stmt]
data ParserState = ParserState {
prog :: Program,
exprStack :: [Expr],
opStack :: [Op],
stmtIdent :: String,
stmtParam :: String,
rest :: [Token]
} deriving Show
data Result a = Error Token | State a deriving Show
instance Monad Result where
(Error e) >>= _ = Error e
(State a) >>= f = f a
return = State
instance Functor Result where
fmap _ (Error t) = Error t
fmap f (State a) = State (f a)
instance Applicative Result where
pure = return
(<*>) = ap
instance Alternative Result where
empty = Error EOF
Error _ <|> p = p
State x <|> _ = State x
epsilon :: ParserState -> Result ParserState
epsilon = State
term :: Token -> ParserState -> Result ParserState
term t state@ParserState {rest = (x:xs)}
| t == x = State state{rest = xs}
| otherwise = Error x
term _ ParserState {rest = []} = Error EOF
termS :: Char -> ParserState -> Result ParserState
termS c = term (Simple c)
termValue :: ParserState -> Result ParserState
termValue state@ParserState {rest = (DecimalConst x:xs)} = State state{exprStack = v : exprStack state, rest = xs}
where v = EIntConst (read x)
termValue state@ParserState {rest = (HexConst x:xs)} = State state{exprStack = v : exprStack state, rest = xs}
where v = EIntConst (read x)
termValue state@ParserState {rest = (NamedParam x:xs)} = State state{exprStack = ENamedParam x : exprStack state, rest = xs}
termValue state@ParserState {rest = (Identifier x:xs)} = State state{exprStack = EVariable x : exprStack state, rest = xs}
termValue ParserState {rest = (x:_)} = Error x
termValue ParserState {rest = []} = Error EOF
termIdent :: ParserState -> Result ParserState
termIdent state@ParserState {rest = (Identifier x:xs)} = State state{stmtIdent = x, rest = xs}
termIdent ParserState {rest = (x:_)} = Error x
termIdent ParserState {rest = []} = Error EOF
foldExpr :: Op -> ParserState -> Result ParserState
foldExpr _ state@ParserState{opStack = []} = State state
foldExpr stop state@ParserState{opStack = (op:_)} | op <= stop = State state
foldExpr stop state@ParserState{exprStack = (e:es), opStack = (ONeg:ops)}
= foldExpr stop state{ exprStack = ENeg e : es, opStack = ops }
foldExpr stop state@ParserState{exprStack = (e1:e2:es), opStack = (op:ops)}
= case op of
OPlus -> foldExpr stop state{ exprStack = EAdd e2 e1 : es, opStack = ops }
OMinus -> foldExpr stop state{ exprStack = ESub e2 e1 : es, opStack = ops }
OStar -> foldExpr stop state{ exprStack = EMul e2 e1 : es, opStack = ops }
OSlash -> foldExpr stop state{ exprStack = EDiv e2 e1 : es, opStack = ops }
_ -> error "invalid op in stack"
foldExpr _ _ = error "invalid state in fold"
exprS :: ParserState -> Result ParserState
exprS = expr >=> foldExpr ONop
expr :: ParserState -> Result ParserState
expr s = expr' s <|> (termNeg >=> expr') s
where
termNeg = termS '-' >=> (\state -> State state{opStack = ONeg : opStack state})
expr' st = (termValue >=> exprRest) st <|>
(termS '(' >=> exprS >=> termS ')' >=> exprRest) st
exprRest :: ParserState -> Result ParserState
exprRest s =
(termOp '+' >=> expr) s <|>
(termOp '-' >=> expr) s <|>
(termOp '*' >=> expr) s <|>
(termOp '/' >=> expr) s <|>
epsilon s
where
termOp c = termS c >=> foldExpr (op c) >=> (\state -> State state{opStack = op c : opStack state})
op '+' = OPlus
op '-' = OMinus
op '*' = OStar
op '/' = OSlash
op _ = error "bad op"
retexprs :: ParserState -> Result ParserState
retexprs = exprS >=> exprList
where
exprList s = (termS ',' >=> retexprs >=> action) s <|> epsilon s
action state@ParserState {exprStack = (e1:e2:es)} = State state{exprStack = EList e2 e1 : es}
action _ = error "2"
retStmt :: ParserState -> Result ParserState
retStmt = term Return >=> retexprs >=> termS ';' >=> action
where
action state@ParserState {exprStack = (e:es)} = State state{prog = SReturn (reverse $ unroll e []) : prog state, exprStack = es}
action _ = error "3"
unroll :: Expr -> [Expr] -> [Expr]
unroll (EList e1 e2) es = unroll e2 (e1:es)
unroll e es = e:es
nameBinding :: ParserState -> Result ParserState
nameBinding = termIdent >=> termS '=' >=> exprS >=> action
where
action state@ParserState {stmtIdent = ident, exprStack = (e:[])}
= State state{prog = SBinding ident e : prog state, exprStack = []}
action _ = error "invalid binding"
paramDecl :: ParserState -> Result ParserState
paramDecl = termNamedParam >=> termIdent >=> action
where
action state@ParserState{stmtParam = param, stmtIdent = ident}
= State state{prog = SDecl param ident: prog state}
termNamedParam state@ParserState {rest = (NamedParam x:xs)} = State state{stmtParam = x, rest = xs}
termNamedParam ParserState {rest = (x:_)} = Error x
termNamedParam ParserState {rest = []} = Error EOF
statements :: ParserState -> Result ParserState
statements s = (statement >=> termS ';' >=> statements) s <|> epsilon s
where statement st = paramDecl st <|> nameBinding st
parseProgram :: ParserState -> Result ParserState
parseProgram s = (statements >=> retStmt) s <|> retStmt s
parseGrammar :: [Token] -> Result ParserState
parseGrammar st = parseProgram (ParserState [] [] [] "" "" st)
I'm looking for code improvements. For example, repeated this lines looks ugly:
NNN ParserState {rest = (x:_)} = Error x
NNN ParserState {rest = []} = Error EOF
Also, I would like to separate parser states for parsing statements and expressions. Writing two separate types will require doubling number of epsilon/term/termIdent functions, which I hope to avoid.
1 Answer 1
A suggested change,
data MyType = TValue | TIdent | TNamedParam
termX :: MyType -> ParserState -> Result ParserState
termX t state = case (t, rest state) of
(TIdent, Identifier x:xs) -> vstate' x xs
(TNamedParam, NamedParam x:xs) -> vstate' x xs
(TValue, HexConst x:xs) -> vstate xs (v' x)
(TValue, DecimalConst x:xs) -> vstate xs (v' x)
(TValue, NamedParam x:xs) -> vstate xs (ENamedParam x)
(TValue, Identifier x:xs) -> vstate xs (EVariable x)
(_ , (x:_)) -> Error x
(_ , []) -> Error EOF
where v' = EIntConst . read
vstate xs v = State state{exprStack = v: exprStack state, rest = xs}
vstate' x xs = State state{stmtParam = x, rest = xs}
termValue = termX TValue
termIdent = termX TIdent
termNamedParam = termX TNamedParam
I think this is better than repeating the state@ again and again.
I think the below might also be nice since it is avoiding the repetition of foldexpression
foldExpr _ state@ParserState{opStack = []} = State state
foldExpr stop state@ParserState{exprStack = ex, opStack = (x:ops)} = case process ex x of
Just ev -> foldExpr stop state{ exprStack = ev, opStack = ops }
Nothing -> State state
where process (e:es) ONeg = Just $ ENeg e : es
process (e1:e2:es) OPlus = Just $ EAdd e2 e1 : es
process (e1:e2:es) OMinus = Just $ ESub e2 e1 : es
process (e1:e2:es) OStar = Just $ EMul e2 e1 : es
process (e1:e2:es) OSlash = Just $ EDiv e2 e1 : es
process e y | y <= stop = Nothing
A slightly terse version of exprRest. I think the do notation is clearer here.
exprRest :: ParserState -> Result ParserState
exprRest s = foldr (\op acc ->
(termOp op >=> expr) s <|> acc) (epsilon s) ['+', '-', '*', '/']
where
termOp c = termS c >=> foldExpr (op c) >=> \state ->
State state{opStack = op c : opStack state}
op '+' = OPlus
op '-' = OMinus
op '*' = OStar
op '/' = OSlash
op _ = error "bad op"
Here is the only change I made, Look at this pattern,
((termOp '+' >=> expr) s) <|>
((termOp '-' >=> expr) s) <|>
((termOp '*' >=> expr) s) <|>
((termOp '/' >=> expr) s) <|>
epsilon s
Which is same as
let myfn op = (termOp op >=> expr) s in
((myfn '+') <|> (myfn '-') <|> (myfn '*') <|> (myfn '/') <|>
epsilon s
This can be replaced with a fold
fold (\opfn acc -> opfn <|> acc) (epsilon s) [myfn +, myfn -, myfn *, myfn /]
equivalently, ...
So I used your question as an excuse to understand lenses. Here is the resulting code. See if you like it. (I don't claim it is good as I am still learning lenses)
{-# LANGUAGE TemplateHaskell, FlexibleContexts #-}
module LangParser(parseGrammar, Token(..), Result(..)) where
import Control.Applicative
import Control.Monad
import Data.Lens.Template (makeLenses)
import Data.Lens.Lazy
data Token =
Return |
Simple Char |
DecimalConst String |
HexConst String |
NamedParam String |
Identifier String |
LexError String |
EOF
deriving (Eq, Show)
data Op = ONop | OPlus | OMinus | OStar | OSlash | ONeg deriving (Show, Eq)
instance Ord Op where
compare ONeg _ = GT
compare _ ONeg = LT
compare OStar _ = GT
compare OSlash _ = GT
compare OPlus b
| b == OStar = LT
| b == OSlash = LT
| otherwise = GT
compare OMinus b
| b == OStar = LT
| b == OSlash = LT
| otherwise = GT
compare _ _ = error "invalid comparison"
data Expr = EList Expr Expr |
EAdd Expr Expr |
ESub Expr Expr |
EMul Expr Expr |
EDiv Expr Expr |
ENeg Expr |
EIntConst Int |
EVariable String |
ENamedParam String
deriving Show
data Stmt = SReturn [Expr] |
SDecl String String |
SBinding String Expr
deriving Show
type Program = [Stmt]
data ParserState = ParserState {
_prog :: Program,
_exprStack :: [Expr],
_opStack :: [Op],
_stmtIdent :: String,
_stmtParam :: String,
_rest :: [Token]
} deriving Show
$( makeLenses [''ParserState])
data Result a = Error Token | State a
deriving Show
instance Monad Result where
(Error e) >>= _ = Error e
(State a) >>= f = f a
return = State
instance Functor Result where
fmap _ (Error t) = Error t
fmap f (State a) = State (f a)
instance Applicative Result where
pure = return
(<*>) = ap
instance Alternative Result where
empty = Error EOF
Error _ <|> p = p
State x <|> _ = State x
epsilon :: ParserState -> Result ParserState
epsilon = State
term :: Token -> ParserState -> Result ParserState
term t state = case state ^. rest of
(x:xs) -> if t == x then State (rest ^= xs $ state) else Error x
[] -> Error EOF
termS :: Char -> ParserState -> Result ParserState
termS c = term (Simple c)
data MyType = TValue | TIdent | TNamedParam
termX :: MyType -> ParserState -> Result ParserState
termX t state = case (t, rest ^$ state) of
(TIdent, Identifier x:xs) -> vstate' x xs
(TNamedParam, NamedParam x:xs) -> vstate' x xs
(TValue, HexConst x:xs) -> vstate xs (v' x)
(TValue, DecimalConst x:xs) -> vstate xs (v' x)
(TValue, NamedParam x:xs) -> vstate xs (ENamedParam x)
(TValue, Identifier x:xs) -> vstate xs (EVariable x)
(_ , (x:_)) -> Error x
(_ , []) -> Error EOF
where v' = EIntConst . read
vstate xs v = State ( exprStack ^= v: (exprStack ^$ state) $ rest ^= xs $ state )
vstate' x xs = State ( stmtParam ^= x $ rest ^= xs $ state)
foldExpr :: Op -> ParserState -> Result ParserState
foldExpr stop state = case opStack ^$ state of
[] -> State state
(x:ops) -> case process ex x of
Just ev -> foldExpr stop (exprStack ^= ev $ opStack ^= ops $ state)
Nothing -> State state
where process (e:es) ONeg = Just $ ENeg e : es
process (e1:e2:es) OPlus = Just $ EAdd e2 e1 : es
process (e1:e2:es) OMinus = Just $ ESub e2 e1 : es
process (e1:e2:es) OStar = Just $ EMul e2 e1 : es
process (e1:e2:es) OSlash = Just $ EDiv e2 e1 : es
process e y | y <= stop = Nothing
ex = exprStack ^$ state
exprS :: ParserState -> Result ParserState
exprS = expr >=> foldExpr ONop
expr :: ParserState -> Result ParserState
expr s = expr' s <|> (termNeg >=> expr') s
where
termNeg s = termS '-' s >>= State . (opStack ^%= (ONeg :))
expr' st = (termX TValue >=> exprRest) st <|> (termS '(' >=> exprS >=> termS ')' >=> exprRest) st
unroll :: Expr -> [Expr] -> [Expr]
unroll (EList e1 e2) es = unroll e2 (e1:es)
unroll e es = e:es
exprRest :: ParserState -> Result ParserState
exprRest s =
(termOp '+' >=> expr) s <|>
(termOp '-' >=> expr) s <|>
(termOp '*' >=> expr) s <|>
(termOp '/' >=> expr) s <|>
epsilon s
where
termOp c = termS c >=> foldExpr (op c) >=> State . (opStack ^%= (op c :))
op '+' = OPlus
op '-' = OMinus
op '*' = OStar
op '/' = OSlash
op _ = error "bad op"
retexprs :: ParserState -> Result ParserState
retexprs s = do a <- exprS s
b <- termS ',' a
c <- retexprs b
action c <|> epsilon s
where
action state = case getES state of
(e1:e2:es) -> State (myprog state e1 e2 es)
_ -> error "2"
myprog state e1 e2 es = setES state (EList e2 e1 : es)
retStmt :: ParserState -> Result ParserState
retStmt s = do a <- term Return s
b <- retexprs a
c <- termS ';' b
action c
where
action :: ParserState -> Result ParserState
action state = case getES state of
(e:es) -> State (myprog state e es)
_ -> error "3"
myprog state e es = prog ^= SReturn (reverse $ unroll e []) : (state ^. prog) $ setES state es
nameBinding :: ParserState -> Result ParserState
nameBinding s = do a <- termX TIdent s
b <- termS '=' a
c <- exprS b
action c
where
action :: ParserState -> Result ParserState
action state = case getES state of
(e:[]) -> State (myprog state e)
_ -> error "invalid binding"
myprog state e = prog ^= SBinding (state ^. stmtIdent) e : (state ^. prog) $ setES state []
paramDecl :: ParserState -> Result ParserState
paramDecl s = do a <- termX TNamedParam s
b <- termX TIdent a
action b
where
action :: ParserState -> Result ParserState
action state = State (prog ^= SDecl (state ^. stmtParam) (state ^. stmtIdent) : (state ^. prog) $ state)
setES state es = (exprStack ^= es) state
getES state = state ^. exprStack
statements :: ParserState -> Result ParserState
statements s = do b <- statement s
c <- termS ';' b
statements c <|> epsilon s
where statement st = paramDecl st <|> nameBinding st
parseProgram :: ParserState -> Result ParserState
parseProgram s = do b <- statements s
retStmt b <|> retStmt s
parseGrammar :: [Token] -> Result ParserState
parseGrammar st = parseProgram (ParserState [] [] [] "" "" st)
-
\$\begingroup\$ It's exactly how it supposed to be. In comparison left operand is always on operations stack and right operand is just taken from token stream. Left-associative operations with equal precedence should compare as GT, right-associative as LT. This is required for arithmetically correct :) parsing. \$\endgroup\$blaze– blaze2012年06月15日 12:48:29 +00:00Commented Jun 15, 2012 at 12:48
-
\$\begingroup\$ And I can't derive from Ord because OPlus and OMinus have equal precedence, but are different. \$\endgroup\$blaze– blaze2012年06月15日 12:49:43 +00:00Commented Jun 15, 2012 at 12:49
-
\$\begingroup\$ So is it correct for Op to have an Ord instance? Isn't one of the requirements for an Ord class, a total ordering? I suppose, it may not lead to problems here, but I was taught to be very careful to only define instances of Typeclasses if you can provide the same guarantees of their interfaces. \$\endgroup\$Rahul Gopinath– Rahul Gopinath2012年06月15日 15:45:55 +00:00Commented Jun 15, 2012 at 15:45
-
\$\begingroup\$ I see your point. Will remove Ord and change to plain comparison function. \$\endgroup\$blaze– blaze2012年06月16日 22:08:46 +00:00Commented Jun 16, 2012 at 22:08
-
\$\begingroup\$ Could you explain change in exprRest? I don't get how it works. \$\endgroup\$blaze– blaze2012年06月16日 22:16:16 +00:00Commented Jun 16, 2012 at 22:16
foldExpr
,exprS
,expr
andexprRest
\$\endgroup\$