\$\begingroup\$
\$\endgroup\$
So I ported a lexer I wrote in C++ over to rust, as I'm starting to learn rust. Since I'm very new though, I don't know any Idioms and good practices in rust. So if anyone could point out some (probably obvious) issues I'd be very thankful.
This is my code:
use std::cmp::PartialEq;
use std::fmt::Debug;
#[derive(Debug, PartialEq)]
pub enum Token {
EOF,
Ident(String),
Str(String),
FNum(f64),
INum(u64),
Assign,
BLsh,
BRsh,
BURsh,
If,
Else,
Elif,
Loop,
Stop,
Skip,
Yes,
No,
Nope,
Fun,
Ret,
And,
Not,
Or,
LBrack,
LBrace,
LPar,
RBrack,
RBrace,
RPar,
Semi,
Comma,
Get,
Concat,
IDiv,
FDiv,
Add,
Minus,
Mul,
Mod,
BXOr,
BAnd,
BOr,
IE,
EQ,
NE,
LT,
LE,
GT,
GE,
}
macro_rules! peek_char {
($e:expr) => {
match $e.peek().cloned() {
Some(c) => c,
None => {
return Token::EOF;
}
}
};
}
pub fn get_token(iterator: &mut std::iter::Peekable<std::str::Chars>) -> Token {
let mut next: char = peek_char!(iterator);
let mut current_token: String = String::new();
if next.is_whitespace() {
loop {
next = peek_char!(iterator);
if !next.is_whitespace() {
break;
}
iterator.next();
}
get_token(iterator)
} else if next == '#' {
loop {
next = peek_char!(iterator);
if next == '\n' {
break;
}
iterator.next();
}
get_token(iterator)
} else if next.is_alphabetic() {
loop {
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
break;
}
};
if !(next.is_alphanumeric() || next == '_') {
break;
}
iterator.next();
current_token.push(next);
}
match current_token.as_str() {
"if" => Token::If,
"else" => Token::Else,
"elif" => Token::Elif,
"loop" => Token::Loop,
"stop" => Token::Stop,
"skip" => Token::Skip,
"yes" => Token::Yes,
"no" => Token::No,
"nope" => Token::Nope,
"fun" => Token::Fun,
"return" => Token::Ret,
"and" => Token::And,
"not" => Token::Not,
"or" => Token::Or,
_ => Token::Ident(current_token),
}
} else if next == '"' {
iterator.next();
loop {
next = peek_char!(iterator);
let to_add: char = if next == '\\' {
iterator.next();
next = peek_char!(iterator);
match next {
't' => '\t',
'b' => '\x08',
'n' => '\n',
'r' => '\r',
'f' => '\x0c',
'"' => '"',
'\\' => '\\',
_ => panic!("unknown escaped character"),
}
} else if next == '"' {
iterator.next();
break;
} else {
next
};
iterator.next();
current_token.push(to_add);
}
Token::Str(current_token)
} else if next.is_digit(10) {
loop {
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
break;
}
};
if !(next.is_digit(10) || next == '.') {
break;
}
iterator.next();
if next == '.' && current_token.contains('.') {
panic!("multiple decimal points in number");
}
current_token.push(next);
}
if current_token.contains('.') {
Token::FNum(
current_token
.parse::<f64>()
.expect("error reading float literal"),
)
} else {
Token::INum(
u64::from_str_radix(¤t_token, 10).expect("error reading integer literal"),
)
}
} else if next == '/' {
iterator.next();
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
return Token::FDiv;
}
};
if next == '/' {
iterator.next();
Token::IDiv
} else {
Token::FDiv
}
} else if next == '?' {
iterator.next();
next = peek_char!(iterator);
iterator.next();
if next == '=' {
Token::IE
} else {
panic!("unknown character");
}
} else if next == '!' {
iterator.next();
next = peek_char!(iterator);
iterator.next();
if next == '=' {
Token::NE
} else {
panic!("unknown character");
}
} else if next == '=' {
iterator.next();
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
return Token::Assign;
}
};
if next == '=' {
iterator.next();
Token::EQ
} else {
Token::Assign
}
} else if next == '<' {
iterator.next();
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
return Token::LT;
}
};
if next == '=' {
iterator.next();
Token::LE
} else if next == '<' {
iterator.next();
Token::BLsh
} else {
Token::LT
}
} else if next == '>' {
iterator.next();
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
return Token::GT;
}
};
if next == '=' {
iterator.next();
Token::GE
} else if next == '>' {
iterator.next();
next = match iterator.peek().cloned() {
Some(c) => c,
None => {
return Token::BRsh;
}
};
if next == '>' {
iterator.next();
Token::BURsh
} else {
Token::BRsh
}
} else {
Token::GT
}
} else {
iterator.next();
match next {
'[' => Token::LBrack,
'{' => Token::LBrace,
'(' => Token::LPar,
']' => Token::RBrack,
'}' => Token::RBrace,
')' => Token::RPar,
';' => Token::Semi,
'.' => Token::Get,
',' => Token::Comma,
'+' => Token::Add,
'-' => Token::Minus,
'*' => Token::Mul,
'%' => Token::Mod,
'$' => Token::Concat,
'^' => Token::BXOr,
'&' => Token::BAnd,
'|' => Token::BOr,
_ => panic!("unknown character"),
}
}
}
AlexV
7,3532 gold badges24 silver badges47 bronze badges
asked Feb 10, 2020 at 23:36
1 Answer 1
\$\begingroup\$
\$\endgroup\$
A couple of general notes:
- I’m not sure why you used a macro for
peek_char!
. I’m relatively certain this could have been a standard function. - Take some time to learn about slices.
- You have a single, several hundred line function there. It’s hard to read and follow. Extract lots of well named functions to improve readability.
- If you
panic
inside your function, there’s not really any way for the person calling your function to handle it. This should return aResult<Token, ParseError>
.
answered Feb 12, 2020 at 10:12
Explore related questions
See similar questions with these tags.
lang-rust