top-down lexer in rust

Question 1

the other day, i decided to build a top-down lexer in rust, just for fun.
this is what i have so far:

use std::str::Chars;
use std::env;
use std::fs;
#[derive(Debug, Clone)]
enum Op {
 Plus,
 Minus,
 Multiply,
 Divide,
 Modulo,
 Equal,
 Not,
 NotEqual,
 Greater,
 GreaterOrEqual,
 Less,
 LessOrEqual,
}
#[derive(Debug, Clone)]
enum TokenKind {
 Opr(Op),
 Ident(String),
 Str(String),
 Num(String),
}
#[derive(Debug, Clone)]
struct Lexer<'a> {
 // string containing entire program
 source: &'a str,
 // the peekable character
 peek: Option<char>,
 // the characters iterator
 chars: Chars<'a>,
 // utf-8 position in the string
 pos: usize,
 // the bottom two are for error reporting/debugging
 row: usize,
 col: usize
}
#[allow(unused)]
impl<'a> Lexer<'a> {
 fn new(source: &'a str) -> Self {
 let mut chars = source.chars();
 Self {
 source,
 peek: chars.next(),
 chars,
 pos: 0,
 row: 1,
 col: 1
 }
 }
 #[inline]
 fn position(&self) -> (usize, usize) {
 (self.row, self.col)
 }
 #[inline]
 fn is_over(&self) -> bool {
 self.peek().is_none()
 }
 #[inline]
 fn bump(&mut self) {
 if let Some(ch) = self.chars.next() {
 self.pos += ch.len_utf8();
 self.col += 1;
 if ch == '\n' {
 self.col = 0;
 self.row += 1;
 }
 self.peek = Some(ch);
 } else {
 self.peek = None;
 }
 }
 #[inline]
 fn peek(&self) -> Option<char> {
 self.peek
 }
 #[inline]
 fn unicode_escape(&mut self, ch: char) -> Option<char> {
 // e.g. \n \t \u{...}
 let res = match ch {
 'n' => '\n',
 't' => '\t',
 'r' => '\r',
 'u' => todo!("implement unicode character codes"),
 'x' => todo!("implement hex character codes"),
 other => other
 };
 Some(res)
 }
 fn trim_ident(&mut self) -> TokenKind {
 let start_pos = self.pos;
 while let Some(s) = self.peek() {
 if !(s.is_alphanumeric() || s == '_') {
 break;
 }
 self.bump();
 }
 TokenKind::Ident(self.source[start_pos..self.pos].to_owned())
 }
 fn trim_number(&mut self) -> TokenKind {
 let start_pos = self.pos;
 while let Some(s) = self.peek() {
 if !s.is_numeric() {
 break;
 }
 self.bump();
 }
 TokenKind::Num(self.source[start_pos..self.pos].to_owned())
 }
 fn trim_string(&mut self) -> TokenKind {
 self.bump();
 let start_pos = self.pos;
 while let Some(s) = self.peek() {
 if s == '"' {
 break
 }
 if s == '\\' {
 self.bump();
 }
 self.bump();
 }
 if self.is_over() {
 todo!("Could not lex string");
 }
 self.bump();
 TokenKind::Str(self.source[start_pos..self.pos-1].to_owned())
 }
 
 #[inline]
 fn trim_comment(&mut self) {
 while let Some(s) = self.peek() {
 if s == '\n' {
 self.row = 0;
 self.col += 1;
 break
 }
 self.bump();
 }
 }
 #[inline]
 fn trim_block_comment(&mut self) {
 let mut recursion: usize = 1;
 while let Some(s) = self.next() {
 match s {
 '\n' => {
 self.row = 0;
 self.col += 1;
 },
 '/' => {
 if let Some('*') = self.peek() {
 self.bump();
 recursion += 1;
 }
 },
 '*' => {
 if let Some('/') = self.peek() {
 self.bump();
 recursion -= 1;
 }
 }
 _ => {},
 }
 if recursion == 0 {
 break
 }
 }
 }
 #[inline]
 fn trim_whitespace(&mut self) {
 while let Some(s) = self.peek() {
 if !s.is_whitespace() {
 break;
 }
 self.bump();
 }
 }
 
 #[inline]
 fn next_char(&mut self) -> Option<char> {
 let peek = self.peek;
 self.bump();
 peek
 }
}
impl<'a> Iterator for Lexer<'a> {
 type Item = TokenKind;
 fn next(&mut self) -> Option<TokenKind> {
 loop {
 self.trim_whitespace();
 return match self.peek()? {
 x if x.is_alphabetic() => return Some(self.trim_ident()),
 x if x.is_numeric() => return Some(self.trim_number()),
 '"' => {
 Some(self.trim_string())
 },
 '+' => {
 self.bump();
 Some(TokenKind::Opr(Op::Plus))
 },
 '-' => {
 self.bump();
 Some(TokenKind::Opr(Op::Minus))
 },
 '*' => {
 self.bump();
 Some(TokenKind::Opr(Op::Multiply))
 },
 '/' => {
 self.bump();
 if let Some('/') = self.peek() {
 self.trim_comment();
 continue;
 }
 if let Some('*') = self.peek() {
 self.trim_block_comment();
 continue;
 }
 Some(TokenKind::Opr(Op::Divide))
 },
 '%' => {
 self.bump();
 Some(TokenKind::Opr(Op::Modulo))
 },
 '=' => {
 self.bump();
 Some(TokenKind::Opr(Op::Equal))
 },
 '!' => {
 self.bump();
 if let Some('=') = self.peek() {
 self.bump();
 Some(TokenKind::Opr(Op::NotEqual))
 } else {
 Some(TokenKind::Opr(Op::Not))
 }
 },
 '>' => {
 self.bump();
 if let Some('=') = self.peek() {
 self.bump();
 Some(TokenKind::Opr(Op::GreaterOrEqual))
 } else {
 Some(TokenKind::Opr(Op::Greater))
 }
 },
 '<' => {
 self.bump();
 if let Some('=') = self.peek() {
 self.bump();
 Some(TokenKind::Opr(Op::LessOrEqual))
 } else {
 Some(TokenKind::Opr(Op::Less))
 }
 },
 _ => None
 };
 }
 }
}
macro_rules! elapsed {
 ($($code:tt)*) => {
 {
 let start = std::time::Instant::now();
 $($code)*;
 println!("Time elapsed is: {:?}", start.elapsed());
 }
 }
}
fn main() {
 for (i, arg) in env::args().enumerate() {
 if i == 0 { continue }
 let file = fs::read_to_string(arg).expect("failed to read file");
 let lexer: Lexer<'_> = Lexer::new(&file);
 elapsed!(for tok in lexer {
 //println!("{tok:?}");
 })
 }
}

HOWEVER i am sure there are many ways i could improve this code, readability and performance-wise. for instance, the loop {} in Lexer::next should be removed, but that messes up comments. also, as a side note, whenever i see other people implement lexers, their .peek() method clones the chars iterator and then calls .next(). my version stores peek as a separate value. can you tell me which is better?

EXAMPLES

"Hello, World!" -> Str("Hello, World!")
36 -> Num("36")
/* /* nested block comment */ */ -> nothing
foo_bar_baz_23 -> Ident("foo_bar_baz_23")
+ - / * % = ! != < > <= >= -> Opr(Plus) Opr(Minus) Opr(Divide) Opr(Multiply) Opr(Modulo) ... 
```

Question 2

Since no one has answered the question yet you can still edit the code to correct any errors.

Question 3

Sample inputs and outputs would be nice (driver code/test suite).

Question 4

why is nobody answering this question?

Question 5

Instead of using .chars() and keeping track of pos I suggest using char_indices() which gives you an iterator over tuples of the position and char.

Instead of carefully tracking row and col in bump as part of the lexer, I'd create a custom iterator on top of char_indices() which iterates over something like:

struct CharInfo {
 ch: char,
 pos: usize,
 row: usize,
 col: usize
}

Instead of having a peek member in the lexer, I'd use a .peekable() on the CharInfo iterator to have an iterator that I could peek into.

Alternately, I might not use a peeking technique here. In this case your iterator is trivially cloneable. You can rewind the iterator by using a combination of .clone() and overwriting.

For example, with a peeking approach, you might use next_if to conditionally pull chars out of while they are whitespace.

loop {
 if self.chars.peek().map_or(false, |x| x.ch.is_whitespace()) {
 self.next(); // consume whitespace
 }
}

With a cloning approach, you might do:

loop {
 let current = self.chars.clone();
 if !self.chars.next().map_or(false, |x| x.ch.is_whitespace()) {
 self.chars = current;
 break;
 } 
}

But either way, you might get good mileage out of iterator methods (which you can use because you use a peekable iterator), this includes the builtin methods, methods from itertools or possibly other crates.

Winston Ewert Winston Ewert 30.7k4 gold badges52 silver badges79 bronze badges · Accepted Answer · 2022-12-23 04:50:36Z

Instead of using .chars() and keeping track of pos I suggest using char_indices() which gives you an iterator over tuples of the position and char.

Instead of carefully tracking row and col in bump as part of the lexer, I'd create a custom iterator on top of char_indices() which iterates over something like:

struct CharInfo {
 ch: char,
 pos: usize,
 row: usize,
 col: usize
}

Instead of having a peek member in the lexer, I'd use a .peekable() on the CharInfo iterator to have an iterator that I could peek into.

Alternately, I might not use a peeking technique here. In this case your iterator is trivially cloneable. You can rewind the iterator by using a combination of .clone() and overwriting.

For example, with a peeking approach, you might use next_if to conditionally pull chars out of while they are whitespace.

loop {
 if self.chars.peek().map_or(false, |x| x.ch.is_whitespace()) {
 self.next(); // consume whitespace
 }
}

With a cloning approach, you might do:

loop {
 let current = self.chars.clone();
 if !self.chars.next().map_or(false, |x| x.ch.is_whitespace()) {
 self.chars = current;
 break;
 } 
}

But either way, you might get good mileage out of iterator methods (which you can use because you use a peekable iterator), this includes the builtin methods, methods from itertools or possibly other crates.

Stack Exchange Network

top-down lexer in rust

EXAMPLES

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

top-down lexer in rust

EXAMPLES

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions