0
\$\begingroup\$

the other day, i decided to build a top-down lexer in rust, just for fun.
this is what i have so far:

use std::str::Chars;
use std::env;
use std::fs;
#[derive(Debug, Clone)]
enum Op {
 Plus,
 Minus,
 Multiply,
 Divide,
 Modulo,
 Equal,
 Not,
 NotEqual,
 Greater,
 GreaterOrEqual,
 Less,
 LessOrEqual,
}
#[derive(Debug, Clone)]
enum TokenKind {
 Opr(Op),
 Ident(String),
 Str(String),
 Num(String),
}
#[derive(Debug, Clone)]
struct Lexer<'a> {
 // string containing entire program
 source: &'a str,
 // the peekable character
 peek: Option<char>,
 // the characters iterator
 chars: Chars<'a>,
 // utf-8 position in the string
 pos: usize,
 // the bottom two are for error reporting/debugging
 row: usize,
 col: usize
}
#[allow(unused)]
impl<'a> Lexer<'a> {
 fn new(source: &'a str) -> Self {
 let mut chars = source.chars();
 Self {
 source,
 peek: chars.next(),
 chars,
 pos: 0,
 row: 1,
 col: 1
 }
 }
 #[inline]
 fn position(&self) -> (usize, usize) {
 (self.row, self.col)
 }
 #[inline]
 fn is_over(&self) -> bool {
 self.peek().is_none()
 }
 #[inline]
 fn bump(&mut self) {
 if let Some(ch) = self.chars.next() {
 self.pos += ch.len_utf8();
 self.col += 1;
 if ch == '\n' {
 self.col = 0;
 self.row += 1;
 }
 self.peek = Some(ch);
 } else {
 self.peek = None;
 }
 }
 #[inline]
 fn peek(&self) -> Option<char> {
 self.peek
 }
 #[inline]
 fn unicode_escape(&mut self, ch: char) -> Option<char> {
 // e.g. \n \t \u{...}
 let res = match ch {
 'n' => '\n',
 't' => '\t',
 'r' => '\r',
 'u' => todo!("implement unicode character codes"),
 'x' => todo!("implement hex character codes"),
 other => other
 };
 Some(res)
 }
 fn trim_ident(&mut self) -> TokenKind {
 let start_pos = self.pos;
 while let Some(s) = self.peek() {
 if !(s.is_alphanumeric() || s == '_') {
 break;
 }
 self.bump();
 }
 TokenKind::Ident(self.source[start_pos..self.pos].to_owned())
 }
 fn trim_number(&mut self) -> TokenKind {
 let start_pos = self.pos;
 while let Some(s) = self.peek() {
 if !s.is_numeric() {
 break;
 }
 self.bump();
 }
 TokenKind::Num(self.source[start_pos..self.pos].to_owned())
 }
 fn trim_string(&mut self) -> TokenKind {
 self.bump();
 let start_pos = self.pos;
 while let Some(s) = self.peek() {
 if s == '"' {
 break
 }
 if s == '\\' {
 self.bump();
 }
 self.bump();
 }
 if self.is_over() {
 todo!("Could not lex string");
 }
 self.bump();
 TokenKind::Str(self.source[start_pos..self.pos-1].to_owned())
 }
 
 #[inline]
 fn trim_comment(&mut self) {
 while let Some(s) = self.peek() {
 if s == '\n' {
 self.row = 0;
 self.col += 1;
 break
 }
 self.bump();
 }
 }
 #[inline]
 fn trim_block_comment(&mut self) {
 let mut recursion: usize = 1;
 while let Some(s) = self.next() {
 match s {
 '\n' => {
 self.row = 0;
 self.col += 1;
 },
 '/' => {
 if let Some('*') = self.peek() {
 self.bump();
 recursion += 1;
 }
 },
 '*' => {
 if let Some('/') = self.peek() {
 self.bump();
 recursion -= 1;
 }
 }
 _ => {},
 }
 if recursion == 0 {
 break
 }
 }
 }
 #[inline]
 fn trim_whitespace(&mut self) {
 while let Some(s) = self.peek() {
 if !s.is_whitespace() {
 break;
 }
 self.bump();
 }
 }
 
 #[inline]
 fn next_char(&mut self) -> Option<char> {
 let peek = self.peek;
 self.bump();
 peek
 }
}
impl<'a> Iterator for Lexer<'a> {
 type Item = TokenKind;
 fn next(&mut self) -> Option<TokenKind> {
 loop {
 self.trim_whitespace();
 return match self.peek()? {
 x if x.is_alphabetic() => return Some(self.trim_ident()),
 x if x.is_numeric() => return Some(self.trim_number()),
 '"' => {
 Some(self.trim_string())
 },
 '+' => {
 self.bump();
 Some(TokenKind::Opr(Op::Plus))
 },
 '-' => {
 self.bump();
 Some(TokenKind::Opr(Op::Minus))
 },
 '*' => {
 self.bump();
 Some(TokenKind::Opr(Op::Multiply))
 },
 '/' => {
 self.bump();
 if let Some('/') = self.peek() {
 self.trim_comment();
 continue;
 }
 if let Some('*') = self.peek() {
 self.trim_block_comment();
 continue;
 }
 Some(TokenKind::Opr(Op::Divide))
 },
 '%' => {
 self.bump();
 Some(TokenKind::Opr(Op::Modulo))
 },
 '=' => {
 self.bump();
 Some(TokenKind::Opr(Op::Equal))
 },
 '!' => {
 self.bump();
 if let Some('=') = self.peek() {
 self.bump();
 Some(TokenKind::Opr(Op::NotEqual))
 } else {
 Some(TokenKind::Opr(Op::Not))
 }
 },
 '>' => {
 self.bump();
 if let Some('=') = self.peek() {
 self.bump();
 Some(TokenKind::Opr(Op::GreaterOrEqual))
 } else {
 Some(TokenKind::Opr(Op::Greater))
 }
 },
 '<' => {
 self.bump();
 if let Some('=') = self.peek() {
 self.bump();
 Some(TokenKind::Opr(Op::LessOrEqual))
 } else {
 Some(TokenKind::Opr(Op::Less))
 }
 },
 _ => None
 };
 }
 }
}
macro_rules! elapsed {
 ($($code:tt)*) => {
 {
 let start = std::time::Instant::now();
 $($code)*;
 println!("Time elapsed is: {:?}", start.elapsed());
 }
 }
}
fn main() {
 for (i, arg) in env::args().enumerate() {
 if i == 0 { continue }
 let file = fs::read_to_string(arg).expect("failed to read file");
 let lexer: Lexer<'_> = Lexer::new(&file);
 elapsed!(for tok in lexer {
 //println!("{tok:?}");
 })
 }
}

HOWEVER i am sure there are many ways i could improve this code, readability and performance-wise. for instance, the loop {} in Lexer::next should be removed, but that messes up comments. also, as a side note, whenever i see other people implement lexers, their .peek() method clones the chars iterator and then calls .next(). my version stores peek as a separate value. can you tell me which is better?

EXAMPLES

"Hello, World!" -> Str("Hello, World!")
36 -> Num("36")
/* /* nested block comment */ */ -> nothing
foo_bar_baz_23 -> Ident("foo_bar_baz_23")
+ - / * % = ! != < > <= >= -> Opr(Plus) Opr(Minus) Opr(Divide) Opr(Multiply) Opr(Modulo) ... 
```
asked Dec 17, 2022 at 8:18
\$\endgroup\$
3
  • 2
    \$\begingroup\$ Since no one has answered the question yet you can still edit the code to correct any errors. \$\endgroup\$ Commented Dec 17, 2022 at 15:37
  • \$\begingroup\$ Sample inputs and outputs would be nice (driver code/test suite). \$\endgroup\$ Commented Dec 17, 2022 at 16:08
  • \$\begingroup\$ why is nobody answering this question? \$\endgroup\$ Commented Dec 18, 2022 at 1:30

1 Answer 1

3
\$\begingroup\$

Instead of using .chars() and keeping track of pos I suggest using char_indices() which gives you an iterator over tuples of the position and char.

Instead of carefully tracking row and col in bump as part of the lexer, I'd create a custom iterator on top of char_indices() which iterates over something like:

struct CharInfo {
 ch: char,
 pos: usize,
 row: usize,
 col: usize
}

Instead of having a peek member in the lexer, I'd use a .peekable() on the CharInfo iterator to have an iterator that I could peek into.

Alternately, I might not use a peeking technique here. In this case your iterator is trivially cloneable. You can rewind the iterator by using a combination of .clone() and overwriting.

For example, with a peeking approach, you might use next_if to conditionally pull chars out of while they are whitespace.

loop {
 if self.chars.peek().map_or(false, |x| x.ch.is_whitespace()) {
 self.next(); // consume whitespace
 }
}

With a cloning approach, you might do:

loop {
 let current = self.chars.clone();
 if !self.chars.next().map_or(false, |x| x.ch.is_whitespace()) {
 self.chars = current;
 break;
 } 
}

But either way, you might get good mileage out of iterator methods (which you can use because you use a peekable iterator), this includes the builtin methods, methods from itertools or possibly other crates.

answered Dec 23, 2022 at 4:50
\$\endgroup\$
0

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.