Lexer for shell-like language [rust]

Question 1

I am working on writing a rust implementation of a sh-like language. Rather than posting 600 lines of code here, you can just go to the GitHub repository.

Is this a good approach for lexxing?

Here's the lex function:

pub fn lex(&mut self) {
 while self.input.has_next() {
 let c: char = self.input.next();
 match c {
 /// Any whitespace is ignored.
 ' ' | '\t' | '\n' | '\r' => {
 continue;
 }
 /// Comments can also be ignored.
 '#' => {
 while self.input.has_next() && self.input.next() != '\n' {
 self.input.next();
 }
 continue;
 }
 // Redirection
 '>' => {
 if self.input.has_next() {
 match self.input.lookahead(1) {
 '>' => {
 self.input.next();
 self.output.push(Token::new(TokenType::RedirAppend));
 }
 '|' => {
 self.input.next();
 self.output.push(Token::new(TokenType::ForceRedir));
 }
 _ => {
 // We shouldn't eat the next token if it's anything else,
 // it just means that the code either has a space afterwords
 // or it's (potentially) important text
 self.output.push(Token::new(TokenType::Redir));
 }
 }
 }
 }
 // Pipes
 '|' => {
 self.output.push(Token::new(TokenType::Pipe));
 }
 '{' => {
 self.output.push(Token::new(TokenType::LeftCurlyBrace));
 }
 '}' => {
 self.output.push(Token::new(TokenType::RightCurlyBrace));
 }
 '[' => {
 if self.input.has_next() && self.input.lookahead(1) == '[' {
 self.input.next();
 self.output
 .push(Token::new(TokenType::LeftDoubleSquareBracket));
 } else {
 self.output.push(Token::new(TokenType::LeftSquareBracket));
 }
 }
 ']' => {
 if self.input.has_next() && self.input.lookahead(1) == ']' {
 self.input.next();
 self.output
 .push(Token::new(TokenType::RightDoubleSquareBracket));
 } else {
 self.output.push(Token::new(TokenType::RightSquareBracket));
 }
 }
 '(' => {
 if self.input.has_next() && self.input.lookahead(1) == '(' {
 self.input.next();
 self.output.push(Token::new(TokenType::LeftDoubleParen));
 } else {
 self.output.push(Token::new(TokenType::LeftParen));
 }
 }
 ')' => {
 if self.input.has_next() && self.input.lookahead(1) == ')' {
 self.input.next();
 self.output.push(Token::new(TokenType::RightDoubleParen));
 } else {
 self.output.push(Token::new(TokenType::RightParen));
 }
 }
 // Keywords and identifiers
 'a'..='z' | 'A'..='Z' | '_' => {
 if self.input.has_next_multi(3)
 && c == 'c'
 && self.input.lookahead_multi(3) == "ase"
 {
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Case));
 } else if self.input.has_next_multi(5)
 && c == 'c'
 && self.input.lookahead_multi(5) == "oproc"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Coproc));
 } else if self.input.has_next() && c == 'd' && self.input.lookahead(1) == 'o' {
 self.input.next();
 self.output.push(Token::new(TokenType::Do));
 } else if self.input.has_next_multi(3)
 && c == 'd'
 && self.input.lookahead_multi(3) == "one"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Done));
 } else if self.input.has_next_multi(3)
 && c == 'e'
 && self.input.lookahead_multi(3) == "lif"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Elif));
 } else if self.input.has_next_multi(3)
 && c == 'e'
 && self.input.lookahead_multi(3) == "lse"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Else));
 } else if self.input.has_next_multi(3)
 && c == 'e'
 && self.input.lookahead_multi(3) == "sac"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Esac));
 } else if self.input.has_next() && c == 'f' && self.input.lookahead(1) == 'i' {
 self.input.next();
 self.output.push(Token::new(TokenType::Fi));
 } else if self.input.has_next_multi(2)
 && c == 'f'
 && self.input.lookahead_multi(2) == "or"
 {
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::For));
 } else if self.input.has_next_multi(7)
 && c == 'f'
 && self.input.lookahead_multi(7) == "unction"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Function));
 } else if self.input.has_next() && c == 'i' && self.input.lookahead(1) == 'f' {
 self.input.next();
 self.output.push(Token::new(TokenType::If));
 } else if self.input.has_next() && c == 'i' && self.input.lookahead(1) == 'n' {
 self.input.next();
 self.output.push(Token::new(TokenType::In));
 } else if self.input.has_next_multi(5)
 && c == 's'
 && self.input.lookahead_multi(5) == "elect"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Select));
 } else if self.input.has_next_multi(3)
 && c == 't'
 && self.input.lookahead_multi(3) == "hen"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Then));
 } else if self.input.has_next_multi(3)
 && c == 't'
 && self.input.lookahead_multi(3) == "ime"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Time));
 } else if self.input.has_next_multi(4)
 && c == 'u'
 && self.input.lookahead_multi(4) == "ntil"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Until));
 } else if self.input.has_next_multi(4)
 && c == 'w'
 && self.input.lookahead_multi(4) == "hile"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::While));
 } else {
 let mut tk: String = "".to_owned();
 while self.input.has_next() {
 match self.input.next() {
 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => {
 tk.push(c);
 self.input.next();
 }
 _ => {
 break;
 }
 }
 self.output.push(Token::new(TokenType::Text));
 }
 }
 }
 _ => {
 panic!("Can't handle character {}", c)
 }
 }
 }
}
```

Question 2

A method self.input.discard(n) would be very helpful. Consider

 } else if self.input.has_next_multi(5)
 && c == 'c'
 && self.input.lookahead_multi(5) == "oproc"
 {
 self.input.discard(5);
 self.output.push(Token::new(TokenType::Coproc));

I don't think has_next_multi has a reason to exist. lookahead_multi is in the position to fail if there is not enough characters in the stream.
Matching c looks like a premature optimization. Just lookahead for all possible alternative, like >>, >|, >, etc. It is also recommended to keep them in a table, and iterate over it.
I see no separators. An identifier case1 will be tokenized into a keyword case and an identifier 1. I don't think it is correct.
Along the same line, consider a redirection into a file named case. The code will recognize a file name as a keyword. Keep it in mind before diving into a parser.
There is no support for quotes, backslash, or any other escape mechanism.

vnp vnp 58.7k4 gold badges55 silver badges144 bronze badges · Answer 1 · 2022-06-05 21:52:40Z

A method self.input.discard(n) would be very helpful. Consider

 } else if self.input.has_next_multi(5)
 && c == 'c'
 && self.input.lookahead_multi(5) == "oproc"
 {
 self.input.discard(5);
 self.output.push(Token::new(TokenType::Coproc));

I don't think has_next_multi has a reason to exist. lookahead_multi is in the position to fail if there is not enough characters in the stream.
Matching c looks like a premature optimization. Just lookahead for all possible alternative, like >>, >|, >, etc. It is also recommended to keep them in a table, and iterate over it.
I see no separators. An identifier case1 will be tokenized into a keyword case and an identifier 1. I don't think it is correct.
Along the same line, consider a redirection into a file named case. The code will recognize a file name as a keyword. Keep it in mind before diving into a parser.
There is no support for quotes, backslash, or any other escape mechanism.

Stack Exchange Network

Lexer for shell-like language [rust]

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Lexer for shell-like language [rust]

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions