1
\$\begingroup\$

I am working on writing a rust implementation of a sh-like language. Rather than posting 600 lines of code here, you can just go to the GitHub repository.

Is this a good approach for lexxing?

Here's the lex function:

pub fn lex(&mut self) {
 while self.input.has_next() {
 let c: char = self.input.next();
 match c {
 /// Any whitespace is ignored.
 ' ' | '\t' | '\n' | '\r' => {
 continue;
 }
 /// Comments can also be ignored.
 '#' => {
 while self.input.has_next() && self.input.next() != '\n' {
 self.input.next();
 }
 continue;
 }
 // Redirection
 '>' => {
 if self.input.has_next() {
 match self.input.lookahead(1) {
 '>' => {
 self.input.next();
 self.output.push(Token::new(TokenType::RedirAppend));
 }
 '|' => {
 self.input.next();
 self.output.push(Token::new(TokenType::ForceRedir));
 }
 _ => {
 // We shouldn't eat the next token if it's anything else,
 // it just means that the code either has a space afterwords
 // or it's (potentially) important text
 self.output.push(Token::new(TokenType::Redir));
 }
 }
 }
 }
 // Pipes
 '|' => {
 self.output.push(Token::new(TokenType::Pipe));
 }
 '{' => {
 self.output.push(Token::new(TokenType::LeftCurlyBrace));
 }
 '}' => {
 self.output.push(Token::new(TokenType::RightCurlyBrace));
 }
 '[' => {
 if self.input.has_next() && self.input.lookahead(1) == '[' {
 self.input.next();
 self.output
 .push(Token::new(TokenType::LeftDoubleSquareBracket));
 } else {
 self.output.push(Token::new(TokenType::LeftSquareBracket));
 }
 }
 ']' => {
 if self.input.has_next() && self.input.lookahead(1) == ']' {
 self.input.next();
 self.output
 .push(Token::new(TokenType::RightDoubleSquareBracket));
 } else {
 self.output.push(Token::new(TokenType::RightSquareBracket));
 }
 }
 '(' => {
 if self.input.has_next() && self.input.lookahead(1) == '(' {
 self.input.next();
 self.output.push(Token::new(TokenType::LeftDoubleParen));
 } else {
 self.output.push(Token::new(TokenType::LeftParen));
 }
 }
 ')' => {
 if self.input.has_next() && self.input.lookahead(1) == ')' {
 self.input.next();
 self.output.push(Token::new(TokenType::RightDoubleParen));
 } else {
 self.output.push(Token::new(TokenType::RightParen));
 }
 }
 // Keywords and identifiers
 'a'..='z' | 'A'..='Z' | '_' => {
 if self.input.has_next_multi(3)
 && c == 'c'
 && self.input.lookahead_multi(3) == "ase"
 {
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Case));
 } else if self.input.has_next_multi(5)
 && c == 'c'
 && self.input.lookahead_multi(5) == "oproc"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Coproc));
 } else if self.input.has_next() && c == 'd' && self.input.lookahead(1) == 'o' {
 self.input.next();
 self.output.push(Token::new(TokenType::Do));
 } else if self.input.has_next_multi(3)
 && c == 'd'
 && self.input.lookahead_multi(3) == "one"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Done));
 } else if self.input.has_next_multi(3)
 && c == 'e'
 && self.input.lookahead_multi(3) == "lif"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Elif));
 } else if self.input.has_next_multi(3)
 && c == 'e'
 && self.input.lookahead_multi(3) == "lse"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Else));
 } else if self.input.has_next_multi(3)
 && c == 'e'
 && self.input.lookahead_multi(3) == "sac"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Esac));
 } else if self.input.has_next() && c == 'f' && self.input.lookahead(1) == 'i' {
 self.input.next();
 self.output.push(Token::new(TokenType::Fi));
 } else if self.input.has_next_multi(2)
 && c == 'f'
 && self.input.lookahead_multi(2) == "or"
 {
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::For));
 } else if self.input.has_next_multi(7)
 && c == 'f'
 && self.input.lookahead_multi(7) == "unction"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Function));
 } else if self.input.has_next() && c == 'i' && self.input.lookahead(1) == 'f' {
 self.input.next();
 self.output.push(Token::new(TokenType::If));
 } else if self.input.has_next() && c == 'i' && self.input.lookahead(1) == 'n' {
 self.input.next();
 self.output.push(Token::new(TokenType::In));
 } else if self.input.has_next_multi(5)
 && c == 's'
 && self.input.lookahead_multi(5) == "elect"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Select));
 } else if self.input.has_next_multi(3)
 && c == 't'
 && self.input.lookahead_multi(3) == "hen"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Then));
 } else if self.input.has_next_multi(3)
 && c == 't'
 && self.input.lookahead_multi(3) == "ime"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Time));
 } else if self.input.has_next_multi(4)
 && c == 'u'
 && self.input.lookahead_multi(4) == "ntil"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::Until));
 } else if self.input.has_next_multi(4)
 && c == 'w'
 && self.input.lookahead_multi(4) == "hile"
 {
 self.input.next();
 self.input.next();
 self.input.next();
 self.input.next();
 self.output.push(Token::new(TokenType::While));
 } else {
 let mut tk: String = "".to_owned();
 while self.input.has_next() {
 match self.input.next() {
 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => {
 tk.push(c);
 self.input.next();
 }
 _ => {
 break;
 }
 }
 self.output.push(Token::new(TokenType::Text));
 }
 }
 }
 _ => {
 panic!("Can't handle character {}", c)
 }
 }
 }
}
```
asked Jun 5, 2022 at 19:41
\$\endgroup\$

1 Answer 1

2
\$\begingroup\$
  • A method self.input.discard(n) would be very helpful. Consider

     } else if self.input.has_next_multi(5)
     && c == 'c'
     && self.input.lookahead_multi(5) == "oproc"
     {
     self.input.discard(5);
     self.output.push(Token::new(TokenType::Coproc));
    
  • I don't think has_next_multi has a reason to exist. lookahead_multi is in the position to fail if there is not enough characters in the stream.

  • Matching c looks like a premature optimization. Just lookahead for all possible alternative, like >>, >|, >, etc. It is also recommended to keep them in a table, and iterate over it.

  • I see no separators. An identifier case1 will be tokenized into a keyword case and an identifier 1. I don't think it is correct.

  • Along the same line, consider a redirection into a file named case. The code will recognize a file name as a keyword. Keep it in mind before diving into a parser.

  • There is no support for quotes, backslash, or any other escape mechanism.

answered Jun 5, 2022 at 21:52
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.