From edec4a2323a91cbfe7f49ad54d9dbebdc495c15f Mon Sep 17 00:00:00 2001 From: SimonFJ20 Date: Tue, 14 Mar 2023 17:57:04 +0100 Subject: [PATCH] add lexer --- src/main.rs | 271 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 266 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index 44d5d7a..3176dde 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,23 +1,59 @@ +#![allow(dead_code)] + +use std::str::Chars; + +#[derive(Debug)] enum TokenType { InvalidChar, MalformedString, MalformedComment, + Id, Int, + Decimal, + String, + False, + True, + If, + Else, + While, + For, + In, + Break, + Continue, + Function, + Return, + End, + Underscore, Plus, Minus, Asterisk, Slash, Percent, + PlusEqual, + MinusEqual, + AsteriskEqual, + SlashEqual, + PercentEqual, LParen, RParen, + LBrace, + RBrace, + LBracket, + RBracket, + Dot, + Comma, + Colon, + Semicolon, } +#[derive(Debug)] struct Position { index: usize, line: i32, col: i32, } +#[derive(Debug)] struct Token { token_type: TokenType, pos: Position, @@ -26,12 +62,226 @@ struct Token { struct Lexer<'a> { text: &'a str, - i: i32, + chars: Chars<'a>, + current_char: Option, + index: usize, + line: i32, + col: i32, } impl<'a> Lexer<'a> { pub fn new(text: &'a str) -> Self { - Self { text, i: 0 } + let mut chars = text.chars(); + let first_char = chars.next(); + Self { + text, + chars, + current_char: first_char, + index: 0, + line: 1, + col: 1, + } + } + + fn next_token(&mut self) -> Option { + if self.done() { + return None; + } + match self.current() { + ' ' | '\t' | '\r' | '\n' => self.skip_whitespace(), + '1'..='9' => Some(self.int_token()), + 'a'..='z' | 'A'..='Z' | '_' => Some(self.id_token()), + '"' => Some(self.string_token()), + '+' => { + Some(self.single_or_double_char_token(TokenType::Plus, '=', TokenType::PlusEqual)) + } + '-' => { + Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual)) + } + '*' => Some(self.single_or_double_char_token( + TokenType::Asterisk, + '=', + TokenType::AsteriskEqual, + )), + '/' => self.slash_token(), + '%' => Some(self.single_or_double_char_token( + TokenType::Percent, + '=', + TokenType::PercentEqual, + )), + '(' => Some(self.step_and_token(TokenType::LParen, self.pos())), + ')' => Some(self.step_and_token(TokenType::LParen, self.pos())), + '.' => Some(self.dot_token()), + _ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())), + } + } + + fn skip_whitespace(&mut self) -> Option { + while !self.done() && matches!(self.current(), ' ' | '\t' | '\r' | '\n') { + self.step() + } + self.next_token() + } + + fn int_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + while !self.done() && matches!(self.current(), '0'..='9') { + self.step(); + } + self.token(TokenType::Int, start) + } + + fn string_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + let mut escaped = false; + while !self.done() && (self.current() != '"' || escaped) { + escaped = self.current() == '\\' && !escaped; + self.step(); + } + if self.done() || self.current() != '"' { + self.step_and_token(TokenType::MalformedString, start) + } else { + self.step_and_token(TokenType::String, start) + } + } + + fn id_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + self.step(); + } + match &self.text[start.index..self.index] { + "false" => self.token(TokenType::False, start), + "true" => self.token(TokenType::True, start), + "if" => self.token(TokenType::True, start), + "else" => self.token(TokenType::True, start), + "while" => self.token(TokenType::True, start), + "for" => self.token(TokenType::True, start), + "in" => self.token(TokenType::True, start), + "break" => self.token(TokenType::True, start), + "continue" => self.token(TokenType::True, start), + "function" => self.token(TokenType::True, start), + "return" => self.token(TokenType::True, start), + "end" => self.token(TokenType::True, start), + "underscore" => self.token(TokenType::True, start), + _ => self.token(TokenType::Id, start), + } + } + + fn single_or_double_char_token( + &mut self, + single_type: TokenType, + double_char: char, + double_type: TokenType, + ) -> Token { + let start = self.pos(); + self.step(); + if !self.done() && self.current() == double_char { + self.step_and_token(double_type, start) + } else { + self.token(single_type, start) + } + } + + fn slash_token(&mut self) -> Option { + let start = self.pos(); + self.step(); + if !self.done() && self.current() == '/' { + self.step(); + while !self.done() && self.current() != '\n' { + self.step(); + } + self.next_token() + } else if !self.done() && self.current() == '*' { + self.step(); + let mut depth = 1; + let mut last_char: Option = None; + while !self.done() { + match (last_char, self.current()) { + (Some('/'), '*') => { + depth += 1; + } + (Some('*'), '/') => { + depth -= 1; + if depth == 0 { + self.step(); + break; + } + } + _ => {} + } + last_char = Some(self.current()); + self.step(); + } + if depth != 0 { + Some(self.token(TokenType::MalformedComment, start)) + } else { + self.next_token() + } + } else if !self.done() && self.current() == '=' { + return Some(self.step_and_token(TokenType::SlashEqual, start)); + } else { + return Some(self.token(TokenType::Slash, start)); + } + } + + fn dot_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + if !self.done() && matches!(self.current(), '0'..='9') { + self.step(); + while !self.done() && matches!(self.current(), '0'..='9') { + self.step(); + } + self.token(TokenType::Decimal, start) + } else { + self.token(TokenType::Dot, start) + } + } + + fn step_and_token(&mut self, token_type: TokenType, start: Position) -> Token { + self.step(); + self.token(token_type, start) + } + + fn token(&self, token_type: TokenType, start: Position) -> Token { + Token { + token_type, + length: self.index - start.index, + pos: start, + } + } + + fn pos(&self) -> Position { + Position { + index: self.index, + line: self.line, + col: self.col, + } + } + + fn done(&self) -> bool { + self.current_char.is_none() + } + + fn current(&self) -> char { + self.current_char.expect("done() not checked") + } + + fn step(&mut self) { + self.index += 1; + if !self.done() { + if self.current() == '\n' { + self.line += 1; + self.col = 1; + } else { + self.col += 1; + } + } + self.current_char = self.chars.next(); } } @@ -39,10 +289,21 @@ impl<'a> Iterator for Lexer<'a> { type Item = Token; fn next(&mut self) -> Option { - todo!() + self.next_token() } } -fn main() { - println!("Hello, world!"); +enum Expr { + Id(String), + Int(i64), + Float(f64), + String(String), +} + +fn main() { + let text = "3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5"; + let lexer = Lexer::new(text); + lexer.for_each(|token| { + println!("{:?}", token); + }) }