From c9ed3333f1061f6a6d2a6eb9df7af48f71497079 Mon Sep 17 00:00:00 2001 From: SimonFJ20 Date: Wed, 15 Mar 2023 02:23:41 +0100 Subject: [PATCH] operator parsing --- src/main.rs | 640 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 601 insertions(+), 39 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3176dde..73244d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,17 +2,34 @@ use std::str::Chars; -#[derive(Debug)] +#[derive(Debug, Clone)] +struct Position { + pub index: usize, + pub line: i32, + pub col: i32, +} + +impl Position { + pub fn new(index: usize, line: i32, col: i32) -> Self { + Self { index, line, col } + } +} + +#[derive(Debug, PartialEq)] enum TokenType { InvalidChar, MalformedString, MalformedComment, + Id, Int, Decimal, String, + False, True, + Let, + Mut, If, Else, While, @@ -20,20 +37,36 @@ enum TokenType { In, Break, Continue, - Function, + Fn, Return, End, + Not, + And, + Or, Underscore, + Plus, Minus, Asterisk, Slash, Percent, + DoubleAsterisk, + Equal, + Exclamation, + LessThan, + GreaterThan, + PlusEqual, MinusEqual, AsteriskEqual, SlashEqual, PercentEqual, + DoubleAsteriskEqual, + EqualEqual, + ExclamationEqual, + LessThanEqual, + GreaterThanEqual, + LParen, RParen, LBrace, @@ -44,20 +77,14 @@ enum TokenType { Comma, Colon, Semicolon, -} - -#[derive(Debug)] -struct Position { - index: usize, - line: i32, - col: i32, + Ampersand, } #[derive(Debug)] struct Token { - token_type: TokenType, - pos: Position, - length: usize, + pub token_type: TokenType, + pub pos: Position, + pub length: usize, } struct Lexer<'a> { @@ -98,20 +125,42 @@ impl<'a> Lexer<'a> { '-' => { Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual)) } - '*' => Some(self.single_or_double_char_token( - TokenType::Asterisk, - '=', - TokenType::AsteriskEqual, - )), + '*' => Some(self.asterisk_token()), '/' => self.slash_token(), '%' => Some(self.single_or_double_char_token( TokenType::Percent, '=', TokenType::PercentEqual, )), + '=' => { + Some(self.single_or_double_char_token(TokenType::Equal, '=', TokenType::EqualEqual)) + } + '!' => Some(self.single_or_double_char_token( + TokenType::Exclamation, + '=', + TokenType::ExclamationEqual, + )), + '<' => Some(self.single_or_double_char_token( + TokenType::LessThan, + '=', + TokenType::LessThanEqual, + )), + '>' => Some(self.single_or_double_char_token( + TokenType::GreaterThan, + '=', + TokenType::GreaterThanEqual, + )), '(' => Some(self.step_and_token(TokenType::LParen, self.pos())), - ')' => Some(self.step_and_token(TokenType::LParen, self.pos())), + ')' => Some(self.step_and_token(TokenType::RParen, self.pos())), + '{' => Some(self.step_and_token(TokenType::LBrace, self.pos())), + '}' => Some(self.step_and_token(TokenType::RBrace, self.pos())), + '[' => Some(self.step_and_token(TokenType::LBracket, self.pos())), + ']' => Some(self.step_and_token(TokenType::RBracket, self.pos())), '.' => Some(self.dot_token()), + ',' => Some(self.step_and_token(TokenType::Comma, self.pos())), + ':' => Some(self.step_and_token(TokenType::Colon, self.pos())), + ';' => Some(self.step_and_token(TokenType::Semicolon, self.pos())), + '&' => Some(self.step_and_token(TokenType::Ampersand, self.pos())), _ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())), } } @@ -153,22 +202,27 @@ impl<'a> Lexer<'a> { while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { self.step(); } - match &self.text[start.index..self.index] { - "false" => self.token(TokenType::False, start), - "true" => self.token(TokenType::True, start), - "if" => self.token(TokenType::True, start), - "else" => self.token(TokenType::True, start), - "while" => self.token(TokenType::True, start), - "for" => self.token(TokenType::True, start), - "in" => self.token(TokenType::True, start), - "break" => self.token(TokenType::True, start), - "continue" => self.token(TokenType::True, start), - "function" => self.token(TokenType::True, start), - "return" => self.token(TokenType::True, start), - "end" => self.token(TokenType::True, start), - "underscore" => self.token(TokenType::True, start), - _ => self.token(TokenType::Id, start), - } + self.token( + match &self.text[start.index..self.index] { + "false" => TokenType::False, + "true" => TokenType::True, + "let" => TokenType::Let, + "mut" => TokenType::Mut, + "if" => TokenType::If, + "else" => TokenType::Else, + "while" => TokenType::While, + "for" => TokenType::For, + "in" => TokenType::In, + "break" => TokenType::Break, + "continue" => TokenType::Continue, + "fn" => TokenType::Fn, + "return" => TokenType::Return, + "end" => TokenType::End, + "_" => TokenType::Underscore, + _ => TokenType::Id, + }, + start, + ) } fn single_or_double_char_token( @@ -186,6 +240,23 @@ impl<'a> Lexer<'a> { } } + fn asterisk_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + if !self.done() && self.current() == '*' { + self.step(); + if !self.done() && self.current() == '=' { + self.step_and_token(TokenType::DoubleAsteriskEqual, start) + } else { + self.token(TokenType::DoubleAsterisk, start) + } + } else if !self.done() && self.current() == '=' { + self.step_and_token(TokenType::AsteriskEqual, start) + } else { + self.token(TokenType::Asterisk, start) + } + } + fn slash_token(&mut self) -> Option { let start = self.pos(); self.step(); @@ -255,7 +326,7 @@ impl<'a> Lexer<'a> { } } - fn pos(&self) -> Position { + pub fn pos(&self) -> Position { Position { index: self.index, line: self.line, @@ -268,7 +339,7 @@ impl<'a> Lexer<'a> { } fn current(&self) -> char { - self.current_char.expect("done() not checked") + self.current_char.expect("done() checked") } fn step(&mut self) { @@ -293,17 +364,508 @@ impl<'a> Iterator for Lexer<'a> { } } +#[derive(Debug)] +struct Node { + pub value: T, + pub pos: Position, +} + +#[derive(Debug)] enum Expr { + Unit, Id(String), Int(i64), Float(f64), String(String), + Bool(bool), + Array(Vec>), + Object(Vec), + Tuple(Vec>), + + Member { + subject: Box>, + value: String, + }, + Index { + subject: Box>, + value: Box>, + }, + Call { + subject: Box>, + arguments: Vec>, + }, + Unary { + unary_type: UnaryType, + subject: Box>, + }, + Binary { + binary_type: BinaryType, + left: Box>, + right: Box>, + }, +} + +#[derive(Debug)] +enum ObjectEntry { + Pair(Box>, Box), +} + +#[derive(Debug)] +enum UnaryType { + Not, + Negate, + Reference, + ReferenceMut, + Dereference, +} + +#[derive(Debug)] +enum BinaryType { + Exponentiate, + Multiply, + Divide, + Modulo, + Add, + Subtract, + LT, + LTE, + GT, + GTE, + In, + Equal, + Inequal, + And, + Or, +} + +#[derive(Debug)] +struct ParserError { + pos: Position, + message: String, +} + +struct Parser<'a> { + text: &'a str, + lexer: Lexer<'a>, + current_token: Option, +} + +impl<'a> Parser<'a> { + pub fn new(text: &'a str, mut lexer: Lexer<'a>) -> Self { + Self { + text, + current_token: lexer.next(), + lexer, + } + } + + pub fn parse_expr(&mut self) -> Result, ParserError> { + self.parse_prec_or() + } + + fn parse_prec_or(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_and()?; + while !self.done() { + if self.current_is(TokenType::Or) { + self.step(); + let right = self.parse_prec_and()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Or, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_and(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_equal_inequal()?; + while !self.done() { + if self.current_is(TokenType::And) { + self.step(); + let right = self.parse_prec_equal_inequal()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::And, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_equal_inequal(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_lt_lte_gt_gte_in()?; + while !self.done() { + if self.current_is(TokenType::EqualEqual) { + self.step(); + let right = self.parse_prec_lt_lte_gt_gte_in()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Equal, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::ExclamationEqual) { + self.step(); + let right = self.parse_prec_lt_lte_gt_gte_in()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Inequal, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_lt_lte_gt_gte_in(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_add_subtract()?; + while !self.done() { + if self.current_is(TokenType::LessThan) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::LT, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::GreaterThan) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::GT, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::LessThanEqual) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::LTE, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::GreaterThanEqual) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::GTE, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::In) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::In, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_add_subtract(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_multiply_divide_modulo()?; + while !self.done() { + if self.current_is(TokenType::Plus) { + self.step(); + let right = self.parse_prec_multiply_divide_modulo()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Add, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::Minus) { + self.step(); + let right = self.parse_prec_multiply_divide_modulo()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Subtract, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_multiply_divide_modulo(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_unary()?; + while !self.done() { + if self.current_is(TokenType::Asterisk) { + self.step(); + let right = self.parse_prec_unary()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Multiply, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::Slash) { + self.step(); + let right = self.parse_prec_unary()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Divide, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::Percent) { + self.step(); + let right = self.parse_prec_unary()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Modulo, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_unary(&mut self) -> Result, ParserError> { + if !self.done() && self.current_is(TokenType::Not) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Not, + subject, + }) + } else if !self.done() && self.current_is(TokenType::Minus) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Negate, + subject, + }) + } else if !self.done() && self.current_is(TokenType::Ampersand) { + self.step(); + if !self.done() && self.current_is(TokenType::Mut) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::ReferenceMut, + subject, + }) + } else { + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Reference, + subject, + }) + } + } else if !self.done() && self.current_is(TokenType::Asterisk) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Dereference, + subject, + }) + } else { + self.parse_prec_exponentiate() + } + } + + fn parse_prec_exponentiate(&mut self) -> Result, ParserError> { + let left = self.parse_prec_member_index_call()?; + if !self.done() && self.current_is(TokenType::AsteriskEqual) { + let right = self.parse_prec_exponentiate()?; + self.step_and_ok_node(Expr::Binary { + binary_type: BinaryType::Exponentiate, + left: Box::new(left), + right: Box::new(right), + }) + } else { + Ok(left) + } + } + + fn parse_prec_member_index_call(&mut self) -> Result, ParserError> { + let mut subject = self.parse_operand()?; + while !self.done() { + if self.current_is(TokenType::Dot) { + self.step(); + if self.done() || self.current_is(TokenType::Id) { + return self.error("expected identifier"); + } + let value = self.token_string(self.current()); + self.step(); + subject = self.node(Expr::Member { + subject: Box::new(subject), + value, + }); + } else if self.current_is(TokenType::LBracket) { + self.step(); + let value = self.parse_expr()?; + if self.done() || !self.current_is(TokenType::RBracket) { + return self.error("expected ']'"); + } + subject = self.node(Expr::Index { + subject: Box::new(subject), + value: Box::new(value), + }); + } else if self.current_is(TokenType::LParen) { + self.step(); + let mut arguments = Vec::>::new(); + if !self.done() && !self.current_is(TokenType::RParen) { + arguments.push(self.parse_expr()?); + while !self.done() && self.current_is(TokenType::Comma) { + self.step(); + if self.done() || self.current_is(TokenType::RParen) { + self.step(); + } + arguments.push(self.parse_expr()?); + } + } + if self.done() || !self.current_is(TokenType::RParen) { + return self.error("expected ')'"); + } + self.step(); + subject = self.node(Expr::Call { + subject: Box::new(subject), + arguments, + }) + } else { + break; + } + } + Ok(subject) + } + + fn parse_operand(&mut self) -> Result, ParserError> { + if self.done() { + return self.error("expected value, got eof"); + } + match self.current().token_type { + TokenType::Id => self.step_and_ok_node(Expr::Id(self.token_string(self.current()))), + TokenType::Int => { + let mut value_string = self.token_string(self.current()); + self.step(); + if !self.done() && self.current_is(TokenType::Decimal) { + value_string.push_str(&self.token_string(self.current())); + self.step_and_ok_node(Expr::Float( + value_string.parse::().expect("valid f64"), + )) + } else { + self.ok_node(Expr::Int((value_string).parse::().expect("valid i64"))) + } + } + TokenType::Decimal => self.step_and_ok_node(Expr::Float( + self.token_string(self.current()) + .parse::() + .expect("valid f64"), + )), + TokenType::False => self.step_and_ok_node(Expr::Bool(false)), + TokenType::True => self.step_and_ok_node(Expr::Bool(true)), + TokenType::LParen => self.parse_unit_group_or_tuple(), + TokenType::LBrace => self.parse_object(), + TokenType::LBracket => self.parse_array(), + TokenType::Fn => self.parse_function(), + _ => self.error("expected value"), + } + } + + fn parse_unit_group_or_tuple(&mut self) -> Result, ParserError> { + self.step(); + if !self.done() && !self.current_is(TokenType::LParen) { + todo!() + } else { + self.step_and_ok_node(Expr::Unit) + } + } + + fn parse_object(&mut self) -> Result, ParserError> { + todo!() + } + + fn parse_array(&mut self) -> Result, ParserError> { + todo!() + } + + fn parse_function(&mut self) -> Result, ParserError> { + todo!() + } + + fn token_string(&self, token: &Token) -> String { + self.text[token.pos.index..token.pos.index + token.length].to_string() + } + + fn step_and_ok_node(&mut self, value: T) -> Result, ParserError> { + self.step(); + self.ok_node(value) + } + + fn ok_node(&self, value: T) -> Result, ParserError> { + Ok(Node { + value, + pos: self.lexer.pos(), + }) + } + + fn step_and_node(&mut self, value: T) -> Node { + self.step(); + self.node(value) + } + + fn node(&self, value: T) -> Node { + Node { + value, + pos: self.lexer.pos(), + } + } + + fn step_and(&mut self, value: T) -> T { + self.step(); + value + } + + fn error(&self, message: &str) -> Result, ParserError> { + Err(ParserError { + pos: self.lexer.pos(), + message: message.to_string(), + }) + } + + fn done(&self) -> bool { + self.current_token.is_none() + } + + fn current_is(&self, token_type: TokenType) -> bool { + self.current().token_type == token_type + } + + fn current(&self) -> &Token { + self.current_token.as_ref().expect("done() checked") + } + + fn step(&mut self) { + self.current_token = self.lexer.next(); + } } fn main() { - let text = "3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5"; + println!("tokens = ["); + let text = "** 3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5"; let lexer = Lexer::new(text); lexer.for_each(|token| { - println!("{:?}", token); - }) + println!(" {:?},", token); + }); + println!("]"); + + let text2 = "1 + 2 * 3"; + let mut parser = Parser::new(text2, Lexer::new(text2)); + let expr = parser.parse_expr(); + println!("ast = {:#?}", expr); }