From 8705272a77592fabc3621f37ec3fe39d8647958c Mon Sep 17 00:00:00 2001 From: SimonFJ20 Date: Wed, 15 Mar 2023 16:23:49 +0100 Subject: [PATCH] extract into files and indirect lexer --- src/ast.rs | 75 +++++ src/lexer.rs | 281 +++++++++++++++++ src/main.rs | 859 +------------------------------------------------- src/parser.rs | 425 +++++++++++++++++++++++++ src/tokens.rs | 88 ++++++ 5 files changed, 875 insertions(+), 853 deletions(-) create mode 100644 src/ast.rs create mode 100644 src/lexer.rs create mode 100644 src/parser.rs create mode 100644 src/tokens.rs diff --git a/src/ast.rs b/src/ast.rs new file mode 100644 index 0000000..974d05a --- /dev/null +++ b/src/ast.rs @@ -0,0 +1,75 @@ +use crate::tokens::Position; + +#[derive(Debug)] +pub struct Node { + pub value: T, + pub pos: Position, +} + +#[derive(Debug)] +pub enum Expr { + Unit, + Id(String), + Int(i64), + Float(f64), + String(String), + Bool(bool), + Array(Vec>), + Object(Vec), + Tuple(Vec>), + + Member { + subject: Box>, + value: String, + }, + Index { + subject: Box>, + value: Box>, + }, + Call { + subject: Box>, + arguments: Vec>, + }, + Unary { + unary_type: UnaryType, + subject: Box>, + }, + Binary { + binary_type: BinaryType, + left: Box>, + right: Box>, + }, +} + +#[derive(Debug)] +pub enum ObjectEntry { + Pair(Box>, Box), +} + +#[derive(Debug)] +pub enum UnaryType { + Not, + Negate, + Reference, + ReferenceMut, + Dereference, +} + +#[derive(Debug)] +pub enum BinaryType { + Exponentiate, + Multiply, + Divide, + Modulo, + Add, + Subtract, + LT, + LTE, + GT, + GTE, + In, + Equal, + Inequal, + And, + Or, +} diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..898c264 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,281 @@ +use crate::tokens::{Position, PositionKnowing, Token, TokenType}; +use std::str::Chars; + +pub struct Lexer<'a> { + text: &'a str, + chars: Chars<'a>, + current_char: Option, + index: usize, + line: i32, + col: i32, +} + +impl<'a> Lexer<'a> { + pub fn new(text: &'a str) -> Self { + let mut chars = text.chars(); + let first_char = chars.next(); + Self { + text, + chars, + current_char: first_char, + index: 0, + line: 1, + col: 1, + } + } + + fn next_token(&mut self) -> Option { + if self.done() { + return None; + } + match self.current() { + ' ' | '\t' | '\r' | '\n' => self.skip_whitespace(), + '1'..='9' => Some(self.int_token()), + 'a'..='z' | 'A'..='Z' | '_' => Some(self.id_token()), + '"' => Some(self.string_token()), + '+' => { + Some(self.single_or_double_char_token(TokenType::Plus, '=', TokenType::PlusEqual)) + } + '-' => { + Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual)) + } + '*' => Some(self.asterisk_token()), + '/' => self.slash_token(), + '%' => Some(self.single_or_double_char_token( + TokenType::Percent, + '=', + TokenType::PercentEqual, + )), + '=' => { + Some(self.single_or_double_char_token(TokenType::Equal, '=', TokenType::EqualEqual)) + } + '!' => Some(self.single_or_double_char_token( + TokenType::Exclamation, + '=', + TokenType::ExclamationEqual, + )), + '<' => Some(self.single_or_double_char_token( + TokenType::LessThan, + '=', + TokenType::LessThanEqual, + )), + '>' => Some(self.single_or_double_char_token( + TokenType::GreaterThan, + '=', + TokenType::GreaterThanEqual, + )), + '(' => Some(self.step_and_token(TokenType::LParen, self.pos())), + ')' => Some(self.step_and_token(TokenType::RParen, self.pos())), + '{' => Some(self.step_and_token(TokenType::LBrace, self.pos())), + '}' => Some(self.step_and_token(TokenType::RBrace, self.pos())), + '[' => Some(self.step_and_token(TokenType::LBracket, self.pos())), + ']' => Some(self.step_and_token(TokenType::RBracket, self.pos())), + '.' => Some(self.dot_token()), + ',' => Some(self.step_and_token(TokenType::Comma, self.pos())), + ':' => Some(self.step_and_token(TokenType::Colon, self.pos())), + ';' => Some(self.step_and_token(TokenType::Semicolon, self.pos())), + '&' => Some(self.step_and_token(TokenType::Ampersand, self.pos())), + _ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())), + } + } + + fn skip_whitespace(&mut self) -> Option { + while !self.done() && matches!(self.current(), ' ' | '\t' | '\r' | '\n') { + self.step() + } + self.next_token() + } + + fn int_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + while !self.done() && matches!(self.current(), '0'..='9') { + self.step(); + } + self.token(TokenType::Int, start) + } + + fn string_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + let mut escaped = false; + while !self.done() && (self.current() != '"' || escaped) { + escaped = self.current() == '\\' && !escaped; + self.step(); + } + if self.done() || self.current() != '"' { + self.step_and_token(TokenType::MalformedString, start) + } else { + self.step_and_token(TokenType::String, start) + } + } + + fn id_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { + self.step(); + } + self.token( + match &self.text[start.index..self.index] { + "false" => TokenType::False, + "true" => TokenType::True, + "let" => TokenType::Let, + "mut" => TokenType::Mut, + "if" => TokenType::If, + "else" => TokenType::Else, + "while" => TokenType::While, + "for" => TokenType::For, + "in" => TokenType::In, + "break" => TokenType::Break, + "continue" => TokenType::Continue, + "fn" => TokenType::Fn, + "return" => TokenType::Return, + "end" => TokenType::End, + "_" => TokenType::Underscore, + _ => TokenType::Id, + }, + start, + ) + } + + fn single_or_double_char_token( + &mut self, + single_type: TokenType, + double_char: char, + double_type: TokenType, + ) -> Token { + let start = self.pos(); + self.step(); + if !self.done() && self.current() == double_char { + self.step_and_token(double_type, start) + } else { + self.token(single_type, start) + } + } + + fn asterisk_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + if !self.done() && self.current() == '*' { + self.step(); + if !self.done() && self.current() == '=' { + self.step_and_token(TokenType::DoubleAsteriskEqual, start) + } else { + self.token(TokenType::DoubleAsterisk, start) + } + } else if !self.done() && self.current() == '=' { + self.step_and_token(TokenType::AsteriskEqual, start) + } else { + self.token(TokenType::Asterisk, start) + } + } + + fn slash_token(&mut self) -> Option { + let start = self.pos(); + self.step(); + if !self.done() && self.current() == '/' { + self.step(); + while !self.done() && self.current() != '\n' { + self.step(); + } + self.next_token() + } else if !self.done() && self.current() == '*' { + self.step(); + let mut depth = 1; + let mut last_char: Option = None; + while !self.done() { + match (last_char, self.current()) { + (Some('/'), '*') => { + depth += 1; + } + (Some('*'), '/') => { + depth -= 1; + if depth == 0 { + self.step(); + break; + } + } + _ => {} + } + last_char = Some(self.current()); + self.step(); + } + if depth != 0 { + Some(self.token(TokenType::MalformedComment, start)) + } else { + self.next_token() + } + } else if !self.done() && self.current() == '=' { + return Some(self.step_and_token(TokenType::SlashEqual, start)); + } else { + return Some(self.token(TokenType::Slash, start)); + } + } + + fn dot_token(&mut self) -> Token { + let start = self.pos(); + self.step(); + if !self.done() && matches!(self.current(), '0'..='9') { + self.step(); + while !self.done() && matches!(self.current(), '0'..='9') { + self.step(); + } + self.token(TokenType::Decimal, start) + } else { + self.token(TokenType::Dot, start) + } + } + + fn step_and_token(&mut self, token_type: TokenType, start: Position) -> Token { + self.step(); + self.token(token_type, start) + } + + fn token(&self, token_type: TokenType, start: Position) -> Token { + Token { + token_type, + length: self.index - start.index, + pos: start, + } + } + + fn done(&self) -> bool { + self.current_char.is_none() + } + + fn current(&self) -> char { + self.current_char.expect("done() checked") + } + + fn step(&mut self) { + self.index += 1; + if !self.done() { + if self.current() == '\n' { + self.line += 1; + self.col = 1; + } else { + self.col += 1; + } + } + self.current_char = self.chars.next(); + } +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + self.next_token() + } +} + +impl<'a> PositionKnowing for Lexer<'a> { + fn pos(&self) -> Position { + Position { + index: self.index, + line: self.line, + col: self.col, + } + } +} diff --git a/src/main.rs b/src/main.rs index 73244d7..9c63157 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,859 +1,12 @@ #![allow(dead_code)] -use std::str::Chars; +mod ast; +mod lexer; +mod parser; +mod tokens; -#[derive(Debug, Clone)] -struct Position { - pub index: usize, - pub line: i32, - pub col: i32, -} - -impl Position { - pub fn new(index: usize, line: i32, col: i32) -> Self { - Self { index, line, col } - } -} - -#[derive(Debug, PartialEq)] -enum TokenType { - InvalidChar, - MalformedString, - MalformedComment, - - Id, - Int, - Decimal, - String, - - False, - True, - Let, - Mut, - If, - Else, - While, - For, - In, - Break, - Continue, - Fn, - Return, - End, - Not, - And, - Or, - Underscore, - - Plus, - Minus, - Asterisk, - Slash, - Percent, - DoubleAsterisk, - Equal, - Exclamation, - LessThan, - GreaterThan, - - PlusEqual, - MinusEqual, - AsteriskEqual, - SlashEqual, - PercentEqual, - DoubleAsteriskEqual, - EqualEqual, - ExclamationEqual, - LessThanEqual, - GreaterThanEqual, - - LParen, - RParen, - LBrace, - RBrace, - LBracket, - RBracket, - Dot, - Comma, - Colon, - Semicolon, - Ampersand, -} - -#[derive(Debug)] -struct Token { - pub token_type: TokenType, - pub pos: Position, - pub length: usize, -} - -struct Lexer<'a> { - text: &'a str, - chars: Chars<'a>, - current_char: Option, - index: usize, - line: i32, - col: i32, -} - -impl<'a> Lexer<'a> { - pub fn new(text: &'a str) -> Self { - let mut chars = text.chars(); - let first_char = chars.next(); - Self { - text, - chars, - current_char: first_char, - index: 0, - line: 1, - col: 1, - } - } - - fn next_token(&mut self) -> Option { - if self.done() { - return None; - } - match self.current() { - ' ' | '\t' | '\r' | '\n' => self.skip_whitespace(), - '1'..='9' => Some(self.int_token()), - 'a'..='z' | 'A'..='Z' | '_' => Some(self.id_token()), - '"' => Some(self.string_token()), - '+' => { - Some(self.single_or_double_char_token(TokenType::Plus, '=', TokenType::PlusEqual)) - } - '-' => { - Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual)) - } - '*' => Some(self.asterisk_token()), - '/' => self.slash_token(), - '%' => Some(self.single_or_double_char_token( - TokenType::Percent, - '=', - TokenType::PercentEqual, - )), - '=' => { - Some(self.single_or_double_char_token(TokenType::Equal, '=', TokenType::EqualEqual)) - } - '!' => Some(self.single_or_double_char_token( - TokenType::Exclamation, - '=', - TokenType::ExclamationEqual, - )), - '<' => Some(self.single_or_double_char_token( - TokenType::LessThan, - '=', - TokenType::LessThanEqual, - )), - '>' => Some(self.single_or_double_char_token( - TokenType::GreaterThan, - '=', - TokenType::GreaterThanEqual, - )), - '(' => Some(self.step_and_token(TokenType::LParen, self.pos())), - ')' => Some(self.step_and_token(TokenType::RParen, self.pos())), - '{' => Some(self.step_and_token(TokenType::LBrace, self.pos())), - '}' => Some(self.step_and_token(TokenType::RBrace, self.pos())), - '[' => Some(self.step_and_token(TokenType::LBracket, self.pos())), - ']' => Some(self.step_and_token(TokenType::RBracket, self.pos())), - '.' => Some(self.dot_token()), - ',' => Some(self.step_and_token(TokenType::Comma, self.pos())), - ':' => Some(self.step_and_token(TokenType::Colon, self.pos())), - ';' => Some(self.step_and_token(TokenType::Semicolon, self.pos())), - '&' => Some(self.step_and_token(TokenType::Ampersand, self.pos())), - _ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())), - } - } - - fn skip_whitespace(&mut self) -> Option { - while !self.done() && matches!(self.current(), ' ' | '\t' | '\r' | '\n') { - self.step() - } - self.next_token() - } - - fn int_token(&mut self) -> Token { - let start = self.pos(); - self.step(); - while !self.done() && matches!(self.current(), '0'..='9') { - self.step(); - } - self.token(TokenType::Int, start) - } - - fn string_token(&mut self) -> Token { - let start = self.pos(); - self.step(); - let mut escaped = false; - while !self.done() && (self.current() != '"' || escaped) { - escaped = self.current() == '\\' && !escaped; - self.step(); - } - if self.done() || self.current() != '"' { - self.step_and_token(TokenType::MalformedString, start) - } else { - self.step_and_token(TokenType::String, start) - } - } - - fn id_token(&mut self) -> Token { - let start = self.pos(); - self.step(); - while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { - self.step(); - } - self.token( - match &self.text[start.index..self.index] { - "false" => TokenType::False, - "true" => TokenType::True, - "let" => TokenType::Let, - "mut" => TokenType::Mut, - "if" => TokenType::If, - "else" => TokenType::Else, - "while" => TokenType::While, - "for" => TokenType::For, - "in" => TokenType::In, - "break" => TokenType::Break, - "continue" => TokenType::Continue, - "fn" => TokenType::Fn, - "return" => TokenType::Return, - "end" => TokenType::End, - "_" => TokenType::Underscore, - _ => TokenType::Id, - }, - start, - ) - } - - fn single_or_double_char_token( - &mut self, - single_type: TokenType, - double_char: char, - double_type: TokenType, - ) -> Token { - let start = self.pos(); - self.step(); - if !self.done() && self.current() == double_char { - self.step_and_token(double_type, start) - } else { - self.token(single_type, start) - } - } - - fn asterisk_token(&mut self) -> Token { - let start = self.pos(); - self.step(); - if !self.done() && self.current() == '*' { - self.step(); - if !self.done() && self.current() == '=' { - self.step_and_token(TokenType::DoubleAsteriskEqual, start) - } else { - self.token(TokenType::DoubleAsterisk, start) - } - } else if !self.done() && self.current() == '=' { - self.step_and_token(TokenType::AsteriskEqual, start) - } else { - self.token(TokenType::Asterisk, start) - } - } - - fn slash_token(&mut self) -> Option { - let start = self.pos(); - self.step(); - if !self.done() && self.current() == '/' { - self.step(); - while !self.done() && self.current() != '\n' { - self.step(); - } - self.next_token() - } else if !self.done() && self.current() == '*' { - self.step(); - let mut depth = 1; - let mut last_char: Option = None; - while !self.done() { - match (last_char, self.current()) { - (Some('/'), '*') => { - depth += 1; - } - (Some('*'), '/') => { - depth -= 1; - if depth == 0 { - self.step(); - break; - } - } - _ => {} - } - last_char = Some(self.current()); - self.step(); - } - if depth != 0 { - Some(self.token(TokenType::MalformedComment, start)) - } else { - self.next_token() - } - } else if !self.done() && self.current() == '=' { - return Some(self.step_and_token(TokenType::SlashEqual, start)); - } else { - return Some(self.token(TokenType::Slash, start)); - } - } - - fn dot_token(&mut self) -> Token { - let start = self.pos(); - self.step(); - if !self.done() && matches!(self.current(), '0'..='9') { - self.step(); - while !self.done() && matches!(self.current(), '0'..='9') { - self.step(); - } - self.token(TokenType::Decimal, start) - } else { - self.token(TokenType::Dot, start) - } - } - - fn step_and_token(&mut self, token_type: TokenType, start: Position) -> Token { - self.step(); - self.token(token_type, start) - } - - fn token(&self, token_type: TokenType, start: Position) -> Token { - Token { - token_type, - length: self.index - start.index, - pos: start, - } - } - - pub fn pos(&self) -> Position { - Position { - index: self.index, - line: self.line, - col: self.col, - } - } - - fn done(&self) -> bool { - self.current_char.is_none() - } - - fn current(&self) -> char { - self.current_char.expect("done() checked") - } - - fn step(&mut self) { - self.index += 1; - if !self.done() { - if self.current() == '\n' { - self.line += 1; - self.col = 1; - } else { - self.col += 1; - } - } - self.current_char = self.chars.next(); - } -} - -impl<'a> Iterator for Lexer<'a> { - type Item = Token; - - fn next(&mut self) -> Option { - self.next_token() - } -} - -#[derive(Debug)] -struct Node { - pub value: T, - pub pos: Position, -} - -#[derive(Debug)] -enum Expr { - Unit, - Id(String), - Int(i64), - Float(f64), - String(String), - Bool(bool), - Array(Vec>), - Object(Vec), - Tuple(Vec>), - - Member { - subject: Box>, - value: String, - }, - Index { - subject: Box>, - value: Box>, - }, - Call { - subject: Box>, - arguments: Vec>, - }, - Unary { - unary_type: UnaryType, - subject: Box>, - }, - Binary { - binary_type: BinaryType, - left: Box>, - right: Box>, - }, -} - -#[derive(Debug)] -enum ObjectEntry { - Pair(Box>, Box), -} - -#[derive(Debug)] -enum UnaryType { - Not, - Negate, - Reference, - ReferenceMut, - Dereference, -} - -#[derive(Debug)] -enum BinaryType { - Exponentiate, - Multiply, - Divide, - Modulo, - Add, - Subtract, - LT, - LTE, - GT, - GTE, - In, - Equal, - Inequal, - And, - Or, -} - -#[derive(Debug)] -struct ParserError { - pos: Position, - message: String, -} - -struct Parser<'a> { - text: &'a str, - lexer: Lexer<'a>, - current_token: Option, -} - -impl<'a> Parser<'a> { - pub fn new(text: &'a str, mut lexer: Lexer<'a>) -> Self { - Self { - text, - current_token: lexer.next(), - lexer, - } - } - - pub fn parse_expr(&mut self) -> Result, ParserError> { - self.parse_prec_or() - } - - fn parse_prec_or(&mut self) -> Result, ParserError> { - let mut left = self.parse_prec_and()?; - while !self.done() { - if self.current_is(TokenType::Or) { - self.step(); - let right = self.parse_prec_and()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Or, - left: Box::new(left), - right: Box::new(right), - }); - } else { - break; - } - } - Ok(left) - } - - fn parse_prec_and(&mut self) -> Result, ParserError> { - let mut left = self.parse_prec_equal_inequal()?; - while !self.done() { - if self.current_is(TokenType::And) { - self.step(); - let right = self.parse_prec_equal_inequal()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::And, - left: Box::new(left), - right: Box::new(right), - }); - } else { - break; - } - } - Ok(left) - } - - fn parse_prec_equal_inequal(&mut self) -> Result, ParserError> { - let mut left = self.parse_prec_lt_lte_gt_gte_in()?; - while !self.done() { - if self.current_is(TokenType::EqualEqual) { - self.step(); - let right = self.parse_prec_lt_lte_gt_gte_in()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Equal, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::ExclamationEqual) { - self.step(); - let right = self.parse_prec_lt_lte_gt_gte_in()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Inequal, - left: Box::new(left), - right: Box::new(right), - }); - } else { - break; - } - } - Ok(left) - } - - fn parse_prec_lt_lte_gt_gte_in(&mut self) -> Result, ParserError> { - let mut left = self.parse_prec_add_subtract()?; - while !self.done() { - if self.current_is(TokenType::LessThan) { - self.step(); - let right = self.parse_prec_add_subtract()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::LT, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::GreaterThan) { - self.step(); - let right = self.parse_prec_add_subtract()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::GT, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::LessThanEqual) { - self.step(); - let right = self.parse_prec_add_subtract()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::LTE, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::GreaterThanEqual) { - self.step(); - let right = self.parse_prec_add_subtract()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::GTE, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::In) { - self.step(); - let right = self.parse_prec_add_subtract()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::In, - left: Box::new(left), - right: Box::new(right), - }); - } else { - break; - } - } - Ok(left) - } - - fn parse_prec_add_subtract(&mut self) -> Result, ParserError> { - let mut left = self.parse_prec_multiply_divide_modulo()?; - while !self.done() { - if self.current_is(TokenType::Plus) { - self.step(); - let right = self.parse_prec_multiply_divide_modulo()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Add, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::Minus) { - self.step(); - let right = self.parse_prec_multiply_divide_modulo()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Subtract, - left: Box::new(left), - right: Box::new(right), - }); - } else { - break; - } - } - Ok(left) - } - - fn parse_prec_multiply_divide_modulo(&mut self) -> Result, ParserError> { - let mut left = self.parse_prec_unary()?; - while !self.done() { - if self.current_is(TokenType::Asterisk) { - self.step(); - let right = self.parse_prec_unary()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Multiply, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::Slash) { - self.step(); - let right = self.parse_prec_unary()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Divide, - left: Box::new(left), - right: Box::new(right), - }); - } else if self.current_is(TokenType::Percent) { - self.step(); - let right = self.parse_prec_unary()?; - left = self.node(Expr::Binary { - binary_type: BinaryType::Modulo, - left: Box::new(left), - right: Box::new(right), - }); - } else { - break; - } - } - Ok(left) - } - - fn parse_prec_unary(&mut self) -> Result, ParserError> { - if !self.done() && self.current_is(TokenType::Not) { - self.step(); - let subject = Box::new(self.parse_prec_unary()?); - self.ok_node(Expr::Unary { - unary_type: UnaryType::Not, - subject, - }) - } else if !self.done() && self.current_is(TokenType::Minus) { - self.step(); - let subject = Box::new(self.parse_prec_unary()?); - self.ok_node(Expr::Unary { - unary_type: UnaryType::Negate, - subject, - }) - } else if !self.done() && self.current_is(TokenType::Ampersand) { - self.step(); - if !self.done() && self.current_is(TokenType::Mut) { - self.step(); - let subject = Box::new(self.parse_prec_unary()?); - self.ok_node(Expr::Unary { - unary_type: UnaryType::ReferenceMut, - subject, - }) - } else { - let subject = Box::new(self.parse_prec_unary()?); - self.ok_node(Expr::Unary { - unary_type: UnaryType::Reference, - subject, - }) - } - } else if !self.done() && self.current_is(TokenType::Asterisk) { - self.step(); - let subject = Box::new(self.parse_prec_unary()?); - self.ok_node(Expr::Unary { - unary_type: UnaryType::Dereference, - subject, - }) - } else { - self.parse_prec_exponentiate() - } - } - - fn parse_prec_exponentiate(&mut self) -> Result, ParserError> { - let left = self.parse_prec_member_index_call()?; - if !self.done() && self.current_is(TokenType::AsteriskEqual) { - let right = self.parse_prec_exponentiate()?; - self.step_and_ok_node(Expr::Binary { - binary_type: BinaryType::Exponentiate, - left: Box::new(left), - right: Box::new(right), - }) - } else { - Ok(left) - } - } - - fn parse_prec_member_index_call(&mut self) -> Result, ParserError> { - let mut subject = self.parse_operand()?; - while !self.done() { - if self.current_is(TokenType::Dot) { - self.step(); - if self.done() || self.current_is(TokenType::Id) { - return self.error("expected identifier"); - } - let value = self.token_string(self.current()); - self.step(); - subject = self.node(Expr::Member { - subject: Box::new(subject), - value, - }); - } else if self.current_is(TokenType::LBracket) { - self.step(); - let value = self.parse_expr()?; - if self.done() || !self.current_is(TokenType::RBracket) { - return self.error("expected ']'"); - } - subject = self.node(Expr::Index { - subject: Box::new(subject), - value: Box::new(value), - }); - } else if self.current_is(TokenType::LParen) { - self.step(); - let mut arguments = Vec::>::new(); - if !self.done() && !self.current_is(TokenType::RParen) { - arguments.push(self.parse_expr()?); - while !self.done() && self.current_is(TokenType::Comma) { - self.step(); - if self.done() || self.current_is(TokenType::RParen) { - self.step(); - } - arguments.push(self.parse_expr()?); - } - } - if self.done() || !self.current_is(TokenType::RParen) { - return self.error("expected ')'"); - } - self.step(); - subject = self.node(Expr::Call { - subject: Box::new(subject), - arguments, - }) - } else { - break; - } - } - Ok(subject) - } - - fn parse_operand(&mut self) -> Result, ParserError> { - if self.done() { - return self.error("expected value, got eof"); - } - match self.current().token_type { - TokenType::Id => self.step_and_ok_node(Expr::Id(self.token_string(self.current()))), - TokenType::Int => { - let mut value_string = self.token_string(self.current()); - self.step(); - if !self.done() && self.current_is(TokenType::Decimal) { - value_string.push_str(&self.token_string(self.current())); - self.step_and_ok_node(Expr::Float( - value_string.parse::().expect("valid f64"), - )) - } else { - self.ok_node(Expr::Int((value_string).parse::().expect("valid i64"))) - } - } - TokenType::Decimal => self.step_and_ok_node(Expr::Float( - self.token_string(self.current()) - .parse::() - .expect("valid f64"), - )), - TokenType::False => self.step_and_ok_node(Expr::Bool(false)), - TokenType::True => self.step_and_ok_node(Expr::Bool(true)), - TokenType::LParen => self.parse_unit_group_or_tuple(), - TokenType::LBrace => self.parse_object(), - TokenType::LBracket => self.parse_array(), - TokenType::Fn => self.parse_function(), - _ => self.error("expected value"), - } - } - - fn parse_unit_group_or_tuple(&mut self) -> Result, ParserError> { - self.step(); - if !self.done() && !self.current_is(TokenType::LParen) { - todo!() - } else { - self.step_and_ok_node(Expr::Unit) - } - } - - fn parse_object(&mut self) -> Result, ParserError> { - todo!() - } - - fn parse_array(&mut self) -> Result, ParserError> { - todo!() - } - - fn parse_function(&mut self) -> Result, ParserError> { - todo!() - } - - fn token_string(&self, token: &Token) -> String { - self.text[token.pos.index..token.pos.index + token.length].to_string() - } - - fn step_and_ok_node(&mut self, value: T) -> Result, ParserError> { - self.step(); - self.ok_node(value) - } - - fn ok_node(&self, value: T) -> Result, ParserError> { - Ok(Node { - value, - pos: self.lexer.pos(), - }) - } - - fn step_and_node(&mut self, value: T) -> Node { - self.step(); - self.node(value) - } - - fn node(&self, value: T) -> Node { - Node { - value, - pos: self.lexer.pos(), - } - } - - fn step_and(&mut self, value: T) -> T { - self.step(); - value - } - - fn error(&self, message: &str) -> Result, ParserError> { - Err(ParserError { - pos: self.lexer.pos(), - message: message.to_string(), - }) - } - - fn done(&self) -> bool { - self.current_token.is_none() - } - - fn current_is(&self, token_type: TokenType) -> bool { - self.current().token_type == token_type - } - - fn current(&self) -> &Token { - self.current_token.as_ref().expect("done() checked") - } - - fn step(&mut self) { - self.current_token = self.lexer.next(); - } -} +use crate::lexer::Lexer; +use crate::parser::Parser; fn main() { println!("tokens = ["); diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..345ab56 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,425 @@ +use crate::ast::{BinaryType, Expr, Node, UnaryType}; +use crate::tokens::{Position, PositionKnowing, Token, TokenType}; + +#[derive(Debug)] +pub struct ParserError { + pos: Position, + message: String, +} + +pub struct Parser<'a, Tokens> +where + Tokens: PositionKnowing + Iterator, +{ + text: &'a str, + tokens: Tokens, + current_token: Option, +} + +impl<'a, Tokens> Parser<'a, Tokens> +where + Tokens: PositionKnowing + Iterator, +{ + pub fn new(text: &'a str, mut lexer: Tokens) -> Self { + Self { + text, + current_token: lexer.next(), + tokens: lexer, + } + } + + pub fn parse_expr(&mut self) -> Result, ParserError> { + self.parse_prec_or() + } + + fn parse_prec_or(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_and()?; + while !self.done() { + if self.current_is(TokenType::Or) { + self.step(); + let right = self.parse_prec_and()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Or, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_and(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_equal_inequal()?; + while !self.done() { + if self.current_is(TokenType::And) { + self.step(); + let right = self.parse_prec_equal_inequal()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::And, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_equal_inequal(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_lt_lte_gt_gte_in()?; + while !self.done() { + if self.current_is(TokenType::EqualEqual) { + self.step(); + let right = self.parse_prec_lt_lte_gt_gte_in()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Equal, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::ExclamationEqual) { + self.step(); + let right = self.parse_prec_lt_lte_gt_gte_in()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Inequal, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_lt_lte_gt_gte_in(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_add_subtract()?; + while !self.done() { + if self.current_is(TokenType::LessThan) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::LT, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::GreaterThan) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::GT, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::LessThanEqual) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::LTE, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::GreaterThanEqual) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::GTE, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::In) { + self.step(); + let right = self.parse_prec_add_subtract()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::In, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_add_subtract(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_multiply_divide_modulo()?; + while !self.done() { + if self.current_is(TokenType::Plus) { + self.step(); + let right = self.parse_prec_multiply_divide_modulo()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Add, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::Minus) { + self.step(); + let right = self.parse_prec_multiply_divide_modulo()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Subtract, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_multiply_divide_modulo(&mut self) -> Result, ParserError> { + let mut left = self.parse_prec_unary()?; + while !self.done() { + if self.current_is(TokenType::Asterisk) { + self.step(); + let right = self.parse_prec_unary()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Multiply, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::Slash) { + self.step(); + let right = self.parse_prec_unary()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Divide, + left: Box::new(left), + right: Box::new(right), + }); + } else if self.current_is(TokenType::Percent) { + self.step(); + let right = self.parse_prec_unary()?; + left = self.node(Expr::Binary { + binary_type: BinaryType::Modulo, + left: Box::new(left), + right: Box::new(right), + }); + } else { + break; + } + } + Ok(left) + } + + fn parse_prec_unary(&mut self) -> Result, ParserError> { + if !self.done() && self.current_is(TokenType::Not) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Not, + subject, + }) + } else if !self.done() && self.current_is(TokenType::Minus) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Negate, + subject, + }) + } else if !self.done() && self.current_is(TokenType::Ampersand) { + self.step(); + if !self.done() && self.current_is(TokenType::Mut) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::ReferenceMut, + subject, + }) + } else { + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Reference, + subject, + }) + } + } else if !self.done() && self.current_is(TokenType::Asterisk) { + self.step(); + let subject = Box::new(self.parse_prec_unary()?); + self.ok_node(Expr::Unary { + unary_type: UnaryType::Dereference, + subject, + }) + } else { + self.parse_prec_exponentiate() + } + } + + fn parse_prec_exponentiate(&mut self) -> Result, ParserError> { + let left = self.parse_prec_member_index_call()?; + if !self.done() && self.current_is(TokenType::AsteriskEqual) { + let right = self.parse_prec_exponentiate()?; + self.step_and_ok_node(Expr::Binary { + binary_type: BinaryType::Exponentiate, + left: Box::new(left), + right: Box::new(right), + }) + } else { + Ok(left) + } + } + + fn parse_prec_member_index_call(&mut self) -> Result, ParserError> { + let mut subject = self.parse_operand()?; + while !self.done() { + if self.current_is(TokenType::Dot) { + self.step(); + if self.done() || self.current_is(TokenType::Id) { + return self.error("expected identifier"); + } + let value = self.token_string(self.current()); + self.step(); + subject = self.node(Expr::Member { + subject: Box::new(subject), + value, + }); + } else if self.current_is(TokenType::LBracket) { + self.step(); + let value = self.parse_expr()?; + if self.done() || !self.current_is(TokenType::RBracket) { + return self.error("expected ']'"); + } + subject = self.node(Expr::Index { + subject: Box::new(subject), + value: Box::new(value), + }); + } else if self.current_is(TokenType::LParen) { + self.step(); + let mut arguments = Vec::>::new(); + if !self.done() && !self.current_is(TokenType::RParen) { + arguments.push(self.parse_expr()?); + while !self.done() && self.current_is(TokenType::Comma) { + self.step(); + if self.done() || self.current_is(TokenType::RParen) { + self.step(); + } + arguments.push(self.parse_expr()?); + } + } + if self.done() || !self.current_is(TokenType::RParen) { + return self.error("expected ')'"); + } + self.step(); + subject = self.node(Expr::Call { + subject: Box::new(subject), + arguments, + }) + } else { + break; + } + } + Ok(subject) + } + + fn parse_operand(&mut self) -> Result, ParserError> { + if self.done() { + return self.error("expected value, got eof"); + } + match self.current().token_type { + TokenType::Id => self.step_and_ok_node(Expr::Id(self.token_string(self.current()))), + TokenType::Int => { + let mut value_string = self.token_string(self.current()); + self.step(); + if !self.done() && self.current_is(TokenType::Decimal) { + value_string.push_str(&self.token_string(self.current())); + self.step_and_ok_node(Expr::Float( + value_string.parse::().expect("valid f64"), + )) + } else { + self.ok_node(Expr::Int((value_string).parse::().expect("valid i64"))) + } + } + TokenType::Decimal => self.step_and_ok_node(Expr::Float( + self.token_string(self.current()) + .parse::() + .expect("valid f64"), + )), + TokenType::False => self.step_and_ok_node(Expr::Bool(false)), + TokenType::True => self.step_and_ok_node(Expr::Bool(true)), + TokenType::LParen => self.parse_unit_group_or_tuple(), + TokenType::LBrace => self.parse_object(), + TokenType::LBracket => self.parse_array(), + TokenType::Fn => self.parse_function(), + _ => self.error("expected value"), + } + } + + fn parse_unit_group_or_tuple(&mut self) -> Result, ParserError> { + self.step(); + if !self.done() && !self.current_is(TokenType::LParen) { + todo!() + } else { + self.step_and_ok_node(Expr::Unit) + } + } + + fn parse_object(&mut self) -> Result, ParserError> { + todo!() + } + + fn parse_array(&mut self) -> Result, ParserError> { + todo!() + } + + fn parse_function(&mut self) -> Result, ParserError> { + todo!() + } + + fn token_string(&self, token: &Token) -> String { + self.text[token.pos.index..token.pos.index + token.length].to_string() + } + + fn step_and_ok_node(&mut self, value: T) -> Result, ParserError> { + self.step(); + self.ok_node(value) + } + + fn ok_node(&self, value: T) -> Result, ParserError> { + Ok(Node { + value, + pos: self.tokens.pos(), + }) + } + + fn step_and_node(&mut self, value: T) -> Node { + self.step(); + self.node(value) + } + + fn node(&self, value: T) -> Node { + Node { + value, + pos: self.tokens.pos(), + } + } + + fn step_and(&mut self, value: T) -> T { + self.step(); + value + } + + fn error(&self, message: &str) -> Result, ParserError> { + Err(ParserError { + pos: self.tokens.pos(), + message: message.to_string(), + }) + } + + fn done(&self) -> bool { + self.current_token.is_none() + } + + fn current_is(&self, token_type: TokenType) -> bool { + self.current().token_type == token_type + } + + fn current(&self) -> &Token { + self.current_token.as_ref().expect("done() checked") + } + + fn step(&mut self) { + self.current_token = self.tokens.next(); + } +} diff --git a/src/tokens.rs b/src/tokens.rs new file mode 100644 index 0000000..05a5f17 --- /dev/null +++ b/src/tokens.rs @@ -0,0 +1,88 @@ +#[derive(Debug, Clone)] +pub struct Position { + pub index: usize, + pub line: i32, + pub col: i32, +} + +impl Position { + pub fn new(index: usize, line: i32, col: i32) -> Self { + Self { index, line, col } + } +} + +#[derive(Debug, PartialEq)] +pub enum TokenType { + InvalidChar, + MalformedString, + MalformedComment, + + Id, + Int, + Decimal, + String, + + False, + True, + Let, + Mut, + If, + Else, + While, + For, + In, + Break, + Continue, + Fn, + Return, + End, + Not, + And, + Or, + Underscore, + + Plus, + Minus, + Asterisk, + Slash, + Percent, + DoubleAsterisk, + Equal, + Exclamation, + LessThan, + GreaterThan, + + PlusEqual, + MinusEqual, + AsteriskEqual, + SlashEqual, + PercentEqual, + DoubleAsteriskEqual, + EqualEqual, + ExclamationEqual, + LessThanEqual, + GreaterThanEqual, + + LParen, + RParen, + LBrace, + RBrace, + LBracket, + RBracket, + Dot, + Comma, + Colon, + Semicolon, + Ampersand, +} + +#[derive(Debug)] +pub struct Token { + pub token_type: TokenType, + pub pos: Position, + pub length: usize, +} + +pub trait PositionKnowing { + fn pos(&self) -> Position; +}