operator parsing

This commit is contained in:
SimonFJ20 2023-03-15 02:23:41 +01:00
parent edec4a2323
commit c9ed3333f1

View File

@ -2,17 +2,34 @@
use std::str::Chars; use std::str::Chars;
#[derive(Debug)] #[derive(Debug, Clone)]
struct Position {
pub index: usize,
pub line: i32,
pub col: i32,
}
impl Position {
pub fn new(index: usize, line: i32, col: i32) -> Self {
Self { index, line, col }
}
}
#[derive(Debug, PartialEq)]
enum TokenType { enum TokenType {
InvalidChar, InvalidChar,
MalformedString, MalformedString,
MalformedComment, MalformedComment,
Id, Id,
Int, Int,
Decimal, Decimal,
String, String,
False, False,
True, True,
Let,
Mut,
If, If,
Else, Else,
While, While,
@ -20,20 +37,36 @@ enum TokenType {
In, In,
Break, Break,
Continue, Continue,
Function, Fn,
Return, Return,
End, End,
Not,
And,
Or,
Underscore, Underscore,
Plus, Plus,
Minus, Minus,
Asterisk, Asterisk,
Slash, Slash,
Percent, Percent,
DoubleAsterisk,
Equal,
Exclamation,
LessThan,
GreaterThan,
PlusEqual, PlusEqual,
MinusEqual, MinusEqual,
AsteriskEqual, AsteriskEqual,
SlashEqual, SlashEqual,
PercentEqual, PercentEqual,
DoubleAsteriskEqual,
EqualEqual,
ExclamationEqual,
LessThanEqual,
GreaterThanEqual,
LParen, LParen,
RParen, RParen,
LBrace, LBrace,
@ -44,20 +77,14 @@ enum TokenType {
Comma, Comma,
Colon, Colon,
Semicolon, Semicolon,
} Ampersand,
#[derive(Debug)]
struct Position {
index: usize,
line: i32,
col: i32,
} }
#[derive(Debug)] #[derive(Debug)]
struct Token { struct Token {
token_type: TokenType, pub token_type: TokenType,
pos: Position, pub pos: Position,
length: usize, pub length: usize,
} }
struct Lexer<'a> { struct Lexer<'a> {
@ -98,20 +125,42 @@ impl<'a> Lexer<'a> {
'-' => { '-' => {
Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual)) Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual))
} }
'*' => Some(self.single_or_double_char_token( '*' => Some(self.asterisk_token()),
TokenType::Asterisk,
'=',
TokenType::AsteriskEqual,
)),
'/' => self.slash_token(), '/' => self.slash_token(),
'%' => Some(self.single_or_double_char_token( '%' => Some(self.single_or_double_char_token(
TokenType::Percent, TokenType::Percent,
'=', '=',
TokenType::PercentEqual, TokenType::PercentEqual,
)), )),
'=' => {
Some(self.single_or_double_char_token(TokenType::Equal, '=', TokenType::EqualEqual))
}
'!' => Some(self.single_or_double_char_token(
TokenType::Exclamation,
'=',
TokenType::ExclamationEqual,
)),
'<' => Some(self.single_or_double_char_token(
TokenType::LessThan,
'=',
TokenType::LessThanEqual,
)),
'>' => Some(self.single_or_double_char_token(
TokenType::GreaterThan,
'=',
TokenType::GreaterThanEqual,
)),
'(' => Some(self.step_and_token(TokenType::LParen, self.pos())), '(' => Some(self.step_and_token(TokenType::LParen, self.pos())),
')' => Some(self.step_and_token(TokenType::LParen, self.pos())), ')' => Some(self.step_and_token(TokenType::RParen, self.pos())),
'{' => Some(self.step_and_token(TokenType::LBrace, self.pos())),
'}' => Some(self.step_and_token(TokenType::RBrace, self.pos())),
'[' => Some(self.step_and_token(TokenType::LBracket, self.pos())),
']' => Some(self.step_and_token(TokenType::RBracket, self.pos())),
'.' => Some(self.dot_token()), '.' => Some(self.dot_token()),
',' => Some(self.step_and_token(TokenType::Comma, self.pos())),
':' => Some(self.step_and_token(TokenType::Colon, self.pos())),
';' => Some(self.step_and_token(TokenType::Semicolon, self.pos())),
'&' => Some(self.step_and_token(TokenType::Ampersand, self.pos())),
_ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())), _ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())),
} }
} }
@ -153,22 +202,27 @@ impl<'a> Lexer<'a> {
while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') { while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
self.step(); self.step();
} }
match &self.text[start.index..self.index] { self.token(
"false" => self.token(TokenType::False, start), match &self.text[start.index..self.index] {
"true" => self.token(TokenType::True, start), "false" => TokenType::False,
"if" => self.token(TokenType::True, start), "true" => TokenType::True,
"else" => self.token(TokenType::True, start), "let" => TokenType::Let,
"while" => self.token(TokenType::True, start), "mut" => TokenType::Mut,
"for" => self.token(TokenType::True, start), "if" => TokenType::If,
"in" => self.token(TokenType::True, start), "else" => TokenType::Else,
"break" => self.token(TokenType::True, start), "while" => TokenType::While,
"continue" => self.token(TokenType::True, start), "for" => TokenType::For,
"function" => self.token(TokenType::True, start), "in" => TokenType::In,
"return" => self.token(TokenType::True, start), "break" => TokenType::Break,
"end" => self.token(TokenType::True, start), "continue" => TokenType::Continue,
"underscore" => self.token(TokenType::True, start), "fn" => TokenType::Fn,
_ => self.token(TokenType::Id, start), "return" => TokenType::Return,
} "end" => TokenType::End,
"_" => TokenType::Underscore,
_ => TokenType::Id,
},
start,
)
} }
fn single_or_double_char_token( fn single_or_double_char_token(
@ -186,6 +240,23 @@ impl<'a> Lexer<'a> {
} }
} }
fn asterisk_token(&mut self) -> Token {
let start = self.pos();
self.step();
if !self.done() && self.current() == '*' {
self.step();
if !self.done() && self.current() == '=' {
self.step_and_token(TokenType::DoubleAsteriskEqual, start)
} else {
self.token(TokenType::DoubleAsterisk, start)
}
} else if !self.done() && self.current() == '=' {
self.step_and_token(TokenType::AsteriskEqual, start)
} else {
self.token(TokenType::Asterisk, start)
}
}
fn slash_token(&mut self) -> Option<Token> { fn slash_token(&mut self) -> Option<Token> {
let start = self.pos(); let start = self.pos();
self.step(); self.step();
@ -255,7 +326,7 @@ impl<'a> Lexer<'a> {
} }
} }
fn pos(&self) -> Position { pub fn pos(&self) -> Position {
Position { Position {
index: self.index, index: self.index,
line: self.line, line: self.line,
@ -268,7 +339,7 @@ impl<'a> Lexer<'a> {
} }
fn current(&self) -> char { fn current(&self) -> char {
self.current_char.expect("done() not checked") self.current_char.expect("done() checked")
} }
fn step(&mut self) { fn step(&mut self) {
@ -293,17 +364,508 @@ impl<'a> Iterator for Lexer<'a> {
} }
} }
#[derive(Debug)]
struct Node<T> {
pub value: T,
pub pos: Position,
}
#[derive(Debug)]
enum Expr { enum Expr {
Unit,
Id(String), Id(String),
Int(i64), Int(i64),
Float(f64), Float(f64),
String(String), String(String),
Bool(bool),
Array(Vec<Node<Expr>>),
Object(Vec<ObjectEntry>),
Tuple(Vec<Node<Expr>>),
Member {
subject: Box<Node<Expr>>,
value: String,
},
Index {
subject: Box<Node<Expr>>,
value: Box<Node<Expr>>,
},
Call {
subject: Box<Node<Expr>>,
arguments: Vec<Node<Expr>>,
},
Unary {
unary_type: UnaryType,
subject: Box<Node<Expr>>,
},
Binary {
binary_type: BinaryType,
left: Box<Node<Expr>>,
right: Box<Node<Expr>>,
},
}
#[derive(Debug)]
enum ObjectEntry {
Pair(Box<Node<Expr>>, Box<Expr>),
}
#[derive(Debug)]
enum UnaryType {
Not,
Negate,
Reference,
ReferenceMut,
Dereference,
}
#[derive(Debug)]
enum BinaryType {
Exponentiate,
Multiply,
Divide,
Modulo,
Add,
Subtract,
LT,
LTE,
GT,
GTE,
In,
Equal,
Inequal,
And,
Or,
}
#[derive(Debug)]
struct ParserError {
pos: Position,
message: String,
}
struct Parser<'a> {
text: &'a str,
lexer: Lexer<'a>,
current_token: Option<Token>,
}
impl<'a> Parser<'a> {
pub fn new(text: &'a str, mut lexer: Lexer<'a>) -> Self {
Self {
text,
current_token: lexer.next(),
lexer,
}
}
pub fn parse_expr(&mut self) -> Result<Node<Expr>, ParserError> {
self.parse_prec_or()
}
fn parse_prec_or(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_and()?;
while !self.done() {
if self.current_is(TokenType::Or) {
self.step();
let right = self.parse_prec_and()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Or,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_and(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_equal_inequal()?;
while !self.done() {
if self.current_is(TokenType::And) {
self.step();
let right = self.parse_prec_equal_inequal()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::And,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_equal_inequal(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_lt_lte_gt_gte_in()?;
while !self.done() {
if self.current_is(TokenType::EqualEqual) {
self.step();
let right = self.parse_prec_lt_lte_gt_gte_in()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Equal,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::ExclamationEqual) {
self.step();
let right = self.parse_prec_lt_lte_gt_gte_in()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Inequal,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_lt_lte_gt_gte_in(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_add_subtract()?;
while !self.done() {
if self.current_is(TokenType::LessThan) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::LT,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::GreaterThan) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::GT,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::LessThanEqual) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::LTE,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::GreaterThanEqual) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::GTE,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::In) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::In,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_add_subtract(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_multiply_divide_modulo()?;
while !self.done() {
if self.current_is(TokenType::Plus) {
self.step();
let right = self.parse_prec_multiply_divide_modulo()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Add,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::Minus) {
self.step();
let right = self.parse_prec_multiply_divide_modulo()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Subtract,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_multiply_divide_modulo(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_unary()?;
while !self.done() {
if self.current_is(TokenType::Asterisk) {
self.step();
let right = self.parse_prec_unary()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Multiply,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::Slash) {
self.step();
let right = self.parse_prec_unary()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Divide,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::Percent) {
self.step();
let right = self.parse_prec_unary()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Modulo,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_unary(&mut self) -> Result<Node<Expr>, ParserError> {
if !self.done() && self.current_is(TokenType::Not) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Not,
subject,
})
} else if !self.done() && self.current_is(TokenType::Minus) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Negate,
subject,
})
} else if !self.done() && self.current_is(TokenType::Ampersand) {
self.step();
if !self.done() && self.current_is(TokenType::Mut) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::ReferenceMut,
subject,
})
} else {
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Reference,
subject,
})
}
} else if !self.done() && self.current_is(TokenType::Asterisk) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Dereference,
subject,
})
} else {
self.parse_prec_exponentiate()
}
}
fn parse_prec_exponentiate(&mut self) -> Result<Node<Expr>, ParserError> {
let left = self.parse_prec_member_index_call()?;
if !self.done() && self.current_is(TokenType::AsteriskEqual) {
let right = self.parse_prec_exponentiate()?;
self.step_and_ok_node(Expr::Binary {
binary_type: BinaryType::Exponentiate,
left: Box::new(left),
right: Box::new(right),
})
} else {
Ok(left)
}
}
fn parse_prec_member_index_call(&mut self) -> Result<Node<Expr>, ParserError> {
let mut subject = self.parse_operand()?;
while !self.done() {
if self.current_is(TokenType::Dot) {
self.step();
if self.done() || self.current_is(TokenType::Id) {
return self.error("expected identifier");
}
let value = self.token_string(self.current());
self.step();
subject = self.node(Expr::Member {
subject: Box::new(subject),
value,
});
} else if self.current_is(TokenType::LBracket) {
self.step();
let value = self.parse_expr()?;
if self.done() || !self.current_is(TokenType::RBracket) {
return self.error("expected ']'");
}
subject = self.node(Expr::Index {
subject: Box::new(subject),
value: Box::new(value),
});
} else if self.current_is(TokenType::LParen) {
self.step();
let mut arguments = Vec::<Node<Expr>>::new();
if !self.done() && !self.current_is(TokenType::RParen) {
arguments.push(self.parse_expr()?);
while !self.done() && self.current_is(TokenType::Comma) {
self.step();
if self.done() || self.current_is(TokenType::RParen) {
self.step();
}
arguments.push(self.parse_expr()?);
}
}
if self.done() || !self.current_is(TokenType::RParen) {
return self.error("expected ')'");
}
self.step();
subject = self.node(Expr::Call {
subject: Box::new(subject),
arguments,
})
} else {
break;
}
}
Ok(subject)
}
fn parse_operand(&mut self) -> Result<Node<Expr>, ParserError> {
if self.done() {
return self.error("expected value, got eof");
}
match self.current().token_type {
TokenType::Id => self.step_and_ok_node(Expr::Id(self.token_string(self.current()))),
TokenType::Int => {
let mut value_string = self.token_string(self.current());
self.step();
if !self.done() && self.current_is(TokenType::Decimal) {
value_string.push_str(&self.token_string(self.current()));
self.step_and_ok_node(Expr::Float(
value_string.parse::<f64>().expect("valid f64"),
))
} else {
self.ok_node(Expr::Int((value_string).parse::<i64>().expect("valid i64")))
}
}
TokenType::Decimal => self.step_and_ok_node(Expr::Float(
self.token_string(self.current())
.parse::<f64>()
.expect("valid f64"),
)),
TokenType::False => self.step_and_ok_node(Expr::Bool(false)),
TokenType::True => self.step_and_ok_node(Expr::Bool(true)),
TokenType::LParen => self.parse_unit_group_or_tuple(),
TokenType::LBrace => self.parse_object(),
TokenType::LBracket => self.parse_array(),
TokenType::Fn => self.parse_function(),
_ => self.error("expected value"),
}
}
fn parse_unit_group_or_tuple(&mut self) -> Result<Node<Expr>, ParserError> {
self.step();
if !self.done() && !self.current_is(TokenType::LParen) {
todo!()
} else {
self.step_and_ok_node(Expr::Unit)
}
}
fn parse_object(&mut self) -> Result<Node<Expr>, ParserError> {
todo!()
}
fn parse_array(&mut self) -> Result<Node<Expr>, ParserError> {
todo!()
}
fn parse_function(&mut self) -> Result<Node<Expr>, ParserError> {
todo!()
}
fn token_string(&self, token: &Token) -> String {
self.text[token.pos.index..token.pos.index + token.length].to_string()
}
fn step_and_ok_node<T>(&mut self, value: T) -> Result<Node<T>, ParserError> {
self.step();
self.ok_node(value)
}
fn ok_node<T>(&self, value: T) -> Result<Node<T>, ParserError> {
Ok(Node {
value,
pos: self.lexer.pos(),
})
}
fn step_and_node<T>(&mut self, value: T) -> Node<T> {
self.step();
self.node(value)
}
fn node<T>(&self, value: T) -> Node<T> {
Node {
value,
pos: self.lexer.pos(),
}
}
fn step_and<T>(&mut self, value: T) -> T {
self.step();
value
}
fn error(&self, message: &str) -> Result<Node<Expr>, ParserError> {
Err(ParserError {
pos: self.lexer.pos(),
message: message.to_string(),
})
}
fn done(&self) -> bool {
self.current_token.is_none()
}
fn current_is(&self, token_type: TokenType) -> bool {
self.current().token_type == token_type
}
fn current(&self) -> &Token {
self.current_token.as_ref().expect("done() checked")
}
fn step(&mut self) {
self.current_token = self.lexer.next();
}
} }
fn main() { fn main() {
let text = "3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5"; println!("tokens = [");
let text = "** 3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5";
let lexer = Lexer::new(text); let lexer = Lexer::new(text);
lexer.for_each(|token| { lexer.for_each(|token| {
println!("{:?}", token); println!(" {:?},", token);
}) });
println!("]");
let text2 = "1 + 2 * 3";
let mut parser = Parser::new(text2, Lexer::new(text2));
let expr = parser.parse_expr();
println!("ast = {:#?}", expr);
} }