operator parsing

This commit is contained in:
SimonFJ20 2023-03-15 02:23:41 +01:00
parent edec4a2323
commit c9ed3333f1

View File

@ -2,17 +2,34 @@
use std::str::Chars;
#[derive(Debug)]
#[derive(Debug, Clone)]
struct Position {
pub index: usize,
pub line: i32,
pub col: i32,
}
impl Position {
pub fn new(index: usize, line: i32, col: i32) -> Self {
Self { index, line, col }
}
}
#[derive(Debug, PartialEq)]
enum TokenType {
InvalidChar,
MalformedString,
MalformedComment,
Id,
Int,
Decimal,
String,
False,
True,
Let,
Mut,
If,
Else,
While,
@ -20,20 +37,36 @@ enum TokenType {
In,
Break,
Continue,
Function,
Fn,
Return,
End,
Not,
And,
Or,
Underscore,
Plus,
Minus,
Asterisk,
Slash,
Percent,
DoubleAsterisk,
Equal,
Exclamation,
LessThan,
GreaterThan,
PlusEqual,
MinusEqual,
AsteriskEqual,
SlashEqual,
PercentEqual,
DoubleAsteriskEqual,
EqualEqual,
ExclamationEqual,
LessThanEqual,
GreaterThanEqual,
LParen,
RParen,
LBrace,
@ -44,20 +77,14 @@ enum TokenType {
Comma,
Colon,
Semicolon,
}
#[derive(Debug)]
struct Position {
index: usize,
line: i32,
col: i32,
Ampersand,
}
#[derive(Debug)]
struct Token {
token_type: TokenType,
pos: Position,
length: usize,
pub token_type: TokenType,
pub pos: Position,
pub length: usize,
}
struct Lexer<'a> {
@ -98,20 +125,42 @@ impl<'a> Lexer<'a> {
'-' => {
Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual))
}
'*' => Some(self.single_or_double_char_token(
TokenType::Asterisk,
'=',
TokenType::AsteriskEqual,
)),
'*' => Some(self.asterisk_token()),
'/' => self.slash_token(),
'%' => Some(self.single_or_double_char_token(
TokenType::Percent,
'=',
TokenType::PercentEqual,
)),
'=' => {
Some(self.single_or_double_char_token(TokenType::Equal, '=', TokenType::EqualEqual))
}
'!' => Some(self.single_or_double_char_token(
TokenType::Exclamation,
'=',
TokenType::ExclamationEqual,
)),
'<' => Some(self.single_or_double_char_token(
TokenType::LessThan,
'=',
TokenType::LessThanEqual,
)),
'>' => Some(self.single_or_double_char_token(
TokenType::GreaterThan,
'=',
TokenType::GreaterThanEqual,
)),
'(' => Some(self.step_and_token(TokenType::LParen, self.pos())),
')' => Some(self.step_and_token(TokenType::LParen, self.pos())),
')' => Some(self.step_and_token(TokenType::RParen, self.pos())),
'{' => Some(self.step_and_token(TokenType::LBrace, self.pos())),
'}' => Some(self.step_and_token(TokenType::RBrace, self.pos())),
'[' => Some(self.step_and_token(TokenType::LBracket, self.pos())),
']' => Some(self.step_and_token(TokenType::RBracket, self.pos())),
'.' => Some(self.dot_token()),
',' => Some(self.step_and_token(TokenType::Comma, self.pos())),
':' => Some(self.step_and_token(TokenType::Colon, self.pos())),
';' => Some(self.step_and_token(TokenType::Semicolon, self.pos())),
'&' => Some(self.step_and_token(TokenType::Ampersand, self.pos())),
_ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())),
}
}
@ -153,22 +202,27 @@ impl<'a> Lexer<'a> {
while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
self.step();
}
match &self.text[start.index..self.index] {
"false" => self.token(TokenType::False, start),
"true" => self.token(TokenType::True, start),
"if" => self.token(TokenType::True, start),
"else" => self.token(TokenType::True, start),
"while" => self.token(TokenType::True, start),
"for" => self.token(TokenType::True, start),
"in" => self.token(TokenType::True, start),
"break" => self.token(TokenType::True, start),
"continue" => self.token(TokenType::True, start),
"function" => self.token(TokenType::True, start),
"return" => self.token(TokenType::True, start),
"end" => self.token(TokenType::True, start),
"underscore" => self.token(TokenType::True, start),
_ => self.token(TokenType::Id, start),
}
self.token(
match &self.text[start.index..self.index] {
"false" => TokenType::False,
"true" => TokenType::True,
"let" => TokenType::Let,
"mut" => TokenType::Mut,
"if" => TokenType::If,
"else" => TokenType::Else,
"while" => TokenType::While,
"for" => TokenType::For,
"in" => TokenType::In,
"break" => TokenType::Break,
"continue" => TokenType::Continue,
"fn" => TokenType::Fn,
"return" => TokenType::Return,
"end" => TokenType::End,
"_" => TokenType::Underscore,
_ => TokenType::Id,
},
start,
)
}
fn single_or_double_char_token(
@ -186,6 +240,23 @@ impl<'a> Lexer<'a> {
}
}
fn asterisk_token(&mut self) -> Token {
let start = self.pos();
self.step();
if !self.done() && self.current() == '*' {
self.step();
if !self.done() && self.current() == '=' {
self.step_and_token(TokenType::DoubleAsteriskEqual, start)
} else {
self.token(TokenType::DoubleAsterisk, start)
}
} else if !self.done() && self.current() == '=' {
self.step_and_token(TokenType::AsteriskEqual, start)
} else {
self.token(TokenType::Asterisk, start)
}
}
fn slash_token(&mut self) -> Option<Token> {
let start = self.pos();
self.step();
@ -255,7 +326,7 @@ impl<'a> Lexer<'a> {
}
}
fn pos(&self) -> Position {
pub fn pos(&self) -> Position {
Position {
index: self.index,
line: self.line,
@ -268,7 +339,7 @@ impl<'a> Lexer<'a> {
}
fn current(&self) -> char {
self.current_char.expect("done() not checked")
self.current_char.expect("done() checked")
}
fn step(&mut self) {
@ -293,17 +364,508 @@ impl<'a> Iterator for Lexer<'a> {
}
}
#[derive(Debug)]
struct Node<T> {
pub value: T,
pub pos: Position,
}
#[derive(Debug)]
enum Expr {
Unit,
Id(String),
Int(i64),
Float(f64),
String(String),
Bool(bool),
Array(Vec<Node<Expr>>),
Object(Vec<ObjectEntry>),
Tuple(Vec<Node<Expr>>),
Member {
subject: Box<Node<Expr>>,
value: String,
},
Index {
subject: Box<Node<Expr>>,
value: Box<Node<Expr>>,
},
Call {
subject: Box<Node<Expr>>,
arguments: Vec<Node<Expr>>,
},
Unary {
unary_type: UnaryType,
subject: Box<Node<Expr>>,
},
Binary {
binary_type: BinaryType,
left: Box<Node<Expr>>,
right: Box<Node<Expr>>,
},
}
#[derive(Debug)]
enum ObjectEntry {
Pair(Box<Node<Expr>>, Box<Expr>),
}
#[derive(Debug)]
enum UnaryType {
Not,
Negate,
Reference,
ReferenceMut,
Dereference,
}
#[derive(Debug)]
enum BinaryType {
Exponentiate,
Multiply,
Divide,
Modulo,
Add,
Subtract,
LT,
LTE,
GT,
GTE,
In,
Equal,
Inequal,
And,
Or,
}
#[derive(Debug)]
struct ParserError {
pos: Position,
message: String,
}
struct Parser<'a> {
text: &'a str,
lexer: Lexer<'a>,
current_token: Option<Token>,
}
impl<'a> Parser<'a> {
pub fn new(text: &'a str, mut lexer: Lexer<'a>) -> Self {
Self {
text,
current_token: lexer.next(),
lexer,
}
}
pub fn parse_expr(&mut self) -> Result<Node<Expr>, ParserError> {
self.parse_prec_or()
}
fn parse_prec_or(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_and()?;
while !self.done() {
if self.current_is(TokenType::Or) {
self.step();
let right = self.parse_prec_and()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Or,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_and(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_equal_inequal()?;
while !self.done() {
if self.current_is(TokenType::And) {
self.step();
let right = self.parse_prec_equal_inequal()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::And,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_equal_inequal(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_lt_lte_gt_gte_in()?;
while !self.done() {
if self.current_is(TokenType::EqualEqual) {
self.step();
let right = self.parse_prec_lt_lte_gt_gte_in()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Equal,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::ExclamationEqual) {
self.step();
let right = self.parse_prec_lt_lte_gt_gte_in()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Inequal,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_lt_lte_gt_gte_in(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_add_subtract()?;
while !self.done() {
if self.current_is(TokenType::LessThan) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::LT,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::GreaterThan) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::GT,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::LessThanEqual) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::LTE,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::GreaterThanEqual) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::GTE,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::In) {
self.step();
let right = self.parse_prec_add_subtract()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::In,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_add_subtract(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_multiply_divide_modulo()?;
while !self.done() {
if self.current_is(TokenType::Plus) {
self.step();
let right = self.parse_prec_multiply_divide_modulo()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Add,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::Minus) {
self.step();
let right = self.parse_prec_multiply_divide_modulo()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Subtract,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_multiply_divide_modulo(&mut self) -> Result<Node<Expr>, ParserError> {
let mut left = self.parse_prec_unary()?;
while !self.done() {
if self.current_is(TokenType::Asterisk) {
self.step();
let right = self.parse_prec_unary()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Multiply,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::Slash) {
self.step();
let right = self.parse_prec_unary()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Divide,
left: Box::new(left),
right: Box::new(right),
});
} else if self.current_is(TokenType::Percent) {
self.step();
let right = self.parse_prec_unary()?;
left = self.node(Expr::Binary {
binary_type: BinaryType::Modulo,
left: Box::new(left),
right: Box::new(right),
});
} else {
break;
}
}
Ok(left)
}
fn parse_prec_unary(&mut self) -> Result<Node<Expr>, ParserError> {
if !self.done() && self.current_is(TokenType::Not) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Not,
subject,
})
} else if !self.done() && self.current_is(TokenType::Minus) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Negate,
subject,
})
} else if !self.done() && self.current_is(TokenType::Ampersand) {
self.step();
if !self.done() && self.current_is(TokenType::Mut) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::ReferenceMut,
subject,
})
} else {
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Reference,
subject,
})
}
} else if !self.done() && self.current_is(TokenType::Asterisk) {
self.step();
let subject = Box::new(self.parse_prec_unary()?);
self.ok_node(Expr::Unary {
unary_type: UnaryType::Dereference,
subject,
})
} else {
self.parse_prec_exponentiate()
}
}
fn parse_prec_exponentiate(&mut self) -> Result<Node<Expr>, ParserError> {
let left = self.parse_prec_member_index_call()?;
if !self.done() && self.current_is(TokenType::AsteriskEqual) {
let right = self.parse_prec_exponentiate()?;
self.step_and_ok_node(Expr::Binary {
binary_type: BinaryType::Exponentiate,
left: Box::new(left),
right: Box::new(right),
})
} else {
Ok(left)
}
}
fn parse_prec_member_index_call(&mut self) -> Result<Node<Expr>, ParserError> {
let mut subject = self.parse_operand()?;
while !self.done() {
if self.current_is(TokenType::Dot) {
self.step();
if self.done() || self.current_is(TokenType::Id) {
return self.error("expected identifier");
}
let value = self.token_string(self.current());
self.step();
subject = self.node(Expr::Member {
subject: Box::new(subject),
value,
});
} else if self.current_is(TokenType::LBracket) {
self.step();
let value = self.parse_expr()?;
if self.done() || !self.current_is(TokenType::RBracket) {
return self.error("expected ']'");
}
subject = self.node(Expr::Index {
subject: Box::new(subject),
value: Box::new(value),
});
} else if self.current_is(TokenType::LParen) {
self.step();
let mut arguments = Vec::<Node<Expr>>::new();
if !self.done() && !self.current_is(TokenType::RParen) {
arguments.push(self.parse_expr()?);
while !self.done() && self.current_is(TokenType::Comma) {
self.step();
if self.done() || self.current_is(TokenType::RParen) {
self.step();
}
arguments.push(self.parse_expr()?);
}
}
if self.done() || !self.current_is(TokenType::RParen) {
return self.error("expected ')'");
}
self.step();
subject = self.node(Expr::Call {
subject: Box::new(subject),
arguments,
})
} else {
break;
}
}
Ok(subject)
}
fn parse_operand(&mut self) -> Result<Node<Expr>, ParserError> {
if self.done() {
return self.error("expected value, got eof");
}
match self.current().token_type {
TokenType::Id => self.step_and_ok_node(Expr::Id(self.token_string(self.current()))),
TokenType::Int => {
let mut value_string = self.token_string(self.current());
self.step();
if !self.done() && self.current_is(TokenType::Decimal) {
value_string.push_str(&self.token_string(self.current()));
self.step_and_ok_node(Expr::Float(
value_string.parse::<f64>().expect("valid f64"),
))
} else {
self.ok_node(Expr::Int((value_string).parse::<i64>().expect("valid i64")))
}
}
TokenType::Decimal => self.step_and_ok_node(Expr::Float(
self.token_string(self.current())
.parse::<f64>()
.expect("valid f64"),
)),
TokenType::False => self.step_and_ok_node(Expr::Bool(false)),
TokenType::True => self.step_and_ok_node(Expr::Bool(true)),
TokenType::LParen => self.parse_unit_group_or_tuple(),
TokenType::LBrace => self.parse_object(),
TokenType::LBracket => self.parse_array(),
TokenType::Fn => self.parse_function(),
_ => self.error("expected value"),
}
}
fn parse_unit_group_or_tuple(&mut self) -> Result<Node<Expr>, ParserError> {
self.step();
if !self.done() && !self.current_is(TokenType::LParen) {
todo!()
} else {
self.step_and_ok_node(Expr::Unit)
}
}
fn parse_object(&mut self) -> Result<Node<Expr>, ParserError> {
todo!()
}
fn parse_array(&mut self) -> Result<Node<Expr>, ParserError> {
todo!()
}
fn parse_function(&mut self) -> Result<Node<Expr>, ParserError> {
todo!()
}
fn token_string(&self, token: &Token) -> String {
self.text[token.pos.index..token.pos.index + token.length].to_string()
}
fn step_and_ok_node<T>(&mut self, value: T) -> Result<Node<T>, ParserError> {
self.step();
self.ok_node(value)
}
fn ok_node<T>(&self, value: T) -> Result<Node<T>, ParserError> {
Ok(Node {
value,
pos: self.lexer.pos(),
})
}
fn step_and_node<T>(&mut self, value: T) -> Node<T> {
self.step();
self.node(value)
}
fn node<T>(&self, value: T) -> Node<T> {
Node {
value,
pos: self.lexer.pos(),
}
}
fn step_and<T>(&mut self, value: T) -> T {
self.step();
value
}
fn error(&self, message: &str) -> Result<Node<Expr>, ParserError> {
Err(ParserError {
pos: self.lexer.pos(),
message: message.to_string(),
})
}
fn done(&self) -> bool {
self.current_token.is_none()
}
fn current_is(&self, token_type: TokenType) -> bool {
self.current().token_type == token_type
}
fn current(&self) -> &Token {
self.current_token.as_ref().expect("done() checked")
}
fn step(&mut self) {
self.current_token = self.lexer.next();
}
}
fn main() {
let text = "3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5";
println!("tokens = [");
let text = "** 3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5";
let lexer = Lexer::new(text);
lexer.for_each(|token| {
println!("{:?}", token);
})
println!(" {:?},", token);
});
println!("]");
let text2 = "1 + 2 * 3";
let mut parser = Parser::new(text2, Lexer::new(text2));
let expr = parser.parse_expr();
println!("ast = {:#?}", expr);
}