From b88ea4bad1d5081b541cbd4429dcfe371e238d32 Mon Sep 17 00:00:00 2001 From: Theis Pieter Hollebeek Date: Tue, 7 Feb 2023 12:47:18 +0100 Subject: [PATCH] work-in-progress lexer rewrite --- src/bong/lexer.rs | 550 ++++++++++++++++++++-------------------------- 1 file changed, 237 insertions(+), 313 deletions(-) diff --git a/src/bong/lexer.rs b/src/bong/lexer.rs index ed12127..5119103 100644 --- a/src/bong/lexer.rs +++ b/src/bong/lexer.rs @@ -1,17 +1,15 @@ -#[derive(Debug, Clone, PartialEq)] -pub enum ErrorType { - UnexpectedToken(char), - InvalidConstructor, +#![allow(dead_code)] + +use std::iter::Peekable; + +#[derive(PartialEq, Eq, Debug)] +pub struct TokenError { + error: String, + line: usize, + col: usize, } -#[derive(Debug, Clone, PartialEq)] -pub struct Error { - error: ErrorType, - line: isize, - col: isize, -} - -#[derive(Debug, Clone, PartialEq)] +#[derive(PartialEq, Eq, Debug)] pub enum Token { Name(String), Id(String), @@ -19,326 +17,252 @@ pub enum Token { SlWhitespace(String), MlWhitespace(String), SlComment(String), - MlComment(String), // not implemented - Int(String), // not implemented - Float(String), // not implemented + MlComment(String), + Int(String), + Float(String), String(String), - Null(String), // not implemented - True(String), // not implemented - False(String), // not implemented + Null(String), + True(String), + False(String), LBrace(String), RBrace(String), LBracket(String), RBracket(String), - Equal(String), // not implemented - Colon(String), // not implemented - SemiColon(String), // not implemented - Comma(String), // not implemented + Equal(String), + Colon(String), + SemiColon(String), + Comma(String), + Error(TokenError), } -#[derive(PartialEq)] -enum Mode { - Name, - Class, - Id, - String, - EscapedString, - SlWhitespace, - MlWhitespace, - SlComment, -} - -impl Mode { - fn token_constructor(&self) -> Result Token>, ErrorType> { - match self { - Mode::Name => Ok(Box::new(Token::Name)), - Mode::Class => Ok(Box::new(Token::Class)), - Mode::String => Ok(Box::new(Token::String)), - Mode::SlWhitespace => Ok(Box::new(Token::SlWhitespace)), - Mode::MlWhitespace => Ok(Box::new(Token::MlWhitespace)), - Mode::SlComment => Ok(Box::new(Token::SlComment)), - Mode::Id => Ok(Box::new(Token::Id)), - Mode::EscapedString => Err(ErrorType::InvalidConstructor), +fn make_keyword_or_name>( + iter: &mut Peekable, + line: &mut usize, +) -> Token { + let mut result: Vec = Vec::new(); + loop { + match iter.peek() { + Some('A'..='Z' | 'a'..='z') => { + *line += 1; + let c = iter + .next() + .expect("iterator should not be mutated between peek & next"); + result.push(c); + } + Some('0'..='9') => { + // we assert instead of returning an error because this means the lexer is written incorrectly + assert_ne!(result.len(), 0); + *line += 1; + let c = iter + .next() + .expect("iterator should not be mutated between peek & next"); + result.push(c); + } + _ => { + break match String::from_iter(result).as_str() { + s @ "null" => Token::Null(s.to_string()), + s @ "true" => Token::True(s.to_string()), + s @ "false" => Token::False(s.to_string()), + name => Token::Name(name.to_string()), + } + } } } } -fn collect_into_token_and_push( - constructor: &dyn Fn(String) -> Token, - tokens: &mut Vec, - value: &mut Vec, -) { - let token = constructor(value.iter().collect()); - tokens.push(token); - value.clear(); -} - -#[allow(dead_code)] -pub fn lexer(code_to_lex: &str) -> Result, Error> { - let mut tokens = Vec::new(); - let mut value = Vec::new(); - let mut mode = Mode::SlWhitespace; - let mut line = 0; - let mut col = 0; - let position_map = move |error: ErrorType| Error { error, line, col }; - for current_char in code_to_lex.chars() { - match current_char { - v @ ('.' | '#') => { - match mode { - m @ (Mode::Name - | Mode::Class - | Mode::Id - | Mode::SlWhitespace - | Mode::MlWhitespace) => { - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - mode = match v { - '.' => Mode::Class, - '#' => Mode::Id, - _ => panic!("race condition"), - }; - } - Mode::String | Mode::SlComment => {} - Mode::EscapedString => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken('.'), - }) - } - }; - value.push(v); +fn make_number>( + iter: &mut Peekable, + line: &mut usize, + col: &mut usize, +) -> Result { + let mut result: Vec = Vec::new(); + loop { + let next = iter.peek(); + match next { + Some('0'..='9') => { + *line += 1; + let c = iter + .next() + .expect("iterator should not be mutated between peek & next"); + result.push(c); } - '\\' => match mode { - Mode::String => { - value.push('\\'); - mode = Mode::EscapedString; - } - _ => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken('\\'), - }) - } - }, - '"' => { - match mode { - m @ Mode::String => { - mode = Mode::SlWhitespace; - value.push('"'); - - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - } - m @ (Mode::SlWhitespace | Mode::MlWhitespace) => { - mode = Mode::String; - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - value.push('"'); - } - Mode::EscapedString => { - value.push('"'); - mode = Mode::String; - } - Mode::SlComment => { - value.push('"'); - } - _ => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken('"'), - }) - } - }; - } - - v @ ('{' | '}' | '[' | ']') => match mode { - m @ (Mode::Name - | Mode::Class - | Mode::Id - | Mode::MlWhitespace - | Mode::SlWhitespace) => { - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - mode = Mode::SlWhitespace; - let constructor = match v { - '{' => Token::LBrace, - '}' => Token::RBrace, - '[' => Token::LBracket, - ']' => Token::RBracket, - _ => panic!("race condition"), - }; - tokens.push(constructor(String::from(v))); - } - Mode::EscapedString => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken(v), - }) - } - Mode::String | Mode::SlComment => { - value.push(v); - } - }, - c @ (' ' | '\r') => { - match mode { - m @ (Mode::Name | Mode::Class | Mode::Id) => { - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - mode = Mode::SlWhitespace; - } - Mode::String | Mode::SlComment | Mode::MlWhitespace | Mode::SlWhitespace => {} - Mode::EscapedString => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken(c), - }) - } - }; - value.push(c); - } - c @ '\n' => { - match mode { - m @ (Mode::Name | Mode::Class | Mode::Id | Mode::SlComment) => { - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - mode = Mode::MlWhitespace; - } - Mode::MlWhitespace | Mode::SlWhitespace => { - mode = Mode::MlWhitespace; - } - Mode::String => {} - Mode::EscapedString => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken('\n'), - }) - } - }; - value.push(c); - line += 1; - col = -1; - } - '/' => { - match mode { - Mode::String | Mode::SlComment => {} - m @ (Mode::Name - | Mode::Class - | Mode::Id - | Mode::SlWhitespace - | Mode::MlWhitespace) => { - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - mode = Mode::SlComment; - } - Mode::EscapedString => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken('/'), - }) - } - }; - value.push('/'); - } - v @ ('A'..='Z' | 'a'..='z' | '0'..='9') => { - match mode { - Mode::Name | Mode::Class | Mode::Id => { - if v.is_numeric() - && (value.is_empty() || mode == Mode::Id && value.len() == 1) - { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken(v), - }); - } - } - Mode::String | Mode::SlComment => {} - m @ (Mode::SlWhitespace | Mode::MlWhitespace) => { - collect_into_token_and_push( - &m.token_constructor().map_err(position_map)?, - &mut tokens, - &mut value, - ); - mode = Mode::Name; - } - Mode::EscapedString => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken(v), - }) - } - }; - value.push(v); - } - unrecognized_char => match mode { - Mode::String => { - value.push(unrecognized_char); - } - _ => { - return Err(Error { - line, - col, - error: ErrorType::UnexpectedToken(unrecognized_char), + Some('.') => { + *line += 1; + if result.contains(&'.') { + iter.next(); + return Err(TokenError { + error: "unexpected token".to_string(), + col: *col, + line: *line, }); + } else { + iter.next() + .expect("iterator should not be mutated between peek & next"); + result.push('.'); } - }, + } + _ => { + break Ok(if result.contains(&'.') { + Token::Float(String::from_iter(result)) + } else { + Token::Int(String::from_iter(result)) + }) + } } - col += 1; } - - Ok(tokens) } -#[test] -fn test_example_1() { - let text = "text.title { +fn make_string>( + iter: &mut T, + line: &mut usize, + col: &mut usize, +) -> Result { + let mut result: Vec = Vec::new(); + let mut escaped = false; + iter.next().expect("opening quote should exist"); + loop { + let next = iter.next().ok_or(TokenError { + error: "unexpected end of string".to_string(), + line: *line, + col: *col, + })?; + match next { + '\\' => { + *line += 1; + escaped = !escaped; + result.push('\\'); + } + '"' => { + *line += 1; + if escaped { + result.push('"'); + escaped = false; + } else { + break Ok(Token::String(String::from_iter(result))); + } + } + '\n' => { + *line = 0; + *col += 1; + result.push('\n'); + } + c => { + *line += 1; + escaped = false; + result.push(c); + } + } + } +} + +fn lexer(code: &str) -> Vec { + let mut tokens = Vec::new(); + let mut iter = code.chars().peekable(); + let mut col = 0; + let mut line = 0; + loop { + let char = if let Some(c) = iter.peek() { + c + } else { + break tokens; + }; + + match char { + '"' => { + let token = match make_string(&mut iter, &mut line, &mut col) { + Ok(token) => token, + Err(err) => Token::Error(err), + }; + tokens.push(token); + } + '0'..='9' => { + let token = match make_number(&mut iter, &mut line, &mut col) { + Ok(token) => token, + Err(err) => Token::Error(err), + }; + tokens.push(token); + } + 'a'..='z' | 'A'..='Z' => { + let token = make_keyword_or_name(&mut iter, &mut line); + tokens.push(token); + } + ' ' => { + let mut result: Vec = Vec::new(); + let token = loop { + let next = iter.peek(); + match next { + Some(' ' | '\n' | '\r') => { + let c = iter + .next() + .expect("should not mutate between next & unwrap"); + result.push(c); + } + _ => { + break if result.contains(&'\n') { + Token::MlWhitespace(String::from_iter(result)) + } else { + Token::SlWhitespace(String::from_iter(result)) + } + } + }; + }; + tokens.push(token); + } + _ => todo!(), + } + } +} + +#[cfg(test)] +mod tests { + use crate::bong::lexer::{lexer, Token}; + + #[test] + fn name_keyword_number_whitespace() { + let result = lexer("abcd \n 1234 12.34 true false \"string\""); + + let space = |c: &str| -> Token { Token::SlWhitespace(c.to_string()) }; + + assert_eq!( + result, + vec![ + Token::Name("abcd".to_string()), + Token::MlWhitespace(" \n ".to_string()), + Token::Int("1234".to_string()), + space(" "), + Token::Float("12.34".to_string()), + space(" "), + Token::True("true".to_string()), + space(" "), + Token::False("false".to_string()), + space(" "), + Token::String("string".to_string()), + ] + ) + } + + #[test] + fn test_example_1() { + let text = "text.title { // text { \"hello world\" } \"hello world\" }"; - let tokens = lexer(text); - assert_eq!( - tokens, - Ok(vec![ - Token::SlWhitespace("".to_string()), - Token::Name("text".to_string()), - Token::Class(".title".to_string()), - Token::SlWhitespace(" ".to_string()), - Token::LBrace("{".to_string()), - Token::MlWhitespace("\n ".to_string()), - Token::SlComment("// text { \"hello world\" }".to_string()), - Token::MlWhitespace("\n ".to_string()), - Token::String("\"hello world\"".to_string()), - Token::MlWhitespace("\n".to_string()), - Token::RBrace("}".to_string()), - ]) - ) + let tokens = lexer(text); + assert_eq!( + tokens, + vec![ + Token::SlWhitespace("".to_string()), + Token::Name("text".to_string()), + Token::Class(".title".to_string()), + Token::SlWhitespace(" ".to_string()), + Token::LBrace("{".to_string()), + Token::MlWhitespace("\n ".to_string()), + Token::SlComment("// text { \"hello world\" }".to_string()), + Token::MlWhitespace("\n ".to_string()), + Token::String("\"hello world\"".to_string()), + Token::MlWhitespace("\n".to_string()), + Token::RBrace("}".to_string()), + ] + ) + } }