work-in-progress lexer rewrite

This commit is contained in:
Theis Pieter Hollebeek 2023-02-07 12:47:18 +01:00
parent 5f508eee12
commit b88ea4bad1

View File

@ -1,17 +1,15 @@
#[derive(Debug, Clone, PartialEq)] #![allow(dead_code)]
pub enum ErrorType {
UnexpectedToken(char), use std::iter::Peekable;
InvalidConstructor,
#[derive(PartialEq, Eq, Debug)]
pub struct TokenError {
error: String,
line: usize,
col: usize,
} }
#[derive(Debug, Clone, PartialEq)] #[derive(PartialEq, Eq, Debug)]
pub struct Error {
error: ErrorType,
line: isize,
col: isize,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Token { pub enum Token {
Name(String), Name(String),
Id(String), Id(String),
@ -19,303 +17,228 @@ pub enum Token {
SlWhitespace(String), SlWhitespace(String),
MlWhitespace(String), MlWhitespace(String),
SlComment(String), SlComment(String),
MlComment(String), // not implemented MlComment(String),
Int(String), // not implemented Int(String),
Float(String), // not implemented Float(String),
String(String), String(String),
Null(String), // not implemented Null(String),
True(String), // not implemented True(String),
False(String), // not implemented False(String),
LBrace(String), LBrace(String),
RBrace(String), RBrace(String),
LBracket(String), LBracket(String),
RBracket(String), RBracket(String),
Equal(String), // not implemented Equal(String),
Colon(String), // not implemented Colon(String),
SemiColon(String), // not implemented SemiColon(String),
Comma(String), // not implemented Comma(String),
Error(TokenError),
} }
#[derive(PartialEq)] fn make_keyword_or_name<T: Iterator<Item = char>>(
enum Mode { iter: &mut Peekable<T>,
Name, line: &mut usize,
Class, ) -> Token {
Id, let mut result: Vec<char> = Vec::new();
String, loop {
EscapedString, match iter.peek() {
SlWhitespace, Some('A'..='Z' | 'a'..='z') => {
MlWhitespace, *line += 1;
SlComment, let c = iter
.next()
.expect("iterator should not be mutated between peek & next");
result.push(c);
} }
Some('0'..='9') => {
impl Mode { // we assert instead of returning an error because this means the lexer is written incorrectly
fn token_constructor(&self) -> Result<Box<dyn Fn(String) -> Token>, ErrorType> { assert_ne!(result.len(), 0);
match self { *line += 1;
Mode::Name => Ok(Box::new(Token::Name)), let c = iter
Mode::Class => Ok(Box::new(Token::Class)), .next()
Mode::String => Ok(Box::new(Token::String)), .expect("iterator should not be mutated between peek & next");
Mode::SlWhitespace => Ok(Box::new(Token::SlWhitespace)), result.push(c);
Mode::MlWhitespace => Ok(Box::new(Token::MlWhitespace)),
Mode::SlComment => Ok(Box::new(Token::SlComment)),
Mode::Id => Ok(Box::new(Token::Id)),
Mode::EscapedString => Err(ErrorType::InvalidConstructor),
}
}
}
fn collect_into_token_and_push(
constructor: &dyn Fn(String) -> Token,
tokens: &mut Vec<Token>,
value: &mut Vec<char>,
) {
let token = constructor(value.iter().collect());
tokens.push(token);
value.clear();
}
#[allow(dead_code)]
pub fn lexer(code_to_lex: &str) -> Result<Vec<Token>, Error> {
let mut tokens = Vec::new();
let mut value = Vec::new();
let mut mode = Mode::SlWhitespace;
let mut line = 0;
let mut col = 0;
let position_map = move |error: ErrorType| Error { error, line, col };
for current_char in code_to_lex.chars() {
match current_char {
v @ ('.' | '#') => {
match mode {
m @ (Mode::Name
| Mode::Class
| Mode::Id
| Mode::SlWhitespace
| Mode::MlWhitespace) => {
collect_into_token_and_push(
&m.token_constructor().map_err(position_map)?,
&mut tokens,
&mut value,
);
mode = match v {
'.' => Mode::Class,
'#' => Mode::Id,
_ => panic!("race condition"),
};
}
Mode::String | Mode::SlComment => {}
Mode::EscapedString => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken('.'),
})
}
};
value.push(v);
}
'\\' => match mode {
Mode::String => {
value.push('\\');
mode = Mode::EscapedString;
} }
_ => { _ => {
return Err(Error { break match String::from_iter(result).as_str() {
line, s @ "null" => Token::Null(s.to_string()),
col, s @ "true" => Token::True(s.to_string()),
error: ErrorType::UnexpectedToken('\\'), s @ "false" => Token::False(s.to_string()),
name => Token::Name(name.to_string()),
}
}
}
}
}
fn make_number<T: Iterator<Item = char>>(
iter: &mut Peekable<T>,
line: &mut usize,
col: &mut usize,
) -> Result<Token, TokenError> {
let mut result: Vec<char> = Vec::new();
loop {
let next = iter.peek();
match next {
Some('0'..='9') => {
*line += 1;
let c = iter
.next()
.expect("iterator should not be mutated between peek & next");
result.push(c);
}
Some('.') => {
*line += 1;
if result.contains(&'.') {
iter.next();
return Err(TokenError {
error: "unexpected token".to_string(),
col: *col,
line: *line,
});
} else {
iter.next()
.expect("iterator should not be mutated between peek & next");
result.push('.');
}
}
_ => {
break Ok(if result.contains(&'.') {
Token::Float(String::from_iter(result))
} else {
Token::Int(String::from_iter(result))
}) })
} }
}, }
}
}
fn make_string<T: Iterator<Item = char>>(
iter: &mut T,
line: &mut usize,
col: &mut usize,
) -> Result<Token, TokenError> {
let mut result: Vec<char> = Vec::new();
let mut escaped = false;
iter.next().expect("opening quote should exist");
loop {
let next = iter.next().ok_or(TokenError {
error: "unexpected end of string".to_string(),
line: *line,
col: *col,
})?;
match next {
'\\' => {
*line += 1;
escaped = !escaped;
result.push('\\');
}
'"' => { '"' => {
match mode { *line += 1;
m @ Mode::String => { if escaped {
mode = Mode::SlWhitespace; result.push('"');
value.push('"'); escaped = false;
} else {
break Ok(Token::String(String::from_iter(result)));
}
}
'\n' => {
*line = 0;
*col += 1;
result.push('\n');
}
c => {
*line += 1;
escaped = false;
result.push(c);
}
}
}
}
collect_into_token_and_push( fn lexer(code: &str) -> Vec<Token> {
&m.token_constructor().map_err(position_map)?, let mut tokens = Vec::new();
&mut tokens, let mut iter = code.chars().peekable();
&mut value, let mut col = 0;
); let mut line = 0;
loop {
let char = if let Some(c) = iter.peek() {
c
} else {
break tokens;
};
match char {
'"' => {
let token = match make_string(&mut iter, &mut line, &mut col) {
Ok(token) => token,
Err(err) => Token::Error(err),
};
tokens.push(token);
} }
m @ (Mode::SlWhitespace | Mode::MlWhitespace) => { '0'..='9' => {
mode = Mode::String; let token = match make_number(&mut iter, &mut line, &mut col) {
collect_into_token_and_push( Ok(token) => token,
&m.token_constructor().map_err(position_map)?, Err(err) => Token::Error(err),
&mut tokens, };
&mut value, tokens.push(token);
);
value.push('"');
} }
Mode::EscapedString => { 'a'..='z' | 'A'..='Z' => {
value.push('"'); let token = make_keyword_or_name(&mut iter, &mut line);
mode = Mode::String; tokens.push(token);
} }
Mode::SlComment => { ' ' => {
value.push('"'); let mut result: Vec<char> = Vec::new();
let token = loop {
let next = iter.peek();
match next {
Some(' ' | '\n' | '\r') => {
let c = iter
.next()
.expect("should not mutate between next & unwrap");
result.push(c);
} }
_ => { _ => {
return Err(Error { break if result.contains(&'\n') {
line, Token::MlWhitespace(String::from_iter(result))
col, } else {
error: ErrorType::UnexpectedToken('"'), Token::SlWhitespace(String::from_iter(result))
}) }
} }
}; };
};
tokens.push(token);
}
_ => todo!(),
}
}
} }
v @ ('{' | '}' | '[' | ']') => match mode { #[cfg(test)]
m @ (Mode::Name mod tests {
| Mode::Class use crate::bong::lexer::{lexer, Token};
| Mode::Id
| Mode::MlWhitespace
| Mode::SlWhitespace) => {
collect_into_token_and_push(
&m.token_constructor().map_err(position_map)?,
&mut tokens,
&mut value,
);
mode = Mode::SlWhitespace;
let constructor = match v {
'{' => Token::LBrace,
'}' => Token::RBrace,
'[' => Token::LBracket,
']' => Token::RBracket,
_ => panic!("race condition"),
};
tokens.push(constructor(String::from(v)));
}
Mode::EscapedString => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken(v),
})
}
Mode::String | Mode::SlComment => {
value.push(v);
}
},
c @ (' ' | '\r') => {
match mode {
m @ (Mode::Name | Mode::Class | Mode::Id) => {
collect_into_token_and_push(
&m.token_constructor().map_err(position_map)?,
&mut tokens,
&mut value,
);
mode = Mode::SlWhitespace;
}
Mode::String | Mode::SlComment | Mode::MlWhitespace | Mode::SlWhitespace => {}
Mode::EscapedString => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken(c),
})
}
};
value.push(c);
}
c @ '\n' => {
match mode {
m @ (Mode::Name | Mode::Class | Mode::Id | Mode::SlComment) => {
collect_into_token_and_push(
&m.token_constructor().map_err(position_map)?,
&mut tokens,
&mut value,
);
mode = Mode::MlWhitespace;
}
Mode::MlWhitespace | Mode::SlWhitespace => {
mode = Mode::MlWhitespace;
}
Mode::String => {}
Mode::EscapedString => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken('\n'),
})
}
};
value.push(c);
line += 1;
col = -1;
}
'/' => {
match mode {
Mode::String | Mode::SlComment => {}
m @ (Mode::Name
| Mode::Class
| Mode::Id
| Mode::SlWhitespace
| Mode::MlWhitespace) => {
collect_into_token_and_push(
&m.token_constructor().map_err(position_map)?,
&mut tokens,
&mut value,
);
mode = Mode::SlComment;
}
Mode::EscapedString => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken('/'),
})
}
};
value.push('/');
}
v @ ('A'..='Z' | 'a'..='z' | '0'..='9') => {
match mode {
Mode::Name | Mode::Class | Mode::Id => {
if v.is_numeric()
&& (value.is_empty() || mode == Mode::Id && value.len() == 1)
{
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken(v),
});
}
}
Mode::String | Mode::SlComment => {}
m @ (Mode::SlWhitespace | Mode::MlWhitespace) => {
collect_into_token_and_push(
&m.token_constructor().map_err(position_map)?,
&mut tokens,
&mut value,
);
mode = Mode::Name;
}
Mode::EscapedString => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken(v),
})
}
};
value.push(v);
}
unrecognized_char => match mode {
Mode::String => {
value.push(unrecognized_char);
}
_ => {
return Err(Error {
line,
col,
error: ErrorType::UnexpectedToken(unrecognized_char),
});
}
},
}
col += 1;
}
Ok(tokens) #[test]
fn name_keyword_number_whitespace() {
let result = lexer("abcd \n 1234 12.34 true false \"string\"");
let space = |c: &str| -> Token { Token::SlWhitespace(c.to_string()) };
assert_eq!(
result,
vec![
Token::Name("abcd".to_string()),
Token::MlWhitespace(" \n ".to_string()),
Token::Int("1234".to_string()),
space(" "),
Token::Float("12.34".to_string()),
space(" "),
Token::True("true".to_string()),
space(" "),
Token::False("false".to_string()),
space(" "),
Token::String("string".to_string()),
]
)
} }
#[test] #[test]
@ -327,7 +250,7 @@ fn test_example_1() {
let tokens = lexer(text); let tokens = lexer(text);
assert_eq!( assert_eq!(
tokens, tokens,
Ok(vec![ vec![
Token::SlWhitespace("".to_string()), Token::SlWhitespace("".to_string()),
Token::Name("text".to_string()), Token::Name("text".to_string()),
Token::Class(".title".to_string()), Token::Class(".title".to_string()),
@ -339,6 +262,7 @@ fn test_example_1() {
Token::String("\"hello world\"".to_string()), Token::String("\"hello world\"".to_string()),
Token::MlWhitespace("\n".to_string()), Token::MlWhitespace("\n".to_string()),
Token::RBrace("}".to_string()), Token::RBrace("}".to_string()),
]) ]
) )
} }
}