web-stack-project/src/bong/lexer.rs

434 lines
14 KiB
Rust

#![allow(dead_code)]
const NO_MUT_PEEK_NEXT_MESSAGE: &str = "should not mutate between peek & next";
use std::iter::Peekable;
#[derive(PartialEq, Eq, Debug)]
pub struct TokenError {
error: String,
col: usize,
line: usize,
}
#[derive(PartialEq, Eq, Debug)]
pub enum Token {
Name(String),
Id(String),
Class(String),
SlWhitespace(String),
MlWhitespace(String),
SlComment(String),
MlComment(String),
Int(String),
Float(String),
String(String),
Null(String),
True(String),
False(String),
LBrace(String),
RBrace(String),
LBracket(String),
RBracket(String),
Equal(String),
Colon(String),
SemiColon(String),
Comma(String),
Error(TokenError),
}
fn make_keyword_or_name<T: Iterator<Item = char>>(
iter: &mut Peekable<T>,
col: &mut usize,
) -> Token {
let mut result: Vec<char> = Vec::new();
loop {
match iter.peek() {
Some('A'..='Z' | 'a'..='z') => {
*col += 1;
let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(c);
}
Some('0'..='9') => {
// we assert instead of returning an error because this means the lexer is written incorrectly
assert_ne!(result.len(), 0);
*col += 1;
let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(c);
}
_ => {
break match String::from_iter(result).as_str() {
s @ "null" => Token::Null(s.to_string()),
s @ "true" => Token::True(s.to_string()),
s @ "false" => Token::False(s.to_string()),
name => Token::Name(name.to_string()),
}
}
}
}
}
fn make_id_or_class<T: Iterator<Item = char>>(iter: &mut Peekable<T>, col: &mut usize) -> Token {
let mut result: Vec<char> = vec![iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE)];
loop {
match iter.peek() {
Some('A'..='Z' | 'a'..='z') => {
*col += 1;
let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(c);
}
Some('0'..='9') => {
// we assert instead of returning an error because this means the lexer is written incorrectly
// atleast one character must be # and atleast one character must be A-Z | a-z
assert!(result.len() > 1);
*col += 1;
let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(c);
}
_ => {
break if result.contains(&'#') {
Token::Id(String::from_iter(result))
} else if result.contains(&'.') {
Token::Class(String::from_iter(result))
} else {
panic!("should contain . or #")
}
}
}
}
}
fn make_number<T: Iterator<Item = char>>(
iter: &mut Peekable<T>,
col: &mut usize,
line: &mut usize,
) -> Result<Token, TokenError> {
let mut result: Vec<char> = Vec::new();
loop {
let next = iter.peek();
match next {
Some('0'..='9') => {
*col += 1;
let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(c);
}
Some('.') => {
*col += 1;
if result.contains(&'.') {
iter.next();
let error = TokenError {
error: "unexpected token".to_string(),
col: *col,
line: *line,
};
*col += 1;
return Err(error);
}
*col += 1;
iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push('.');
}
_ => {
break Ok(if result.contains(&'.') {
Token::Float(String::from_iter(result))
} else {
Token::Int(String::from_iter(result))
})
}
}
}
}
fn make_string<T: Iterator<Item = char>>(
iter: &mut T,
col: &mut usize,
line: &mut usize,
) -> Result<Token, TokenError> {
let mut result: Vec<char> = vec![iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE)];
let mut escaped = false;
loop {
let next = iter.next().ok_or(TokenError {
error: "unexpected end of string".to_string(),
col: *col,
line: *line,
})?;
match next {
'\\' => {
*col += 1;
escaped = !escaped;
result.push('\\');
}
'"' => {
*col += 1;
if escaped {
result.push('"');
escaped = false;
} else {
result.push('"');
break Ok(Token::String(String::from_iter(result)));
}
}
'\n' => {
*col = 0;
*line += 1;
result.push('\n');
}
c => {
*col += 1;
escaped = false;
result.push(c);
}
}
}
}
fn single_token<T: Iterator<Item = char>, U: Fn(String) -> Token>(
iter: &mut Peekable<T>,
constructor: U,
col: &mut usize,
) -> Token {
let char = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
*col += 1;
constructor(char.to_string())
}
fn make_comment<T: Iterator<Item = char>>(
iter: &mut Peekable<T>,
col: &mut usize,
line: &mut usize,
) -> Result<Token, TokenError> {
let first_slash = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
let second_character = if let Some(c) = iter.next() {
c
} else {
return Err(TokenError {
error: "unexpected EOF".to_string(),
col: *col,
line: *line,
});
};
let mut result = vec![first_slash, second_character];
*col += 2;
match second_character {
'/' => loop {
*col += 1;
match iter.peek() {
Some('\n') | None => break Ok(Token::SlComment(String::from_iter(result))),
_ => result.push(iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE)),
}
},
'*' => {
let mut current = if let Some(c) = iter.next() {
c
} else {
return Err(TokenError {
error: "unexpected EOF".to_string(),
col: *col,
line: *line,
});
};
let mut nesting = 0;
loop {
if let Some(next) = iter.peek() {
println!("{current:?}|{next:?}|{result:?}");
if current == '/' {
if *next == '*' {
result.push(current);
current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(current);
current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
nesting += 1;
}
} else if current == '*' {
if *next == '/' {
result.push(current);
current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(current);
if nesting == 0 {
break Ok(Token::MlComment(String::from_iter(result)));
}
if iter.peek().is_none() {
return Err(TokenError {
error: "unexpected EOF".to_string(),
col: *col,
line: *line,
});
}
current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
nesting -= 1;
}
} else {
result.push(current);
current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
}
} else {
break Err(TokenError {
error: "unexpected EOF".to_string(),
col: *col,
line: *line,
});
}
}
}
c => Err(TokenError {
error: format!("unexpected token {c}"),
col: *col,
line: *line,
}),
}
}
fn lexer(code: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut iter = code.chars().peekable();
let mut col = 0;
let mut line = 0;
loop {
let char = if let Some(c) = iter.peek() {
c
} else {
break tokens;
};
match char {
'"' => {
let token = match make_string(&mut iter, &mut col, &mut line) {
Ok(token) => token,
Err(err) => Token::Error(err),
};
tokens.push(token);
}
'0'..='9' => {
let token = match make_number(&mut iter, &mut col, &mut line) {
Ok(token) => token,
Err(err) => Token::Error(err),
};
tokens.push(token);
}
'a'..='z' | 'A'..='Z' => {
let token = make_keyword_or_name(&mut iter, &mut col);
tokens.push(token);
}
'{' => {
tokens.push(single_token(&mut iter, Token::LBrace, &mut col));
}
'}' => {
tokens.push(single_token(&mut iter, Token::RBrace, &mut col));
}
'#' | '.' => {
tokens.push(make_id_or_class(&mut iter, &mut col));
}
' ' | '\n' | '\r' => {
let mut result: Vec<char> = Vec::new();
let token = loop {
let next = iter.peek();
match next {
Some(' ' | '\n' | '\r') => {
let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE);
result.push(c);
}
_ => {
break if result.contains(&'\n') {
Token::MlWhitespace(String::from_iter(result))
} else {
Token::SlWhitespace(String::from_iter(result))
}
}
};
};
tokens.push(token);
}
'/' => {
let token = match make_comment(&mut iter, &mut col, &mut line) {
Ok(token) => token,
Err(err) => Token::Error(err),
};
tokens.push(token);
}
c => {
tokens.push(Token::Error(TokenError {
error: format!("unrecognized character {c}"),
col,
line,
}));
iter.next();
col += 1;
}
}
}
}
#[cfg(test)]
mod tests {
use crate::bong::lexer::{lexer, Token};
#[test]
fn name_keyword_number_whitespace() {
let result = lexer("abcd \n 1234 12.34 true false \"string\"");
let space = |c: &str| -> Token { Token::SlWhitespace(c.to_string()) };
assert_eq!(
result,
vec![
Token::Name("abcd".to_string()),
Token::MlWhitespace(" \n ".to_string()),
Token::Int("1234".to_string()),
space(" "),
Token::Float("12.34".to_string()),
space(" "),
Token::True("true".to_string()),
space(" "),
Token::False("false".to_string()),
space(" "),
Token::String("\"string\"".to_string()),
]
)
}
#[test]
fn test_example_1() {
let text = "text.title {
// text { \"hello world\" }
\"hello world\"
}";
let tokens = lexer(text);
assert_eq!(
tokens,
vec![
Token::Name("text".to_string()),
Token::Class(".title".to_string()),
Token::SlWhitespace(" ".to_string()),
Token::LBrace("{".to_string()),
Token::MlWhitespace("\n ".to_string()),
Token::SlComment("// text { \"hello world\" }".to_string()),
Token::MlWhitespace("\n ".to_string()),
Token::String("\"hello world\"".to_string()),
Token::MlWhitespace("\n".to_string()),
Token::RBrace("}".to_string()),
]
)
}
#[test]
fn unnested_multiline_comment() {
let text = "/* hello */";
let tokens = lexer(text);
assert_eq!(tokens, vec![Token::MlComment("/* hello */".to_string()),])
}
#[test]
fn nested_multiline_comment() {
let text = "/* /* hello */ */";
let tokens = lexer(text);
assert_eq!(
tokens,
vec![Token::MlComment("/* /* hello */ */".to_string()),]
)
}
}