#![allow(dead_code)] const NO_MUT_PEEK_NEXT_MESSAGE: &str = "should not mutate between peek & next"; use std::iter::Peekable; #[derive(PartialEq, Eq, Debug)] pub struct TokenError { error: String, col: usize, line: usize, } #[derive(PartialEq, Eq, Debug)] pub enum Token { Name(String), Id(String), Class(String), SlWhitespace(String), MlWhitespace(String), SlComment(String), MlComment(String), Int(String), Float(String), String(String), Null(String), True(String), False(String), LBrace(String), RBrace(String), LBracket(String), RBracket(String), Equal(String), Colon(String), SemiColon(String), Comma(String), Error(TokenError), } fn make_keyword_or_name>( iter: &mut Peekable, col: &mut usize, ) -> Token { let mut result: Vec = Vec::new(); loop { match iter.peek() { Some('A'..='Z' | 'a'..='z') => { *col += 1; let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(c); } Some('0'..='9') => { // we assert instead of returning an error because this means the lexer is written incorrectly assert_ne!(result.len(), 0); *col += 1; let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(c); } _ => { break match String::from_iter(result).as_str() { s @ "null" => Token::Null(s.to_string()), s @ "true" => Token::True(s.to_string()), s @ "false" => Token::False(s.to_string()), name => Token::Name(name.to_string()), } } } } } fn make_id_or_class>(iter: &mut Peekable, col: &mut usize) -> Token { let mut result: Vec = vec![iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE)]; loop { match iter.peek() { Some('A'..='Z' | 'a'..='z') => { *col += 1; let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(c); } Some('0'..='9') => { // we assert instead of returning an error because this means the lexer is written incorrectly // atleast one character must be # and atleast one character must be A-Z | a-z assert!(result.len() > 1); *col += 1; let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(c); } _ => { break if result.contains(&'#') { Token::Id(String::from_iter(result)) } else if result.contains(&'.') { Token::Class(String::from_iter(result)) } else { panic!("should contain . or #") } } } } } fn make_number>( iter: &mut Peekable, col: &mut usize, line: &mut usize, ) -> Result { let mut result: Vec = Vec::new(); loop { let next = iter.peek(); match next { Some('0'..='9') => { *col += 1; let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(c); } Some('.') => { *col += 1; if result.contains(&'.') { iter.next(); let error = TokenError { error: "unexpected token".to_string(), col: *col, line: *line, }; *col += 1; return Err(error); } *col += 1; iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push('.'); } _ => { break Ok(if result.contains(&'.') { Token::Float(String::from_iter(result)) } else { Token::Int(String::from_iter(result)) }) } } } } fn make_string>( iter: &mut T, col: &mut usize, line: &mut usize, ) -> Result { let mut result: Vec = vec![iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE)]; let mut escaped = false; loop { let next = iter.next().ok_or(TokenError { error: "unexpected end of string".to_string(), col: *col, line: *line, })?; match next { '\\' => { *col += 1; escaped = !escaped; result.push('\\'); } '"' => { *col += 1; if escaped { result.push('"'); escaped = false; } else { result.push('"'); break Ok(Token::String(String::from_iter(result))); } } '\n' => { *col = 0; *line += 1; result.push('\n'); } c => { *col += 1; escaped = false; result.push(c); } } } } fn single_token, U: Fn(String) -> Token>( iter: &mut Peekable, constructor: U, col: &mut usize, ) -> Token { let char = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); *col += 1; constructor(char.to_string()) } fn make_comment>( iter: &mut Peekable, col: &mut usize, line: &mut usize, ) -> Result { let first_slash = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); let second_character = if let Some(c) = iter.next() { c } else { return Err(TokenError { error: "unexpected EOF".to_string(), col: *col, line: *line, }); }; let mut result = vec![first_slash, second_character]; *col += 2; match second_character { '/' => loop { *col += 1; match iter.peek() { Some('\n') | None => break Ok(Token::SlComment(String::from_iter(result))), _ => result.push(iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE)), } }, '*' => { let mut current = if let Some(c) = iter.next() { c } else { return Err(TokenError { error: "unexpected EOF".to_string(), col: *col, line: *line, }); }; let mut nesting = 0; loop { if let Some(next) = iter.peek() { println!("{current:?}|{next:?}|{result:?}"); if current == '/' { if *next == '*' { result.push(current); current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(current); current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); nesting += 1; } } else if current == '*' { if *next == '/' { result.push(current); current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(current); if nesting == 0 { break Ok(Token::MlComment(String::from_iter(result))); } if iter.peek().is_none() { return Err(TokenError { error: "unexpected EOF".to_string(), col: *col, line: *line, }); } current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); nesting -= 1; } } else { result.push(current); current = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); } } else { break Err(TokenError { error: "unexpected EOF".to_string(), col: *col, line: *line, }); } } } c => Err(TokenError { error: format!("unexpected token {c}"), col: *col, line: *line, }), } } fn lexer(code: &str) -> Vec { let mut tokens = Vec::new(); let mut iter = code.chars().peekable(); let mut col = 0; let mut line = 0; loop { let char = if let Some(c) = iter.peek() { c } else { break tokens; }; match char { '"' => { let token = match make_string(&mut iter, &mut col, &mut line) { Ok(token) => token, Err(err) => Token::Error(err), }; tokens.push(token); } '0'..='9' => { let token = match make_number(&mut iter, &mut col, &mut line) { Ok(token) => token, Err(err) => Token::Error(err), }; tokens.push(token); } 'a'..='z' | 'A'..='Z' => { let token = make_keyword_or_name(&mut iter, &mut col); tokens.push(token); } '{' => { tokens.push(single_token(&mut iter, Token::LBrace, &mut col)); } '}' => { tokens.push(single_token(&mut iter, Token::RBrace, &mut col)); } '#' | '.' => { tokens.push(make_id_or_class(&mut iter, &mut col)); } ' ' | '\n' | '\r' => { let mut result: Vec = Vec::new(); let token = loop { let next = iter.peek(); match next { Some(' ' | '\n' | '\r') => { let c = iter.next().expect(NO_MUT_PEEK_NEXT_MESSAGE); result.push(c); } _ => { break if result.contains(&'\n') { Token::MlWhitespace(String::from_iter(result)) } else { Token::SlWhitespace(String::from_iter(result)) } } }; }; tokens.push(token); } '/' => { let token = match make_comment(&mut iter, &mut col, &mut line) { Ok(token) => token, Err(err) => Token::Error(err), }; tokens.push(token); } c => { tokens.push(Token::Error(TokenError { error: format!("unrecognized character {c}"), col, line, })); iter.next(); col += 1; } } } } #[cfg(test)] mod tests { use crate::bong::lexer::{lexer, Token}; #[test] fn name_keyword_number_whitespace() { let result = lexer("abcd \n 1234 12.34 true false \"string\""); let space = |c: &str| -> Token { Token::SlWhitespace(c.to_string()) }; assert_eq!( result, vec![ Token::Name("abcd".to_string()), Token::MlWhitespace(" \n ".to_string()), Token::Int("1234".to_string()), space(" "), Token::Float("12.34".to_string()), space(" "), Token::True("true".to_string()), space(" "), Token::False("false".to_string()), space(" "), Token::String("\"string\"".to_string()), ] ) } #[test] fn test_example_1() { let text = "text.title { // text { \"hello world\" } \"hello world\" }"; let tokens = lexer(text); assert_eq!( tokens, vec![ Token::Name("text".to_string()), Token::Class(".title".to_string()), Token::SlWhitespace(" ".to_string()), Token::LBrace("{".to_string()), Token::MlWhitespace("\n ".to_string()), Token::SlComment("// text { \"hello world\" }".to_string()), Token::MlWhitespace("\n ".to_string()), Token::String("\"hello world\"".to_string()), Token::MlWhitespace("\n".to_string()), Token::RBrace("}".to_string()), ] ) } #[test] fn unnested_multiline_comment() { let text = "/* hello */"; let tokens = lexer(text); assert_eq!(tokens, vec![Token::MlComment("/* hello */".to_string()),]) } #[test] fn nested_multiline_comment() { let text = "/* /* hello */ */"; let tokens = lexer(text); assert_eq!( tokens, vec![Token::MlComment("/* /* hello */ */".to_string()),] ) } }