add lexer

This commit is contained in:
SimonFJ20 2023-03-14 17:57:04 +01:00
parent 6ede00be64
commit edec4a2323

View File

@ -1,23 +1,59 @@
#![allow(dead_code)]
use std::str::Chars;
#[derive(Debug)]
enum TokenType {
InvalidChar,
MalformedString,
MalformedComment,
Id,
Int,
Decimal,
String,
False,
True,
If,
Else,
While,
For,
In,
Break,
Continue,
Function,
Return,
End,
Underscore,
Plus,
Minus,
Asterisk,
Slash,
Percent,
PlusEqual,
MinusEqual,
AsteriskEqual,
SlashEqual,
PercentEqual,
LParen,
RParen,
LBrace,
RBrace,
LBracket,
RBracket,
Dot,
Comma,
Colon,
Semicolon,
}
#[derive(Debug)]
struct Position {
index: usize,
line: i32,
col: i32,
}
#[derive(Debug)]
struct Token {
token_type: TokenType,
pos: Position,
@ -26,12 +62,226 @@ struct Token {
struct Lexer<'a> {
text: &'a str,
i: i32,
chars: Chars<'a>,
current_char: Option<char>,
index: usize,
line: i32,
col: i32,
}
impl<'a> Lexer<'a> {
pub fn new(text: &'a str) -> Self {
Self { text, i: 0 }
let mut chars = text.chars();
let first_char = chars.next();
Self {
text,
chars,
current_char: first_char,
index: 0,
line: 1,
col: 1,
}
}
fn next_token(&mut self) -> Option<Token> {
if self.done() {
return None;
}
match self.current() {
' ' | '\t' | '\r' | '\n' => self.skip_whitespace(),
'1'..='9' => Some(self.int_token()),
'a'..='z' | 'A'..='Z' | '_' => Some(self.id_token()),
'"' => Some(self.string_token()),
'+' => {
Some(self.single_or_double_char_token(TokenType::Plus, '=', TokenType::PlusEqual))
}
'-' => {
Some(self.single_or_double_char_token(TokenType::Minus, '=', TokenType::MinusEqual))
}
'*' => Some(self.single_or_double_char_token(
TokenType::Asterisk,
'=',
TokenType::AsteriskEqual,
)),
'/' => self.slash_token(),
'%' => Some(self.single_or_double_char_token(
TokenType::Percent,
'=',
TokenType::PercentEqual,
)),
'(' => Some(self.step_and_token(TokenType::LParen, self.pos())),
')' => Some(self.step_and_token(TokenType::LParen, self.pos())),
'.' => Some(self.dot_token()),
_ => Some(self.step_and_token(TokenType::InvalidChar, self.pos())),
}
}
fn skip_whitespace(&mut self) -> Option<Token> {
while !self.done() && matches!(self.current(), ' ' | '\t' | '\r' | '\n') {
self.step()
}
self.next_token()
}
fn int_token(&mut self) -> Token {
let start = self.pos();
self.step();
while !self.done() && matches!(self.current(), '0'..='9') {
self.step();
}
self.token(TokenType::Int, start)
}
fn string_token(&mut self) -> Token {
let start = self.pos();
self.step();
let mut escaped = false;
while !self.done() && (self.current() != '"' || escaped) {
escaped = self.current() == '\\' && !escaped;
self.step();
}
if self.done() || self.current() != '"' {
self.step_and_token(TokenType::MalformedString, start)
} else {
self.step_and_token(TokenType::String, start)
}
}
fn id_token(&mut self) -> Token {
let start = self.pos();
self.step();
while !self.done() && matches!(self.current(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_') {
self.step();
}
match &self.text[start.index..self.index] {
"false" => self.token(TokenType::False, start),
"true" => self.token(TokenType::True, start),
"if" => self.token(TokenType::True, start),
"else" => self.token(TokenType::True, start),
"while" => self.token(TokenType::True, start),
"for" => self.token(TokenType::True, start),
"in" => self.token(TokenType::True, start),
"break" => self.token(TokenType::True, start),
"continue" => self.token(TokenType::True, start),
"function" => self.token(TokenType::True, start),
"return" => self.token(TokenType::True, start),
"end" => self.token(TokenType::True, start),
"underscore" => self.token(TokenType::True, start),
_ => self.token(TokenType::Id, start),
}
}
fn single_or_double_char_token(
&mut self,
single_type: TokenType,
double_char: char,
double_type: TokenType,
) -> Token {
let start = self.pos();
self.step();
if !self.done() && self.current() == double_char {
self.step_and_token(double_type, start)
} else {
self.token(single_type, start)
}
}
fn slash_token(&mut self) -> Option<Token> {
let start = self.pos();
self.step();
if !self.done() && self.current() == '/' {
self.step();
while !self.done() && self.current() != '\n' {
self.step();
}
self.next_token()
} else if !self.done() && self.current() == '*' {
self.step();
let mut depth = 1;
let mut last_char: Option<char> = None;
while !self.done() {
match (last_char, self.current()) {
(Some('/'), '*') => {
depth += 1;
}
(Some('*'), '/') => {
depth -= 1;
if depth == 0 {
self.step();
break;
}
}
_ => {}
}
last_char = Some(self.current());
self.step();
}
if depth != 0 {
Some(self.token(TokenType::MalformedComment, start))
} else {
self.next_token()
}
} else if !self.done() && self.current() == '=' {
return Some(self.step_and_token(TokenType::SlashEqual, start));
} else {
return Some(self.token(TokenType::Slash, start));
}
}
fn dot_token(&mut self) -> Token {
let start = self.pos();
self.step();
if !self.done() && matches!(self.current(), '0'..='9') {
self.step();
while !self.done() && matches!(self.current(), '0'..='9') {
self.step();
}
self.token(TokenType::Decimal, start)
} else {
self.token(TokenType::Dot, start)
}
}
fn step_and_token(&mut self, token_type: TokenType, start: Position) -> Token {
self.step();
self.token(token_type, start)
}
fn token(&self, token_type: TokenType, start: Position) -> Token {
Token {
token_type,
length: self.index - start.index,
pos: start,
}
}
fn pos(&self) -> Position {
Position {
index: self.index,
line: self.line,
col: self.col,
}
}
fn done(&self) -> bool {
self.current_char.is_none()
}
fn current(&self) -> char {
self.current_char.expect("done() not checked")
}
fn step(&mut self) {
self.index += 1;
if !self.done() {
if self.current() == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
}
self.current_char = self.chars.next();
}
}
@ -39,10 +289,21 @@ impl<'a> Iterator for Lexer<'a> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
todo!()
self.next_token()
}
}
fn main() {
println!("Hello, world!");
enum Expr {
Id(String),
Int(i64),
Float(f64),
String(String),
}
fn main() {
let text = "3.14 \"foo\" false true ( ) + += /* 1 /* 2 */ 3 */ // 4 \n 5";
let lexer = Lexer::new(text);
lexer.for_each(|token| {
println!("{:?}", token);
})
}