From 5f41342a160ae3e0f39c35cbbd4a3ae548ff1269 Mon Sep 17 00:00:00 2001 From: SimonFJ20 Date: Wed, 29 May 2024 15:08:07 +0200 Subject: [PATCH] hawd >~< --- example.yapp | 11 + src/checked.rs | 63 ++++++ src/checker.rs | 383 +++++++++++++++++++++++++++++++++++ src/hash.rs | 7 + src/itertewls.rs | 29 +++ src/lexer.rs | 279 ++++++++++++++++++++++++++ src/main.rs | 507 +---------------------------------------------- src/parsed.rs | 43 ++++ src/parser.rs | 478 ++++++++++++++++++++++++++++++++++++++++++++ src/sym.rs | 49 +++++ src/token.rs | 40 ++++ 11 files changed, 1391 insertions(+), 498 deletions(-) create mode 100644 example.yapp create mode 100644 src/checked.rs create mode 100644 src/checker.rs create mode 100644 src/hash.rs create mode 100644 src/itertewls.rs create mode 100644 src/lexer.rs create mode 100644 src/parsed.rs create mode 100644 src/parser.rs create mode 100644 src/sym.rs create mode 100644 src/token.rs diff --git a/example.yapp b/example.yapp new file mode 100644 index 0000000..30e5ac3 --- /dev/null +++ b/example.yapp @@ -0,0 +1,11 @@ + + +fn b() { + a(); +} + +fn a() { + +} + + diff --git a/src/checked.rs b/src/checked.rs new file mode 100644 index 0000000..ae77e83 --- /dev/null +++ b/src/checked.rs @@ -0,0 +1,63 @@ +#[derive(Clone, PartialEq, Debug)] +pub struct Node { + pub kind: NodeKind, + pub typ: Type, +} + +#[derive(Clone, PartialEq, Debug)] +pub enum NodeKind { + Error, + Id(u64), + Int(i64), + String(String), + Group(Box), + Block(Vec), + Call { + subject: Box, + args: Vec, + }, + If { + cond: Box, + truthy: Box, + falsy: Option>, + }, + Loop { + body: Box, + }, + Break, + Assign { + subject: Box, + value: Box, + }, + Let { + subject: Box, + value: Box, + }, + Fn { + subject: Box, + params: Vec, + return_typ: Box, + body: Box, + }, + Return { + value: Option>, + }, + Param { + subject: Box, + typ: Option, + }, +} + +#[derive(Clone, PartialEq, Debug)] +pub enum Type { + Error, + Unit, + I32, + U32, + String, + Fn { + id: u64, + params: Vec, + return_typ: Box, + }, +} diff --git a/src/checker.rs b/src/checker.rs new file mode 100644 index 0000000..42b1700 --- /dev/null +++ b/src/checker.rs @@ -0,0 +1,383 @@ +#![allow(unused_variables)] + +use crate::{ + checked::{Node, NodeKind, Type}, + hash::hash, + itertewls::Itertewls, + parsed, + sym::Syms, +}; + +pub struct Checker {} + +impl Checker { + pub fn new() -> Self { + Self {} + } + + pub fn check(&mut self, ast: &Vec) -> Vec { + let mut syms = Syms::new(); + ast.into_iter() + .map(|stmt| self.check_expr(stmt, &mut syms)) + .collect() + } + + fn fn_scan<'syms>( + &mut self, + ast: &Vec, + syms: &mut Syms<'syms>, + ) -> Result<(), ()> { + for node in ast { + match node { + parsed::Node::Fn { + subject, + params, + return_typ, + body, + } => { + let params = params.into_iter().map(|param| { + let parsed::Node::Param { subject, typ } = param else { unreachable!() }; + let parsed::Node::Id(id) = subject.as_ref() else { unreachable!() }; + let typ = self.check_type(typ.as_ref().ok_or(())?); + Ok((*id, self.node(NodeKind::Param { subject: Box::new(self.node(NodeKind::Id(*id), Type::Unit)), typ: Some(typ) }, Type::Unit))) + }).collect::, _>>()?; + + if let Some(id) = params.iter().map(|(id, _)| *id).find_first_duplicate() { + self.error("redefinition param"); + return Err(()); + } + + let parsed::Node::Id(id) = subject.as_ref() else { unreachable!() }; + + if syms.defined_locally(*id) { + self.error("redefinition fn"); + return Err(()); + } + //syms.define(*id, typ.clone()); + } + _ => {} + } + } + Ok(()) + } + + fn check_expr<'syms>(&mut self, node: &parsed::Node, syms: &mut Syms<'syms>) -> Node { + match node { + parsed::Node::Error => self.node(NodeKind::Error, Type::Unit), + parsed::Node::Id(id) => { + let Some(sym) = syms.get(*id) else { + self.error("undefined >~<"); + return self.node(NodeKind::Error, Type::Error); + }; + self.node(NodeKind::Id(*id), sym.typ) + } + parsed::Node::Int(value) => self.node( + NodeKind::Int(*value), + if *value as i32 > i32::MAX { + Type::U32 + } else { + Type::I32 + }, + ), + parsed::Node::String(value) => self.node(NodeKind::String(value.clone()), Type::String), + parsed::Node::Group(expr) => { + let expr = self.check_expr(expr, syms); + let typ = expr.typ.clone(); + self.node(NodeKind::Group(Box::new(expr)), typ) + } + parsed::Node::Block(stmts) => { + let mut child_syms = syms.child(); + let stmts = stmts + .into_iter() + .map(|stmt| self.check_expr(stmt, &mut child_syms)) + .collect::>(); + let typ = stmts + .last() + .map(|stmt| stmt.typ.clone()) + .unwrap_or(Type::Unit); + self.node(NodeKind::Block(stmts), typ) + } + parsed::Node::Call { subject, args } => { + let subject = Box::new(self.check_expr(subject, syms)); + + let args = args + .into_iter() + .map(|arg| self.check_expr(arg, syms)) + .collect::>(); + let typ = 'br: { + match subject.typ.clone() { + Type::Fn { + id: _, + params, + return_typ, + } => { + if args.len() != params.len() { + self.error("too few/many args"); + break 'br Type::Error; + } + if args + .iter() + .zip(params) + .map(|(arg, param)| self.compatible(&arg.typ, ¶m.typ)) + .any(|is_compatible| !is_compatible) + { + self.error("incorrect args"); + break 'br Type::Error; + } + *return_typ + } + _ => { + self.error("not a function"); + Type::Error + } + } + }; + self.node(NodeKind::Call { subject, args }, typ) + } + parsed::Node::If { + cond, + truthy, + falsy, + } => { + let cond = Box::new(self.check_expr(cond, syms)); + let truthy = Box::new(self.check_expr(truthy, syms)); + let falsy = falsy + .as_ref() + .map(|block| Box::new(self.check_expr(block, syms))); + let typ = 'br: { + match falsy.as_ref().map(|block| block.typ.clone()) { + Some(falsy_typ) => { + if !self.compatible(&truthy.typ, &falsy_typ) { + self.error("incompatible types #2"); + break 'br Type::Error; + } + falsy_typ + } + None => Type::Unit, + } + }; + self.node( + NodeKind::If { + cond, + truthy, + falsy, + }, + typ, + ) + } + parsed::Node::Loop { body } => { + let body = Box::new(self.check_expr(body, &mut syms.child())); + let typ = body.typ.clone(); + self.node(NodeKind::Loop { body }, typ) + } + parsed::Node::Break => self.node(NodeKind::Break, Type::Unit), + parsed::Node::Assign { subject, value } => { + let subject = Box::new(self.check_expr(subject, syms)); + let value = Box::new(self.check_expr(value, syms)); + + let typ = if !self.compatible(&subject.typ, &value.typ) { + self.error("incompatible types #3"); + Type::Error + } else { + subject.typ.clone() + }; + self.node(NodeKind::Assign { subject, value }, typ) + } + parsed::Node::Let { subject, value } => { + let (subject, subject_typ) = match subject.as_ref() { + parsed::Node::Param { subject, typ } => { + (subject, typ.as_ref().map(|typ| self.check_type(typ))) + } + _ => unreachable!(), + }; + + let value = Box::new(self.check_expr(value, syms)); + let typ = value.typ.clone(); + + if subject_typ + .as_ref() + .map(|subject_typ| !self.compatible(subject_typ, &typ)) + .unwrap_or(false) + { + self.error("incompatible types #1"); + return self.node(NodeKind::Error, Type::Error); + } + + let subject = match subject.as_ref() { + parsed::Node::Id(id) => { + if syms.defined_locally(*id) { + self.error("redefinition"); + return self.node(NodeKind::Error, Type::Error); + } + syms.define(*id, typ.clone()); + Box::new(self.node( + NodeKind::Param { + subject: Box::new(self.node(NodeKind::Id(*id), Type::Unit)), + typ: Some(Type::Unit), + }, + Type::Unit, + )) + } + _ => unreachable!(), + }; + + self.node(NodeKind::Let { subject, value }, typ) + } + parsed::Node::Fn { + subject, + params, + return_typ, + body, + } => { + todo!("symbol lookup"); + } + parsed::Node::Return { value } => { + let value = value + .as_ref() + .map(|value| Box::new(self.check_expr(value, syms))); + let typ = value + .as_ref() + .map(|value| value.typ.clone()) + .unwrap_or(Type::Unit); + self.node(NodeKind::Return { value }, typ) + } + parsed::Node::Param { .. } => unreachable!("handle elsewhere"), + } + } + + fn check_type(&self, node: &parsed::Node) -> Type { + match node { + parsed::Node::Error => Type::Error, + parsed::Node::Id(value) => { + if *value == hash("i32") { + Type::I32 + } else if *value == hash("u32") { + Type::U32 + } else { + todo!("symbol lookup idk") + } + } + _ => unreachable!(), + } + } + + fn compatible(&self, typ_a: &Type, typ_b: &Type) -> bool { + typ_a == typ_b + } + + fn node(&self, kind: NodeKind, typ: Type) -> Node { + Node { kind, typ } + } + + fn error>(&mut self, msg: S) { + let msg = msg.into(); + println!("checker error: {msg}"); + } +} + +#[test] +fn test_checker() { + use crate::parser::Parser; + use NodeKind::{Block, Id, Int, Let, Param}; + use Type::{Unit, I32}; + + let check = |text| Checker::new().check(&Parser::new(text).parse()); + + assert_eq!( + check("let a = 5; a;"), + vec![ + Node { + kind: Let { + subject: Box::new(Node { + kind: Param { + subject: Box::new(Node { + kind: Id(hash("a")), + typ: Unit + }), + typ: Some(Unit) + }, + typ: Unit + }), + value: Box::new(Node { + kind: Int(5), + typ: I32 + }) + }, + typ: I32 + }, + Node { + kind: Id(hash("a")), + typ: I32 + } + ] + ); + + assert_eq!( + check("let a = 5; a; { a; let b = 5; b; } a; b;"), + vec![ + Node { + kind: Let { + subject: Box::new(Node { + kind: Param { + subject: Box::new(Node { + kind: Id(hash("a")), + typ: Unit + }), + typ: Some(Unit) + }, + typ: Unit + }), + value: Box::new(Node { + kind: Int(5), + typ: I32 + }) + }, + typ: I32 + }, + Node { + kind: Id(hash("a")), + typ: I32 + }, + Node { + kind: Block(vec![ + Node { + kind: Id(hash("a")), + typ: I32 + }, + Node { + kind: Let { + subject: Box::new(Node { + kind: Param { + subject: Box::new(Node { + kind: Id(hash("b")), + typ: Unit + }), + typ: Some(Unit) + }, + typ: Unit + }), + value: Box::new(Node { + kind: Int(5), + typ: I32 + }) + }, + typ: I32 + }, + Node { + kind: Id(hash("b")), + typ: I32 + } + ]), + typ: I32 + }, + Node { + kind: Id(hash("a")), + typ: I32 + }, + Node { + kind: NodeKind::Error, + typ: Type::Error + } + ] + ); +} diff --git a/src/hash.rs b/src/hash.rs new file mode 100644 index 0000000..4164ef7 --- /dev/null +++ b/src/hash.rs @@ -0,0 +1,7 @@ +use std::hash::{DefaultHasher, Hash, Hasher}; + +pub fn hash(value: H) -> u64 { + let mut hasher = DefaultHasher::new(); + value.hash(&mut hasher); + hasher.finish() +} diff --git a/src/itertewls.rs b/src/itertewls.rs new file mode 100644 index 0000000..76db65f --- /dev/null +++ b/src/itertewls.rs @@ -0,0 +1,29 @@ +enum Duplicate { + None(std::collections::HashMap), + Found(T), +} + +pub trait Itertewls +where + Self: Iterator + Sized, +{ + fn find_first_duplicate(self) -> Option; +} + +impl Itertewls for I +where + I: Iterator + Sized, + Item: std::cmp::PartialEq + Clone, +{ + fn find_first_duplicate(mut self) -> Option { + self.try_fold(Vec::new(), |mut used, item| { + if used.contains(&item) { + Err(item) + } else { + used.push(item); + Ok(used) + } + }) + .err() + } +} diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..e074eb5 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,279 @@ +use crate::{ + hash::hash, + token::{Token, TokenKind, TokenValue}, +}; +use std::{collections::HashMap, str::Chars}; + +pub struct Lexer<'a> { + text: &'a str, + chars: Chars<'a>, + current: Option, + index: usize, + line: i32, + col: i32, + symbols: HashMap, + keywords: HashMap, +} + +impl<'a> Lexer<'a> { + pub fn new(text: &'a str) -> Self { + let mut chars = text.chars(); + let current = chars.next(); + Self { + text, + chars, + current, + index: 0, + line: 1, + col: 1, + symbols: HashMap::new(), + keywords: Self::make_keywords(), + } + } + + fn make_keywords() -> HashMap { + let mut keywords = HashMap::new(); + keywords.insert("if".to_string(), TokenKind::If); + keywords.insert("else".to_string(), TokenKind::Else); + keywords.insert("loop".to_string(), TokenKind::Loop); + keywords.insert("break".to_string(), TokenKind::Break); + keywords.insert("let".to_string(), TokenKind::Let); + keywords.insert("fn".to_string(), TokenKind::Fn); + keywords.insert("return".to_string(), TokenKind::Return); + keywords + } + + pub fn symbols(self) -> HashMap { + self.symbols + } + + fn next_token(&mut self) -> Option { + match self.current { + None => None, + Some(' ' | '\t' | '\n') => { + self.step(); + self.next_token() + } + Some(ch @ ('a'..='z' | 'A'..='Z' | '_')) => { + let mut value = String::new(); + value.push(ch); + self.step(); + loop { + match self.current { + Some(ch @ ('a'..='z' | 'A'..='Z' | '0'..='9' | '_')) => { + value.push(ch); + self.step(); + } + _ => { + if let Some(kind) = self.keywords.get(&value) { + return self.token(kind.clone()); + } + let id = hash(&value); + self.symbols.insert(id, value); + break self.token_with_value(TokenKind::Id, TokenValue::Id(id)); + } + } + } + } + Some(ch @ ('1'..='9')) => { + let mut value = String::new(); + value.push(ch); + self.step(); + loop { + match self.current { + Some(ch @ ('0'..='9' | '_')) => { + value.push(ch); + self.step(); + } + _ => { + let value = value.replace('_', "").parse::().unwrap(); + break self.token_with_value(TokenKind::Int, TokenValue::Int(value)); + } + } + } + } + Some('"') => { + self.step(); + let mut value = String::new(); + loop { + match self.current { + Some('\\') => { + self.step(); + match self.current { + Some('n') => value.push('\n'), + Some('r') => value.push('\r'), + Some('t') => value.push('\t'), + Some('0') => value.push('\0'), + Some(ch) => value.push(ch), + None => { + self.error("malformed string"); + break self.token(TokenKind::Error); + } + } + } + Some('"') => { + self.step(); + break self + .token_with_value(TokenKind::String, TokenValue::String(value)); + } + Some(ch) => { + value.push(ch); + } + _ => { + self.error("malformed string"); + break self.token(TokenKind::Error); + } + } + self.step() + } + } + Some('/') => { + self.step(); + match self.current { + Some('/') => { + self.step(); + loop { + match self.current { + None | Some('\n') => break self.next_token(), + _ => { + self.step(); + } + } + } + } + Some('*') => { + self.step(); + let mut lch = self.current; + loop { + match (lch, self.current) { + (Some('*'), Some('/')) => break self.next_token(), + (_, Some(ch)) => { + lch = Some(ch); + self.step(); + } + (_, None) => { + self.error("malformed /**/ comment"); + break self.token(TokenKind::Error); + } + } + } + } + _ => todo!(), + } + } + Some('-') => { + self.step(); + match self.current { + Some('>') => { + self.step(); + self.token(TokenKind::MinusLt) + } + _ => todo!(), + } + } + Some(ch @ ('0' | '(' | ')' | '{' | '}' | ':' | ',' | ';' | '=')) => { + self.step(); + match ch { + '0' => self.token_with_value(TokenKind::Int, TokenValue::Int(0)), + '(' => self.token(TokenKind::LParen), + ')' => self.token(TokenKind::RParen), + '{' => self.token(TokenKind::LBrace), + '}' => self.token(TokenKind::RBrace), + ':' => self.token(TokenKind::Colon), + ',' => self.token(TokenKind::Comma), + ';' => self.token(TokenKind::Semicolon), + '=' => self.token(TokenKind::Equal), + _ => unreachable!(), + } + } + Some(ch) => { + self.error(format!("unknown char '{ch}'")); + self.token(TokenKind::Error) + } + } + } + + fn step(&mut self) { + match self.current { + Some('\n') => { + self.line += 1; + self.col = 1; + } + Some(_) => { + self.col += 1; + } + _ => {} + } + self.current = self.chars.next(); + if self.current.is_some() { + self.index += 1; + } + } + + fn token(&self, kind: TokenKind) -> Option { + Some(Token { + kind, + value: TokenValue::None, + index: self.index, + line: self.line, + col: self.col, + }) + } + + fn token_with_value(&self, kind: TokenKind, value: TokenValue) -> Option { + Some(Token { + kind, + value, + index: self.index, + line: self.line, + col: self.col, + }) + } + + fn error>(&mut self, msg: S) { + let msg = msg.into(); + println!("lexer error: {msg}, line {}", self.line) + } + + fn done(&self) -> bool { + self.current.is_none() + } +} + +impl Iterator for Lexer<'_> { + type Item = Token; + + fn next(&mut self) -> Option { + self.next_token() + } +} + +#[test] +fn test_lexer() { + use TokenKind as TK; + use TokenValue as TV; + + let lex = |text| { + Lexer::new(text) + .map(|token| (token.kind, token.value)) + .collect::>() + }; + + assert_eq!(lex("abc"), vec![(TK::Id, TV::Id(hash("abc")))]); + assert_eq!(lex("123"), vec![(TK::Int, TV::Int(123))]); + assert_eq!(lex("\"\""), vec![(TK::String, TV::String("".to_string()))]); + assert_eq!( + lex("\"hello\""), + vec![(TK::String, TV::String("hello".to_string()))] + ); + assert_eq!( + lex("\"new\\nline\""), + vec![(TK::String, TV::String("new\nline".to_string()))] + ); + assert_eq!( + lex("\"backslash\\\\\""), + vec![(TK::String, TV::String("backslash\\".to_string()))] + ); + assert_eq!(lex("->"), vec![(TK::MinusLt, TV::None)]); + assert_eq!(lex("let"), vec![(TK::Let, TV::None)]); +} diff --git a/src/main.rs b/src/main.rs index eeaf413..b2fe783 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,503 +1,14 @@ #![allow(dead_code)] -use std::{collections::HashMap, str::Chars}; - -#[derive(Clone, PartialEq, Debug)] -enum Expr { - Error, - Id(u64), - Int(i64), - String(String), - Block(Vec), - Call { - subject: Box, - args: Vec, - }, - If { - cond: Box, - truthy: Box, - falsy: Option>, - }, - Loop { - body: Box, - }, - Break, - Assign { - subject: Box, - value: Box, - }, - Let { - subject: Box, - value: Box, - }, - Fn { - subject: Box, - value: Box, - }, - Return { - value: Option>, - }, -} - -struct Parser<'a> { - lexer: Lexer<'a>, - current: Option, -} - -impl<'a> Parser<'a> { - pub fn new(text: &'a str) -> Self { - let mut lexer = Lexer::new(text); - let current = lexer.next(); - Self { lexer, current } - } - - pub fn parse(&mut self) -> Vec { - self.parse_file() - } - - fn parse_file(&mut self) -> Vec { - let mut stmts = Vec::new(); - loop { - match self.current { - Some(_) => stmts.push(self.parse_stmt()), - None => break stmts, - } - } - } - - fn parse_stmt(&mut self) -> Expr { - match self.curr_kind() { - Some(TokenKind::If) => todo!(), - Some(TokenKind::Loop) => todo!(), - Some(TokenKind::Fn) => todo!(), - _ => { - let stmt = match self.curr_kind() { - Some(TokenKind::Let) => todo!(), - Some(TokenKind::Break) => { - self.step(); - Expr::Break - } - Some(TokenKind::Return) => { - self.step(); - let value = match self.curr_kind() { - Some(TokenKind::Semicolon) => None, - _ => Some(Box::new(self.parse_expr())), - }; - Expr::Return { value } - } - _ => self.parse_assign(), - }; - match self.curr_kind() { - Some(TokenKind::Semicolon) => { - self.step(); - stmt - } - _ => { - self.error("expected ';'"); - Expr::Error - } - } - } - } - } - - fn parse_assign(&mut self) -> Expr { - let subject = self.parse_expr(); - match self.curr_kind() { - Some(TokenKind::Equal) => { - self.step(); - let value = self.parse_expr(); - Expr::Assign { - subject: Box::new(subject), - value: Box::new(value), - } - } - _ => subject, - } - } - - fn parse_expr(&mut self) -> Expr { - self.parse_call() - } - - fn parse_call(&mut self) -> Expr { - let mut subject = self.parse_value(); - loop { - match self.curr_kind() { - Some(TokenKind::LParen) => { - self.step(); - let mut args = Vec::new(); - match self.curr_kind() { - None | Some(TokenKind::LParen) => {} - Some(_) => { - args.push(self.parse_expr()); - while let Some(TokenKind::Comma) = self.curr_kind() { - self.step(); - if let Some(TokenKind::RParen) = self.curr_kind() { - break; - } - args.push(self.parse_expr()); - } - } - } - match self.curr_kind() { - Some(TokenKind::RParen) => {} - _ => { - self.error("expected ')'"); - return Expr::Error; - } - } - self.step(); - subject = Expr::Call { - subject: Box::new(subject), - args, - }; - } - _ => break subject, - } - } - } - - fn parse_value(&mut self) -> Expr { - match self.curr_kind() { - Some(TokenKind::Id) => { - let Some(Token { - value: TokenValue::Id(value), - .. - }) = self.current - else { - unreachable!() - }; - self.step(); - Expr::Id(value) - } - Some(TokenKind::Int) => { - let Some(Token { - value: TokenValue::Int(value), - .. - }) = self.current - else { - unreachable!() - }; - self.step(); - Expr::Int(value) - } - Some(TokenKind::String) => { - self.step(); - let Some(Token { - value: TokenValue::String(value), - .. - }) = self.current.clone() - else { - unreachable!() - }; - Expr::String(value.clone()) - } - _ => { - self.step(); - self.error("expected value"); - Expr::Error - } - } - } - - fn error>(&mut self, msg: S) { - let msg = msg.into(); - println!( - "parser error: {msg}, line {}", - self.current - .as_ref() - .map(|t| t.line.to_string()) - .unwrap_or_else(|| "-".to_string()) - ) - } - - fn step(&mut self) { - self.current = self.lexer.next(); - } - - fn curr_kind(&self) -> Option { - self.current.as_ref().map(|t| t.kind.clone()) - } -} - -#[test] -fn test_parser() { - use Expr::*; - assert_eq!(Parser::new("abc;").parse(), vec![Id(0)]); - assert_eq!(Parser::new("123;").parse(), vec![Int(123)]); - assert_eq!(Parser::new("0;").parse(), vec![Int(0)]); - assert_eq!(Parser::new("0;abc;").parse(), vec![Int(0), Id(0)]); - assert_eq!( - Parser::new("add(mul(12, 34), 56);").parse(), - vec![Call { - subject: Box::new(Id(0)), - args: vec![ - Call { - subject: Box::new(Id(1)), - args: vec![Int(12), Int(34)] - }, - Int(56) - ] - }] - ); - assert_eq!(Parser::new("break;").parse(), vec![Break]); - assert_eq!(Parser::new("return;").parse(), vec![Return { value: None }]); - assert_eq!( - Parser::new("return add(1, 2);").parse(), - vec![Return { - value: Some(Box::new(Call { - subject: Box::new(Id(0)), - args: vec![Int(1), Int(2)] - })) - }] - ); -} - -#[derive(Clone, PartialEq, Debug)] -struct Token { - kind: TokenKind, - value: TokenValue, - index: usize, - line: i32, - col: i32, -} - -#[derive(Clone, PartialEq, Debug)] -enum TokenKind { - Error, - Id, - Int, - String, - If, - Loop, - Break, - Let, - Fn, - Return, - LParen, - RParen, - LBrace, - RBrace, - Comma, - Semicolon, - Equal, -} - -#[derive(Clone, PartialEq, Debug)] -enum TokenValue { - None, - Id(u64), - Int(i64), - String(String), -} - -struct Lexer<'a> { - text: &'a str, - chars: Chars<'a>, - current: Option, - index: usize, - line: i32, - col: i32, - symbol_counter: u64, - symbols: HashMap, - keywords: HashMap, -} - -impl<'a> Lexer<'a> { - pub fn new(text: &'a str) -> Self { - let mut chars = text.chars(); - let current = chars.next(); - Self { - text, - chars, - current, - index: 0, - line: 1, - col: 1, - symbol_counter: 0, - symbols: HashMap::new(), - keywords: Self::make_keywords(), - } - } - - fn make_keywords() -> HashMap { - let mut keywords = HashMap::new(); - keywords.insert("if".to_string(), TokenKind::If); - keywords.insert("loop".to_string(), TokenKind::Loop); - keywords.insert("break".to_string(), TokenKind::Break); - keywords.insert("let".to_string(), TokenKind::Let); - keywords.insert("fn".to_string(), TokenKind::Fn); - keywords.insert("return".to_string(), TokenKind::Return); - keywords - } - - fn next_token(&mut self) -> Option { - match self.current { - None => None, - Some(' ' | '\t' | '\n') => { - self.step(); - self.next_token() - } - Some(ch @ ('a'..='z' | 'A'..='Z' | '_')) => { - let mut value = String::new(); - value.push(ch); - self.step(); - loop { - match self.current { - Some(ch @ ('a'..='z' | 'A'..='Z' | '0'..='9' | '_')) => { - value.push(ch); - self.step(); - } - _ => { - if let Some(kind) = self.keywords.get(&value) { - return self.token(kind.clone()); - } - let id = self.symbol_counter; - self.symbol_counter += 1; - self.symbols.insert(value, id); - break self.token_with_value(TokenKind::Id, TokenValue::Id(id)); - } - } - } - } - Some(ch @ ('1'..='9')) => { - let mut value = String::new(); - value.push(ch); - self.step(); - loop { - match self.current { - Some(ch @ ('0'..='9' | '_')) => { - value.push(ch); - self.step(); - } - _ => { - let value = value.replace('_', "").parse::().unwrap(); - break self.token_with_value(TokenKind::Int, TokenValue::Int(value)); - } - } - } - } - Some('/') => { - self.step(); - match self.current { - Some('/') => { - self.step(); - loop { - match self.current { - None | Some('\n') => break self.next_token(), - _ => { - self.step(); - } - } - } - } - Some('*') => { - self.step(); - let mut lch = self.current; - loop { - match (lch, self.current) { - (Some('*'), Some('/')) => break self.next_token(), - (_, Some(ch)) => { - lch = Some(ch); - self.step(); - } - (_, None) => { - self.error("malformed /**/ comment"); - break self.token(TokenKind::Error); - } - } - } - } - _ => todo!(), - } - } - Some(ch @ ('0' | '(' | ')' | '{' | '}' | ',' | ';' | '=')) => { - self.step(); - match ch { - '0' => self.token_with_value(TokenKind::Int, TokenValue::Int(0)), - '(' => self.token(TokenKind::LParen), - ')' => self.token(TokenKind::RParen), - '{' => self.token(TokenKind::LBrace), - '}' => self.token(TokenKind::RBrace), - ',' => self.token(TokenKind::Comma), - ';' => self.token(TokenKind::Semicolon), - '=' => self.token(TokenKind::Equal), - _ => unreachable!(), - } - } - _ => todo!(), - } - } - - fn step(&mut self) { - match self.current { - Some('\n') => { - self.line += 1; - self.col = 1; - } - Some(_) => { - self.col += 1; - } - _ => {} - } - self.current = self.chars.next(); - if self.current.is_some() { - self.index += 1; - } - } - - fn token(&self, kind: TokenKind) -> Option { - Some(Token { - kind, - value: TokenValue::None, - index: self.index, - line: self.line, - col: self.col, - }) - } - - fn token_with_value(&self, kind: TokenKind, value: TokenValue) -> Option { - Some(Token { - kind, - value, - index: self.index, - line: self.line, - col: self.col, - }) - } - - fn error>(&mut self, msg: S) { - let msg = msg.into(); - println!("lexer error: {msg}, line {}", self.line) - } - - fn done(&self) -> bool { - self.current.is_none() - } -} - -impl Iterator for Lexer<'_> { - type Item = Token; - - fn next(&mut self) -> Option { - self.next_token() - } -} - -#[test] -fn test_lexer() { - assert_eq!( - Lexer::new("123").collect::>(), - vec![Token { - kind: TokenKind::Int, - value: TokenValue::Int(123), - index: 2, - line: 1, - col: 4 - }] - ); -} +mod checked; +mod checker; +mod hash; +mod itertewls; +mod lexer; +mod parsed; +mod parser; +mod sym; +mod token; fn main() { println!("Hello, world!"); diff --git a/src/parsed.rs b/src/parsed.rs new file mode 100644 index 0000000..7074005 --- /dev/null +++ b/src/parsed.rs @@ -0,0 +1,43 @@ +#[derive(Clone, PartialEq, Debug)] +pub enum Node { + Error, + Id(u64), + Int(i64), + String(String), + Group(Box), + Block(Vec), + Call { + subject: Box, + args: Vec, + }, + If { + cond: Box, + truthy: Box, + falsy: Option>, + }, + Loop { + body: Box, + }, + Break, + Assign { + subject: Box, + value: Box, + }, + Let { + subject: Box, + value: Box, + }, + Fn { + subject: Box, + params: Vec, + return_typ: Box, + body: Box, + }, + Return { + value: Option>, + }, + Param { + subject: Box, + typ: Option>, + }, +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..82bd0a1 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,478 @@ +use std::collections::HashMap; + +use crate::{ + lexer::Lexer, + parsed::Node, + token::{Token, TokenKind, TokenValue}, +}; + +pub struct Parser<'a> { + lexer: Lexer<'a>, + current: Option, +} + +impl<'a> Parser<'a> { + pub fn new(text: &'a str) -> Self { + let mut lexer = Lexer::new(text); + let current = lexer.next(); + Self { lexer, current } + } + + pub fn parse(&mut self) -> Vec { + self.parse_file() + } + + pub fn symbols(self) -> HashMap { + self.lexer.symbols() + } + + fn parse_file(&mut self) -> Vec { + let mut stmts = Vec::new(); + loop { + match self.current { + Some(_) => stmts.push(self.parse_stmt()), + None => break stmts, + } + } + } + + fn parse_stmt(&mut self) -> Node { + match self.curr_kind() { + Some(TokenKind::LBrace) => self.parse_block(), + Some(TokenKind::If) => self.parse_if(), + Some(TokenKind::Loop) => self.parse_loop(), + Some(TokenKind::Fn) => self.parse_fn(), + _ => { + let stmt = match self.curr_kind() { + Some(TokenKind::Let) => self.parse_let(), + Some(TokenKind::Break) => { + self.step(); + Node::Break + } + Some(TokenKind::Return) => { + self.step(); + let value = match self.curr_kind() { + Some(TokenKind::Semicolon) => None, + _ => Some(Box::new(self.parse_expr())), + }; + Node::Return { value } + } + _ => self.parse_assign(), + }; + match self.curr_kind() { + Some(TokenKind::Semicolon) => { + self.step(); + stmt + } + _ => { + self.error("expected ';'"); + Node::Error + } + } + } + } + } + + fn parse_fn(&mut self) -> Node { + self.step(); + if !self.curr_is(TokenKind::Id) { + self.error("expected id"); + return Node::Error; + } + let subject = Box::new(self.parse_id()); + if !self.curr_is(TokenKind::LParen) { + self.error("expected '('"); + return Node::Error; + } + let params = match self.parse_fn_params() { + Ok(params) => params, + Err(expr) => return expr, + }; + if !self.curr_is(TokenKind::MinusLt) { + self.error("expected '->'"); + return Node::Error; + } + self.step(); + let return_typ = Box::new(self.parse_typ()); + if !self.curr_is(TokenKind::LBrace) { + self.error("expected '{'"); + return Node::Error; + } + let body = Box::new(self.parse_block()); + Node::Fn { + subject, + params, + return_typ, + body, + } + } + + fn parse_fn_params(&mut self) -> Result, Node> { + self.step(); + let mut params = Vec::new(); + if !self.curr_is(TokenKind::RParen) { + if !self.curr_is(TokenKind::RParen) { + self.error("expected ')'"); + return Err(Node::Error); + } + if !self.curr_is(TokenKind::Id) { + self.error("expected id"); + return Err(Node::Error); + } + params.push(self.parse_param()); + while let Some(TokenKind::Comma) = self.curr_kind() { + self.step(); + if self.curr_is(TokenKind::RParen) { + self.error("expected ')'"); + break; + } + params.push(self.parse_param()); + } + } + if !self.curr_is(TokenKind::RParen) { + self.error("expected ')'"); + return Err(Node::Error); + } + self.step(); + Ok(params) + } + + fn parse_let(&mut self) -> Node { + self.step(); + if !self.curr_is(TokenKind::Id) { + self.error("expected id"); + return Node::Error; + } + let subject = self.parse_param(); + if !self.curr_is(TokenKind::Equal) { + self.error("expected '='"); + return Node::Error; + } + self.step(); + let value = self.parse_expr(); + Node::Let { + subject: Box::new(subject), + value: Box::new(value), + } + } + + fn parse_param(&mut self) -> Node { + let subject = Box::new(self.parse_id()); + let typ = if let Some(TokenKind::Comma) = self.curr_kind() { + self.step(); + Some(Box::new(self.parse_typ())) + } else { + None + }; + Node::Param { subject, typ } + } + + fn parse_typ(&mut self) -> Node { + match self.curr_kind() { + Some(TokenKind::Id) => self.parse_id(), + _ => { + self.error("expected type"); + self.step(); + Node::Error + } + } + } + + fn parse_assign(&mut self) -> Node { + let subject = self.parse_expr(); + match self.curr_kind() { + Some(TokenKind::Equal) => { + self.step(); + let value = self.parse_expr(); + Node::Assign { + subject: Box::new(subject), + value: Box::new(value), + } + } + _ => subject, + } + } + + fn parse_expr(&mut self) -> Node { + self.parse_call() + } + + fn parse_call(&mut self) -> Node { + let mut subject = self.parse_value(); + loop { + match self.curr_kind() { + Some(TokenKind::LParen) => { + self.step(); + let mut args = Vec::new(); + match self.curr_kind() { + None | Some(TokenKind::LParen) => {} + Some(_) => { + args.push(self.parse_expr()); + while let Some(TokenKind::Comma) = self.curr_kind() { + self.step(); + if let Some(TokenKind::RParen) = self.curr_kind() { + break; + } + args.push(self.parse_expr()); + } + } + } + match self.curr_kind() { + Some(TokenKind::RParen) => {} + _ => { + self.error("expected ')'"); + return Node::Error; + } + } + self.step(); + subject = Node::Call { + subject: Box::new(subject), + args, + }; + } + _ => break subject, + } + } + } + + fn parse_value(&mut self) -> Node { + match self.curr_kind() { + Some(TokenKind::Id) => self.parse_id(), + Some(TokenKind::Int) => self.parse_int(), + Some(TokenKind::String) => self.parse_string(), + Some(TokenKind::LParen) => self.parse_group(), + Some(TokenKind::LBrace) => self.parse_block(), + Some(TokenKind::If) => self.parse_if(), + Some(TokenKind::Loop) => self.parse_loop(), + _ => { + self.error("expected value"); + self.step(); + Node::Error + } + } + } + + fn parse_id(&mut self) -> Node { + let Some(Token { + kind: TokenKind::Id, + value: TokenValue::Id(value), + .. + }) = self.current + else { + unreachable!() + }; + self.step(); + Node::Id(value) + } + + fn parse_int(&mut self) -> Node { + let Some(Token { + kind: TokenKind::Int, + value: TokenValue::Int(value), + .. + }) = self.current + else { + unreachable!() + }; + self.step(); + Node::Int(value) + } + + fn parse_string(&mut self) -> Node { + let Some(Token { + kind: TokenKind::String, + value: TokenValue::String(value), + .. + }) = self.current.clone() + else { + unreachable!() + }; + self.step(); + Node::String(value.clone()) + } + + fn parse_group(&mut self) -> Node { + self.step(); + let expr = Box::new(self.parse_expr()); + if !self.curr_is(TokenKind::RParen) { + self.error("expected ')'"); + return Node::Error; + } + self.step(); + Node::Group(expr) + } + + fn parse_block(&mut self) -> Node { + self.step(); + let mut stmts = Vec::new(); + loop { + match self.curr_kind() { + Some(TokenKind::RBrace) => { + self.step(); + break Node::Block(stmts); + } + _ => stmts.push(self.parse_stmt()), + } + } + } + + fn parse_if(&mut self) -> Node { + self.step(); + let cond = Box::new(self.parse_expr()); + if !self.curr_is(TokenKind::LBrace) { + self.error("expected '}'"); + return Node::Error; + } + let truthy = Box::new(self.parse_block()); + let falsy = match self.curr_kind() { + Some(TokenKind::Else) => { + self.step(); + if !self.curr_is(TokenKind::LBrace) { + self.error("expected '}'"); + return Node::Error; + } + Some(Box::new(self.parse_block())) + } + _ => None, + }; + Node::If { + cond, + truthy, + falsy, + } + } + + fn parse_loop(&mut self) -> Node { + self.step(); + if !self.curr_is(TokenKind::LBrace) { + self.error("expected '}'"); + return Node::Error; + } + let body = Box::new(self.parse_block()); + Node::Loop { body } + } + + fn error>(&mut self, msg: S) { + let msg = msg.into(); + println!( + "parser error: {msg}, line {}", + self.current + .as_ref() + .map(|t| t.line.to_string()) + .unwrap_or_else(|| "-".to_string()) + ) + } + + fn step(&mut self) { + self.current = self.lexer.next(); + } + + fn curr_is(&self, kind: TokenKind) -> bool { + self.curr_kind() == Some(kind) + } + + fn curr_kind(&self) -> Option { + self.current.as_ref().map(|t| t.kind.clone()) + } +} + +#[test] +fn test_parser() { + use crate::hash::hash; + use Node::*; + + let parse = |text| Parser::new(text).parse(); + + #[allow(non_snake_case)] + fn B(v: T) -> Box { + Box::new(v) + } + + assert_eq!(Parser::new("abc;").parse(), vec![Id(hash("abc"))]); + assert_eq!(Parser::new("123;").parse(), vec![Int(123)]); + assert_eq!( + Parser::new("\"hello\";").parse(), + vec![String("hello".to_string())] + ); + assert_eq!(Parser::new("0;").parse(), vec![Int(0)]); + assert_eq!(Parser::new("0;abc;").parse(), vec![Int(0), Id(hash("abc"))]); + assert_eq!( + parse("add(mul(12, 34), 56);"), + vec![Call { + subject: B(Id(hash("add"))), + args: vec![ + Call { + subject: B(Id(hash("mul"))), + args: vec![Int(12), Int(34)] + }, + Int(56) + ] + }] + ); + assert_eq!( + parse("a = 123;"), + vec![Assign { + subject: B(Id(hash("a"))), + value: B(Int(123)) + }] + ); + assert_eq!(parse("break;"), vec![Break]); + assert_eq!(parse("return;"), vec![Return { value: None }]); + assert_eq!( + parse("return add(1, 2);"), + vec![Return { + value: Some(B(Call { + subject: B(Id(hash("add"))), + args: vec![Int(1), Int(2)] + })) + }] + ); + assert_eq!( + parse("a = 5;"), + vec![Assign { + subject: B(Id(hash("a"))), + value: B(Int(5)) + }] + ); + assert_eq!( + parse("let a = 5;"), + vec![Let { + subject: B(Param { + subject: B(Id(hash("a"))), + typ: None + }), + value: B(Int(5)) + }] + ); + assert_eq!( + parse("fn test() -> i32 {}"), + vec![Fn { + subject: B(Id(hash("test"))), + params: vec![], + return_typ: B(Id(hash("i32"))), + body: B(Block(vec![])) + }] + ); + assert_eq!( + parse("if 0 {}"), + vec![If { + cond: B(Int(0)), + truthy: B(Block(vec![])), + falsy: None + }] + ); + assert_eq!( + parse("if 0 {} else {}"), + vec![If { + cond: B(Int(0)), + truthy: B(Block(vec![])), + falsy: Some(B(Block(vec![]))), + }] + ); + assert_eq!( + parse("loop {}"), + vec![Loop { + body: B(Block(vec![])), + }] + ); +} diff --git a/src/sym.rs b/src/sym.rs new file mode 100644 index 0000000..5b73808 --- /dev/null +++ b/src/sym.rs @@ -0,0 +1,49 @@ +#![allow(unused_variables)] +use std::collections::HashMap; + +use crate::checked::Type; + +#[derive(Clone, PartialEq, Debug)] +pub struct Sym { + pub id: u64, + pub typ: Type, +} + +pub struct Syms<'syms> { + parent: Option<&'syms Syms<'syms>>, + map: HashMap, +} + +impl<'syms> Syms<'syms> { + pub fn new() -> Self { + Self { + parent: None, + map: HashMap::new(), + } + } + + pub fn child(&'syms self) -> Self { + Self { + parent: Some(self), + map: HashMap::new(), + } + } + + pub fn get(&self, id: u64) -> Option { + if let Some(sym) = self.map.get(&id) { + return Some(sym.clone()); + } + if let Some(parent) = self.parent { + return parent.get(id); + } + None + } + + pub fn defined_locally(&self, id: u64) -> bool { + self.map.contains_key(&id) + } + + pub fn define(&mut self, id: u64, typ: Type) { + self.map.insert(id, Sym { id, typ }); + } +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..a84f7c2 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,40 @@ +#[derive(Clone, PartialEq, Debug)] +pub struct Token { + pub kind: TokenKind, + pub value: TokenValue, + pub index: usize, + pub line: i32, + pub col: i32, +} + +#[derive(Clone, PartialEq, Debug)] +pub enum TokenKind { + Error, + Id, + Int, + String, + If, + Else, + Loop, + Break, + Let, + Fn, + Return, + LParen, + RParen, + LBrace, + RBrace, + Colon, + Comma, + Semicolon, + Equal, + MinusLt, +} + +#[derive(Clone, PartialEq, Debug)] +pub enum TokenValue { + None, + Id(u64), + Int(i64), + String(String), +}