From 216dbf2fcea0d6e6cc6d644b68859c2acbfc6a9f Mon Sep 17 00:00:00 2001 From: SimonFJ20 Date: Thu, 4 Apr 2024 02:08:11 +0200 Subject: [PATCH] something interesting, jk --- parser.c | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- parser.h | 38 +++++- 2 files changed, 358 insertions(+), 23 deletions(-) diff --git a/parser.c b/parser.c index 9b7d8bc..89d2ede 100644 --- a/parser.c +++ b/parser.c @@ -11,6 +11,7 @@ void lexer_construct(Lexer* lexer, const char* text, size_t length) .index = 0, .line = 1, .col = 1, + .failed = false, }; } @@ -25,6 +26,36 @@ static inline bool is_id_char(char c) || (c >= '0' && c <= '9') || c == '_'; } +static inline Token skip_comment(Lexer* lexer) +{ + Pos pos = lexer_pos(lexer); + lexer_step(lexer); + if (lexer_current(lexer) == '/') { + while (!lexer_done(lexer) && lexer_current(lexer) != '\n') { + lexer_step(lexer); + } + return lexer_next(lexer); + } else if (lexer_current(lexer) == '*') { + lexer_step(lexer); + char last = '\0'; + while (!lexer_done(lexer) + && !(last == '*' && lexer_current(lexer) == '/')) { + last = lexer_current(lexer); + lexer_step(lexer); + } + if (lexer_done(lexer)) { + lexer->failed = true; + print_error("lexer: malformed multiline comment", pos); + return lexer_token(lexer, TokenType_Error, pos); + } + return lexer_next(lexer); + } else { + lexer->failed = true; + print_error("lexer: malformed comment", pos); + return lexer_token(lexer, TokenType_Error, pos); + } +} + struct MatchIdToTokenTypeCase { const char* keyword; TokenType token_type; @@ -41,6 +72,49 @@ static inline TokenType match_id_to_token_type( return TokenType_Id; } +static inline Token lex_id_or_keyword(Lexer* lexer) +{ + Pos pos = lexer_pos(lexer); + lexer_step(lexer); + while (!lexer_done(lexer) && is_id_char(lexer_current(lexer))) { + lexer_step(lexer); + } + size_t length = lexer->index - pos.index; + TokenType token_type + = match_id_to_token_type(&lexer->text[pos.index], length, + (struct MatchIdToTokenTypeCase[]) { + { "not", TokenType_Not }, + { "and", TokenType_And }, + { "or", TokenType_Or }, + { "if", TokenType_If }, + { "loop", TokenType_Loop }, + { "fn", TokenType_Fn }, + { "return", TokenType_Return }, + { "break", TokenType_Break }, + { NULL, TokenType_Id }, + }); + return lexer_token(lexer, token_type, pos); +} + +Token lex_single_char(Lexer* lexer, TokenType token_type) +{ + Pos pos = lexer_pos(lexer); + lexer_step(lexer); + return lexer_token(lexer, token_type, pos); +} + +Token lex_single_or_double_char( + Lexer* lexer, TokenType first, char c2, TokenType second) +{ + Pos pos = lexer_pos(lexer); + lexer_step(lexer); + if (lexer_done(lexer) || lexer_current(lexer) != c2) { + return lexer_token(lexer, first, pos); + } + lexer_step(lexer); + return lexer_token(lexer, second, pos); +} + Token lexer_next(Lexer* lexer) { Pos pos = lexer_pos(lexer); @@ -52,36 +126,70 @@ Token lexer_next(Lexer* lexer) lexer_step(lexer); return lexer_next(lexer); } + if (c == '/') { + return skip_comment(lexer); + } if (is_id_start_char(c)) { - lexer_step(lexer); - while (is_id_char(c)) { - lexer_step(lexer); - } - size_t length = lexer->index - pos.index; - TokenType token_type - = match_id_to_token_type(&lexer->text[pos.index], length, - (struct MatchIdToTokenTypeCase[]) { - { "not", TokenType_Not }, - { "and", TokenType_And }, - { "or", TokenType_Or }, - { "loop", TokenType_Loop }, - { "fn", TokenType_Fn }, - { "return", TokenType_Return }, - { "break", TokenType_Break }, - { NULL, TokenType_Id }, - }); - return lexer_token(lexer, token_type, pos); + return lex_id_or_keyword(lexer); } if (c >= '1' && c <= '9') { lexer_step(lexer); - while (c >= '1' && c <= '9') { + while (!lexer_done(lexer) && c >= '1' && c <= '9') { lexer_step(lexer); } return lexer_token(lexer, TokenType_Int, pos); } + switch (c) { + case '0': + return lex_single_char(lexer, TokenType_Int); + case '(': + return lex_single_char(lexer, TokenType_LParen); + case ')': + return lex_single_char(lexer, TokenType_RParen); + case '{': + return lex_single_char(lexer, TokenType_LBrace); + case '}': + return lex_single_char(lexer, TokenType_RBrace); + case '[': + return lex_single_char(lexer, TokenType_LBracket); + case ']': + return lex_single_char(lexer, TokenType_RBracket); + case ',': + return lex_single_char(lexer, TokenType_Comma); + case ';': + return lex_single_char(lexer, TokenType_Semicolon); + case '+': + return lex_single_or_double_char( + lexer, TokenType_Plus, '=', TokenType_PlusEqual); + case '-': + return lex_single_or_double_char( + lexer, TokenType_Minus, '=', TokenType_MinusEqual); + case '*': + return lex_single_or_double_char( + lexer, TokenType_Asterisk, '=', TokenType_AsteriskEqual); + case '=': + return lex_single_or_double_char( + lexer, TokenType_Equal, '=', TokenType_EqualEqual); + case '!': + return lex_single_or_double_char( + lexer, TokenType_Exclamation, '=', TokenType_ExclamationEqual); + case '<': + return lex_single_or_double_char( + lexer, TokenType_LT, '=', TokenType_LTEqual); + case '>': + return lex_single_or_double_char( + lexer, TokenType_GT, '=', TokenType_GTEqual); + case '|': + return lex_single_or_double_char( + lexer, TokenType_Pipe, '>', TokenType_PipeGT); + } + lexer->failed = true; + print_error("lexer: unrecognized character", pos); return lexer_token(lexer, TokenType_Error, pos); } +bool lexer_failed(const Lexer* lexer) { return lexer->failed; } + Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos) { return (Token) { @@ -156,3 +264,200 @@ int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item) vec->length += 1; return 0; } + +ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init) +{ + ASTNode* node = malloc(sizeof(ASTNode)); + if (node == NULL) { + return NULL; + } + *node = spec_init; + node->node_type = node_type; + node->pos = pos; + return node; +} + +void ast_node_free(ASTNode* node) +{ + if (node == NULL) { + return; + } + switch (node->node_type) { + case ASTNodeType_Error: + break; + case ASTNodeType_Id: + if (node->id_value != NULL) { + free(node->id_value); + } + break; + case ASTNodeType_Int: + break; + case ASTNodeType_Block: + for (size_t i = 0; i < node->statements.length; ++i) { + ast_node_free(node->statements.data[i]); + } + ast_node_vec_destroy(&node->statements); + break; + case ASTNodeType_If: + ast_node_free(node->if_node.condition); + ast_node_free(node->if_node.truthy); + ast_node_free(node->if_node.falsy); + break; + case ASTNodeType_Loop: + ast_node_free(node->loop_node.body); + break; + case ASTNodeType_Call: + ast_node_free(node->call_node.subject); + for (size_t i = 0; i < node->call_node.args.length; ++i) { + ast_node_free(node->call_node.args.data[i]); + } + ast_node_vec_destroy(&node->call_node.args); + break; + case ASTNodeType_Index: + ast_node_free(node->index_node.subject); + ast_node_free(node->index_node.value); + break; + case ASTNodeType_Unary: + ast_node_free(node->unary_node.subject); + break; + case ASTNodeType_Binary: + ast_node_free(node->binary_node.left); + ast_node_free(node->binary_node.right); + break; + case ASTNodeType_Assign: + ast_node_free(node->assign_node.subject); + ast_node_free(node->assign_node.value); + break; + case ASTNodeType_Let: + if (node->let_node.id != NULL) { + free(node->let_node.id); + } + ast_node_free(node->let_node.value); + break; + case ASTNodeType_Break: + break; + case ASTNodeType_Fn: + if (node->fn_node.id != NULL) { + free(node->fn_node.id); + } + for (size_t i = 0; i < node->fn_node.params.length; ++i) { + ast_node_free(node->fn_node.params.data[i]); + } + ast_node_vec_destroy(&node->fn_node.params); + ast_node_free(node->fn_node.body); + break; + } + free(node); +} + +void parser_construct(Parser* parser, const char* text, size_t text_length) +{ + *parser = (Parser) { + .text = text, + .text_length = text_length, + .lexer = { 0 }, + .current = { 0 }, + .failed = false, + }; + lexer_construct(&parser->lexer, text, text_length); + parser->current = lexer_next(&parser->lexer); +} + +bool parser_failed(const Parser* parser) { return parser->failed; } + +void parser_step(Parser* parser) +{ + parser->current = lexer_next(&parser->lexer); +} + +bool parser_done(const Parser* parser) +{ + return parser->current.token_type == TokenType_EOF; +} + +ASTNode* parser_parse(Parser* parser) { return parser_parse_expr(parser); } + +ASTNode* parser_parse_expr(Parser* parser) +{ + return parser_parse_operand(parser); +} + +ASTNode* parser_parse_operand(Parser* parser) +{ + Pos pos = parser->current.pos; + switch (parser->current.token_type) { + case TokenType_Error: + return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); + case TokenType_Id: + return parser_parse_id(parser); + case TokenType_Int: + return parser_parse_int(parser); + case TokenType_LParen: + case TokenType_LBrace: + case TokenType_If: + case TokenType_Loop: + default: + parser->failed = true; + print_error("expected operand", pos); + return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); + break; + } +} + +ASTNode* parser_parse_id(Parser* parser) +{ + Pos pos = parser->current.pos; + char* value = malloc(parser->current.length + 1); + value[parser->current.length] = '\0'; + strncpy(value, &parser->text[parser->current.pos.index], + parser->current.length); + parser_step(parser); + return ast_node_new(ASTNodeType_Id, pos, (ASTNode) { .id_value = value }); +} + +ASTNode* parser_parse_int(Parser* parser) +{ + Pos pos = parser->current.pos; + int value = (int)strtol(&parser->text[parser->current.length], NULL, 10); + parser_step(parser); + return ast_node_new(ASTNodeType_Int, pos, (ASTNode) { .int_value = value }); +} + +ASTNode* parser_parse_group(Parser* parser) +{ + Pos pos = parser->current.pos; + parser_step(parser); + ASTNode* expr = parser_parse_expr(parser); + if (parser->current.token_type != TokenType_RParen) { + parser->failed = true; + print_error("parser: expected ')'", pos); + return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); + } + parser_step(parser); + return expr; +} + +ASTNode* parser_parse_block(Parser* parser) +{ + Pos pos = parser->current.pos; + parser_step(parser); + ASTNodeVec statements; + ast_node_vec_construct(&statements); + while (!parser_done(parser) + && parser->current.token_type != TokenType_RBrace) { + ASTNode* statement = parser_parse_statement(parser); + ast_node_vec_push(&statements, statement); + } + if (parser->current.token_type != TokenType_RBrace) { + parser->failed = true; + print_error("parser: expected '}'", pos); + return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); + } + parser_step(parser); + return ast_node_new( + ASTNodeType_Block, pos, (ASTNode) { .statements = statements }); +} + +ASTNode* parser_parse_if(Parser* parser); + +ASTNode* parser_parse_loop(Parser* parser); diff --git a/parser.h b/parser.h index 8d5594b..955cef0 100644 --- a/parser.h +++ b/parser.h @@ -10,6 +10,8 @@ typedef struct { int col; } Pos; +void print_error(const char* message, Pos pos); + typedef enum { TokenType_Error, TokenType_EOF, @@ -18,6 +20,7 @@ typedef enum { TokenType_Not, TokenType_And, TokenType_Or, + TokenType_If, TokenType_Loop, TokenType_Fn, TokenType_Return, @@ -36,13 +39,14 @@ typedef enum { TokenType_MinusEqual, TokenType_Asterisk, TokenType_AsteriskEqual, + TokenType_Equal, TokenType_EqualEqual, TokenType_Exclamation, TokenType_ExclamationEqual, TokenType_LT, - TokenType_LTE, + TokenType_LTEqual, TokenType_GT, - TokenType_GTE, + TokenType_GTEqual, TokenType_Pipe, TokenType_PipeGT, } TokenType; @@ -59,10 +63,12 @@ typedef struct { size_t index; int line; int col; + bool failed; } Lexer; void lexer_construct(Lexer* lexer, const char* text, size_t text_length); Token lexer_next(Lexer* lexer); +bool lexer_failed(const Lexer* lexer); Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos); void lexer_step(Lexer* lexer); bool lexer_done(const Lexer* lexer); @@ -174,7 +180,7 @@ typedef struct { struct ASTNode { ASTNodeType node_type; - int line; + Pos pos; union { char* id_value; int int_value; @@ -191,6 +197,30 @@ struct ASTNode { }; }; -void ast_node_destroy(ASTNode* node); +ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init); +void ast_node_free(ASTNode* node); + +typedef struct { + const char* text; + size_t text_length; + Lexer lexer; + Token current; + bool failed; +} Parser; + +void parser_construct(Parser* parser, const char* text, size_t text_length); +bool parser_failed(const Parser* parser); +void parser_step(Parser* parser); +bool parser_done(const Parser* parser); +ASTNode* parser_parse(Parser* parser); +ASTNode* parser_parse_statement(Parser* parser); +ASTNode* parser_parse_expr(Parser* parser); +ASTNode* parser_parse_operand(Parser* parser); +ASTNode* parser_parse_id(Parser* parser); +ASTNode* parser_parse_int(Parser* parser); +ASTNode* parser_parse_group(Parser* parser); +ASTNode* parser_parse_block(Parser* parser); +ASTNode* parser_parse_if(Parser* parser); +ASTNode* parser_parse_loop(Parser* parser); #endif