#include "parser.h" #include #include #include void lexer_construct(Lexer* lexer, const char* text, size_t length) { *lexer = (Lexer) { .text = text, .text_length = length, .index = 0, .line = 1, .col = 1, .failed = false, }; } static inline bool is_id_start_char(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } static inline bool is_id_char(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; } static inline Token skip_comment(Lexer* lexer) { Pos pos = lexer_pos(lexer); lexer_step(lexer); if (lexer_current(lexer) == '/') { while (!lexer_done(lexer) && lexer_current(lexer) != '\n') { lexer_step(lexer); } return lexer_next(lexer); } else if (lexer_current(lexer) == '*') { lexer_step(lexer); char last = '\0'; while (!lexer_done(lexer) && !(last == '*' && lexer_current(lexer) == '/')) { last = lexer_current(lexer); lexer_step(lexer); } if (lexer_done(lexer)) { lexer->failed = true; print_error("lexer: malformed multiline comment", pos); return lexer_token(lexer, TokenType_Error, pos); } return lexer_next(lexer); } else { lexer->failed = true; print_error("lexer: malformed comment", pos); return lexer_token(lexer, TokenType_Error, pos); } } struct MatchIdToTokenTypeCase { const char* keyword; TokenType token_type; }; static inline TokenType match_id_to_token_type( const char* source, size_t length, struct MatchIdToTokenTypeCase cases[]) { for (size_t i = 0; cases[i].keyword != NULL; ++i) { if (strncmp(source, cases[i].keyword, length) == 0) { return cases[i].token_type; } } return TokenType_Id; } static inline Token lex_id_or_keyword(Lexer* lexer) { Pos pos = lexer_pos(lexer); lexer_step(lexer); while (!lexer_done(lexer) && is_id_char(lexer_current(lexer))) { lexer_step(lexer); } size_t length = lexer->index - pos.index; TokenType token_type = match_id_to_token_type(&lexer->text[pos.index], length, (struct MatchIdToTokenTypeCase[]) { { "not", TokenType_Not }, { "and", TokenType_And }, { "or", TokenType_Or }, { "if", TokenType_If }, { "loop", TokenType_Loop }, { "fn", TokenType_Fn }, { "return", TokenType_Return }, { "break", TokenType_Break }, { NULL, TokenType_Id }, }); return lexer_token(lexer, token_type, pos); } Token lex_single_char(Lexer* lexer, TokenType token_type) { Pos pos = lexer_pos(lexer); lexer_step(lexer); return lexer_token(lexer, token_type, pos); } Token lex_single_or_double_char( Lexer* lexer, TokenType first, char c2, TokenType second) { Pos pos = lexer_pos(lexer); lexer_step(lexer); if (lexer_done(lexer) || lexer_current(lexer) != c2) { return lexer_token(lexer, first, pos); } lexer_step(lexer); return lexer_token(lexer, second, pos); } Token lexer_next(Lexer* lexer) { Pos pos = lexer_pos(lexer); if (lexer_done(lexer)) { return lexer_token(lexer, TokenType_EOF, pos); } char c = lexer_current(lexer); if (c == ' ' || c == '\t' || c == '\n') { lexer_step(lexer); return lexer_next(lexer); } if (c == '/') { return skip_comment(lexer); } if (is_id_start_char(c)) { return lex_id_or_keyword(lexer); } if (c >= '1' && c <= '9') { lexer_step(lexer); while (!lexer_done(lexer) && c >= '1' && c <= '9') { lexer_step(lexer); } return lexer_token(lexer, TokenType_Int, pos); } switch (c) { case '0': return lex_single_char(lexer, TokenType_Int); case '(': return lex_single_char(lexer, TokenType_LParen); case ')': return lex_single_char(lexer, TokenType_RParen); case '{': return lex_single_char(lexer, TokenType_LBrace); case '}': return lex_single_char(lexer, TokenType_RBrace); case '[': return lex_single_char(lexer, TokenType_LBracket); case ']': return lex_single_char(lexer, TokenType_RBracket); case ',': return lex_single_char(lexer, TokenType_Comma); case ';': return lex_single_char(lexer, TokenType_Semicolon); case '+': return lex_single_or_double_char( lexer, TokenType_Plus, '=', TokenType_PlusEqual); case '-': return lex_single_or_double_char( lexer, TokenType_Minus, '=', TokenType_MinusEqual); case '*': return lex_single_or_double_char( lexer, TokenType_Asterisk, '=', TokenType_AsteriskEqual); case '=': return lex_single_or_double_char( lexer, TokenType_Equal, '=', TokenType_EqualEqual); case '!': return lex_single_or_double_char( lexer, TokenType_Exclamation, '=', TokenType_ExclamationEqual); case '<': return lex_single_or_double_char( lexer, TokenType_LT, '=', TokenType_LTEqual); case '>': return lex_single_or_double_char( lexer, TokenType_GT, '=', TokenType_GTEqual); case '|': return lex_single_or_double_char( lexer, TokenType_Pipe, '>', TokenType_PipeGT); } lexer->failed = true; print_error("lexer: unrecognized character", pos); return lexer_token(lexer, TokenType_Error, pos); } bool lexer_failed(const Lexer* lexer) { return lexer->failed; } Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos) { return (Token) { .token_type = token_type, .pos = pos, .length = lexer->index - pos.index, }; } void lexer_step(Lexer* lexer) { if (lexer_done(lexer)) { return; } lexer->index += 1; if (lexer_current(lexer) == '\n') { lexer->line += 1; lexer->col = 1; } else { lexer->col += 1; } } bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->text_length; } char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } Pos lexer_pos(const Lexer* lexer) { return (Pos) { .index = lexer->index, .line = lexer->line, .col = lexer->col, }; } int ast_node_vec_construct(ASTNodeVec* vec) { const size_t capacity_start = 4; *vec = (ASTNodeVec) { .data = malloc(capacity_start), .length = 0, .capacity = capacity_start, }; if (vec->data == NULL) { return -1; } return 0; } void ast_node_vec_destroy(ASTNodeVec* vec) { if (vec->data != NULL) { free(vec->data); } } int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item) { if (vec->length + 1 > vec->capacity) { vec->capacity *= 2; ASTNode** data = realloc(vec->data, vec->capacity); if (data == NULL) { return -1; } vec->data = data; } vec->data[vec->length] = item; vec->length += 1; return 0; } ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init) { ASTNode* node = malloc(sizeof(ASTNode)); if (node == NULL) { return NULL; } *node = spec_init; node->node_type = node_type; node->pos = pos; return node; } void ast_node_free(ASTNode* node) { if (node == NULL) { return; } switch (node->node_type) { case ASTNodeType_Error: break; case ASTNodeType_Id: if (node->id_value != NULL) { free(node->id_value); } break; case ASTNodeType_Int: break; case ASTNodeType_Block: for (size_t i = 0; i < node->statements.length; ++i) { ast_node_free(node->statements.data[i]); } ast_node_vec_destroy(&node->statements); break; case ASTNodeType_If: ast_node_free(node->if_node.condition); ast_node_free(node->if_node.truthy); ast_node_free(node->if_node.falsy); break; case ASTNodeType_Loop: ast_node_free(node->loop_node.body); break; case ASTNodeType_Call: ast_node_free(node->call_node.subject); for (size_t i = 0; i < node->call_node.args.length; ++i) { ast_node_free(node->call_node.args.data[i]); } ast_node_vec_destroy(&node->call_node.args); break; case ASTNodeType_Index: ast_node_free(node->index_node.subject); ast_node_free(node->index_node.value); break; case ASTNodeType_Unary: ast_node_free(node->unary_node.subject); break; case ASTNodeType_Binary: ast_node_free(node->binary_node.left); ast_node_free(node->binary_node.right); break; case ASTNodeType_Assign: ast_node_free(node->assign_node.subject); ast_node_free(node->assign_node.value); break; case ASTNodeType_Let: if (node->let_node.id != NULL) { free(node->let_node.id); } ast_node_free(node->let_node.value); break; case ASTNodeType_Break: break; case ASTNodeType_Fn: if (node->fn_node.id != NULL) { free(node->fn_node.id); } for (size_t i = 0; i < node->fn_node.params.length; ++i) { ast_node_free(node->fn_node.params.data[i]); } ast_node_vec_destroy(&node->fn_node.params); ast_node_free(node->fn_node.body); break; } free(node); } void parser_construct(Parser* parser, const char* text, size_t text_length) { *parser = (Parser) { .text = text, .text_length = text_length, .lexer = { 0 }, .current = { 0 }, .failed = false, }; lexer_construct(&parser->lexer, text, text_length); parser->current = lexer_next(&parser->lexer); } bool parser_failed(const Parser* parser) { return parser->failed; } void parser_step(Parser* parser) { parser->current = lexer_next(&parser->lexer); } bool parser_done(const Parser* parser) { return parser->current.token_type == TokenType_EOF; } ASTNode* parser_parse(Parser* parser) { return parser_parse_expr(parser); } ASTNode* parser_parse_expr(Parser* parser) { return parser_parse_operand(parser); } ASTNode* parser_parse_operand(Parser* parser) { Pos pos = parser->current.pos; switch (parser->current.token_type) { case TokenType_Error: return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); case TokenType_Id: return parser_parse_id(parser); case TokenType_Int: return parser_parse_int(parser); case TokenType_LParen: case TokenType_LBrace: case TokenType_If: case TokenType_Loop: default: parser->failed = true; print_error("expected operand", pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); break; } } ASTNode* parser_parse_id(Parser* parser) { Pos pos = parser->current.pos; char* value = malloc(parser->current.length + 1); value[parser->current.length] = '\0'; strncpy(value, &parser->text[parser->current.pos.index], parser->current.length); parser_step(parser); return ast_node_new(ASTNodeType_Id, pos, (ASTNode) { .id_value = value }); } ASTNode* parser_parse_int(Parser* parser) { Pos pos = parser->current.pos; int value = (int)strtol(&parser->text[parser->current.length], NULL, 10); parser_step(parser); return ast_node_new(ASTNodeType_Int, pos, (ASTNode) { .int_value = value }); } ASTNode* parser_parse_group(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); ASTNode* expr = parser_parse_expr(parser); if (parser->current.token_type != TokenType_RParen) { parser->failed = true; print_error("parser: expected ')'", pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); return expr; } ASTNode* parser_parse_block(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); ASTNodeVec statements; ast_node_vec_construct(&statements); while (!parser_done(parser) && parser->current.token_type != TokenType_RBrace) { ASTNode* statement = parser_parse_statement(parser); ast_node_vec_push(&statements, statement); } if (parser->current.token_type != TokenType_RBrace) { parser->failed = true; print_error("parser: expected '}'", pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); return ast_node_new( ASTNodeType_Block, pos, (ASTNode) { .statements = statements }); } ASTNode* parser_parse_if(Parser* parser); ASTNode* parser_parse_loop(Parser* parser);