#include "compiler.h" #include #include #include #include void lexer_construct(Lexer* lexer, const char* text, size_t length) { *lexer = (Lexer) { .text = text, .text_length = length, .index = 0, .line = 1, .col = 1, .failed = false, }; } static inline bool is_id_start_char(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; } static inline bool is_id_char(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; } static inline Token skip_comment(Lexer* lexer) { Pos pos = lexer_pos(lexer); lexer_step(lexer); if (lexer_current(lexer) == '/') { while (!lexer_done(lexer) && lexer_current(lexer) != '\n') { lexer_step(lexer); } return lexer_next(lexer); } else if (lexer_current(lexer) == '*') { lexer_step(lexer); char last = '\0'; while (!lexer_done(lexer) && !(last == '*' && lexer_current(lexer) == '/')) { last = lexer_current(lexer); lexer_step(lexer); } if (lexer_done(lexer)) { lexer->failed = true; print_error("lexer: malformed multiline comment", pos); return lexer_token(lexer, TokenType_Error, pos); } return lexer_next(lexer); } else { lexer->failed = true; print_error("lexer: malformed comment", pos); return lexer_token(lexer, TokenType_Error, pos); } } struct MatchIdToTokenTypeCase { const char* keyword; TokenType token_type; }; static inline TokenType match_id_to_token_type( const char* source, size_t length, struct MatchIdToTokenTypeCase cases[]) { for (size_t i = 0; cases[i].keyword != NULL; ++i) { if (strlen(cases[i].keyword) <= length && strncmp(source, cases[i].keyword, length) == 0) { return cases[i].token_type; } } return TokenType_Id; } static inline Token lex_id_or_keyword(Lexer* lexer) { Pos pos = lexer_pos(lexer); lexer_step(lexer); while (!lexer_done(lexer) && is_id_char(lexer_current(lexer))) { lexer_step(lexer); } size_t length = lexer->index - pos.index; TokenType token_type = match_id_to_token_type(&lexer->text[pos.index], length, (struct MatchIdToTokenTypeCase[]) { { "not", TokenType_Not }, { "and", TokenType_And }, { "or", TokenType_Or }, { "if", TokenType_If }, { "else", TokenType_Else }, { "loop", TokenType_Loop }, { "let", TokenType_Let }, { "fn", TokenType_Fn }, { "return", TokenType_Return }, { "break", TokenType_Break }, { NULL, TokenType_Id }, }); return lexer_token(lexer, token_type, pos); } Token lex_single_char(Lexer* lexer, TokenType token_type) { Pos pos = lexer_pos(lexer); lexer_step(lexer); return lexer_token(lexer, token_type, pos); } Token lex_single_or_double_char( Lexer* lexer, TokenType first, char c2, TokenType second) { Pos pos = lexer_pos(lexer); lexer_step(lexer); if (lexer_done(lexer) || lexer_current(lexer) != c2) { return lexer_token(lexer, first, pos); } lexer_step(lexer); return lexer_token(lexer, second, pos); } Token lexer_next(Lexer* lexer) { Pos pos = lexer_pos(lexer); if (lexer_done(lexer)) { return lexer_token(lexer, TokenType_EOF, pos); } char c = lexer_current(lexer); if (c == ' ' || c == '\t' || c == '\n') { lexer_step(lexer); return lexer_next(lexer); } if (c == '/') { return skip_comment(lexer); } if (is_id_start_char(c)) { return lex_id_or_keyword(lexer); } if (c >= '1' && c <= '9') { lexer_step(lexer); while (!lexer_done(lexer) && lexer_current(lexer) >= '0' && lexer_current(lexer) <= '9') { lexer_step(lexer); } return lexer_token(lexer, TokenType_Int, pos); } if (c == '-') { lexer_step(lexer); if (lexer_current(lexer) == '=') { lexer_step(lexer); return lexer_token(lexer, TokenType_MinusEqual, pos); } else if (lexer_current(lexer) == '>') { lexer_step(lexer); return lexer_token(lexer, TokenType_MinusGT, pos); } return lexer_token(lexer, TokenType_Minus, pos); } switch (c) { case '0': return lex_single_char(lexer, TokenType_Int); case '(': return lex_single_char(lexer, TokenType_LParen); case ')': return lex_single_char(lexer, TokenType_RParen); case '{': return lex_single_char(lexer, TokenType_LBrace); case '}': return lex_single_char(lexer, TokenType_RBrace); case '[': return lex_single_char(lexer, TokenType_LBracket); case ']': return lex_single_char(lexer, TokenType_RBracket); case ',': return lex_single_char(lexer, TokenType_Comma); case ';': return lex_single_char(lexer, TokenType_Semicolon); case '+': return lex_single_or_double_char( lexer, TokenType_Plus, '=', TokenType_PlusEqual); case '*': return lex_single_or_double_char( lexer, TokenType_Asterisk, '=', TokenType_AsteriskEqual); case '=': return lex_single_or_double_char( lexer, TokenType_Equal, '=', TokenType_EqualEqual); case '!': return lex_single_or_double_char( lexer, TokenType_Exclamation, '=', TokenType_ExclamationEqual); case '<': return lex_single_or_double_char( lexer, TokenType_LT, '=', TokenType_LTEqual); case '>': return lex_single_or_double_char( lexer, TokenType_GT, '=', TokenType_GTEqual); case '|': return lex_single_or_double_char( lexer, TokenType_Pipe, '>', TokenType_PipeGT); } lexer->failed = true; print_error("lexer: unrecognized character", pos); return lexer_token(lexer, TokenType_Error, pos); } bool lexer_failed(const Lexer* lexer) { return lexer->failed; } Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos) { return (Token) { .token_type = token_type, .pos = pos, .length = lexer->index - pos.index, }; } void lexer_step(Lexer* lexer) { if (lexer_done(lexer)) { return; } lexer->index += 1; if (lexer_current(lexer) == '\n') { lexer->line += 1; lexer->col = 1; } else { lexer->col += 1; } } bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->text_length; } char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } Pos lexer_pos(const Lexer* lexer) { return (Pos) { .index = lexer->index, .line = lexer->line, .col = lexer->col, }; } int ast_node_vec_construct(ASTNodeVec* vec) { const size_t capacity_start = 4; *vec = (ASTNodeVec) { .data = malloc(capacity_start), .length = 0, .capacity = capacity_start, }; if (vec->data == NULL) { return -1; } return 0; } void ast_node_vec_destroy(ASTNodeVec* vec) { if (vec->data != NULL) { free(vec->data); } } int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item) { if (vec->length + 1 > vec->capacity) { vec->capacity *= 2; ASTNode** data = realloc(vec->data, vec->capacity); if (data == NULL) { return -1; } vec->data = data; } vec->data[vec->length] = item; vec->length += 1; return 0; } ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init) { ASTNode* node = malloc(sizeof(ASTNode)); if (node == NULL) { return NULL; } *node = spec_init; node->node_type = node_type; node->pos = pos; return node; } void ast_node_free(ASTNode* node) { if (node == NULL) { return; } switch (node->node_type) { case ASTNodeType_Error: break; case ASTNodeType_Id: if (node->id_value != NULL) { free(node->id_value); } break; case ASTNodeType_Int: break; case ASTNodeType_Group: ast_node_free(node->group_value); break; case ASTNodeType_Statements: case ASTNodeType_Block: for (size_t i = 0; i < node->statements.length; ++i) { ast_node_free(node->statements.data[i]); } ast_node_vec_destroy(&node->statements); break; case ASTNodeType_If: ast_node_free(node->if_node.condition); ast_node_free(node->if_node.truthy); ast_node_free(node->if_node.falsy); break; case ASTNodeType_Loop: ast_node_free(node->loop_node.body); break; case ASTNodeType_Call: ast_node_free(node->call_node.subject); for (size_t i = 0; i < node->call_node.args.length; ++i) { ast_node_free(node->call_node.args.data[i]); } ast_node_vec_destroy(&node->call_node.args); break; case ASTNodeType_Index: ast_node_free(node->index_node.subject); ast_node_free(node->index_node.value); break; case ASTNodeType_Unary: ast_node_free(node->unary_node.subject); break; case ASTNodeType_Binary: ast_node_free(node->binary_node.left); ast_node_free(node->binary_node.right); break; case ASTNodeType_Assign: ast_node_free(node->assign_node.subject); ast_node_free(node->assign_node.value); break; case ASTNodeType_Let: ast_node_free(node->let_node.id); ast_node_free(node->let_node.value); break; case ASTNodeType_Break: break; case ASTNodeType_Fn: ast_node_free(node->fn_node.id); for (size_t i = 0; i < node->fn_node.params.length; ++i) { ast_node_free(node->fn_node.params.data[i]); } ast_node_vec_destroy(&node->fn_node.params); ast_node_free(node->fn_node.body); break; case ASTNodeType_Return: if (node->return_node.value != NULL) { ast_node_free(node->return_node.value); } break; } free(node); } void parser_construct(Parser* parser, const char* text, size_t text_length) { *parser = (Parser) { .text = text, .text_length = text_length, .lexer = { 0 }, .current = { 0 }, .failed = false, }; lexer_construct(&parser->lexer, text, text_length); parser_step(parser); } bool parser_failed(const Parser* parser) { return parser->failed; } void parser_step(Parser* parser) { parser->current = lexer_next(&parser->lexer); } bool parser_done(const Parser* parser) { return parser->current.token_type == TokenType_EOF; } ASTNode* parser_parse_statements(Parser* parser) { Pos pos = parser->current.pos; ASTNodeVec statements; ast_node_vec_construct(&statements); while (!parser_done(parser)) { ASTNode* statement = parser_parse_statement(parser); ast_node_vec_push(&statements, statement); } return ast_node_new( ASTNodeType_Statements, pos, (ASTNode) { .statements = statements }); } ASTNode* parser_parse_statement(Parser* parser) { if (parser->current.token_type == TokenType_Fn) { return parser_parse_fn(parser); } else if (parser->current.token_type == TokenType_If) { return parser_parse_if(parser); } else if (parser->current.token_type == TokenType_Loop) { return parser_parse_loop(parser); } else { ASTNode* statement = parser_parse_single_line_statement(parser); if (parser->current.token_type != TokenType_Semicolon) { parser->failed = true; print_error("parser: expected ';'", parser->current.pos); } parser_step(parser); return statement; } } ASTNode* parser_parse_fn(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); if (parser->current.token_type != TokenType_Id) { parser->failed = true; print_error("parser: expected 'id'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ASTNode* id = parser_parse_id(parser); if (parser->current.token_type != TokenType_LParen) { parser->failed = true; print_error("parser: expected '('", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); ASTNodeVec params; ast_node_vec_construct(¶ms); if (!parser_done(parser) && parser->current.token_type != TokenType_RParen) { if (parser->current.token_type != TokenType_Id) { parser->failed = true; print_error("parser: expected 'id'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ast_node_vec_push(¶ms, parser_parse_id(parser)); while (parser->current.token_type == TokenType_Comma) { parser_step(parser); if (parser->current.token_type == TokenType_RParen) { break; } if (parser->current.token_type != TokenType_Id) { parser->failed = true; print_error("parser: expected 'id'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ast_node_vec_push(¶ms, parser_parse_id(parser)); } } if (parser->current.token_type != TokenType_RParen) { parser->failed = true; print_error("parser: expected ')'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); if (parser->current.token_type != TokenType_LBrace) { parser->failed = true; print_error("parser: expected '{'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ASTNode* body = parser_parse_block(parser); return ast_node_new(ASTNodeType_Fn, pos, (ASTNode) { .fn_node = (ASTFnNode) { .id = id, .params = params, .body = body, }, }); } ASTNode* parser_parse_single_line_statement(Parser* parser) { if (parser->current.token_type == TokenType_Let) { return parser_parse_let(parser); } else if (parser->current.token_type == TokenType_Return) { return parser_parse_return(parser); } else if (parser->current.token_type == TokenType_Break) { return parser_parse_break(parser); } else { return parser_parse_assign(parser); } } ASTNode* parser_parse_let(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); if (parser->current.token_type != TokenType_Id) { parser->failed = true; print_error("parser: expected 'id'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ASTNode* id = parser_parse_id(parser); if (parser->current.token_type != TokenType_Equal) { parser->failed = true; print_error("parser: expected '='", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); ASTNode* value = parser_parse_expr(parser); return ast_node_new(ASTNodeType_Let, pos, (ASTNode) { .let_node = (ASTLetNode) { .id = id, .value = value, }, }); } ASTNode* parser_parse_return(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); ASTNode* value = NULL; if (parser->current.token_type != TokenType_Semicolon) { value = parser_parse_expr(parser); } return ast_node_new(ASTNodeType_Return, pos, (ASTNode) { .return_node = (ASTReturnNode) { .value = value, }, }); } ASTNode* parser_parse_break(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); return ast_node_new(ASTNodeType_Return, pos, (ASTNode) { 0 }); } ASTNode* parser_parse_assign(Parser* parser) { Pos pos = parser->current.pos; ASTNode* subject = parser_parse_expr(parser); int assign_type = -1; if (parser->current.token_type == TokenType_Equal) { assign_type = AssignType_Assign; } else if (parser->current.token_type == TokenType_PlusEqual) { assign_type = AssignType_Add; } else if (parser->current.token_type == TokenType_MinusEqual) { assign_type = AssignType_Subtract; } else if (parser->current.token_type == TokenType_AsteriskEqual) { assign_type = AssignType_Multiply; } if (assign_type == -1) { return subject; } parser_step(parser); return ast_node_new(ASTNodeType_Assign, pos, (ASTNode) { .assign_node = (ASTAssignNode) { .assign_type = (AssignType)assign_type, .subject = subject, .value = parser_parse_expr(parser), }, }); } ASTNode* parser_parse_expr(Parser* parser) { return parser_parse_or(parser); } static inline ASTNode* binary_node( Pos pos, BinaryType binary_type, ASTNode* left, ASTNode* right) { return ast_node_new(ASTNodeType_Binary, pos, (ASTNode) { .binary_node = (ASTBinaryNode) { .binary_type = binary_type, .left = left, .right = right, }, }); } ASTNode* parser_parse_or(Parser* parser) { Pos pos = parser->current.pos; ASTNode* left = parser_parse_and(parser); while (parser->current.token_type == TokenType_Or) { parser_step(parser); left = binary_node(pos, BinaryType_Or, left, parser_parse_and(parser)); } return left; } ASTNode* parser_parse_and(Parser* parser) { Pos pos = parser->current.pos; ASTNode* left = parser_parse_equality(parser); while (parser->current.token_type == TokenType_And) { parser_step(parser); left = binary_node( pos, BinaryType_And, left, parser_parse_equality(parser)); } return left; } ASTNode* parser_parse_equality(Parser* parser) { Pos pos = parser->current.pos; ASTNode* left = parser_parse_comparison(parser); if (parser->current.token_type == TokenType_EqualEqual) { parser_step(parser); return binary_node( pos, BinaryType_EE, left, parser_parse_comparison(parser)); } else if (parser->current.token_type == TokenType_ExclamationEqual) { parser_step(parser); return binary_node( pos, BinaryType_NE, left, parser_parse_comparison(parser)); } else { return left; } } ASTNode* parser_parse_comparison(Parser* parser) { Pos pos = parser->current.pos; ASTNode* left = parser_parse_term(parser); while (!parser_done(parser)) { if (parser->current.token_type == TokenType_LT) { parser_step(parser); return binary_node( pos, BinaryType_LT, left, parser_parse_term(parser)); } else if (parser->current.token_type == TokenType_LTEqual) { parser_step(parser); return binary_node( pos, BinaryType_LTE, left, parser_parse_term(parser)); } else if (parser->current.token_type == TokenType_GT) { parser_step(parser); return binary_node( pos, BinaryType_GT, left, parser_parse_term(parser)); } else if (parser->current.token_type == TokenType_GTEqual) { parser_step(parser); return binary_node( pos, BinaryType_GTE, left, parser_parse_term(parser)); } else { break; } } return left; } ASTNode* parser_parse_term(Parser* parser) { Pos pos = parser->current.pos; ASTNode* left = parser_parse_factor(parser); while (!parser_done(parser)) { if (parser->current.token_type == TokenType_Plus) { parser_step(parser); return binary_node( pos, BinaryType_Add, left, parser_parse_factor(parser)); } else if (parser->current.token_type == TokenType_Minus) { parser_step(parser); return binary_node( pos, BinaryType_Subtract, left, parser_parse_factor(parser)); } else { break; } } return left; } ASTNode* parser_parse_factor(Parser* parser) { Pos pos = parser->current.pos; ASTNode* left = parser_parse_unary(parser); while (!parser_done(parser)) { if (parser->current.token_type == TokenType_Asterisk) { parser_step(parser); return binary_node( pos, BinaryType_Multiply, left, parser_parse_unary(parser)); } else { break; } } return left; } ASTNode* parser_parse_unary(Parser* parser) { Pos pos = parser->current.pos; if (parser->current.token_type == TokenType_Not) { parser_step(parser); return ast_node_new(ASTNodeType_Unary, pos, (ASTNode) { .unary_node = (ASTUnaryNode) { .unary_type = UnaryType_Not, .subject = parser_parse_unary(parser), }, }); } else if (parser->current.token_type == TokenType_Minus) { parser_step(parser); return ast_node_new(ASTNodeType_Unary, pos, (ASTNode) { .unary_node = (ASTUnaryNode) { .unary_type = UnaryType_Negate, .subject = parser_parse_unary(parser), }, }); } else { return parser_parse_index_call(parser); } } ASTNode* parser_parse_index_call(Parser* parser) { Pos pos = parser->current.pos; ASTNode* subject = parser_parse_operand(parser); while (!parser_done(parser)) { if (parser->current.token_type == TokenType_LParen) { parser_step(parser); ASTNodeVec args; ast_node_vec_construct(&args); if (!parser_done(parser) && parser->current.token_type != TokenType_RParen) { ast_node_vec_push(&args, parser_parse_expr(parser)); while (parser->current.token_type == TokenType_Comma) { parser_step(parser); if (parser->current.token_type == TokenType_RParen) { break; } ast_node_vec_push(&args, parser_parse_expr(parser)); } } if (parser->current.token_type != TokenType_RParen) { parser->failed = true; print_error("parser: expected ')'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); subject = ast_node_new(ASTNodeType_Call, pos, (ASTNode) { .call_node = (ASTCallNode) { .subject = subject, .args = args, }, }); } else if (parser->current.token_type == TokenType_LBracket) { parser_step(parser); ASTNode* value = parser_parse_expr(parser); if (parser->current.token_type != TokenType_RBracket) { parser->failed = true; print_error("parser: expected ']'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); subject = ast_node_new(ASTNodeType_Index, pos, (ASTNode) { .index_node = (ASTIndexNode) { .subject = subject, .value = value, }, }); } else { break; } } return subject; } ASTNode* parser_parse_operand(Parser* parser) { Pos pos = parser->current.pos; switch (parser->current.token_type) { case TokenType_Error: return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); case TokenType_Id: return parser_parse_id(parser); case TokenType_Int: return parser_parse_int(parser); case TokenType_LParen: return parser_parse_group(parser); case TokenType_LBrace: return parser_parse_block(parser); case TokenType_If: return parser_parse_if(parser); case TokenType_Loop: return parser_parse_loop(parser); default: parser->failed = true; print_error("parser: expected operand", pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } } ASTNode* parser_parse_id(Parser* parser) { Pos pos = parser->current.pos; char* value = malloc(parser->current.length + 1); value[parser->current.length] = '\0'; strncpy(value, &parser->text[parser->current.pos.index], parser->current.length); parser_step(parser); return ast_node_new(ASTNodeType_Id, pos, (ASTNode) { .id_value = value, .id_symbol = NULL, }); } ASTNode* parser_parse_int(Parser* parser) { Pos pos = parser->current.pos; int value = (int)strtol(&parser->text[parser->current.length], NULL, 10); parser_step(parser); return ast_node_new(ASTNodeType_Int, pos, (ASTNode) { .int_value = value }); } ASTNode* parser_parse_group(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); ASTNode* expr = parser_parse_expr(parser); if (parser->current.token_type != TokenType_RParen) { parser->failed = true; print_error("parser: expected ')'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); return ast_node_new(ASTNodeType_Group, pos, (ASTNode) { .group_value = expr, }); } ASTNode* parser_parse_block(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); ASTNodeVec statements; ast_node_vec_construct(&statements); while (!parser_done(parser) && parser->current.token_type != TokenType_RBrace) { ASTNode* statement = parser_parse_statement(parser); ast_node_vec_push(&statements, statement); } if (parser->current.token_type != TokenType_RBrace) { parser->failed = true; print_error("parser: expected '}'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } parser_step(parser); return ast_node_new( ASTNodeType_Block, pos, (ASTNode) { .statements = statements }); } ASTNode* parser_parse_if(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); ASTNode* condition = parser_parse_expr(parser); if (parser->current.token_type != TokenType_LBrace) { parser->failed = true; print_error("parser: expected '{'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ASTNode* truthy = parser_parse_block(parser); ASTNode* falsy = NULL; if (parser->current.token_type == TokenType_Else) { parser_step(parser); if (parser->current.token_type != TokenType_LBrace) { parser->failed = true; print_error("parser: expected '{'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } falsy = parser_parse_block(parser); } return ast_node_new(ASTNodeType_If, pos, (ASTNode) { .if_node = (ASTIfNode) { .condition = condition, .truthy = truthy, .falsy = falsy, }, }); } ASTNode* parser_parse_loop(Parser* parser) { Pos pos = parser->current.pos; parser_step(parser); if (parser->current.token_type != TokenType_RBrace) { parser->failed = true; print_error("parser: expected '}'", parser->current.pos); return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 }); } ASTNode* body = parser_parse_block(parser); return ast_node_new(ASTNodeType_Loop, pos, (ASTNode) { .loop_node = (ASTLoopNode) { .body = body, }, }); }