diff --git a/parser.c b/parser.c new file mode 100644 index 0000000..9b7d8bc --- /dev/null +++ b/parser.c @@ -0,0 +1,158 @@ +#include "parser.h" +#include +#include +#include + +void lexer_construct(Lexer* lexer, const char* text, size_t length) +{ + *lexer = (Lexer) { + .text = text, + .text_length = length, + .index = 0, + .line = 1, + .col = 1, + }; +} + +static inline bool is_id_start_char(char c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'; +} + +static inline bool is_id_char(char c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') || c == '_'; +} + +struct MatchIdToTokenTypeCase { + const char* keyword; + TokenType token_type; +}; + +static inline TokenType match_id_to_token_type( + const char* source, size_t length, struct MatchIdToTokenTypeCase cases[]) +{ + for (size_t i = 0; cases[i].keyword != NULL; ++i) { + if (strncmp(source, cases[i].keyword, length) == 0) { + return cases[i].token_type; + } + } + return TokenType_Id; +} + +Token lexer_next(Lexer* lexer) +{ + Pos pos = lexer_pos(lexer); + if (lexer_done(lexer)) { + return lexer_token(lexer, TokenType_EOF, pos); + } + char c = lexer_current(lexer); + if (c == ' ' || c == '\t' || c == '\n') { + lexer_step(lexer); + return lexer_next(lexer); + } + if (is_id_start_char(c)) { + lexer_step(lexer); + while (is_id_char(c)) { + lexer_step(lexer); + } + size_t length = lexer->index - pos.index; + TokenType token_type + = match_id_to_token_type(&lexer->text[pos.index], length, + (struct MatchIdToTokenTypeCase[]) { + { "not", TokenType_Not }, + { "and", TokenType_And }, + { "or", TokenType_Or }, + { "loop", TokenType_Loop }, + { "fn", TokenType_Fn }, + { "return", TokenType_Return }, + { "break", TokenType_Break }, + { NULL, TokenType_Id }, + }); + return lexer_token(lexer, token_type, pos); + } + if (c >= '1' && c <= '9') { + lexer_step(lexer); + while (c >= '1' && c <= '9') { + lexer_step(lexer); + } + return lexer_token(lexer, TokenType_Int, pos); + } + return lexer_token(lexer, TokenType_Error, pos); +} + +Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos) +{ + return (Token) { + .token_type = token_type, + .pos = pos, + .length = lexer->index - pos.index, + }; +} + +void lexer_step(Lexer* lexer) +{ + if (lexer_done(lexer)) { + return; + } + lexer->index += 1; + if (lexer_current(lexer) == '\n') { + lexer->line += 1; + lexer->col = 1; + } else { + lexer->col += 1; + } +} + +bool lexer_done(const Lexer* lexer) +{ + return lexer->index >= lexer->text_length; +} + +char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } + +Pos lexer_pos(const Lexer* lexer) +{ + return (Pos) { + .index = lexer->index, + .line = lexer->line, + .col = lexer->col, + }; +} + +int ast_node_vec_construct(ASTNodeVec* vec) +{ + const size_t capacity_start = 4; + *vec = (ASTNodeVec) { + .data = malloc(capacity_start), + .length = 0, + .capacity = capacity_start, + }; + if (vec->data == NULL) { + return -1; + } + return 0; +} + +void ast_node_vec_destroy(ASTNodeVec* vec) +{ + if (vec->data != NULL) { + free(vec->data); + } +} + +int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item) +{ + if (vec->length + 1 > vec->capacity) { + vec->capacity *= 2; + ASTNode** data = realloc(vec->data, vec->capacity); + if (data == NULL) { + return -1; + } + vec->data = data; + } + vec->data[vec->length] = item; + vec->length += 1; + return 0; +} diff --git a/parser.h b/parser.h index 417c1b5..8d5594b 100644 --- a/parser.h +++ b/parser.h @@ -1,10 +1,18 @@ #ifndef PARSER_H #define PARSER_H +#include #include +typedef struct { + size_t index; + int line; + int col; +} Pos; + typedef enum { TokenType_Error, + TokenType_EOF, TokenType_Id, TokenType_Int, TokenType_Not, @@ -12,6 +20,7 @@ typedef enum { TokenType_Or, TokenType_Loop, TokenType_Fn, + TokenType_Return, TokenType_Break, TokenType_LParen, TokenType_RParen, @@ -21,25 +30,45 @@ typedef enum { TokenType_RBracket, TokenType_Comma, TokenType_Semicolon, - TokenType_Exclamation, TokenType_Plus, + TokenType_PlusEqual, TokenType_Minus, + TokenType_MinusEqual, TokenType_Asterisk, - TokenType_EE, - TokenType_NE, + TokenType_AsteriskEqual, + TokenType_EqualEqual, + TokenType_Exclamation, + TokenType_ExclamationEqual, TokenType_LT, - TokenType_GT, TokenType_LTE, + TokenType_GT, TokenType_GTE, + TokenType_Pipe, + TokenType_PipeGT, } TokenType; typedef struct { - TokenType token; - size_t index; + TokenType token_type; + Pos pos; size_t length; - int line; } Token; +typedef struct { + const char* text; + size_t text_length; + size_t index; + int line; + int col; +} Lexer; + +void lexer_construct(Lexer* lexer, const char* text, size_t text_length); +Token lexer_next(Lexer* lexer); +Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos); +void lexer_step(Lexer* lexer); +bool lexer_done(const Lexer* lexer); +char lexer_current(const Lexer* lexer); +Pos lexer_pos(const Lexer* lexer); + typedef enum { ASTNodeType_Error, ASTNodeType_Id, @@ -65,9 +94,9 @@ typedef struct { size_t capacity; } ASTNodeVec; -void ast_node_vec_construct(ASTNodeVec* vec); +int ast_node_vec_construct(ASTNodeVec* vec); void ast_node_vec_destroy(ASTNodeVec* vec); -void ast_node_vec_push(ASTNodeVec* vec, ASTNode* item); +int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item); typedef struct { ASTNode* condition;