something interesting, jk

This commit is contained in:
SimonFJ20 2024-04-04 02:08:11 +02:00
parent 277471a743
commit 216dbf2fce
2 changed files with 358 additions and 23 deletions

343
parser.c
View File

@ -11,6 +11,7 @@ void lexer_construct(Lexer* lexer, const char* text, size_t length)
.index = 0, .index = 0,
.line = 1, .line = 1,
.col = 1, .col = 1,
.failed = false,
}; };
} }
@ -25,6 +26,36 @@ static inline bool is_id_char(char c)
|| (c >= '0' && c <= '9') || c == '_'; || (c >= '0' && c <= '9') || c == '_';
} }
static inline Token skip_comment(Lexer* lexer)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
if (lexer_current(lexer) == '/') {
while (!lexer_done(lexer) && lexer_current(lexer) != '\n') {
lexer_step(lexer);
}
return lexer_next(lexer);
} else if (lexer_current(lexer) == '*') {
lexer_step(lexer);
char last = '\0';
while (!lexer_done(lexer)
&& !(last == '*' && lexer_current(lexer) == '/')) {
last = lexer_current(lexer);
lexer_step(lexer);
}
if (lexer_done(lexer)) {
lexer->failed = true;
print_error("lexer: malformed multiline comment", pos);
return lexer_token(lexer, TokenType_Error, pos);
}
return lexer_next(lexer);
} else {
lexer->failed = true;
print_error("lexer: malformed comment", pos);
return lexer_token(lexer, TokenType_Error, pos);
}
}
struct MatchIdToTokenTypeCase { struct MatchIdToTokenTypeCase {
const char* keyword; const char* keyword;
TokenType token_type; TokenType token_type;
@ -41,6 +72,49 @@ static inline TokenType match_id_to_token_type(
return TokenType_Id; return TokenType_Id;
} }
static inline Token lex_id_or_keyword(Lexer* lexer)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
while (!lexer_done(lexer) && is_id_char(lexer_current(lexer))) {
lexer_step(lexer);
}
size_t length = lexer->index - pos.index;
TokenType token_type
= match_id_to_token_type(&lexer->text[pos.index], length,
(struct MatchIdToTokenTypeCase[]) {
{ "not", TokenType_Not },
{ "and", TokenType_And },
{ "or", TokenType_Or },
{ "if", TokenType_If },
{ "loop", TokenType_Loop },
{ "fn", TokenType_Fn },
{ "return", TokenType_Return },
{ "break", TokenType_Break },
{ NULL, TokenType_Id },
});
return lexer_token(lexer, token_type, pos);
}
Token lex_single_char(Lexer* lexer, TokenType token_type)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
return lexer_token(lexer, token_type, pos);
}
Token lex_single_or_double_char(
Lexer* lexer, TokenType first, char c2, TokenType second)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
if (lexer_done(lexer) || lexer_current(lexer) != c2) {
return lexer_token(lexer, first, pos);
}
lexer_step(lexer);
return lexer_token(lexer, second, pos);
}
Token lexer_next(Lexer* lexer) Token lexer_next(Lexer* lexer)
{ {
Pos pos = lexer_pos(lexer); Pos pos = lexer_pos(lexer);
@ -52,36 +126,70 @@ Token lexer_next(Lexer* lexer)
lexer_step(lexer); lexer_step(lexer);
return lexer_next(lexer); return lexer_next(lexer);
} }
if (is_id_start_char(c)) { if (c == '/') {
lexer_step(lexer); return skip_comment(lexer);
while (is_id_char(c)) {
lexer_step(lexer);
} }
size_t length = lexer->index - pos.index; if (is_id_start_char(c)) {
TokenType token_type return lex_id_or_keyword(lexer);
= match_id_to_token_type(&lexer->text[pos.index], length,
(struct MatchIdToTokenTypeCase[]) {
{ "not", TokenType_Not },
{ "and", TokenType_And },
{ "or", TokenType_Or },
{ "loop", TokenType_Loop },
{ "fn", TokenType_Fn },
{ "return", TokenType_Return },
{ "break", TokenType_Break },
{ NULL, TokenType_Id },
});
return lexer_token(lexer, token_type, pos);
} }
if (c >= '1' && c <= '9') { if (c >= '1' && c <= '9') {
lexer_step(lexer); lexer_step(lexer);
while (c >= '1' && c <= '9') { while (!lexer_done(lexer) && c >= '1' && c <= '9') {
lexer_step(lexer); lexer_step(lexer);
} }
return lexer_token(lexer, TokenType_Int, pos); return lexer_token(lexer, TokenType_Int, pos);
} }
switch (c) {
case '0':
return lex_single_char(lexer, TokenType_Int);
case '(':
return lex_single_char(lexer, TokenType_LParen);
case ')':
return lex_single_char(lexer, TokenType_RParen);
case '{':
return lex_single_char(lexer, TokenType_LBrace);
case '}':
return lex_single_char(lexer, TokenType_RBrace);
case '[':
return lex_single_char(lexer, TokenType_LBracket);
case ']':
return lex_single_char(lexer, TokenType_RBracket);
case ',':
return lex_single_char(lexer, TokenType_Comma);
case ';':
return lex_single_char(lexer, TokenType_Semicolon);
case '+':
return lex_single_or_double_char(
lexer, TokenType_Plus, '=', TokenType_PlusEqual);
case '-':
return lex_single_or_double_char(
lexer, TokenType_Minus, '=', TokenType_MinusEqual);
case '*':
return lex_single_or_double_char(
lexer, TokenType_Asterisk, '=', TokenType_AsteriskEqual);
case '=':
return lex_single_or_double_char(
lexer, TokenType_Equal, '=', TokenType_EqualEqual);
case '!':
return lex_single_or_double_char(
lexer, TokenType_Exclamation, '=', TokenType_ExclamationEqual);
case '<':
return lex_single_or_double_char(
lexer, TokenType_LT, '=', TokenType_LTEqual);
case '>':
return lex_single_or_double_char(
lexer, TokenType_GT, '=', TokenType_GTEqual);
case '|':
return lex_single_or_double_char(
lexer, TokenType_Pipe, '>', TokenType_PipeGT);
}
lexer->failed = true;
print_error("lexer: unrecognized character", pos);
return lexer_token(lexer, TokenType_Error, pos); return lexer_token(lexer, TokenType_Error, pos);
} }
bool lexer_failed(const Lexer* lexer) { return lexer->failed; }
Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos) Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos)
{ {
return (Token) { return (Token) {
@ -156,3 +264,200 @@ int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item)
vec->length += 1; vec->length += 1;
return 0; return 0;
} }
ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init)
{
ASTNode* node = malloc(sizeof(ASTNode));
if (node == NULL) {
return NULL;
}
*node = spec_init;
node->node_type = node_type;
node->pos = pos;
return node;
}
void ast_node_free(ASTNode* node)
{
if (node == NULL) {
return;
}
switch (node->node_type) {
case ASTNodeType_Error:
break;
case ASTNodeType_Id:
if (node->id_value != NULL) {
free(node->id_value);
}
break;
case ASTNodeType_Int:
break;
case ASTNodeType_Block:
for (size_t i = 0; i < node->statements.length; ++i) {
ast_node_free(node->statements.data[i]);
}
ast_node_vec_destroy(&node->statements);
break;
case ASTNodeType_If:
ast_node_free(node->if_node.condition);
ast_node_free(node->if_node.truthy);
ast_node_free(node->if_node.falsy);
break;
case ASTNodeType_Loop:
ast_node_free(node->loop_node.body);
break;
case ASTNodeType_Call:
ast_node_free(node->call_node.subject);
for (size_t i = 0; i < node->call_node.args.length; ++i) {
ast_node_free(node->call_node.args.data[i]);
}
ast_node_vec_destroy(&node->call_node.args);
break;
case ASTNodeType_Index:
ast_node_free(node->index_node.subject);
ast_node_free(node->index_node.value);
break;
case ASTNodeType_Unary:
ast_node_free(node->unary_node.subject);
break;
case ASTNodeType_Binary:
ast_node_free(node->binary_node.left);
ast_node_free(node->binary_node.right);
break;
case ASTNodeType_Assign:
ast_node_free(node->assign_node.subject);
ast_node_free(node->assign_node.value);
break;
case ASTNodeType_Let:
if (node->let_node.id != NULL) {
free(node->let_node.id);
}
ast_node_free(node->let_node.value);
break;
case ASTNodeType_Break:
break;
case ASTNodeType_Fn:
if (node->fn_node.id != NULL) {
free(node->fn_node.id);
}
for (size_t i = 0; i < node->fn_node.params.length; ++i) {
ast_node_free(node->fn_node.params.data[i]);
}
ast_node_vec_destroy(&node->fn_node.params);
ast_node_free(node->fn_node.body);
break;
}
free(node);
}
void parser_construct(Parser* parser, const char* text, size_t text_length)
{
*parser = (Parser) {
.text = text,
.text_length = text_length,
.lexer = { 0 },
.current = { 0 },
.failed = false,
};
lexer_construct(&parser->lexer, text, text_length);
parser->current = lexer_next(&parser->lexer);
}
bool parser_failed(const Parser* parser) { return parser->failed; }
void parser_step(Parser* parser)
{
parser->current = lexer_next(&parser->lexer);
}
bool parser_done(const Parser* parser)
{
return parser->current.token_type == TokenType_EOF;
}
ASTNode* parser_parse(Parser* parser) { return parser_parse_expr(parser); }
ASTNode* parser_parse_expr(Parser* parser)
{
return parser_parse_operand(parser);
}
ASTNode* parser_parse_operand(Parser* parser)
{
Pos pos = parser->current.pos;
switch (parser->current.token_type) {
case TokenType_Error:
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
case TokenType_Id:
return parser_parse_id(parser);
case TokenType_Int:
return parser_parse_int(parser);
case TokenType_LParen:
case TokenType_LBrace:
case TokenType_If:
case TokenType_Loop:
default:
parser->failed = true;
print_error("expected operand", pos);
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
break;
}
}
ASTNode* parser_parse_id(Parser* parser)
{
Pos pos = parser->current.pos;
char* value = malloc(parser->current.length + 1);
value[parser->current.length] = '\0';
strncpy(value, &parser->text[parser->current.pos.index],
parser->current.length);
parser_step(parser);
return ast_node_new(ASTNodeType_Id, pos, (ASTNode) { .id_value = value });
}
ASTNode* parser_parse_int(Parser* parser)
{
Pos pos = parser->current.pos;
int value = (int)strtol(&parser->text[parser->current.length], NULL, 10);
parser_step(parser);
return ast_node_new(ASTNodeType_Int, pos, (ASTNode) { .int_value = value });
}
ASTNode* parser_parse_group(Parser* parser)
{
Pos pos = parser->current.pos;
parser_step(parser);
ASTNode* expr = parser_parse_expr(parser);
if (parser->current.token_type != TokenType_RParen) {
parser->failed = true;
print_error("parser: expected ')'", pos);
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
}
parser_step(parser);
return expr;
}
ASTNode* parser_parse_block(Parser* parser)
{
Pos pos = parser->current.pos;
parser_step(parser);
ASTNodeVec statements;
ast_node_vec_construct(&statements);
while (!parser_done(parser)
&& parser->current.token_type != TokenType_RBrace) {
ASTNode* statement = parser_parse_statement(parser);
ast_node_vec_push(&statements, statement);
}
if (parser->current.token_type != TokenType_RBrace) {
parser->failed = true;
print_error("parser: expected '}'", pos);
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
}
parser_step(parser);
return ast_node_new(
ASTNodeType_Block, pos, (ASTNode) { .statements = statements });
}
ASTNode* parser_parse_if(Parser* parser);
ASTNode* parser_parse_loop(Parser* parser);

View File

@ -10,6 +10,8 @@ typedef struct {
int col; int col;
} Pos; } Pos;
void print_error(const char* message, Pos pos);
typedef enum { typedef enum {
TokenType_Error, TokenType_Error,
TokenType_EOF, TokenType_EOF,
@ -18,6 +20,7 @@ typedef enum {
TokenType_Not, TokenType_Not,
TokenType_And, TokenType_And,
TokenType_Or, TokenType_Or,
TokenType_If,
TokenType_Loop, TokenType_Loop,
TokenType_Fn, TokenType_Fn,
TokenType_Return, TokenType_Return,
@ -36,13 +39,14 @@ typedef enum {
TokenType_MinusEqual, TokenType_MinusEqual,
TokenType_Asterisk, TokenType_Asterisk,
TokenType_AsteriskEqual, TokenType_AsteriskEqual,
TokenType_Equal,
TokenType_EqualEqual, TokenType_EqualEqual,
TokenType_Exclamation, TokenType_Exclamation,
TokenType_ExclamationEqual, TokenType_ExclamationEqual,
TokenType_LT, TokenType_LT,
TokenType_LTE, TokenType_LTEqual,
TokenType_GT, TokenType_GT,
TokenType_GTE, TokenType_GTEqual,
TokenType_Pipe, TokenType_Pipe,
TokenType_PipeGT, TokenType_PipeGT,
} TokenType; } TokenType;
@ -59,10 +63,12 @@ typedef struct {
size_t index; size_t index;
int line; int line;
int col; int col;
bool failed;
} Lexer; } Lexer;
void lexer_construct(Lexer* lexer, const char* text, size_t text_length); void lexer_construct(Lexer* lexer, const char* text, size_t text_length);
Token lexer_next(Lexer* lexer); Token lexer_next(Lexer* lexer);
bool lexer_failed(const Lexer* lexer);
Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos); Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos);
void lexer_step(Lexer* lexer); void lexer_step(Lexer* lexer);
bool lexer_done(const Lexer* lexer); bool lexer_done(const Lexer* lexer);
@ -174,7 +180,7 @@ typedef struct {
struct ASTNode { struct ASTNode {
ASTNodeType node_type; ASTNodeType node_type;
int line; Pos pos;
union { union {
char* id_value; char* id_value;
int int_value; int int_value;
@ -191,6 +197,30 @@ struct ASTNode {
}; };
}; };
void ast_node_destroy(ASTNode* node); ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init);
void ast_node_free(ASTNode* node);
typedef struct {
const char* text;
size_t text_length;
Lexer lexer;
Token current;
bool failed;
} Parser;
void parser_construct(Parser* parser, const char* text, size_t text_length);
bool parser_failed(const Parser* parser);
void parser_step(Parser* parser);
bool parser_done(const Parser* parser);
ASTNode* parser_parse(Parser* parser);
ASTNode* parser_parse_statement(Parser* parser);
ASTNode* parser_parse_expr(Parser* parser);
ASTNode* parser_parse_operand(Parser* parser);
ASTNode* parser_parse_id(Parser* parser);
ASTNode* parser_parse_int(Parser* parser);
ASTNode* parser_parse_group(Parser* parser);
ASTNode* parser_parse_block(Parser* parser);
ASTNode* parser_parse_if(Parser* parser);
ASTNode* parser_parse_loop(Parser* parser);
#endif #endif