semos/parser.c

464 lines
13 KiB
C
Raw Normal View History

2024-04-02 18:47:16 +01:00
#include "parser.h"
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
void lexer_construct(Lexer* lexer, const char* text, size_t length)
{
*lexer = (Lexer) {
.text = text,
.text_length = length,
.index = 0,
.line = 1,
.col = 1,
2024-04-04 01:08:11 +01:00
.failed = false,
2024-04-02 18:47:16 +01:00
};
}
static inline bool is_id_start_char(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_';
}
static inline bool is_id_char(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
|| (c >= '0' && c <= '9') || c == '_';
}
2024-04-04 01:08:11 +01:00
static inline Token skip_comment(Lexer* lexer)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
if (lexer_current(lexer) == '/') {
while (!lexer_done(lexer) && lexer_current(lexer) != '\n') {
lexer_step(lexer);
}
return lexer_next(lexer);
} else if (lexer_current(lexer) == '*') {
lexer_step(lexer);
char last = '\0';
while (!lexer_done(lexer)
&& !(last == '*' && lexer_current(lexer) == '/')) {
last = lexer_current(lexer);
lexer_step(lexer);
}
if (lexer_done(lexer)) {
lexer->failed = true;
print_error("lexer: malformed multiline comment", pos);
return lexer_token(lexer, TokenType_Error, pos);
}
return lexer_next(lexer);
} else {
lexer->failed = true;
print_error("lexer: malformed comment", pos);
return lexer_token(lexer, TokenType_Error, pos);
}
}
2024-04-02 18:47:16 +01:00
struct MatchIdToTokenTypeCase {
const char* keyword;
TokenType token_type;
};
static inline TokenType match_id_to_token_type(
const char* source, size_t length, struct MatchIdToTokenTypeCase cases[])
{
for (size_t i = 0; cases[i].keyword != NULL; ++i) {
if (strncmp(source, cases[i].keyword, length) == 0) {
return cases[i].token_type;
}
}
return TokenType_Id;
}
2024-04-04 01:08:11 +01:00
static inline Token lex_id_or_keyword(Lexer* lexer)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
while (!lexer_done(lexer) && is_id_char(lexer_current(lexer))) {
lexer_step(lexer);
}
size_t length = lexer->index - pos.index;
TokenType token_type
= match_id_to_token_type(&lexer->text[pos.index], length,
(struct MatchIdToTokenTypeCase[]) {
{ "not", TokenType_Not },
{ "and", TokenType_And },
{ "or", TokenType_Or },
{ "if", TokenType_If },
{ "loop", TokenType_Loop },
{ "fn", TokenType_Fn },
{ "return", TokenType_Return },
{ "break", TokenType_Break },
{ NULL, TokenType_Id },
});
return lexer_token(lexer, token_type, pos);
}
Token lex_single_char(Lexer* lexer, TokenType token_type)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
return lexer_token(lexer, token_type, pos);
}
Token lex_single_or_double_char(
Lexer* lexer, TokenType first, char c2, TokenType second)
{
Pos pos = lexer_pos(lexer);
lexer_step(lexer);
if (lexer_done(lexer) || lexer_current(lexer) != c2) {
return lexer_token(lexer, first, pos);
}
lexer_step(lexer);
return lexer_token(lexer, second, pos);
}
2024-04-02 18:47:16 +01:00
Token lexer_next(Lexer* lexer)
{
Pos pos = lexer_pos(lexer);
if (lexer_done(lexer)) {
return lexer_token(lexer, TokenType_EOF, pos);
}
char c = lexer_current(lexer);
if (c == ' ' || c == '\t' || c == '\n') {
lexer_step(lexer);
return lexer_next(lexer);
}
2024-04-04 01:08:11 +01:00
if (c == '/') {
return skip_comment(lexer);
}
2024-04-02 18:47:16 +01:00
if (is_id_start_char(c)) {
2024-04-04 01:08:11 +01:00
return lex_id_or_keyword(lexer);
2024-04-02 18:47:16 +01:00
}
if (c >= '1' && c <= '9') {
lexer_step(lexer);
2024-04-04 01:08:11 +01:00
while (!lexer_done(lexer) && c >= '1' && c <= '9') {
2024-04-02 18:47:16 +01:00
lexer_step(lexer);
}
return lexer_token(lexer, TokenType_Int, pos);
}
2024-04-04 01:08:11 +01:00
switch (c) {
case '0':
return lex_single_char(lexer, TokenType_Int);
case '(':
return lex_single_char(lexer, TokenType_LParen);
case ')':
return lex_single_char(lexer, TokenType_RParen);
case '{':
return lex_single_char(lexer, TokenType_LBrace);
case '}':
return lex_single_char(lexer, TokenType_RBrace);
case '[':
return lex_single_char(lexer, TokenType_LBracket);
case ']':
return lex_single_char(lexer, TokenType_RBracket);
case ',':
return lex_single_char(lexer, TokenType_Comma);
case ';':
return lex_single_char(lexer, TokenType_Semicolon);
case '+':
return lex_single_or_double_char(
lexer, TokenType_Plus, '=', TokenType_PlusEqual);
case '-':
return lex_single_or_double_char(
lexer, TokenType_Minus, '=', TokenType_MinusEqual);
case '*':
return lex_single_or_double_char(
lexer, TokenType_Asterisk, '=', TokenType_AsteriskEqual);
case '=':
return lex_single_or_double_char(
lexer, TokenType_Equal, '=', TokenType_EqualEqual);
case '!':
return lex_single_or_double_char(
lexer, TokenType_Exclamation, '=', TokenType_ExclamationEqual);
case '<':
return lex_single_or_double_char(
lexer, TokenType_LT, '=', TokenType_LTEqual);
case '>':
return lex_single_or_double_char(
lexer, TokenType_GT, '=', TokenType_GTEqual);
case '|':
return lex_single_or_double_char(
lexer, TokenType_Pipe, '>', TokenType_PipeGT);
}
lexer->failed = true;
print_error("lexer: unrecognized character", pos);
2024-04-02 18:47:16 +01:00
return lexer_token(lexer, TokenType_Error, pos);
}
2024-04-04 01:08:11 +01:00
bool lexer_failed(const Lexer* lexer) { return lexer->failed; }
2024-04-02 18:47:16 +01:00
Token lexer_token(Lexer* lexer, TokenType token_type, Pos pos)
{
return (Token) {
.token_type = token_type,
.pos = pos,
.length = lexer->index - pos.index,
};
}
void lexer_step(Lexer* lexer)
{
if (lexer_done(lexer)) {
return;
}
lexer->index += 1;
if (lexer_current(lexer) == '\n') {
lexer->line += 1;
lexer->col = 1;
} else {
lexer->col += 1;
}
}
bool lexer_done(const Lexer* lexer)
{
return lexer->index >= lexer->text_length;
}
char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; }
Pos lexer_pos(const Lexer* lexer)
{
return (Pos) {
.index = lexer->index,
.line = lexer->line,
.col = lexer->col,
};
}
int ast_node_vec_construct(ASTNodeVec* vec)
{
const size_t capacity_start = 4;
*vec = (ASTNodeVec) {
.data = malloc(capacity_start),
.length = 0,
.capacity = capacity_start,
};
if (vec->data == NULL) {
return -1;
}
return 0;
}
void ast_node_vec_destroy(ASTNodeVec* vec)
{
if (vec->data != NULL) {
free(vec->data);
}
}
int ast_node_vec_push(ASTNodeVec* vec, ASTNode* item)
{
if (vec->length + 1 > vec->capacity) {
vec->capacity *= 2;
ASTNode** data = realloc(vec->data, vec->capacity);
if (data == NULL) {
return -1;
}
vec->data = data;
}
vec->data[vec->length] = item;
vec->length += 1;
return 0;
}
2024-04-04 01:08:11 +01:00
ASTNode* ast_node_new(ASTNodeType node_type, Pos pos, ASTNode spec_init)
{
ASTNode* node = malloc(sizeof(ASTNode));
if (node == NULL) {
return NULL;
}
*node = spec_init;
node->node_type = node_type;
node->pos = pos;
return node;
}
void ast_node_free(ASTNode* node)
{
if (node == NULL) {
return;
}
switch (node->node_type) {
case ASTNodeType_Error:
break;
case ASTNodeType_Id:
if (node->id_value != NULL) {
free(node->id_value);
}
break;
case ASTNodeType_Int:
break;
case ASTNodeType_Block:
for (size_t i = 0; i < node->statements.length; ++i) {
ast_node_free(node->statements.data[i]);
}
ast_node_vec_destroy(&node->statements);
break;
case ASTNodeType_If:
ast_node_free(node->if_node.condition);
ast_node_free(node->if_node.truthy);
ast_node_free(node->if_node.falsy);
break;
case ASTNodeType_Loop:
ast_node_free(node->loop_node.body);
break;
case ASTNodeType_Call:
ast_node_free(node->call_node.subject);
for (size_t i = 0; i < node->call_node.args.length; ++i) {
ast_node_free(node->call_node.args.data[i]);
}
ast_node_vec_destroy(&node->call_node.args);
break;
case ASTNodeType_Index:
ast_node_free(node->index_node.subject);
ast_node_free(node->index_node.value);
break;
case ASTNodeType_Unary:
ast_node_free(node->unary_node.subject);
break;
case ASTNodeType_Binary:
ast_node_free(node->binary_node.left);
ast_node_free(node->binary_node.right);
break;
case ASTNodeType_Assign:
ast_node_free(node->assign_node.subject);
ast_node_free(node->assign_node.value);
break;
case ASTNodeType_Let:
if (node->let_node.id != NULL) {
free(node->let_node.id);
}
ast_node_free(node->let_node.value);
break;
case ASTNodeType_Break:
break;
case ASTNodeType_Fn:
if (node->fn_node.id != NULL) {
free(node->fn_node.id);
}
for (size_t i = 0; i < node->fn_node.params.length; ++i) {
ast_node_free(node->fn_node.params.data[i]);
}
ast_node_vec_destroy(&node->fn_node.params);
ast_node_free(node->fn_node.body);
break;
}
free(node);
}
void parser_construct(Parser* parser, const char* text, size_t text_length)
{
*parser = (Parser) {
.text = text,
.text_length = text_length,
.lexer = { 0 },
.current = { 0 },
.failed = false,
};
lexer_construct(&parser->lexer, text, text_length);
parser->current = lexer_next(&parser->lexer);
}
bool parser_failed(const Parser* parser) { return parser->failed; }
void parser_step(Parser* parser)
{
parser->current = lexer_next(&parser->lexer);
}
bool parser_done(const Parser* parser)
{
return parser->current.token_type == TokenType_EOF;
}
ASTNode* parser_parse(Parser* parser) { return parser_parse_expr(parser); }
ASTNode* parser_parse_expr(Parser* parser)
{
return parser_parse_operand(parser);
}
ASTNode* parser_parse_operand(Parser* parser)
{
Pos pos = parser->current.pos;
switch (parser->current.token_type) {
case TokenType_Error:
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
case TokenType_Id:
return parser_parse_id(parser);
case TokenType_Int:
return parser_parse_int(parser);
case TokenType_LParen:
case TokenType_LBrace:
case TokenType_If:
case TokenType_Loop:
default:
parser->failed = true;
print_error("expected operand", pos);
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
break;
}
}
ASTNode* parser_parse_id(Parser* parser)
{
Pos pos = parser->current.pos;
char* value = malloc(parser->current.length + 1);
value[parser->current.length] = '\0';
strncpy(value, &parser->text[parser->current.pos.index],
parser->current.length);
parser_step(parser);
return ast_node_new(ASTNodeType_Id, pos, (ASTNode) { .id_value = value });
}
ASTNode* parser_parse_int(Parser* parser)
{
Pos pos = parser->current.pos;
int value = (int)strtol(&parser->text[parser->current.length], NULL, 10);
parser_step(parser);
return ast_node_new(ASTNodeType_Int, pos, (ASTNode) { .int_value = value });
}
ASTNode* parser_parse_group(Parser* parser)
{
Pos pos = parser->current.pos;
parser_step(parser);
ASTNode* expr = parser_parse_expr(parser);
if (parser->current.token_type != TokenType_RParen) {
parser->failed = true;
print_error("parser: expected ')'", pos);
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
}
parser_step(parser);
return expr;
}
ASTNode* parser_parse_block(Parser* parser)
{
Pos pos = parser->current.pos;
parser_step(parser);
ASTNodeVec statements;
ast_node_vec_construct(&statements);
while (!parser_done(parser)
&& parser->current.token_type != TokenType_RBrace) {
ASTNode* statement = parser_parse_statement(parser);
ast_node_vec_push(&statements, statement);
}
if (parser->current.token_type != TokenType_RBrace) {
parser->failed = true;
print_error("parser: expected '}'", pos);
return ast_node_new(ASTNodeType_Error, pos, (ASTNode) { 0 });
}
parser_step(parser);
return ast_node_new(
ASTNodeType_Block, pos, (ASTNode) { .statements = statements });
}
ASTNode* parser_parse_if(Parser* parser);
ASTNode* parser_parse_loop(Parser* parser);