codebased/scirpt/lexer.c
2023-04-16 04:45:15 +02:00

360 lines
8.5 KiB
C

#include "lexer.h"
#include "common/stringmap.h"
#include "scirpt/lexer.h"
#include "scirpt/position.h"
#include "scirpt/token.h"
#include <stdlib.h>
#include <string.h>
#define TT(type) ScirptTokenType##type
static inline void step(ScirptLexer* lexer) { scirpt_lexer_step(lexer); }
static inline ScirptToken
token(const ScirptLexer* lexer, ScirptTokenType type, ScirptPosition start)
{
return scirpt_lexer_token(lexer, type, start);
}
static inline ScirptPosition pos(const ScirptLexer* lexer)
{
return scirpt_lexer_pos(lexer);
}
static inline bool current_is(const ScirptLexer* lexer, char value)
{
return scirpt_lexer_current_is(lexer, value);
}
static inline bool done(const ScirptLexer* lexer)
{
return scirpt_lexer_done(lexer);
}
static inline char current(const ScirptLexer* lexer)
{
return scirpt_lexer_current(lexer);
}
ScirptLexer* scirpt_lexer_new(const char* text, size_t text_length)
{
ScirptLexer* lexer = malloc(sizeof(ScirptLexer));
scirpt_lexer_create(lexer, text, text_length);
return lexer;
}
void scirpt_lexer_delete(ScirptLexer* lexer) { free(lexer); }
static inline void
add_keyword(StringMap* keywords, const char* key, ScirptTokenType value)
{
stringmap_set(keywords, key, strlen(key), value);
}
void scirpt_lexer_create(
ScirptLexer* lexer, const char* text, size_t text_length
)
{
StringMap* keywords = stringmap_new();
add_keyword(keywords, "null", TT(Null));
add_keyword(keywords, "false", TT(False));
add_keyword(keywords, "true", TT(True));
add_keyword(keywords, "not", TT(Not));
add_keyword(keywords, "and", TT(And));
add_keyword(keywords, "or", TT(Or));
add_keyword(keywords, "let", TT(Let));
add_keyword(keywords, "if", TT(If));
add_keyword(keywords, "else", TT(Else));
add_keyword(keywords, "while", TT(While));
add_keyword(keywords, "for", TT(For));
add_keyword(keywords, "in", TT(In));
add_keyword(keywords, "break", TT(Break));
add_keyword(keywords, "fn", TT(Fn));
add_keyword(keywords, "return", TT(Return));
*lexer = (ScirptLexer) {
.text = text,
.text_length = text_length,
.index = 0,
.line = 1,
.col = 1,
.keywords = keywords,
};
}
void scirpt_lexer_destroy(ScirptLexer* lexer)
{
stringmap_delete(lexer->keywords);
}
static inline bool is_whitespace(char value)
{
return value == ' ' || value == '\t' || value == '\r' || value == '\n';
}
static inline bool is_id_char_excluding_numbers(char value)
{
return (value >= 'a' && value <= 'z') || (value >= 'A' && value <= 'Z')
|| value == '_';
}
static inline bool is_int_char(char value)
{
return value >= '0' && value <= '9';
}
static inline bool is_id_char(char value)
{
return is_id_char_excluding_numbers(value) || is_int_char(value);
}
ScirptToken scirpt_lexer_next(ScirptLexer* lexer)
{
return scirpt_lexer_level_1(lexer);
}
ScirptToken scirpt_lexer_level_1(ScirptLexer* lexer)
{
if (done(lexer))
return token(lexer, TT(Eof), pos(lexer));
else if (is_whitespace(current(lexer)))
return scirpt_lexer_skip_whitespace(lexer);
else if (is_id_char_excluding_numbers(current(lexer)))
return scirpt_lexer_id_token(lexer);
else
return scirpt_lexer_level_2(lexer);
}
static inline ScirptToken single_token(ScirptLexer* lexer, ScirptTokenType type)
{
ScirptPosition start = pos(lexer);
step(lexer);
return token(lexer, type, start);
}
static inline ScirptToken single_or_double_token(
ScirptLexer* lexer,
ScirptTokenType first_type,
char second_char,
ScirptTokenType second_type
)
{
ScirptPosition start = pos(lexer);
step(lexer);
if (current_is(lexer, second_char)) {
step(lexer);
return token(lexer, second_type, start);
} else {
return token(lexer, first_type, start);
}
}
ScirptToken scirpt_lexer_level_2(ScirptLexer* lexer)
{
switch (current(lexer)) {
case '0':
return single_token(lexer, TT(Int));
case '"':
return scirpt_lexer_string_token(lexer);
case '(':
return single_token(lexer, TT(LParen));
case ')':
return single_token(lexer, TT(RParen));
case '{':
return single_token(lexer, TT(LBrace));
case '}':
return single_token(lexer, TT(RBrace));
case '[':
return single_token(lexer, TT(LBracket));
case ']':
return single_token(lexer, TT(RBracket));
case '.':
return scirpt_lexer_dot_token(lexer);
case ',':
return single_token(lexer, TT(Comma));
case ':':
return single_token(lexer, TT(Colon));
case ';':
return single_token(lexer, TT(Semicolon));
case '+':
return single_or_double_token(lexer, TT(Plus), '=', TT(PlusEqual));
case '-':
return single_or_double_token(
lexer, TT(Minus), '=', TT(MinusEqual)
);
case '*':
return single_or_double_token(
lexer, TT(Asterisk), '=', TT(AsteriskEqual)
);
case '/':
return single_token(lexer, TT(Slash));
case '%':
return single_or_double_token(
lexer, TT(Percent), '=', TT(PercentEqual)
);
case '=':
return single_or_double_token(
lexer, TT(Equal), '=', TT(EqualEqual)
);
case '!':
return single_or_double_token(
lexer, TT(Exclamation), '=', TT(ExclamationEqual)
);
case '<':
return single_or_double_token(lexer, TT(Lt), '=', TT(LtEqual));
case '>':
return single_or_double_token(lexer, TT(Gt), '=', TT(GtEqual));
default:
return scirpt_lexer_level_3(lexer);
}
}
ScirptToken scirpt_lexer_level_3(ScirptLexer* lexer)
{
if (is_int_char(current(lexer)))
return scirpt_lexer_int_token(lexer);
else
return single_token(lexer, TT(InvalidChar));
}
ScirptToken scirpt_lexer_skip_whitespace(ScirptLexer* lexer)
{
step(lexer);
while (!done(lexer) && is_whitespace(current(lexer)))
step(lexer);
return scirpt_lexer_next(lexer);
}
ScirptToken scirpt_lexer_id_token(ScirptLexer* lexer)
{
ScirptPosition start = pos(lexer);
step(lexer);
while (!done(lexer) && is_id_char(current(lexer)))
step(lexer);
size_t* found_keyword = stringmap_get(
lexer->keywords, &lexer->text[start.index], lexer->index - start.index
);
if (found_keyword)
return token(lexer, (ScirptTokenType)*found_keyword, start);
else
return token(lexer, TT(Id), start);
}
ScirptToken scirpt_lexer_int_token(ScirptLexer* lexer)
{
ScirptPosition start = pos(lexer);
step(lexer);
while (!done(lexer) && is_int_char(current(lexer)))
step(lexer);
return token(lexer, TT(Int), start);
}
ScirptToken scirpt_lexer_string_token(ScirptLexer* lexer)
{
ScirptPosition start = pos(lexer);
step(lexer);
while (!done(lexer) && current(lexer) != '\"') {
char first = current(lexer);
step(lexer);
if (!done(lexer) && first == '\\')
step(lexer);
}
if (!current_is(lexer, '"'))
return token(lexer, TT(MalformedString), start);
step(lexer);
return token(lexer, TT(String), start);
}
ScirptToken scirpt_lexer_dot_token(ScirptLexer* lexer)
{
ScirptPosition start = pos(lexer);
step(lexer);
if (current_is(lexer, '.')) {
step(lexer);
while (!done(lexer) && is_int_char(current(lexer)))
step(lexer);
return token(lexer, TT(Decimals), start);
} else {
return token(lexer, TT(Dot), start);
}
}
ScirptToken scirpt_lexer_slash_token(ScirptLexer* lexer)
{
ScirptPosition start = pos(lexer);
step(lexer);
if (current_is(lexer, '/')) {
step(lexer);
while (!done(lexer) && current(lexer) != '\n')
step(lexer);
return scirpt_lexer_next(lexer);
} else if (current_is(lexer, '*')) {
step(lexer);
int depth = 0;
char last_char = '\0';
while (!done(lexer)) {
if (last_char == '/' && current(lexer) == '*') {
depth++;
} else if (last_char == '*' && current(lexer) == '/') {
depth--;
if (depth == 0) {
step(lexer);
break;
}
}
last_char = current(lexer);
step(lexer);
}
if (depth != 0)
return token(lexer, TT(MalformedComment), start);
return scirpt_lexer_next(lexer);
} else if (current_is(lexer, '=')) {
step(lexer);
return token(lexer, TT(SlashEqual), start);
} else {
return token(lexer, TT(Slash), start);
}
}
void scirpt_lexer_step(ScirptLexer* lexer)
{
lexer->index++;
if (!done(lexer)) {
if (current(lexer) == '\n') {
lexer->line++;
lexer->col = 1;
} else {
lexer->col++;
}
}
}
ScirptPosition scirpt_lexer_pos(const ScirptLexer* lexer)
{
return (ScirptPosition) {
.index = lexer->index,
.line = lexer->line,
.col = lexer->col,
};
}
ScirptToken scirpt_lexer_token(
const ScirptLexer* lexer, ScirptTokenType type, ScirptPosition start
)
{
return (ScirptToken) {
.type = type,
.pos = start,
.length = lexer->index - start.index,
};
}
bool scirpt_lexer_current_is(const ScirptLexer* lexer, char value)
{
return !done(lexer) && current(lexer) == value;
}
bool scirpt_lexer_done(const ScirptLexer* lexer)
{
return lexer->index >= lexer->text_length;
}
char scirpt_lexer_current(const ScirptLexer* lexer)
{
return lexer->text[lexer->index];
}