wacc/lexer.c
2023-03-06 01:47:12 +01:00

586 lines
20 KiB
C

#include "lexer.h"
#include "utils.h"
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void lexer_create(Lexer* lexer, const char* text, size_t text_length)
{
*lexer = (Lexer) {
.text = text,
.length = text_length,
.line = 1,
.column = 1,
};
}
Token lexer_next(Lexer* lexer)
{
char c = lexer_current(lexer);
if (lexer_done(lexer))
return lexer_token(lexer, TokenTypeEof, lexer_position(lexer));
else if (isspace(c))
return lexer_skip_whitespace(lexer);
else if (c >= '1' && c <= '9')
return lexer_make_int_or_float(lexer);
else if (isalpha(c) || c == '_')
return lexer_make_id(lexer);
else
return lexer_make_static_token(lexer);
}
Token lexer_skip_whitespace(Lexer* lexer)
{
lexer_step(lexer);
while (!lexer_done(lexer) && isspace(lexer_current(lexer)))
lexer_step(lexer);
return lexer_next(lexer);
}
Token lexer_make_int_or_float(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeIntDoubleDot, begin);
} else {
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
}
} else {
return lexer_token(lexer, TokenTypeInt, begin);
}
}
Token lexer_make_id(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (lexer_done(lexer)
|| (!isalpha(lexer_current(lexer)) && !isdigit(lexer_current(lexer))
&& lexer_current(lexer) != '_')) {
return lexer_token(lexer, TokenTypeUnderscore, begin);
}
while (!lexer_done(lexer)
&& (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer))
|| lexer_current(lexer) == '_'))
lexer_step(lexer);
if (lexer_span_matches(lexer, begin, "if"))
return lexer_token(lexer, TokenTypeIf, begin);
else if (lexer_span_matches(lexer, begin, "else"))
return lexer_token(lexer, TokenTypeElse, begin);
else if (lexer_span_matches(lexer, begin, "while"))
return lexer_token(lexer, TokenTypeWhile, begin);
else if (lexer_span_matches(lexer, begin, "loop"))
return lexer_token(lexer, TokenTypeLoop, begin);
else if (lexer_span_matches(lexer, begin, "for"))
return lexer_token(lexer, TokenTypeFor, begin);
else if (lexer_span_matches(lexer, begin, "in"))
return lexer_token(lexer, TokenTypeIn, begin);
else if (lexer_span_matches(lexer, begin, "break"))
return lexer_token(lexer, TokenTypeBreak, begin);
else if (lexer_span_matches(lexer, begin, "let"))
return lexer_token(lexer, TokenTypeLet, begin);
else if (lexer_span_matches(lexer, begin, "match"))
return lexer_token(lexer, TokenTypeMatch, begin);
else if (lexer_span_matches(lexer, begin, "false"))
return lexer_token(lexer, TokenTypeFalse, begin);
else if (lexer_span_matches(lexer, begin, "true"))
return lexer_token(lexer, TokenTypeTrue, begin);
else if (lexer_span_matches(lexer, begin, "not"))
return lexer_token(lexer, TokenTypeNot, begin);
else if (lexer_span_matches(lexer, begin, "and"))
return lexer_token(lexer, TokenTypeAnd, begin);
else if (lexer_span_matches(lexer, begin, "or"))
return lexer_token(lexer, TokenTypeOr, begin);
else if (lexer_span_matches(lexer, begin, "fn"))
return lexer_token(lexer, TokenTypeFn, begin);
else if (lexer_span_matches(lexer, begin, "return"))
return lexer_token(lexer, TokenTypeReturn, begin);
else if (lexer_span_matches(lexer, begin, "mut"))
return lexer_token(lexer, TokenTypeMut, begin);
else if (lexer_span_matches(lexer, begin, "defer"))
return lexer_token(lexer, TokenTypeDefer, begin);
else
return lexer_token(lexer, TokenTypeId, begin);
}
bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value)
{
size_t length = lexer->index - begin.index;
if (length != strlen(value))
return false;
return strncmp(&lexer->text[begin.index], value, length) == 0;
}
Token lexer_make_static_token(Lexer* lexer)
{
switch (lexer_current(lexer)) {
case '0':
return lexer_make_int_hex_binary_or_float(lexer);
case '\'':
return lexer_make_char(lexer);
case '"':
return lexer_make_string(lexer);
case '(':
return lexer_make_single_char_token(lexer, TokenTypeLParen);
case ')':
return lexer_make_single_char_token(lexer, TokenTypeRParen);
case '{':
return lexer_make_single_char_token(lexer, TokenTypeLBrace);
case '}':
return lexer_make_single_char_token(lexer, TokenTypeRBrace);
case '[':
return lexer_make_single_char_token(lexer, TokenTypeLBracket);
case ']':
return lexer_make_single_char_token(lexer, TokenTypeRBracket);
case '.':
return lexer_make_dot_token(lexer);
case ',':
return lexer_make_single_char_token(lexer, TokenTypeComma);
case ':':
return lexer_make_colon_token(lexer);
case ';':
return lexer_make_single_char_token(lexer, TokenTypeSemicolon);
case '&':
return lexer_make_single_char_token(lexer, TokenTypeAmpersand);
case '+':
return lexer_make_single_or_double_char_token(
lexer, TokenTypePlus, '=', TokenTypePlusEqual);
case '-':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeMinus, '=', TokenTypeMinusEqual);
case '*':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeAsterisk, '=', TokenTypeAsteriskEqual);
case '/':
return lexer_make_slash_token(lexer);
case '%':
return lexer_make_single_or_double_char_token(
lexer, TokenTypePercent, '=', TokenTypePercentEqual);
case '=':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeEqual, '=', TokenTypeDoubleEqual);
case '!':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeExclamation, '=', TokenTypeExclamationEqual);
case '<':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeLt, '=', TokenTypeLtEqual);
case '>':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeGt, '=', TokenTypeGtEqual);
default:
return lexer_make_invalid_char(lexer);
}
}
Token lexer_make_int_hex_binary_or_float(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
} else if (!lexer_done(lexer)
&& (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
lexer_step(lexer);
while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A'
&& lexer_current(lexer) <= 'F')))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeHex, begin);
} else if (!lexer_done(lexer)
&& (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
lexer_step(lexer);
while (!lexer_done(lexer)
&& (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeBinary, begin);
} else {
return lexer_token(lexer, TokenTypeInt, begin);
}
}
Token lexer_make_char(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (lexer_done(lexer))
return lexer_token(lexer, TokenTypeMalformedChar, begin);
lexer_skip_literal_char(lexer);
if (lexer_done(lexer) && lexer_current(lexer) != '\'')
return lexer_token(lexer, TokenTypeMalformedChar, begin);
lexer_step(lexer);
return lexer_token(lexer, TokenTypeChar, begin);
}
Token lexer_make_string(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (lexer_done(lexer))
return lexer_token(lexer, TokenTypeMalformedString, begin);
while (!lexer_done(lexer) && lexer_current(lexer) != '\"')
lexer_skip_literal_char(lexer);
if (lexer_done(lexer) && lexer_current(lexer) != '\"')
return lexer_token(lexer, TokenTypeMalformedString, begin);
lexer_step(lexer);
return lexer_token(lexer, TokenTypeString, begin);
}
void lexer_skip_literal_char(Lexer* lexer)
{
if (lexer_current(lexer) != '\\') {
lexer_step(lexer);
return;
}
lexer_step(lexer);
if (lexer_done(lexer))
return;
char previous = lexer_current(lexer);
lexer_step(lexer);
if (previous >= '1' && previous <= '9') {
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
} else if (previous == 'x' || previous == 'X') {
while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A'
&& lexer_current(lexer) <= 'F')))
lexer_step(lexer);
}
}
Token lexer_make_single_char_token(Lexer* lexer, TokenType type)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
return lexer_token(lexer, type, begin);
}
Token lexer_make_single_or_double_char_token(Lexer* lexer,
TokenType single_type, char second_char, TokenType double_type)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == second_char) {
lexer_step(lexer);
return lexer_token(lexer, double_type, begin);
} else {
return lexer_token(lexer, single_type, begin);
}
}
Token lexer_make_dot_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '=') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleDotEqual, begin);
} else if (!lexer_done(lexer) && lexer_current(lexer) == '<') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleDotLt, begin);
} else {
return lexer_token(lexer, TokenTypeDoubleDot, begin);
}
} else if (!lexer_done(lexer) && isdigit(lexer_current(lexer))) {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
} else {
return lexer_token(lexer, TokenTypeDot, begin);
}
}
Token lexer_make_colon_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == ':') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '<') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleColonLt, begin);
} else {
return lexer_token(lexer, TokenTypeDoubleColon, begin);
}
} else {
return lexer_token(lexer, TokenTypeColon, begin);
}
}
Token lexer_make_slash_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
switch (lexer_current(lexer)) {
case '/':
return lexer_skip_singleline_comment(lexer);
case '*':
return lexer_skip_multiline_comment(lexer);
case '=':
lexer_step(lexer);
return lexer_token(lexer, TokenTypeSlashEqual, begin);
default:
return lexer_token(lexer, TokenTypeSlash, begin);
}
}
Token lexer_skip_singleline_comment(Lexer* lexer)
{
lexer_step(lexer);
while (!lexer_done(lexer) && lexer_current(lexer) != '\n')
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '\n')
lexer_step(lexer);
return lexer_next(lexer);
}
Token lexer_skip_multiline_comment(Lexer* lexer)
{
lexer_step(lexer);
int depth = 1;
while (!lexer_done(lexer) && depth != 0) {
if (lexer_current(lexer) == '/') {
lexer_step(lexer);
if (lexer_done(lexer))
break;
else if (lexer_current(lexer) == '*')
depth += 1;
} else if (lexer_current(lexer) == '*') {
lexer_step(lexer);
if (lexer_done(lexer))
break;
else if (lexer_current(lexer) == '/')
depth -= 1;
}
lexer_step(lexer);
}
return depth != 0 ? lexer_token(
lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer))
: lexer_next(lexer);
}
Token lexer_make_invalid_char(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
return lexer_token(lexer, TokenTypeInvalidChar, begin);
}
Position lexer_position(const Lexer* lexer)
{
return (Position) {
.index = lexer->index,
.line = lexer->line,
.column = lexer->column,
};
}
Token lexer_token(const Lexer* lexer, TokenType type, Position begin)
{
return (Token) {
.type = type,
.position = begin,
.length = lexer->index - begin.index,
};
}
bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->length; }
char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; }
void lexer_step(Lexer* lexer)
{
if (lexer_done(lexer))
return;
if (lexer_current(lexer) == '\n') {
lexer->line += 1;
lexer->column = 1;
} else {
lexer->column += 1;
}
lexer->index += 1;
}
char* token_string(const Token* token, const char* text)
{
char* value = calloc(token->length + 1, sizeof(char));
strncpy(value, &text[token->position.index], token->length);
return value;
}
char* token_to_string(const Token* token, const char* text)
{
const char* type_string = token_type_to_string(token->type);
char* value_string = token_string(token, text);
size_t size = token->length + strlen(type_string) + 7;
char* value = calloc(size, sizeof(char));
snprintf(value, size, "(%s, \"%s\")", type_string, value_string);
free(value_string);
return value;
}
char* lexer_token_string(const Lexer* lexer, const Token* token)
{
return token_string(token, lexer->text);
}
const char* token_type_to_string(TokenType type)
{
switch (type) {
case TokenTypeEof:
return "TokenTypeEof";
case TokenTypeInvalidChar:
return "TokenTypeInvalidChar";
case TokenTypeMalformedMultilineComment:
return "TokenTypeMalformedMultilineComment";
case TokenTypeMalformedChar:
return "TokenTypeMalformedChar";
case TokenTypeMalformedString:
return "TokenTypeMalformedString";
case TokenTypeId:
return "TokenTypeId";
case TokenTypeInt:
return "TokenTypeInt";
case TokenTypeIntDoubleDot:
return "TokenTypeIntDoubleDot";
case TokenTypeHex:
return "TokenTypeHex";
case TokenTypeBinary:
return "TokenTypeBinary";
case TokenTypeFloat:
return "TokenTypeFloat";
case TokenTypeChar:
return "TokenTypeChar";
case TokenTypeString:
return "TokenTypeString";
case TokenTypeIf:
return "TokenTypeIf";
case TokenTypeElse:
return "TokenTypeElse";
case TokenTypeLoop:
return "TokenTypeLoop";
case TokenTypeWhile:
return "TokenTypeWhile";
case TokenTypeFor:
return "TokenTypeFor";
case TokenTypeIn:
return "TokenTypeIn";
case TokenTypeBreak:
return "TokenTypeBreak";
case TokenTypeLet:
return "TokenTypeLet";
case TokenTypeMatch:
return "TokenTypeMatch";
case TokenTypeFalse:
return "TokenTypeFalse";
case TokenTypeTrue:
return "TokenTypeTrue";
case TokenTypeNot:
return "TokenTypeNot";
case TokenTypeAnd:
return "TokenTypeAnd";
case TokenTypeOr:
return "TokenTypeOr";
case TokenTypeFn:
return "TokenTypeFn";
case TokenTypeReturn:
return "TokenTypeReturn";
case TokenTypeMut:
return "TokenTypeMut";
case TokenTypeDefer:
return "TokenTypeDefer";
case TokenTypeLParen:
return "TokenTypeLParen";
case TokenTypeRParen:
return "TokenTypeRParen";
case TokenTypeLBrace:
return "TokenTypeLBrace";
case TokenTypeRBrace:
return "TokenTypeRBrace";
case TokenTypeLBracket:
return "TokenTypeLBracket";
case TokenTypeRBracket:
return "TokenTypeRBracket";
case TokenTypeComma:
return "TokenTypeComma";
case TokenTypeColon:
return "TokenTypeColon";
case TokenTypeDoubleColon:
return "TokenTypeDoubleColon";
case TokenTypeDoubleColonLt:
return "TokenTypeDoubleColonLt";
case TokenTypeSemicolon:
return "TokenTypeSemicolon";
case TokenTypeAmpersand:
return "TokenTypeAmpersand";
case TokenTypeUnderscore:
return "TokenTypeUnderscore";
case TokenTypeDot:
return "TokenTypeDot";
case TokenTypeDoubleDot:
return "TokenTypeDoubleDot";
case TokenTypeDoubleDotEqual:
return "TokenTypeDoubleDotEqual";
case TokenTypeDoubleDotLt:
return "TokenTypeDoubleDotLt";
case TokenTypePlusEqual:
return "TokenTypePlusEqual";
case TokenTypeMinusEqual:
return "TokenTypeMinusEqual";
case TokenTypeAsteriskEqual:
return "TokenTypeAsteriskEqual";
case TokenTypeSlashEqual:
return "TokenTypeSlashEqual";
case TokenTypePercentEqual:
return "TokenTypePercentEqual";
case TokenTypeDoubleEqual:
return "TokenTypeDoubleEqual";
case TokenTypeExclamationEqual:
return "TokenTypeExclamationEqual";
case TokenTypeLtEqual:
return "TokenTypeLtEqual";
case TokenTypeGtEqual:
return "TokenTypeGtEqual";
case TokenTypePlus:
return "TokenTypePlus";
case TokenTypeMinus:
return "TokenTypeMinus";
case TokenTypeAsterisk:
return "TokenTypeAsterisk";
case TokenTypeSlash:
return "TokenTypeSlash";
case TokenTypePercent:
return "TokenTypePercent";
case TokenTypeEqual:
return "TokenTypeEqual";
case TokenTypeExclamation:
return "TokenTypeExclamation";
case TokenTypeLt:
return "TokenTypeLt";
case TokenTypeGt:
return "TokenTypeGt";
default:
ASSERT_EXHAUSTIVE_MATCH();
}
}