#include "lexer.h" #include "utils.h" #include #include #include #include #include void lexer_create(Lexer* lexer, const char* text, size_t text_length) { *lexer = (Lexer) { .text = text, .length = text_length, .line = 1, .column = 1, }; } Token lexer_next(Lexer* lexer) { char c = lexer_current(lexer); if (lexer_done(lexer)) return lexer_token(lexer, TokenTypeEof, lexer_position(lexer)); else if (isspace(c)) return lexer_skip_whitespace(lexer); else if (c >= '1' && c <= '9') return lexer_make_int_or_float(lexer); else if (isalpha(c) || c == '_') return lexer_make_id(lexer); else return lexer_make_static_token(lexer); } Token lexer_skip_whitespace(Lexer* lexer) { lexer_step(lexer); while (!lexer_done(lexer) && isspace(lexer_current(lexer))) lexer_step(lexer); return lexer_next(lexer); } Token lexer_make_int_or_float(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { lexer_step(lexer); return lexer_token(lexer, TokenTypeIntDoubleDot, begin); } else { while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); return lexer_token(lexer, TokenTypeFloat, begin); } } else { return lexer_token(lexer, TokenTypeInt, begin); } } Token lexer_make_id(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (lexer_done(lexer) || (!isalpha(lexer_current(lexer)) && !isdigit(lexer_current(lexer)) && lexer_current(lexer) != '_')) { return lexer_token(lexer, TokenTypeUnderscore, begin); } while (!lexer_done(lexer) && (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer)) || lexer_current(lexer) == '_')) lexer_step(lexer); if (lexer_span_matches(lexer, begin, "if")) return lexer_token(lexer, TokenTypeIf, begin); else if (lexer_span_matches(lexer, begin, "else")) return lexer_token(lexer, TokenTypeElse, begin); else if (lexer_span_matches(lexer, begin, "while")) return lexer_token(lexer, TokenTypeWhile, begin); else if (lexer_span_matches(lexer, begin, "loop")) return lexer_token(lexer, TokenTypeLoop, begin); else if (lexer_span_matches(lexer, begin, "for")) return lexer_token(lexer, TokenTypeFor, begin); else if (lexer_span_matches(lexer, begin, "in")) return lexer_token(lexer, TokenTypeIn, begin); else if (lexer_span_matches(lexer, begin, "break")) return lexer_token(lexer, TokenTypeBreak, begin); else if (lexer_span_matches(lexer, begin, "let")) return lexer_token(lexer, TokenTypeLet, begin); else if (lexer_span_matches(lexer, begin, "match")) return lexer_token(lexer, TokenTypeMatch, begin); else if (lexer_span_matches(lexer, begin, "false")) return lexer_token(lexer, TokenTypeFalse, begin); else if (lexer_span_matches(lexer, begin, "true")) return lexer_token(lexer, TokenTypeTrue, begin); else if (lexer_span_matches(lexer, begin, "not")) return lexer_token(lexer, TokenTypeNot, begin); else if (lexer_span_matches(lexer, begin, "and")) return lexer_token(lexer, TokenTypeAnd, begin); else if (lexer_span_matches(lexer, begin, "or")) return lexer_token(lexer, TokenTypeOr, begin); else if (lexer_span_matches(lexer, begin, "fn")) return lexer_token(lexer, TokenTypeFn, begin); else if (lexer_span_matches(lexer, begin, "return")) return lexer_token(lexer, TokenTypeReturn, begin); else if (lexer_span_matches(lexer, begin, "mut")) return lexer_token(lexer, TokenTypeMut, begin); else if (lexer_span_matches(lexer, begin, "defer")) return lexer_token(lexer, TokenTypeDefer, begin); else return lexer_token(lexer, TokenTypeId, begin); } bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value) { size_t length = lexer->index - begin.index; if (length != strlen(value)) return false; return strncmp(&lexer->text[begin.index], value, length) == 0; } Token lexer_make_static_token(Lexer* lexer) { switch (lexer_current(lexer)) { case '0': return lexer_make_int_hex_binary_or_float(lexer); case '\'': return lexer_make_char(lexer); case '"': return lexer_make_string(lexer); case '(': return lexer_make_single_char_token(lexer, TokenTypeLParen); case ')': return lexer_make_single_char_token(lexer, TokenTypeRParen); case '{': return lexer_make_single_char_token(lexer, TokenTypeLBrace); case '}': return lexer_make_single_char_token(lexer, TokenTypeRBrace); case '[': return lexer_make_single_char_token(lexer, TokenTypeLBracket); case ']': return lexer_make_single_char_token(lexer, TokenTypeRBracket); case '.': return lexer_make_dot_token(lexer); case ',': return lexer_make_single_char_token(lexer, TokenTypeComma); case ':': return lexer_make_colon_token(lexer); case ';': return lexer_make_single_char_token(lexer, TokenTypeSemicolon); case '&': return lexer_make_single_char_token(lexer, TokenTypeAmpersand); case '+': return lexer_make_single_or_double_char_token( lexer, TokenTypePlus, '=', TokenTypePlusEqual); case '-': return lexer_make_single_or_double_char_token( lexer, TokenTypeMinus, '=', TokenTypeMinusEqual); case '*': return lexer_make_single_or_double_char_token( lexer, TokenTypeAsterisk, '=', TokenTypeAsteriskEqual); case '/': return lexer_make_slash_token(lexer); case '%': return lexer_make_single_or_double_char_token( lexer, TokenTypePercent, '=', TokenTypePercentEqual); case '=': return lexer_make_single_or_double_char_token( lexer, TokenTypeEqual, '=', TokenTypeDoubleEqual); case '!': return lexer_make_single_or_double_char_token( lexer, TokenTypeExclamation, '=', TokenTypeExclamationEqual); case '<': return lexer_make_single_or_double_char_token( lexer, TokenTypeLt, '=', TokenTypeLtEqual); case '>': return lexer_make_single_or_double_char_token( lexer, TokenTypeGt, '=', TokenTypeGtEqual); default: return lexer_make_invalid_char(lexer); } } Token lexer_make_int_hex_binary_or_float(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); return lexer_token(lexer, TokenTypeFloat, begin); } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { lexer_step(lexer); while (!lexer_done(lexer) && (isdigit(lexer_current(lexer)) || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') || (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) lexer_step(lexer); return lexer_token(lexer, TokenTypeHex, begin); } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) { lexer_step(lexer); while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1')) lexer_step(lexer); return lexer_token(lexer, TokenTypeBinary, begin); } else { return lexer_token(lexer, TokenTypeInt, begin); } } Token lexer_make_char(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (lexer_done(lexer)) return lexer_token(lexer, TokenTypeMalformedChar, begin); lexer_skip_literal_char(lexer); if (lexer_done(lexer) && lexer_current(lexer) != '\'') return lexer_token(lexer, TokenTypeMalformedChar, begin); lexer_step(lexer); return lexer_token(lexer, TokenTypeChar, begin); } Token lexer_make_string(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (lexer_done(lexer)) return lexer_token(lexer, TokenTypeMalformedString, begin); while (!lexer_done(lexer) && lexer_current(lexer) != '\"') lexer_skip_literal_char(lexer); if (lexer_done(lexer) && lexer_current(lexer) != '\"') return lexer_token(lexer, TokenTypeMalformedString, begin); lexer_step(lexer); return lexer_token(lexer, TokenTypeString, begin); } void lexer_skip_literal_char(Lexer* lexer) { if (lexer_current(lexer) != '\\') { lexer_step(lexer); return; } lexer_step(lexer); if (lexer_done(lexer)) return; char previous = lexer_current(lexer); lexer_step(lexer); if (previous >= '1' && previous <= '9') { while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); } else if (previous == 'x' || previous == 'X') { while (!lexer_done(lexer) && (isdigit(lexer_current(lexer)) || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') || (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) lexer_step(lexer); } } Token lexer_make_single_char_token(Lexer* lexer, TokenType type) { Position begin = lexer_position(lexer); lexer_step(lexer); return lexer_token(lexer, type, begin); } Token lexer_make_single_or_double_char_token(Lexer* lexer, TokenType single_type, char second_char, TokenType double_type) { Position begin = lexer_position(lexer); lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == second_char) { lexer_step(lexer); return lexer_token(lexer, double_type, begin); } else { return lexer_token(lexer, single_type, begin); } } Token lexer_make_dot_token(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '=') { lexer_step(lexer); return lexer_token(lexer, TokenTypeDoubleDotEqual, begin); } else if (!lexer_done(lexer) && lexer_current(lexer) == '<') { lexer_step(lexer); return lexer_token(lexer, TokenTypeDoubleDotLt, begin); } else { return lexer_token(lexer, TokenTypeDoubleDot, begin); } } else if (!lexer_done(lexer) && isdigit(lexer_current(lexer))) { lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); return lexer_token(lexer, TokenTypeFloat, begin); } else { return lexer_token(lexer, TokenTypeDot, begin); } } Token lexer_make_colon_token(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == ':') { lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '<') { lexer_step(lexer); return lexer_token(lexer, TokenTypeDoubleColonLt, begin); } else { return lexer_token(lexer, TokenTypeDoubleColon, begin); } } else { return lexer_token(lexer, TokenTypeColon, begin); } } Token lexer_make_slash_token(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); switch (lexer_current(lexer)) { case '/': return lexer_skip_singleline_comment(lexer); case '*': return lexer_skip_multiline_comment(lexer); case '=': lexer_step(lexer); return lexer_token(lexer, TokenTypeSlashEqual, begin); default: return lexer_token(lexer, TokenTypeSlash, begin); } } Token lexer_skip_singleline_comment(Lexer* lexer) { lexer_step(lexer); while (!lexer_done(lexer) && lexer_current(lexer) != '\n') lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '\n') lexer_step(lexer); return lexer_next(lexer); } Token lexer_skip_multiline_comment(Lexer* lexer) { lexer_step(lexer); int depth = 1; while (!lexer_done(lexer) && depth != 0) { if (lexer_current(lexer) == '/') { lexer_step(lexer); if (lexer_done(lexer)) break; else if (lexer_current(lexer) == '*') depth += 1; } else if (lexer_current(lexer) == '*') { lexer_step(lexer); if (lexer_done(lexer)) break; else if (lexer_current(lexer) == '/') depth -= 1; } lexer_step(lexer); } return depth != 0 ? lexer_token( lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer)) : lexer_next(lexer); } Token lexer_make_invalid_char(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); return lexer_token(lexer, TokenTypeInvalidChar, begin); } Position lexer_position(const Lexer* lexer) { return (Position) { .index = lexer->index, .line = lexer->line, .column = lexer->column, }; } Token lexer_token(const Lexer* lexer, TokenType type, Position begin) { return (Token) { .type = type, .position = begin, .length = lexer->index - begin.index, }; } bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->length; } char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } void lexer_step(Lexer* lexer) { if (lexer_done(lexer)) return; if (lexer_current(lexer) == '\n') { lexer->line += 1; lexer->column = 1; } else { lexer->column += 1; } lexer->index += 1; } char* token_string(const Token* token, const char* text) { char* value = calloc(token->length + 1, sizeof(char)); strncpy(value, &text[token->position.index], token->length); return value; } char* token_to_string(const Token* token, const char* text) { const char* type_string = token_type_to_string(token->type); char* value_string = token_string(token, text); size_t size = token->length + strlen(type_string) + 7; char* value = calloc(size, sizeof(char)); snprintf(value, size, "(%s, \"%s\")", type_string, value_string); free(value_string); return value; } char* lexer_token_string(const Lexer* lexer, const Token* token) { return token_string(token, lexer->text); } const char* token_type_to_string(TokenType type) { switch (type) { case TokenTypeEof: return "TokenTypeEof"; case TokenTypeInvalidChar: return "TokenTypeInvalidChar"; case TokenTypeMalformedMultilineComment: return "TokenTypeMalformedMultilineComment"; case TokenTypeMalformedChar: return "TokenTypeMalformedChar"; case TokenTypeMalformedString: return "TokenTypeMalformedString"; case TokenTypeId: return "TokenTypeId"; case TokenTypeInt: return "TokenTypeInt"; case TokenTypeIntDoubleDot: return "TokenTypeIntDoubleDot"; case TokenTypeHex: return "TokenTypeHex"; case TokenTypeBinary: return "TokenTypeBinary"; case TokenTypeFloat: return "TokenTypeFloat"; case TokenTypeChar: return "TokenTypeChar"; case TokenTypeString: return "TokenTypeString"; case TokenTypeIf: return "TokenTypeIf"; case TokenTypeElse: return "TokenTypeElse"; case TokenTypeLoop: return "TokenTypeLoop"; case TokenTypeWhile: return "TokenTypeWhile"; case TokenTypeFor: return "TokenTypeFor"; case TokenTypeIn: return "TokenTypeIn"; case TokenTypeBreak: return "TokenTypeBreak"; case TokenTypeLet: return "TokenTypeLet"; case TokenTypeMatch: return "TokenTypeMatch"; case TokenTypeFalse: return "TokenTypeFalse"; case TokenTypeTrue: return "TokenTypeTrue"; case TokenTypeNot: return "TokenTypeNot"; case TokenTypeAnd: return "TokenTypeAnd"; case TokenTypeOr: return "TokenTypeOr"; case TokenTypeFn: return "TokenTypeFn"; case TokenTypeReturn: return "TokenTypeReturn"; case TokenTypeMut: return "TokenTypeMut"; case TokenTypeDefer: return "TokenTypeDefer"; case TokenTypeLParen: return "TokenTypeLParen"; case TokenTypeRParen: return "TokenTypeRParen"; case TokenTypeLBrace: return "TokenTypeLBrace"; case TokenTypeRBrace: return "TokenTypeRBrace"; case TokenTypeLBracket: return "TokenTypeLBracket"; case TokenTypeRBracket: return "TokenTypeRBracket"; case TokenTypeComma: return "TokenTypeComma"; case TokenTypeColon: return "TokenTypeColon"; case TokenTypeDoubleColon: return "TokenTypeDoubleColon"; case TokenTypeDoubleColonLt: return "TokenTypeDoubleColonLt"; case TokenTypeSemicolon: return "TokenTypeSemicolon"; case TokenTypeAmpersand: return "TokenTypeAmpersand"; case TokenTypeUnderscore: return "TokenTypeUnderscore"; case TokenTypeDot: return "TokenTypeDot"; case TokenTypeDoubleDot: return "TokenTypeDoubleDot"; case TokenTypeDoubleDotEqual: return "TokenTypeDoubleDotEqual"; case TokenTypeDoubleDotLt: return "TokenTypeDoubleDotLt"; case TokenTypePlusEqual: return "TokenTypePlusEqual"; case TokenTypeMinusEqual: return "TokenTypeMinusEqual"; case TokenTypeAsteriskEqual: return "TokenTypeAsteriskEqual"; case TokenTypeSlashEqual: return "TokenTypeSlashEqual"; case TokenTypePercentEqual: return "TokenTypePercentEqual"; case TokenTypeDoubleEqual: return "TokenTypeDoubleEqual"; case TokenTypeExclamationEqual: return "TokenTypeExclamationEqual"; case TokenTypeLtEqual: return "TokenTypeLtEqual"; case TokenTypeGtEqual: return "TokenTypeGtEqual"; case TokenTypePlus: return "TokenTypePlus"; case TokenTypeMinus: return "TokenTypeMinus"; case TokenTypeAsterisk: return "TokenTypeAsterisk"; case TokenTypeSlash: return "TokenTypeSlash"; case TokenTypePercent: return "TokenTypePercent"; case TokenTypeEqual: return "TokenTypeEqual"; case TokenTypeExclamation: return "TokenTypeExclamation"; case TokenTypeLt: return "TokenTypeLt"; case TokenTypeGt: return "TokenTypeGt"; default: ASSERT_EXHAUSTIVE_MATCH(); } }