#include "lexer.h" #include #include #include #include #include #define ASSERT_EXHAUSTIVE_MATCH() \ (fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, __LINE__, __func__), \ exit(1)) Token lexer_skip_whitespace(Lexer* lexer); Token lexer_make_int_or_float(Lexer* lexer); Token lexer_make_id(Lexer* lexer); bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value); Token lexer_make_static_token(Lexer* lexer); Token lexer_make_int_hex_binary_or_float(Lexer* lexer); Token lexer_make_char(Lexer* lexer); Token lexer_make_string(Lexer* lexer); void lexer_skip_literal_char(Lexer* lexer); Token lexer_make_single_char_token(Lexer* lexer, TokenType type); Token lexer_make_slash_token(Lexer* lexer); Token lexer_skip_singleline_comment(Lexer* lexer); Token lexer_make_single_or_double_char_token( Lexer* lexer, TokenType single_type, char second_char, TokenType double_type); Token lexer_skip_multiline_comment(Lexer* lexer); Token lexer_make_invalid_char(Lexer* lexer); Position lexer_position(const Lexer* lexer); Token lexer_token(const Lexer* lexer, TokenType type, Position begin); bool lexer_done(const Lexer* lexer); char lexer_current(const Lexer* lexer); void lexer_step(Lexer* lexer); void lexer_create(Lexer* lexer, const char* text, size_t text_length) { *lexer = (Lexer) { .text = text, .length = text_length, .line = 1, .column = 1, }; } Token lexer_next(Lexer* lexer) { char c = lexer_current(lexer); if (lexer_done(lexer)) return lexer_token(lexer, TokenTypeEof, lexer_position(lexer)); else if (isspace(c)) return lexer_skip_whitespace(lexer); else if (c >= '1' && c <= '9') return lexer_make_int_or_float(lexer); else if (isalpha(c) || c == '_') return lexer_make_id(lexer); else return lexer_make_static_token(lexer); } Token lexer_skip_whitespace(Lexer* lexer) { lexer_step(lexer); while (!lexer_done(lexer) && isspace(lexer_current(lexer))) lexer_step(lexer); return lexer_next(lexer); } Token lexer_make_int_or_float(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); return lexer_token(lexer, TokenTypeFloat, begin); } else { return lexer_token(lexer, TokenTypeInt, begin); } } Token lexer_make_id(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (lexer_done(lexer) || (!isalpha(lexer_current(lexer)) && !isdigit(lexer_current(lexer)) && lexer_current(lexer) != '_')) { return lexer_token(lexer, TokenTypeUnderscore, begin); } while (!lexer_done(lexer) && (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer)) || lexer_current(lexer) == '_')) lexer_step(lexer); if (lexer_span_matches(lexer, begin, "if")) return lexer_token(lexer, TokenTypeIf, begin); else if (lexer_span_matches(lexer, begin, "else")) return lexer_token(lexer, TokenTypeElse, begin); else if (lexer_span_matches(lexer, begin, "while")) return lexer_token(lexer, TokenTypeWhile, begin); else if (lexer_span_matches(lexer, begin, "loop")) return lexer_token(lexer, TokenTypeLoop, begin); else if (lexer_span_matches(lexer, begin, "for")) return lexer_token(lexer, TokenTypeFor, begin); else if (lexer_span_matches(lexer, begin, "in")) return lexer_token(lexer, TokenTypeIn, begin); else if (lexer_span_matches(lexer, begin, "break")) return lexer_token(lexer, TokenTypeBreak, begin); else if (lexer_span_matches(lexer, begin, "let")) return lexer_token(lexer, TokenTypeLet, begin); else if (lexer_span_matches(lexer, begin, "match")) return lexer_token(lexer, TokenTypeMatch, begin); else if (lexer_span_matches(lexer, begin, "false")) return lexer_token(lexer, TokenTypeFalse, begin); else if (lexer_span_matches(lexer, begin, "true")) return lexer_token(lexer, TokenTypeTrue, begin); else if (lexer_span_matches(lexer, begin, "not")) return lexer_token(lexer, TokenTypeNot, begin); else if (lexer_span_matches(lexer, begin, "and")) return lexer_token(lexer, TokenTypeAnd, begin); else if (lexer_span_matches(lexer, begin, "or")) return lexer_token(lexer, TokenTypeOr, begin); else if (lexer_span_matches(lexer, begin, "fn")) return lexer_token(lexer, TokenTypeFn, begin); else if (lexer_span_matches(lexer, begin, "return")) return lexer_token(lexer, TokenTypeReturn, begin); else return lexer_token(lexer, TokenTypeId, begin); } bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value) { size_t length = lexer->index - begin.index; if (length != strlen(value)) return false; return strncmp(&lexer->text[begin.index], value, length) == 0; } Token lexer_make_static_token(Lexer* lexer) { switch (lexer_current(lexer)) { case '0': return lexer_make_int_hex_binary_or_float(lexer); case '\'': return lexer_make_char(lexer); case '"': return lexer_make_string(lexer); case '(': return lexer_make_single_char_token(lexer, TokenTypeLParen); case ')': return lexer_make_single_char_token(lexer, TokenTypeRParen); case '{': return lexer_make_single_char_token(lexer, TokenTypeLBrace); case '}': return lexer_make_single_char_token(lexer, TokenTypeRBrace); case '[': return lexer_make_single_char_token(lexer, TokenTypeLBracket); case ']': return lexer_make_single_char_token(lexer, TokenTypeRBracket); case '.': return lexer_make_single_or_double_char_token( lexer, TokenTypeDot, '.', TokenTypeDoubleDot); case ',': return lexer_make_single_char_token(lexer, TokenTypeComma); case ':': return lexer_make_single_char_token(lexer, TokenTypeColon); case ';': return lexer_make_single_char_token(lexer, TokenTypeSemicolon); case '&': return lexer_make_single_char_token(lexer, TokenTypeAmpersand); case '+': return lexer_make_single_or_double_char_token( lexer, TokenTypePlus, '=', TokenTypePlusEqual); case '-': return lexer_make_single_or_double_char_token( lexer, TokenTypeMinus, '=', TokenTypeMinusEqual); case '*': return lexer_make_single_or_double_char_token( lexer, TokenTypeAsterisk, '=', TokenTypeAsteriskEqual); case '/': return lexer_make_slash_token(lexer); case '%': return lexer_make_single_or_double_char_token( lexer, TokenTypePercent, '=', TokenTypePercentEqual); case '=': return lexer_make_single_or_double_char_token( lexer, TokenTypeEqual, '=', TokenTypeDoubleEqual); case '!': return lexer_make_single_or_double_char_token( lexer, TokenTypeExclamation, '=', TokenTypeExclamationEqual); case '<': return lexer_make_single_or_double_char_token( lexer, TokenTypeLt, '=', TokenTypeLtEqual); case '>': return lexer_make_single_or_double_char_token( lexer, TokenTypeGt, '=', TokenTypeGtEqual); default: return lexer_make_invalid_char(lexer); } } Token lexer_make_int_hex_binary_or_float(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); return lexer_token(lexer, TokenTypeFloat, begin); } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { while (!lexer_done(lexer) && (isdigit(lexer_current(lexer)) || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') || (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) lexer_step(lexer); return lexer_token(lexer, TokenTypeHex, begin); } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) { lexer_step(lexer); while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1')) lexer_step(lexer); return lexer_token(lexer, TokenTypeBinary, begin); } else { return lexer_token(lexer, TokenTypeInt, begin); } } Token lexer_make_char(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (lexer_done(lexer)) return lexer_token(lexer, TokenTypeMalformedChar, begin); lexer_skip_literal_char(lexer); if (lexer_done(lexer) && lexer_current(lexer) != '\'') return lexer_token(lexer, TokenTypeMalformedChar, begin); lexer_step(lexer); return lexer_token(lexer, TokenTypeChar, begin); } Token lexer_make_string(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); if (lexer_done(lexer)) return lexer_token(lexer, TokenTypeMalformedString, begin); while (!lexer_done(lexer) && lexer_current(lexer) != '\"') lexer_skip_literal_char(lexer); if (lexer_done(lexer) && lexer_current(lexer) != '\"') return lexer_token(lexer, TokenTypeMalformedString, begin); lexer_step(lexer); return lexer_token(lexer, TokenTypeString, begin); } void lexer_skip_literal_char(Lexer* lexer) { if (lexer_current(lexer) != '\\') { lexer_step(lexer); return; } lexer_step(lexer); if (lexer_done(lexer)) return; char previous = lexer_current(lexer); lexer_step(lexer); if (previous >= '1' && previous <= '9') { while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); } else if (previous == 'x' || previous == 'X') { while (!lexer_done(lexer) && (isdigit(lexer_current(lexer)) || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') || (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) lexer_step(lexer); } } Token lexer_make_single_char_token(Lexer* lexer, TokenType type) { Position begin = lexer_position(lexer); lexer_step(lexer); return lexer_token(lexer, type, begin); } Token lexer_make_single_or_double_char_token( Lexer* lexer, TokenType single_type, char second_char, TokenType double_type) { Position begin = lexer_position(lexer); lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == second_char) { lexer_step(lexer); return lexer_token(lexer, double_type, begin); } else { return lexer_token(lexer, single_type, begin); } } Token lexer_make_slash_token(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); switch (lexer_current(lexer)) { case '/': return lexer_skip_singleline_comment(lexer); case '*': return lexer_skip_multiline_comment(lexer); case '=': lexer_step(lexer); return lexer_token(lexer, TokenTypeSlashEqual, begin); default: return lexer_token(lexer, TokenTypeSlash, begin); } } Token lexer_skip_singleline_comment(Lexer* lexer) { lexer_step(lexer); while (!lexer_done(lexer) && lexer_current(lexer) != '\n') lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '\n') lexer_step(lexer); return lexer_next(lexer); } Token lexer_skip_multiline_comment(Lexer* lexer) { lexer_step(lexer); int depth = 1; while (!lexer_done(lexer) && depth != 0) { if (lexer_current(lexer) == '/') { lexer_step(lexer); if (lexer_done(lexer)) break; else if (lexer_current(lexer) == '*') depth += 1; } else if (lexer_current(lexer) == '*') { lexer_step(lexer); if (lexer_done(lexer)) break; else if (lexer_current(lexer) == '/') depth -= 1; } lexer_step(lexer); } return depth != 0 ? lexer_token(lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer)) : lexer_next(lexer); } Token lexer_make_invalid_char(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); return lexer_token(lexer, TokenTypeInvalidChar, begin); } Position lexer_position(const Lexer* lexer) { return (Position) { .index = lexer->index, .line = lexer->line, .column = lexer->column, }; } Token lexer_token(const Lexer* lexer, TokenType type, Position begin) { return (Token) { .type = type, .position = begin, .length = lexer->index - begin.index, }; } bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->length; } char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } void lexer_step(Lexer* lexer) { if (lexer_done(lexer)) return; if (lexer_current(lexer) == '\n') { lexer->line += 1; lexer->column = 1; } else { lexer->column += 1; } lexer->index += 1; } char* token_string(const Token* token, const char* text) { char* value = calloc(token->length + 1, sizeof(char)); strncpy(value, &text[token->position.index], token->length); return value; } char* token_to_string(const Token* token, const char* text) { const char* type_string = token_type_to_string(token->type); char* value_string = token_string(token, text); size_t size = token->length + strlen(type_string) + 7; char* value = calloc(size, sizeof(char)); snprintf(value, size, "(%s, \"%s\")", type_string, value_string); free(value_string); return value; } char* lexer_token_string(const Lexer* lexer, const Token* token) { return token_string(token, lexer->text); } const char* token_type_to_string(TokenType type) { switch (type) { case TokenTypeEof: return "Eof"; case TokenTypeInvalidChar: return "InvalidChar"; case TokenTypeMalformedMultilineComment: return "MalformedMultilineComment"; case TokenTypeMalformedChar: return "MalformedChar"; case TokenTypeMalformedString: return "MalformedString"; case TokenTypeId: return "Id"; case TokenTypeInt: return "Int"; case TokenTypeHex: return "Hex"; case TokenTypeBinary: return "Binary"; case TokenTypeFloat: return "Float"; case TokenTypeChar: return "Char"; case TokenTypeString: return "String"; case TokenTypeIf: return "If"; case TokenTypeElse: return "Else"; case TokenTypeWhile: return "While"; case TokenTypeBreak: return "Break"; case TokenTypeLParen: return "LParen"; case TokenTypeRParen: return "RParen"; case TokenTypeLBrace: return "LBrace"; case TokenTypeRBrace: return "RBrace"; case TokenTypeLBracket: return "LBracket"; case TokenTypeRBracket: return "RBracket"; case TokenTypeDot: return "Dot"; case TokenTypeComma: return "Comma"; case TokenTypeColon: return "Colon"; case TokenTypeSemicolon: return "Semicolon"; case TokenTypePlusEqual: return "PlusEqual"; case TokenTypeMinusEqual: return "MinusEqual"; case TokenTypeAsteriskEqual: return "AsteriskEqual"; case TokenTypeSlashEqual: return "SlashEqual"; case TokenTypePercentEqual: return "PercentEqual"; case TokenTypeDoubleEqual: return "DoubleEqual"; case TokenTypeExclamationEqual: return "ExclamationEqual"; case TokenTypeLtEqual: return "LtEqual"; case TokenTypeGtEqual: return "GtEqual"; case TokenTypePlus: return "Plus"; case TokenTypeMinus: return "Minus"; case TokenTypeAsterisk: return "Asterisk"; case TokenTypeSlash: return "Slash"; case TokenTypePercent: return "Percent"; case TokenTypeEqual: return "Equal"; case TokenTypeExclamation: return "Exclamation"; case TokenTypeLt: return "Lt"; case TokenTypeGt: return "Gt"; default: ASSERT_EXHAUSTIVE_MATCH(); } }