diff --git a/lexer.c b/lexer.c index eb9ecc5..9ec7c34 100644 --- a/lexer.c +++ b/lexer.c @@ -5,18 +5,16 @@ #include #include -struct Lexer { - const char* text; - size_t index, length; - int line, column; -}; +#define ASSERT_EXHAUSTIVE_MATCH() \ + (fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, __LINE__, __func__), \ + exit(1)) Token lexer_skip_whitespace(Lexer* lexer); Token lexer_make_int_or_float(Lexer* lexer); Token lexer_make_id(Lexer* lexer); bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value); Token lexer_make_static_token(Lexer* lexer); -Token lexer_make_int_hex_or_binary(Lexer* lexer); +Token lexer_make_int_hex_binary_or_float(Lexer* lexer); Token lexer_make_char(Lexer* lexer); Token lexer_make_string(Lexer* lexer); void lexer_skip_literal_char(Lexer* lexer); @@ -73,6 +71,7 @@ Token lexer_make_int_or_float(Lexer* lexer) while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == '.') { + lexer_step(lexer); while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) lexer_step(lexer); return lexer_token(lexer, TokenTypeFloat, begin); @@ -85,6 +84,11 @@ Token lexer_make_id(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); + if (lexer_done(lexer) + || (!isalpha(lexer_current(lexer)) && !isdigit(lexer_current(lexer)) + && lexer_current(lexer) != '_')) { + return lexer_token(lexer, TokenTypeUnderscore, begin); + } while (!lexer_done(lexer) && (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer)) || lexer_current(lexer) == '_')) @@ -95,8 +99,32 @@ Token lexer_make_id(Lexer* lexer) return lexer_token(lexer, TokenTypeElse, begin); else if (lexer_span_matches(lexer, begin, "while")) return lexer_token(lexer, TokenTypeWhile, begin); + else if (lexer_span_matches(lexer, begin, "loop")) + return lexer_token(lexer, TokenTypeLoop, begin); + else if (lexer_span_matches(lexer, begin, "for")) + return lexer_token(lexer, TokenTypeFor, begin); + else if (lexer_span_matches(lexer, begin, "in")) + return lexer_token(lexer, TokenTypeIn, begin); else if (lexer_span_matches(lexer, begin, "break")) return lexer_token(lexer, TokenTypeBreak, begin); + else if (lexer_span_matches(lexer, begin, "let")) + return lexer_token(lexer, TokenTypeLet, begin); + else if (lexer_span_matches(lexer, begin, "match")) + return lexer_token(lexer, TokenTypeMatch, begin); + else if (lexer_span_matches(lexer, begin, "false")) + return lexer_token(lexer, TokenTypeFalse, begin); + else if (lexer_span_matches(lexer, begin, "true")) + return lexer_token(lexer, TokenTypeTrue, begin); + else if (lexer_span_matches(lexer, begin, "not")) + return lexer_token(lexer, TokenTypeNot, begin); + else if (lexer_span_matches(lexer, begin, "and")) + return lexer_token(lexer, TokenTypeAnd, begin); + else if (lexer_span_matches(lexer, begin, "or")) + return lexer_token(lexer, TokenTypeOr, begin); + else if (lexer_span_matches(lexer, begin, "fn")) + return lexer_token(lexer, TokenTypeFn, begin); + else if (lexer_span_matches(lexer, begin, "return")) + return lexer_token(lexer, TokenTypeReturn, begin); else return lexer_token(lexer, TokenTypeId, begin); } @@ -113,7 +141,7 @@ Token lexer_make_static_token(Lexer* lexer) { switch (lexer_current(lexer)) { case '0': - return lexer_make_int_hex_or_binary(lexer); + return lexer_make_int_hex_binary_or_float(lexer); case '\'': return lexer_make_char(lexer); case '"': @@ -131,13 +159,16 @@ Token lexer_make_static_token(Lexer* lexer) case ']': return lexer_make_single_char_token(lexer, TokenTypeRBracket); case '.': - return lexer_make_single_char_token(lexer, TokenTypeDot); + return lexer_make_single_or_double_char_token( + lexer, TokenTypeDot, '.', TokenTypeDoubleDot); case ',': return lexer_make_single_char_token(lexer, TokenTypeComma); case ':': return lexer_make_single_char_token(lexer, TokenTypeColon); case ';': return lexer_make_single_char_token(lexer, TokenTypeSemicolon); + case '&': + return lexer_make_single_char_token(lexer, TokenTypeAmpersand); case '+': return lexer_make_single_or_double_char_token( lexer, TokenTypePlus, '=', TokenTypePlusEqual); @@ -169,18 +200,24 @@ Token lexer_make_static_token(Lexer* lexer) } } -Token lexer_make_int_hex_or_binary(Lexer* lexer) +Token lexer_make_int_hex_binary_or_float(Lexer* lexer) { Position begin = lexer_position(lexer); lexer_step(lexer); - if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { + if (!lexer_done(lexer) && lexer_current(lexer) == '.') { + lexer_step(lexer); + while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) + lexer_step(lexer); + return lexer_token(lexer, TokenTypeFloat, begin); + } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { while (!lexer_done(lexer) && (isdigit(lexer_current(lexer)) - || (lexer_current(lexer) >= 'a' || lexer_current(lexer) <= 'f') - || (lexer_current(lexer) >= 'A' || lexer_current(lexer) <= 'F'))) + || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') + || (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) lexer_step(lexer); return lexer_token(lexer, TokenTypeHex, begin); } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) { + lexer_step(lexer); while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1')) lexer_step(lexer); return lexer_token(lexer, TokenTypeBinary, begin); @@ -213,7 +250,7 @@ Token lexer_make_string(Lexer* lexer) if (lexer_done(lexer) && lexer_current(lexer) != '\"') return lexer_token(lexer, TokenTypeMalformedString, begin); lexer_step(lexer); - return lexer_token(lexer, TokenTypeChar, begin); + return lexer_token(lexer, TokenTypeString, begin); } void lexer_skip_literal_char(Lexer* lexer) @@ -253,9 +290,9 @@ Token lexer_make_single_or_double_char_token( lexer_step(lexer); if (!lexer_done(lexer) && lexer_current(lexer) == second_char) { lexer_step(lexer); - return lexer_token(lexer, single_type, begin); - } else { return lexer_token(lexer, double_type, begin); + } else { + return lexer_token(lexer, single_type, begin); } } @@ -290,14 +327,18 @@ Token lexer_skip_multiline_comment(Lexer* lexer) { lexer_step(lexer); int depth = 1; - while (!lexer_done(lexer)) { + while (!lexer_done(lexer) && depth != 0) { if (lexer_current(lexer) == '/') { lexer_step(lexer); - if (!lexer_done(lexer) && lexer_current(lexer) == '*') + if (lexer_done(lexer)) + break; + else if (lexer_current(lexer) == '*') depth += 1; } else if (lexer_current(lexer) == '*') { lexer_step(lexer); - if (lexer_done(lexer) && lexer_current(lexer) == '/') + if (lexer_done(lexer)) + break; + else if (lexer_current(lexer) == '/') depth -= 1; } lexer_step(lexer); @@ -360,13 +401,18 @@ char* token_to_string(const Token* token, const char* text) { const char* type_string = token_type_to_string(token->type); char* value_string = token_string(token, text); - size_t size = token->length + strlen(type_string) + 5; + size_t size = token->length + strlen(type_string) + 7; char* value = calloc(size, sizeof(char)); - snprintf(value, size, "(%s, %s)", type_string, value_string); + snprintf(value, size, "(%s, \"%s\")", type_string, value_string); free(value_string); return value; } +char* lexer_token_string(const Lexer* lexer, const Token* token) +{ + return token_string(token, lexer->text); +} + const char* token_type_to_string(TokenType type) { switch (type) { @@ -458,5 +504,7 @@ const char* token_type_to_string(TokenType type) return "Lt"; case TokenTypeGt: return "Gt"; + default: + ASSERT_EXHAUSTIVE_MATCH(); } } diff --git a/lexer.h b/lexer.h index 314b4ef..41c1c3b 100644 --- a/lexer.h +++ b/lexer.h @@ -21,8 +21,20 @@ typedef enum { TokenTypeIf, TokenTypeElse, + TokenTypeLoop, TokenTypeWhile, + TokenTypeFor, + TokenTypeIn, TokenTypeBreak, + TokenTypeLet, + TokenTypeMatch, + TokenTypeFalse, + TokenTypeTrue, + TokenTypeNot, + TokenTypeAnd, + TokenTypeOr, + TokenTypeFn, + TokenTypeReturn, TokenTypeLParen, TokenTypeRParen, @@ -34,6 +46,9 @@ typedef enum { TokenTypeComma, TokenTypeColon, TokenTypeSemicolon, + TokenTypeDoubleMatch, + TokenTypeAmpersand, + TokenTypeUnderscore, TokenTypePlusEqual, TokenTypeMinusEqual, @@ -72,9 +87,14 @@ typedef struct { char* token_string(const Token* token, const char* text); char* token_to_string(const Token* token, const char* text); -typedef struct Lexer Lexer; +typedef struct { + const char* text; + size_t index, length; + int line, column; +} Lexer; void lexer_create(Lexer* lexer, const char* text, size_t text_length); Token lexer_next(Lexer* lexer); +char* lexer_token_string(const Lexer* lexer, const Token* token); #endif diff --git a/main.c b/main.c index cf7ef51..39f9629 100644 --- a/main.c +++ b/main.c @@ -1,9 +1,26 @@ +#include "lexer.h" #include +#include +#include int main(void) { - char text[] = "abc 123 0xFF 0b101 3.14 'a' '\\n' \"hello\" \"world\\\"\\n\" if else /* /* while */ */ " "while break (){}[].,:; += -= *= /= %= == != <= >= + - * / % % = ! < >"; + + printf("text = \"%s\"\n", text); + + Lexer lexer; + lexer_create(&lexer, text, strlen(text)); + + printf("tokens = [\n"); + Token token = lexer_next(&lexer); + while (token.type != TokenTypeEof) { + char* stringified = token_to_string(&token, text); + printf(" %s\n", stringified); + free(stringified); + token = lexer_next(&lexer); + } + printf("]\n"); } diff --git a/parser.c b/parser.c index c1dfa96..b0ece9a 100644 --- a/parser.c +++ b/parser.c @@ -1 +1,12 @@ #include "parser.h" +#include "lexer.h" + +void parser_create(Parser* parser, Lexer* lexer) +{ + *parser = (Parser) { + .lexer = lexer, + .current = lexer_next(lexer), + }; +} + +void parser_parse_expression(Parser* parser) { } diff --git a/parser.h b/parser.h index bee509b..4775dd4 100644 --- a/parser.h +++ b/parser.h @@ -1,4 +1,42 @@ #ifndef PARSER_H #define PARSER_H +#include "lexer.h" +#include + +typedef enum { + ParsedNodeTypeError, + ParsedNodeTypeInt, + ParsedNodeTypeFloat, + ParsedNodeTypeChar, + ParsedNodeTypeString, + ParsedNodeTypeBool, + ParsedNodeTypeArray, + ParsedNodeTypeDict, + ParsedNodeTypeIf, + ParsedNodeTypeWhile, + ParsedNodeTypeLoop, + ParsedNodeTypeFor, + ParsedNodeTypeLambda, + + ParsedNodeTypeCall, + ParsedNodeTypeAccess, + ParsedNodeTypeIndex, + ParsedNodeTypeUnary, + ParsedNodeTypeBinary, + ParsedNodeTypeAssign, +} ParsedNodeType; + +typedef struct ParsedNode { + ParsedNodeType node_type; +} ParsedNode; + +typedef struct { + Lexer* lexer; + Token current; +} Parser; + +void parser_create(Parser* parser, Lexer* lexer); +void parser_parse_expression(Parser* parser); + #endif