diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..708f726 --- /dev/null +++ b/lexer.c @@ -0,0 +1,145 @@ +#include "lexer.h" +#include +#include + +const char* token_type_to_string(TokenType type) { } + +void token_to_string(char* out, size_t out_size, Token token) +{ + snprintf(out, out_size, "Token { type }"); +} + +void lexer_construct(Lexer* self, const char* text, size_t length) +{ + *self = (Lexer) { + .text = text, + .length = length, + .position = { + .index = 0, + .line = 1, + .column = 1, + }, + }; +} + +Token lexer_next(Lexer* self) { return lexer_make_token(self); } + +bool lexer_done(const Lexer* self) +{ + return self->position.index >= self->length; +} + +void lexer_step(Lexer* self) +{ + if (self->text[self->position.index] == '\n') { + self->position.line += 1; + self->position.column = 1; + } else { + self->position.column += 1; + } + self->position.index += 1; +} + +char lexer_current(const Lexer* self) +{ + return self->text[self->position.index]; +} + +LexerPosition lexer_position(const Lexer* self) { return self->position; } + +Token lexer_token_from(const Lexer* self, TokenType type, LexerPosition start) +{ + return (Token) { + .type = type, + .index = start.index, + .length = self->position.index, + .line = start.line, + .column = start.column, + }; +} + +void lexer_skip_whitespace(Lexer* self) +{ + lexer_step(self); + while (!lexer_done(self) && is_whitespace_char(lexer_current(self))) + lexer_step(self); +} + +Token lexer_make_int_token(Lexer* self) +{ + LexerPosition start = self->position; + lexer_step(self); + while (!lexer_done(self) && is_int_char(lexer_current(self))) + lexer_step(self); + return lexer_token_from(self, TokenTypeInt, start); +} + +Token lexer_make_id_token(Lexer* self) +{ + LexerPosition start = self->position; + lexer_step(self); + while (!lexer_done(self) && is_id_char(lexer_current(self))) + lexer_step(self); + return lexer_token_from(self, TokenTypeId, start); +} + +Token lexer_make_single_char_token(Lexer* self, TokenType type) +{ + LexerPosition start = lexer_position(self); + lexer_step(self); + return lexer_token_from(self, type, start); +} + +Token lexer_make_static_token(Lexer* self) +{ + switch (lexer_current(self)) { + case '+': + return lexer_make_single_char_token(self, TokenTypePlus); + case '(': + return lexer_make_single_char_token(self, TokenTypeLParen); + case ')': + return lexer_make_single_char_token(self, TokenTypeRParen); + default: + return lexer_token_from( + self, TokenTypeInvalid, lexer_position(self)); + } +} + +Token lexer_make_token(Lexer* self) +{ + if (lexer_done(self)) { + return lexer_token_from(self, TokenTypeEof, lexer_position(self)); + } else if (is_whitespace_char(lexer_current(self))) { + lexer_skip_whitespace(self); + return lexer_make_token(self); + } else if (is_int_char(lexer_current(self))) { + return lexer_make_int_token(self); + } else if (is_id_char(is_int_char(lexer_current(self)))) { + return lexer_make_id_token(self); + } else { + return lexer_make_static_token(self); + }; +} + +void token_iterator(TokenIterator* self, Lexer* lexer) +{ + *self = (TokenIterator) { + .lexer = lexer, + .current = lexer_next(lexer), + }; +} + +Token token_step(TokenIterator* self) +{ + return self->current = lexer_next(self->lexer); +} + +Token token_current(TokenIterator* self) { return self->current; } + +bool is_whitespace_char(char c) { return c == ' ' || c == '\t' || c == '\n'; } +bool is_int_char(char c) { return c >= '0' && c <= '9'; } +bool is_id_char(char c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' + || is_int_char(c); +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..c60e84b --- /dev/null +++ b/lexer.h @@ -0,0 +1,66 @@ +#ifndef LEXER_H +#define LEXER_H + +#include +#include + +typedef enum { + TokenTypeInvalid, + TokenTypeEof, + TokenTypePlus, + TokenTypeLParen, + TokenTypeRParen, + TokenTypeInt, + TokenTypeId, +} TokenType; + +const char* token_type_to_string(TokenType type); + +typedef struct { + TokenType type; + size_t index, length; + int line, column; +} Token; + +void token_to_string(char* out, size_t out_size, Token token); + +typedef struct { + size_t index; + int line, column; +} LexerPosition; + +typedef struct { + const char* text; + size_t length; + LexerPosition position; +} Lexer; + +void lexer_construct(Lexer* self, const char* text, size_t length); +Token lexer_next(Lexer* self); + +bool lexer_done(const Lexer* self); +void lexer_step(Lexer* self); +char lexer_current(const Lexer* self); +LexerPosition lexer_position(const Lexer* self); +Token lexer_token_from(const Lexer* self, TokenType type, LexerPosition start); +void lexer_skip_whitespace(Lexer* self); +Token lexer_make_int_token(Lexer* self); +Token lexer_make_id_token(Lexer* self); +Token lexer_make_single_char_token(Lexer* self, TokenType type); +Token lexer_make_static_token(Lexer* self); +Token lexer_make_token(Lexer* self); + +typedef struct { + Lexer* lexer; + Token current; +} TokenIterator; + +void token_iterator(TokenIterator* self, Lexer* lexer); +Token token_step(TokenIterator* self); +Token token_current(TokenIterator* self); + +bool is_whitespace_char(char c); +bool is_int_char(char c); +bool is_id_char(char c); + +#endif diff --git a/main.c b/main.c index 13f5f5d..649bd09 100644 --- a/main.c +++ b/main.c @@ -1,140 +1,6 @@ +#include "lexer.h" #include #include #include -typedef enum { - TokenTypeEof, - TokenTypeNewline, - TokenTypeSemicolon, - TokenTypePlus, - TokenTypeInt, - TokenTypeId, -} TokenType; - -typedef struct { - TokenType type; - size_t index, length; - int line, column; -} Token; - -typedef struct { - size_t index; - int line, column; -} LexerPosition; - -typedef struct { - const char* text; - size_t length; - LexerPosition position; -} Lexer; - -void construct_lexer(Lexer* self, const char* text, size_t length) -{ - *self = (Lexer) { - .text = text, - .length = length, - .position = { - .index = 0, - .line = 1, - .column = 1, - }, - }; -} - -bool is_whitespace_char(char c) { return c == ' ' || c == '\t'; } -bool is_int_char(char c) { return c >= '0' && c <= '9'; } -bool is_id_char(char c) -{ - return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' - || is_int_char(c); -} - -bool lexer_done(const Lexer* self) -{ - return self->position.index >= self->length; -} - -void lexer_step(Lexer* self) -{ - if (self->text[self->position.index] == '\n') { - self->position.line += 1; - self->position.column = 1; - } else { - self->position.column += 1; - } - self->position.index += 1; -} - -char lexer_current(const Lexer* self) -{ - return self->text[self->position.index]; -} - -LexerPosition lexer_position(const Lexer* self) { return self->position; } - -Token lexer_token_from(const Lexer* self, TokenType type, LexerPosition start) -{ - return (Token) { - .type = type, - .index = start.index, - .length = self->position.index, - .line = start.line, - .column = start.column, - }; -} - -void lexer_skip_whitespace(Lexer* self) -{ - lexer_step(self); - while (!lexer_done(self) && is_whitespace_char(lexer_current(self))) - lexer_step(self); -} - -Token lexer_make_int_token(Lexer* self) -{ - LexerPosition start = self->position; - lexer_step(self); - while (!lexer_done(self) && is_int_char(lexer_current(self))) - lexer_step(self); - return lexer_token_from(self, TokenTypeInt, start); -} - -Token lexer_make_id_token(Lexer* self) -{ - LexerPosition start = self->position; - lexer_step(self); - while (!lexer_done(self) && is_id_char(lexer_current(self))) - lexer_step(self); - return lexer_token_from(self, TokenTypeId, start); -} - -Token lexer_make_static_token(Lexer* self) -{ - switch (lexer_current(self)) { - case '\n': - - default: - printf("unrecognized char '%c'\n", lexer_current(self)); - exit(EXIT_FAILURE); - } -} - -Token lexer_make_token(Lexer* self) -{ - if (lexer_done(self)) { - return lexer_token_from(self, TokenTypeEof, lexer_position(self)); - } else if (is_whitespace_char(lexer_current(self))) { - lexer_skip_whitespace(self); - return lexer_make_token(self); - } else if (is_int_char(lexer_current(self))) { - return lexer_make_int_token(self); - } else if (is_id_char(is_int_char(lexer_current(self)))) { - return lexer_make_id_token(self); - } else { - return lexer_make_static_token(self); - }; -} - -Token lexer_next(Lexer* self) { return lexer_make_token(self); } - int main() { printf("hello world\n"); }