make lexer

This commit is contained in:
Simon 2023-02-06 15:48:02 +01:00
parent 8e934c6959
commit 924d6d6fc1
3 changed files with 212 additions and 135 deletions

145
lexer.c Normal file
View File

@ -0,0 +1,145 @@
#include "lexer.h"
#include <stdbool.h>
#include <stdio.h>
const char* token_type_to_string(TokenType type) { }
void token_to_string(char* out, size_t out_size, Token token)
{
snprintf(out, out_size, "Token { type }");
}
void lexer_construct(Lexer* self, const char* text, size_t length)
{
*self = (Lexer) {
.text = text,
.length = length,
.position = {
.index = 0,
.line = 1,
.column = 1,
},
};
}
Token lexer_next(Lexer* self) { return lexer_make_token(self); }
bool lexer_done(const Lexer* self)
{
return self->position.index >= self->length;
}
void lexer_step(Lexer* self)
{
if (self->text[self->position.index] == '\n') {
self->position.line += 1;
self->position.column = 1;
} else {
self->position.column += 1;
}
self->position.index += 1;
}
char lexer_current(const Lexer* self)
{
return self->text[self->position.index];
}
LexerPosition lexer_position(const Lexer* self) { return self->position; }
Token lexer_token_from(const Lexer* self, TokenType type, LexerPosition start)
{
return (Token) {
.type = type,
.index = start.index,
.length = self->position.index,
.line = start.line,
.column = start.column,
};
}
void lexer_skip_whitespace(Lexer* self)
{
lexer_step(self);
while (!lexer_done(self) && is_whitespace_char(lexer_current(self)))
lexer_step(self);
}
Token lexer_make_int_token(Lexer* self)
{
LexerPosition start = self->position;
lexer_step(self);
while (!lexer_done(self) && is_int_char(lexer_current(self)))
lexer_step(self);
return lexer_token_from(self, TokenTypeInt, start);
}
Token lexer_make_id_token(Lexer* self)
{
LexerPosition start = self->position;
lexer_step(self);
while (!lexer_done(self) && is_id_char(lexer_current(self)))
lexer_step(self);
return lexer_token_from(self, TokenTypeId, start);
}
Token lexer_make_single_char_token(Lexer* self, TokenType type)
{
LexerPosition start = lexer_position(self);
lexer_step(self);
return lexer_token_from(self, type, start);
}
Token lexer_make_static_token(Lexer* self)
{
switch (lexer_current(self)) {
case '+':
return lexer_make_single_char_token(self, TokenTypePlus);
case '(':
return lexer_make_single_char_token(self, TokenTypeLParen);
case ')':
return lexer_make_single_char_token(self, TokenTypeRParen);
default:
return lexer_token_from(
self, TokenTypeInvalid, lexer_position(self));
}
}
Token lexer_make_token(Lexer* self)
{
if (lexer_done(self)) {
return lexer_token_from(self, TokenTypeEof, lexer_position(self));
} else if (is_whitespace_char(lexer_current(self))) {
lexer_skip_whitespace(self);
return lexer_make_token(self);
} else if (is_int_char(lexer_current(self))) {
return lexer_make_int_token(self);
} else if (is_id_char(is_int_char(lexer_current(self)))) {
return lexer_make_id_token(self);
} else {
return lexer_make_static_token(self);
};
}
void token_iterator(TokenIterator* self, Lexer* lexer)
{
*self = (TokenIterator) {
.lexer = lexer,
.current = lexer_next(lexer),
};
}
Token token_step(TokenIterator* self)
{
return self->current = lexer_next(self->lexer);
}
Token token_current(TokenIterator* self) { return self->current; }
bool is_whitespace_char(char c) { return c == ' ' || c == '\t' || c == '\n'; }
bool is_int_char(char c) { return c >= '0' && c <= '9'; }
bool is_id_char(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
|| is_int_char(c);
}

66
lexer.h Normal file
View File

@ -0,0 +1,66 @@
#ifndef LEXER_H
#define LEXER_H
#include <stdbool.h>
#include <stdlib.h>
typedef enum {
TokenTypeInvalid,
TokenTypeEof,
TokenTypePlus,
TokenTypeLParen,
TokenTypeRParen,
TokenTypeInt,
TokenTypeId,
} TokenType;
const char* token_type_to_string(TokenType type);
typedef struct {
TokenType type;
size_t index, length;
int line, column;
} Token;
void token_to_string(char* out, size_t out_size, Token token);
typedef struct {
size_t index;
int line, column;
} LexerPosition;
typedef struct {
const char* text;
size_t length;
LexerPosition position;
} Lexer;
void lexer_construct(Lexer* self, const char* text, size_t length);
Token lexer_next(Lexer* self);
bool lexer_done(const Lexer* self);
void lexer_step(Lexer* self);
char lexer_current(const Lexer* self);
LexerPosition lexer_position(const Lexer* self);
Token lexer_token_from(const Lexer* self, TokenType type, LexerPosition start);
void lexer_skip_whitespace(Lexer* self);
Token lexer_make_int_token(Lexer* self);
Token lexer_make_id_token(Lexer* self);
Token lexer_make_single_char_token(Lexer* self, TokenType type);
Token lexer_make_static_token(Lexer* self);
Token lexer_make_token(Lexer* self);
typedef struct {
Lexer* lexer;
Token current;
} TokenIterator;
void token_iterator(TokenIterator* self, Lexer* lexer);
Token token_step(TokenIterator* self);
Token token_current(TokenIterator* self);
bool is_whitespace_char(char c);
bool is_int_char(char c);
bool is_id_char(char c);
#endif

136
main.c
View File

@ -1,140 +1,6 @@
#include "lexer.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
typedef enum {
TokenTypeEof,
TokenTypeNewline,
TokenTypeSemicolon,
TokenTypePlus,
TokenTypeInt,
TokenTypeId,
} TokenType;
typedef struct {
TokenType type;
size_t index, length;
int line, column;
} Token;
typedef struct {
size_t index;
int line, column;
} LexerPosition;
typedef struct {
const char* text;
size_t length;
LexerPosition position;
} Lexer;
void construct_lexer(Lexer* self, const char* text, size_t length)
{
*self = (Lexer) {
.text = text,
.length = length,
.position = {
.index = 0,
.line = 1,
.column = 1,
},
};
}
bool is_whitespace_char(char c) { return c == ' ' || c == '\t'; }
bool is_int_char(char c) { return c >= '0' && c <= '9'; }
bool is_id_char(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
|| is_int_char(c);
}
bool lexer_done(const Lexer* self)
{
return self->position.index >= self->length;
}
void lexer_step(Lexer* self)
{
if (self->text[self->position.index] == '\n') {
self->position.line += 1;
self->position.column = 1;
} else {
self->position.column += 1;
}
self->position.index += 1;
}
char lexer_current(const Lexer* self)
{
return self->text[self->position.index];
}
LexerPosition lexer_position(const Lexer* self) { return self->position; }
Token lexer_token_from(const Lexer* self, TokenType type, LexerPosition start)
{
return (Token) {
.type = type,
.index = start.index,
.length = self->position.index,
.line = start.line,
.column = start.column,
};
}
void lexer_skip_whitespace(Lexer* self)
{
lexer_step(self);
while (!lexer_done(self) && is_whitespace_char(lexer_current(self)))
lexer_step(self);
}
Token lexer_make_int_token(Lexer* self)
{
LexerPosition start = self->position;
lexer_step(self);
while (!lexer_done(self) && is_int_char(lexer_current(self)))
lexer_step(self);
return lexer_token_from(self, TokenTypeInt, start);
}
Token lexer_make_id_token(Lexer* self)
{
LexerPosition start = self->position;
lexer_step(self);
while (!lexer_done(self) && is_id_char(lexer_current(self)))
lexer_step(self);
return lexer_token_from(self, TokenTypeId, start);
}
Token lexer_make_static_token(Lexer* self)
{
switch (lexer_current(self)) {
case '\n':
default:
printf("unrecognized char '%c'\n", lexer_current(self));
exit(EXIT_FAILURE);
}
}
Token lexer_make_token(Lexer* self)
{
if (lexer_done(self)) {
return lexer_token_from(self, TokenTypeEof, lexer_position(self));
} else if (is_whitespace_char(lexer_current(self))) {
lexer_skip_whitespace(self);
return lexer_make_token(self);
} else if (is_int_char(lexer_current(self))) {
return lexer_make_int_token(self);
} else if (is_id_char(is_int_char(lexer_current(self)))) {
return lexer_make_id_token(self);
} else {
return lexer_make_static_token(self);
};
}
Token lexer_next(Lexer* self) { return lexer_make_token(self); }
int main() { printf("hello world\n"); }