diff --git a/' b/' new file mode 100644 index 0000000..2ee068e --- /dev/null +++ b/' @@ -0,0 +1,36 @@ +#ifndef LEXER_H +#define LEXER_H + +#include + +typedef enum { + Id, + Int, + Float, + String, + + If, + Else, + While, + Break, + + LParen, + RParen, + LBrace, + RBrace, + + Plus, + Minus, +} TokenType; + +typedef struct { + TokenType type; + size_t index, length; + int line, column; +} Token; + +typedef struct Lexer Lexer; + +void lexer_create(Lexer* lexer, const char* text, size_t text_length); + +#endif diff --git a/:w b/:w new file mode 100644 index 0000000..eb0d0d5 --- /dev/null +++ b/:w @@ -0,0 +1,187 @@ +#include "lexer.h" +#include +#include +#include +#include + +struct Lexer { + const char* text; + size_t index, length; + int line, column; +}; + +Token lexer_skip_whitespace(Lexer* lexer); +Token lexer_make_int(Lexer* lexer); +Token lexer_make_id(Lexer* lexer); +bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value); +Token lexer_make_static(Lexer* lexer); +Token make_single_char_token(Lexer* lexer, TokenType type); +Token make_slash_token(Lexer* lexer); +Token lexer_make_invalid_char(Lexer* lexer); +Position lexer_position(const Lexer* lexer); +Token lexer_token(const Lexer* lexer, TokenType type, Position begin); +bool lexer_done(const Lexer* lexer); +char lexer_current(const Lexer* lexer); +void lexer_step(Lexer* lexer); + +void lexer_create(Lexer* lexer, const char* text, size_t text_length) +{ + *lexer = (Lexer) { + .text = text, + .length = text_length, + .line = 1, + .column = 1, + }; +} + +Token lexer_next(Lexer* lexer) +{ + char c = lexer_current(lexer); + if (lexer_done(lexer)) + return lexer_token(lexer, TokenTypeEof, lexer_position(lexer)); + else if (isspace(c)) + return lexer_skip_whitespace(lexer); + else if (isdigit(c)) + return lexer_make_int(lexer); + else if (isalpha(c) || c == '_') + return lexer_make_id(lexer); + else + return lexer_make_static(lexer); +} + +Token lexer_skip_whitespace(Lexer* lexer) +{ + lexer_step(lexer); + while (!lexer_done(lexer) && isspace(lexer_current(lexer))) + lexer_step(lexer); + return lexer_next(lexer); +} + +Token lexer_make_int(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) + lexer_step(lexer); + return lexer_token(lexer, TokenTypeInt, begin); +} + +Token lexer_make_id(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + while (!lexer_done(lexer) + && (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer)) + || lexer_current(lexer) == '_')) + lexer_step(lexer); + if (lexer_span_matches(lexer, begin, "if")) + return lexer_token(lexer, TokenTypeIf, begin); + else if (lexer_span_matches(lexer, begin, "else")) + return lexer_token(lexer, TokenTypeElse, begin); + else if (lexer_span_matches(lexer, begin, "while")) + return lexer_token(lexer, TokenTypeWhile, begin); + else if (lexer_span_matches(lexer, begin, "break")) + return lexer_token(lexer, TokenTypeBreak, begin); + else + return lexer_token(lexer, TokenTypeId, begin); +} + +bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value) +{ + size_t length = lexer->index - begin.index; + if (length != strlen(value)) + return false; + return strncmp(&lexer->text[begin.index], value, length) == 0; +} + +Token lexer_make_static(Lexer* lexer) +{ + switch (lexer_current(lexer)) { + case '(': + return make_single_char_token(lexer, TokenTypeLParen); + case ')': + return make_single_char_token(lexer, TokenTypeRParen); + case '{': + return make_single_char_token(lexer, TokenTypeLBrace); + case '}': + return make_single_char_token(lexer, TokenTypeRBrace); + case ';': + return make_single_char_token(lexer, TokenTypeSemicolon); + case '+': + return make_single_char_token(lexer, TokenTypePlus); + case '-': + return make_single_char_token(lexer, TokenTypeMinus); + case '*': + return make_single_char_token(lexer, TokenTypeAsterisk); + case '/': + return make_slash_token(lexer); + case '%': + return make_single_char_token(lexer, TokenTypePercent); + default: + return lexer_make_invalid_char(lexer); + } +} + +Token make_single_char_token(Lexer* lexer, TokenType type) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + return lexer_token(lexer, type, begin); +} + +Token skip_singleline_comment(Lexer* lexer); +Token skip_multiline_comment(Lexer* lexer); + +Token make_slash_token(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + switch (lexer_current(lexer)) { + case '/': + return skip_singleline_comment(lexer); + default: + return lexer_token(lexer, TokenTypeSlash, begin); + } +} + +Token lexer_make_invalid_char(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + return lexer_token(lexer, TokenTypeInvalidChar, begin); +} + +Position lexer_position(const Lexer* lexer) +{ + return (Position) { + .index = lexer->index, + .line = lexer->line, + .column = lexer->column, + }; +} + +Token lexer_token(const Lexer* lexer, TokenType type, Position begin) +{ + return (Token) { + .type = type, + .position = begin, + .length = lexer->index - begin.index, + }; +} + +bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->length; } + +char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } + +void lexer_step(Lexer* lexer) +{ + if (lexer_done(lexer)) + return; + if (lexer_current(lexer) == '\n') { + lexer->line += 1; + lexer->column = 1; + } else { + lexer->column += 1; + } + lexer->index += 1; +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ac5288e --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ + +CFLAGS = -std=c17 -Wall -Wextra -Wpedantic -Wconversion + +HEADERS = $(wildcard *.h) + +all: compile_flags.txt wacc + +wacc: main.o lexer.o + gcc $^ -o $@ + +%.o: %.c $(HEADERS) + gcc $< -c -o $@ $(CFLAGS) + +clean: + rm -rf *.o wacc + +compile_flags.txt: + echo -xc $(CFLAGS) | sed 's/\s\+/\n/g' > compile_flags.txt + diff --git a/compile_flags.txt b/compile_flags.txt new file mode 100644 index 0000000..083ed44 --- /dev/null +++ b/compile_flags.txt @@ -0,0 +1,6 @@ +-xc +-std=c17 +-Wall +-Wextra +-Wpedantic +-Wconversion diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..297c410 --- /dev/null +++ b/lexer.c @@ -0,0 +1,353 @@ +#include "lexer.h" +#include +#include +#include +#include + +struct Lexer { + const char* text; + size_t index, length; + int line, column; +}; + +Token lexer_skip_whitespace(Lexer* lexer); +Token lexer_make_int_or_float(Lexer* lexer); +Token lexer_make_id(Lexer* lexer); +bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value); +Token lexer_make_static_token(Lexer* lexer); +Token lexer_make_int_hex_or_binary(Lexer* lexer); +Token lexer_make_char(Lexer* lexer); +Token lexer_make_string(Lexer* lexer); +void lexer_skip_literal_char(Lexer* lexer); +Token lexer_make_single_char_token(Lexer* lexer, TokenType type); +Token lexer_make_slash_token(Lexer* lexer); +Token lexer_skip_singleline_comment(Lexer* lexer); +Token lexer_make_single_or_double_char_token( + Lexer* lexer, TokenType single_type, char second_char, TokenType double_type); +Token lexer_skip_multiline_comment(Lexer* lexer); +Token lexer_make_invalid_char(Lexer* lexer); +Position lexer_position(const Lexer* lexer); +Token lexer_token(const Lexer* lexer, TokenType type, Position begin); +bool lexer_done(const Lexer* lexer); +char lexer_current(const Lexer* lexer); +void lexer_step(Lexer* lexer); + +void lexer_create(Lexer* lexer, const char* text, size_t text_length) +{ + *lexer = (Lexer) { + .text = text, + .length = text_length, + .line = 1, + .column = 1, + }; +} + +Token lexer_next(Lexer* lexer) +{ + char c = lexer_current(lexer); + if (lexer_done(lexer)) + return lexer_token(lexer, TokenTypeEof, lexer_position(lexer)); + else if (isspace(c)) + return lexer_skip_whitespace(lexer); + else if (c >= '1' && c <= '9') + return lexer_make_int_or_float(lexer); + else if (isalpha(c) || c == '_') + return lexer_make_id(lexer); + else + return lexer_make_static_token(lexer); +} + +Token lexer_skip_whitespace(Lexer* lexer) +{ + lexer_step(lexer); + while (!lexer_done(lexer) && isspace(lexer_current(lexer))) + lexer_step(lexer); + return lexer_next(lexer); +} + +Token lexer_make_int_or_float(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) + lexer_step(lexer); + if (!lexer_done(lexer) && lexer_current(lexer) == '.') { + while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) + lexer_step(lexer); + return lexer_token(lexer, TokenTypeFloat, begin); + } else { + return lexer_token(lexer, TokenTypeInt, begin); + } +} + +Token lexer_make_id(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + while (!lexer_done(lexer) + && (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer)) + || lexer_current(lexer) == '_')) + lexer_step(lexer); + if (lexer_span_matches(lexer, begin, "if")) + return lexer_token(lexer, TokenTypeIf, begin); + else if (lexer_span_matches(lexer, begin, "else")) + return lexer_token(lexer, TokenTypeElse, begin); + else if (lexer_span_matches(lexer, begin, "while")) + return lexer_token(lexer, TokenTypeWhile, begin); + else if (lexer_span_matches(lexer, begin, "break")) + return lexer_token(lexer, TokenTypeBreak, begin); + else + return lexer_token(lexer, TokenTypeId, begin); +} + +bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value) +{ + size_t length = lexer->index - begin.index; + if (length != strlen(value)) + return false; + return strncmp(&lexer->text[begin.index], value, length) == 0; +} + +Token lexer_make_static_token(Lexer* lexer) +{ + switch (lexer_current(lexer)) { + case '0': + return lexer_make_int_hex_or_binary(lexer); + case '\'': + return lexer_make_char(lexer); + case '"': + return lexer_make_string(lexer); + case '(': + return lexer_make_single_char_token(lexer, TokenTypeLParen); + case ')': + return lexer_make_single_char_token(lexer, TokenTypeRParen); + case '{': + return lexer_make_single_char_token(lexer, TokenTypeLBrace); + case '}': + return lexer_make_single_char_token(lexer, TokenTypeRBrace); + case '[': + return lexer_make_single_char_token(lexer, TokenTypeLBracket); + case ']': + return lexer_make_single_char_token(lexer, TokenTypeRBracket); + case '.': + return lexer_make_single_char_token(lexer, TokenTypeDot); + case ',': + return lexer_make_single_char_token(lexer, TokenTypeComma); + case ':': + return lexer_make_single_char_token(lexer, TokenTypeColon); + case ';': + return lexer_make_single_char_token(lexer, TokenTypeSemicolon); + case '+': + return lexer_make_single_or_double_char_token( + lexer, TokenTypePlus, '=', TokenTypePlusEqual); + case '-': + return lexer_make_single_or_double_char_token( + lexer, TokenTypeMinus, '=', TokenTypeMinusEqual); + case '*': + return lexer_make_single_or_double_char_token( + lexer, TokenTypeAsterisk, '=', TokenTypeAsteriskEqual); + case '/': + return lexer_make_slash_token(lexer); + case '%': + return lexer_make_single_or_double_char_token( + lexer, TokenTypePercent, '=', TokenTypePercentEqual); + case '=': + return lexer_make_single_or_double_char_token( + lexer, TokenTypeEqual, '=', TokenTypeDoubleEqual); + case '!': + return lexer_make_single_or_double_char_token( + lexer, TokenTypeExclamation, '=', TokenTypeExclamationEqual); + case '<': + return lexer_make_single_or_double_char_token( + lexer, TokenTypeLt, '=', TokenTypeLtEqual); + case '>': + return lexer_make_single_or_double_char_token( + lexer, TokenTypeGt, '=', TokenTypeGtEqual); + default: + return lexer_make_invalid_char(lexer); + } +} + +Token lexer_make_int_hex_or_binary(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { + while (!lexer_done(lexer) + && (isdigit(lexer_current(lexer)) + || (lexer_current(lexer) >= 'a' || lexer_current(lexer) <= 'f') + || (lexer_current(lexer) >= 'A' || lexer_current(lexer) <= 'F'))) + lexer_step(lexer); + return lexer_token(lexer, TokenTypeHex, begin); + } else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) { + while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1')) + lexer_step(lexer); + return lexer_token(lexer, TokenTypeBinary, begin); + } else { + return lexer_token(lexer, TokenTypeInt, begin); + } +} + +Token lexer_make_char(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + if (lexer_done(lexer)) + return lexer_token(lexer, TokenTypeMalformedChar, begin); + lexer_skip_literal_char(lexer); + if (lexer_done(lexer) && lexer_current(lexer) != '\'') + return lexer_token(lexer, TokenTypeMalformedChar, begin); + lexer_step(lexer); + return lexer_token(lexer, TokenTypeChar, begin); +} + +Token lexer_make_string(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + if (lexer_done(lexer)) + return lexer_token(lexer, TokenTypeMalformedString, begin); + while (!lexer_done(lexer) && lexer_current(lexer) != '\"') + lexer_skip_literal_char(lexer); + if (lexer_done(lexer) && lexer_current(lexer) != '\"') + return lexer_token(lexer, TokenTypeMalformedString, begin); + lexer_step(lexer); + return lexer_token(lexer, TokenTypeChar, begin); +} + +void lexer_skip_literal_char(Lexer* lexer) +{ + if (lexer_current(lexer) == '\\') { + lexer_step(lexer); + if (!lexer_done(lexer) && lexer_current(lexer) == '0') { + lexer_step(lexer); + } else if (!lexer_done(lexer) && lexer_current(lexer) >= '1' + && lexer_current(lexer) <= '9') { + lexer_step(lexer); + while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) + lexer_step(lexer); + } else if (!lexer_done(lexer) + && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { + lexer_step(lexer); + while (!lexer_done(lexer) + && (isdigit(lexer_current(lexer)) + || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') + || (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) + lexer_step(lexer); + } else if (!lexer_done(lexer)) { + lexer_step(lexer); + } + } else { + lexer_step(lexer); + } +} + +Token lexer_make_single_char_token(Lexer* lexer, TokenType type) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + return lexer_token(lexer, type, begin); +} + +Token lexer_make_single_or_double_char_token( + Lexer* lexer, TokenType single_type, char second_char, TokenType double_type) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + if (!lexer_done(lexer) && lexer_current(lexer) == second_char) { + lexer_step(lexer); + return lexer_token(lexer, single_type, begin); + } else { + return lexer_token(lexer, double_type, begin); + } +} + +Token lexer_make_slash_token(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + switch (lexer_current(lexer)) { + case '/': + return lexer_skip_singleline_comment(lexer); + case '*': + return lexer_skip_multiline_comment(lexer); + case '=': + lexer_step(lexer); + return lexer_token(lexer, TokenTypeSlashEqual, begin); + default: + return lexer_token(lexer, TokenTypeSlash, begin); + } +} + +Token lexer_skip_singleline_comment(Lexer* lexer) +{ + lexer_step(lexer); + while (!lexer_done(lexer) && lexer_current(lexer) != '\n') + lexer_step(lexer); + if (!lexer_done(lexer) && lexer_current(lexer) == '\n') + lexer_step(lexer); + return lexer_next(lexer); +} + +Token lexer_skip_multiline_comment(Lexer* lexer) +{ + lexer_step(lexer); + int depth = 1; + while (!lexer_done(lexer)) { + if (lexer_current(lexer) == '/') { + lexer_step(lexer); + if (!lexer_done(lexer) && lexer_current(lexer) == '*') + depth += 1; + } else if (lexer_current(lexer) == '*') { + lexer_step(lexer); + if (lexer_done(lexer) && lexer_current(lexer) == '/') + depth -= 1; + } + lexer_step(lexer); + } + return depth != 0 + ? lexer_token(lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer)) + : lexer_next(lexer); +} + +Token lexer_make_invalid_char(Lexer* lexer) +{ + Position begin = lexer_position(lexer); + lexer_step(lexer); + return lexer_token(lexer, TokenTypeInvalidChar, begin); +} + +Position lexer_position(const Lexer* lexer) +{ + return (Position) { + .index = lexer->index, + .line = lexer->line, + .column = lexer->column, + }; +} + +Token lexer_token(const Lexer* lexer, TokenType type, Position begin) +{ + return (Token) { + .type = type, + .position = begin, + .length = lexer->index - begin.index, + }; +} + +bool lexer_done(const Lexer* lexer) { return lexer->index >= lexer->length; } + +char lexer_current(const Lexer* lexer) { return lexer->text[lexer->index]; } + +void lexer_step(Lexer* lexer) +{ + if (lexer_done(lexer)) + return; + if (lexer_current(lexer) == '\n') { + lexer->line += 1; + lexer->column = 1; + } else { + lexer->column += 1; + } + lexer->index += 1; +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..1efce31 --- /dev/null +++ b/lexer.h @@ -0,0 +1,76 @@ +#ifndef LEXER_H +#define LEXER_H + +#include + +typedef enum { + TokenTypeEof, + + TokenTypeInvalidChar, + TokenTypeMalformedMultilineComment, + TokenTypeMalformedChar, + TokenTypeMalformedString, + + TokenTypeId, + TokenTypeInt, + TokenTypeHex, + TokenTypeBinary, + TokenTypeFloat, + TokenTypeChar, + TokenTypeString, + + TokenTypeIf, + TokenTypeElse, + TokenTypeWhile, + TokenTypeBreak, + + TokenTypeLParen, + TokenTypeRParen, + TokenTypeLBrace, + TokenTypeRBrace, + TokenTypeLBracket, + TokenTypeRBracket, + TokenTypeDot, + TokenTypeComma, + TokenTypeColon, + TokenTypeSemicolon, + + TokenTypePlusEqual, + TokenTypeMinusEqual, + TokenTypeAsteriskEqual, + TokenTypeSlashEqual, + TokenTypePercentEqual, + TokenTypeDoubleEqual, + TokenTypeExclamationEqual, + TokenTypeLtEqual, + TokenTypeGtEqual, + + TokenTypePlus, + TokenTypeMinus, + TokenTypeAsterisk, + TokenTypeSlash, + TokenTypePercent, + TokenTypeEqual, + TokenTypeExclamation, + TokenTypeLt, + TokenTypeGt, + +} TokenType; + +typedef struct { + size_t index; + int line, column; +} Position; + +typedef struct { + TokenType type; + Position position; + size_t length; +} Token; + +typedef struct Lexer Lexer; + +void lexer_create(Lexer* lexer, const char* text, size_t text_length); +Token lexer_next(Lexer* lexer); + +#endif diff --git a/lexer.o b/lexer.o new file mode 100644 index 0000000..e794c73 Binary files /dev/null and b/lexer.o differ diff --git a/main.c b/main.c new file mode 100644 index 0000000..50c765e --- /dev/null +++ b/main.c @@ -0,0 +1,3 @@ +#include + +int main(void) { printf("hello world\n"); } diff --git a/main.o b/main.o new file mode 100644 index 0000000..09c8eea Binary files /dev/null and b/main.o differ diff --git a/wacc b/wacc new file mode 100755 index 0000000..8ad967f Binary files /dev/null and b/wacc differ