more lexer stuff

This commit is contained in:
SimonFJ20 2023-02-18 00:07:24 +01:00
parent 8c4d734af0
commit b71c2e5afb
5 changed files with 230 additions and 71 deletions

9
.clang-format Normal file
View File

@ -0,0 +1,9 @@
BasedOnStyle: WebKit
IndentWidth: 4
ColumnLimit: 80
IndentCaseLabels: true
BreakBeforeBraces: Custom
BraceWrapping:
AfterFunction: true
SplitEmptyFunction: false

231
lexer.c
View File

@ -5,8 +5,9 @@
#include <stdlib.h>
#include <string.h>
#define ASSERT_EXHAUSTIVE_MATCH() \
(fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, __LINE__, __func__), \
#define ASSERT_EXHAUSTIVE_MATCH() \
(fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, \
__LINE__, __func__), \
exit(1))
Token lexer_skip_whitespace(Lexer* lexer);
@ -19,10 +20,12 @@ Token lexer_make_char(Lexer* lexer);
Token lexer_make_string(Lexer* lexer);
void lexer_skip_literal_char(Lexer* lexer);
Token lexer_make_single_char_token(Lexer* lexer, TokenType type);
Token lexer_make_dot_token(Lexer* lexer);
Token lexer_make_colon_token(Lexer* lexer);
Token lexer_make_slash_token(Lexer* lexer);
Token lexer_skip_singleline_comment(Lexer* lexer);
Token lexer_make_single_or_double_char_token(
Lexer* lexer, TokenType single_type, char second_char, TokenType double_type);
Token lexer_make_single_or_double_char_token(Lexer* lexer,
TokenType single_type, char second_char, TokenType double_type);
Token lexer_skip_multiline_comment(Lexer* lexer);
Token lexer_make_invalid_char(Lexer* lexer);
Position lexer_position(const Lexer* lexer);
@ -72,9 +75,14 @@ Token lexer_make_int_or_float(Lexer* lexer)
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
return lexer_token(lexer, TokenTypeIntDoubleDot, begin);
} else {
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
}
} else {
return lexer_token(lexer, TokenTypeInt, begin);
}
@ -125,6 +133,10 @@ Token lexer_make_id(Lexer* lexer)
return lexer_token(lexer, TokenTypeFn, begin);
else if (lexer_span_matches(lexer, begin, "return"))
return lexer_token(lexer, TokenTypeReturn, begin);
else if (lexer_span_matches(lexer, begin, "mut"))
return lexer_token(lexer, TokenTypeMut, begin);
else if (lexer_span_matches(lexer, begin, "defer"))
return lexer_token(lexer, TokenTypeDefer, begin);
else
return lexer_token(lexer, TokenTypeId, begin);
}
@ -159,12 +171,11 @@ Token lexer_make_static_token(Lexer* lexer)
case ']':
return lexer_make_single_char_token(lexer, TokenTypeRBracket);
case '.':
return lexer_make_single_or_double_char_token(
lexer, TokenTypeDot, '.', TokenTypeDoubleDot);
return lexer_make_dot_token(lexer);
case ',':
return lexer_make_single_char_token(lexer, TokenTypeComma);
case ':':
return lexer_make_single_char_token(lexer, TokenTypeColon);
return lexer_make_colon_token(lexer);
case ';':
return lexer_make_single_char_token(lexer, TokenTypeSemicolon);
case '&':
@ -209,16 +220,21 @@ Token lexer_make_int_hex_binary_or_float(Lexer* lexer)
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
} else if (!lexer_done(lexer)
&& (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
lexer_step(lexer);
while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F')))
|| (lexer_current(lexer) >= 'A'
&& lexer_current(lexer) <= 'F')))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeHex, begin);
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
} else if (!lexer_done(lexer)
&& (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
lexer_step(lexer);
while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
while (!lexer_done(lexer)
&& (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeBinary, begin);
} else {
@ -271,7 +287,8 @@ void lexer_skip_literal_char(Lexer* lexer)
while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F')))
|| (lexer_current(lexer) >= 'A'
&& lexer_current(lexer) <= 'F')))
lexer_step(lexer);
}
}
@ -283,8 +300,8 @@ Token lexer_make_single_char_token(Lexer* lexer, TokenType type)
return lexer_token(lexer, type, begin);
}
Token lexer_make_single_or_double_char_token(
Lexer* lexer, TokenType single_type, char second_char, TokenType double_type)
Token lexer_make_single_or_double_char_token(Lexer* lexer,
TokenType single_type, char second_char, TokenType double_type)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
@ -296,6 +313,48 @@ Token lexer_make_single_or_double_char_token(
}
}
Token lexer_make_dot_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '=') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleDotEqual, begin);
} else if (!lexer_done(lexer) && lexer_current(lexer) == '<') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleDotLt, begin);
} else {
return lexer_token(lexer, TokenTypeDoubleDot, begin);
}
} else if (!lexer_done(lexer) && isdigit(lexer_current(lexer))) {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
} else {
return lexer_token(lexer, TokenTypeDot, begin);
}
}
Token lexer_make_colon_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == ':') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '<') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleColonLt, begin);
} else {
return lexer_token(lexer, TokenTypeDoubleColon, begin);
}
} else {
return lexer_token(lexer, TokenTypeColon, begin);
}
}
Token lexer_make_slash_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
@ -343,9 +402,9 @@ Token lexer_skip_multiline_comment(Lexer* lexer)
}
lexer_step(lexer);
}
return depth != 0
? lexer_token(lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer))
: lexer_next(lexer);
return depth != 0 ? lexer_token(
lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer))
: lexer_next(lexer);
}
Token lexer_make_invalid_char(Lexer* lexer)
@ -417,93 +476,137 @@ const char* token_type_to_string(TokenType type)
{
switch (type) {
case TokenTypeEof:
return "Eof";
return "TokenTypeEof";
case TokenTypeInvalidChar:
return "InvalidChar";
return "TokenTypeInvalidChar";
case TokenTypeMalformedMultilineComment:
return "MalformedMultilineComment";
return "TokenTypeMalformedMultilineComment";
case TokenTypeMalformedChar:
return "MalformedChar";
return "TokenTypeMalformedChar";
case TokenTypeMalformedString:
return "MalformedString";
return "TokenTypeMalformedString";
case TokenTypeId:
return "Id";
return "TokenTypeId";
case TokenTypeInt:
return "Int";
return "TokenTypeInt";
case TokenTypeIntDoubleDot:
return "TokenTypeIntDoubleDot";
case TokenTypeHex:
return "Hex";
return "TokenTypeHex";
case TokenTypeBinary:
return "Binary";
return "TokenTypeBinary";
case TokenTypeFloat:
return "Float";
return "TokenTypeFloat";
case TokenTypeChar:
return "Char";
return "TokenTypeChar";
case TokenTypeString:
return "String";
return "TokenTypeString";
case TokenTypeIf:
return "If";
return "TokenTypeIf";
case TokenTypeElse:
return "Else";
return "TokenTypeElse";
case TokenTypeLoop:
return "TokenTypeLoop";
case TokenTypeWhile:
return "While";
return "TokenTypeWhile";
case TokenTypeFor:
return "TokenTypeFor";
case TokenTypeIn:
return "TokenTypeIn";
case TokenTypeBreak:
return "Break";
return "TokenTypeBreak";
case TokenTypeLet:
return "TokenTypeLet";
case TokenTypeMatch:
return "TokenTypeMatch";
case TokenTypeFalse:
return "TokenTypeFalse";
case TokenTypeTrue:
return "TokenTypeTrue";
case TokenTypeNot:
return "TokenTypeNot";
case TokenTypeAnd:
return "TokenTypeAnd";
case TokenTypeOr:
return "TokenTypeOr";
case TokenTypeFn:
return "TokenTypeFn";
case TokenTypeReturn:
return "TokenTypeReturn";
case TokenTypeMut:
return "TokenTypeMut";
case TokenTypeDefer:
return "TokenTypeDefer";
case TokenTypeLParen:
return "LParen";
return "TokenTypeLParen";
case TokenTypeRParen:
return "RParen";
return "TokenTypeRParen";
case TokenTypeLBrace:
return "LBrace";
return "TokenTypeLBrace";
case TokenTypeRBrace:
return "RBrace";
return "TokenTypeRBrace";
case TokenTypeLBracket:
return "LBracket";
return "TokenTypeLBracket";
case TokenTypeRBracket:
return "RBracket";
case TokenTypeDot:
return "Dot";
return "TokenTypeRBracket";
case TokenTypeComma:
return "Comma";
return "TokenTypeComma";
case TokenTypeColon:
return "Colon";
return "TokenTypeColon";
case TokenTypeDoubleColon:
return "TokenTypeDoubleColon";
case TokenTypeDoubleColonLt:
return "TokenTypeDoubleColonLt";
case TokenTypeSemicolon:
return "Semicolon";
return "TokenTypeSemicolon";
case TokenTypeAmpersand:
return "TokenTypeAmpersand";
case TokenTypeUnderscore:
return "TokenTypeUnderscore";
case TokenTypeDot:
return "TokenTypeDot";
case TokenTypeDoubleDot:
return "TokenTypeDoubleDot";
case TokenTypeDoubleDotEqual:
return "TokenTypeDoubleDotEqual";
case TokenTypeDoubleDotLt:
return "TokenTypeDoubleDotLt";
case TokenTypePlusEqual:
return "PlusEqual";
return "TokenTypePlusEqual";
case TokenTypeMinusEqual:
return "MinusEqual";
return "TokenTypeMinusEqual";
case TokenTypeAsteriskEqual:
return "AsteriskEqual";
return "TokenTypeAsteriskEqual";
case TokenTypeSlashEqual:
return "SlashEqual";
return "TokenTypeSlashEqual";
case TokenTypePercentEqual:
return "PercentEqual";
return "TokenTypePercentEqual";
case TokenTypeDoubleEqual:
return "DoubleEqual";
return "TokenTypeDoubleEqual";
case TokenTypeExclamationEqual:
return "ExclamationEqual";
return "TokenTypeExclamationEqual";
case TokenTypeLtEqual:
return "LtEqual";
return "TokenTypeLtEqual";
case TokenTypeGtEqual:
return "GtEqual";
return "TokenTypeGtEqual";
case TokenTypePlus:
return "Plus";
return "TokenTypePlus";
case TokenTypeMinus:
return "Minus";
return "TokenTypeMinus";
case TokenTypeAsterisk:
return "Asterisk";
return "TokenTypeAsterisk";
case TokenTypeSlash:
return "Slash";
return "TokenTypeSlash";
case TokenTypePercent:
return "Percent";
return "TokenTypePercent";
case TokenTypeEqual:
return "Equal";
return "TokenTypeEqual";
case TokenTypeExclamation:
return "Exclamation";
return "TokenTypeExclamation";
case TokenTypeLt:
return "Lt";
return "TokenTypeLt";
case TokenTypeGt:
return "Gt";
return "TokenTypeGt";
default:
ASSERT_EXHAUSTIVE_MATCH();
}

11
lexer.h
View File

@ -13,6 +13,7 @@ typedef enum {
TokenTypeId,
TokenTypeInt,
TokenTypeIntDoubleDot,
TokenTypeHex,
TokenTypeBinary,
TokenTypeFloat,
@ -35,6 +36,8 @@ typedef enum {
TokenTypeOr,
TokenTypeFn,
TokenTypeReturn,
TokenTypeMut,
TokenTypeDefer,
TokenTypeLParen,
TokenTypeRParen,
@ -42,13 +45,17 @@ typedef enum {
TokenTypeRBrace,
TokenTypeLBracket,
TokenTypeRBracket,
TokenTypeDot,
TokenTypeComma,
TokenTypeColon,
TokenTypeDoubleColon,
TokenTypeDoubleColonLt,
TokenTypeSemicolon,
TokenTypeDoubleMatch,
TokenTypeAmpersand,
TokenTypeUnderscore,
TokenTypeDot,
TokenTypeDoubleDot,
TokenTypeDoubleDotEqual,
TokenTypeDoubleDotLt,
TokenTypePlusEqual,
TokenTypeMinusEqual,

9
main.c
View File

@ -6,8 +6,13 @@
int main(void)
{
char text[]
= "abc 123 0xFF 0b101 3.14 'a' '\\n' \"hello\" \"world\\\"\\n\" if else /* /* while */ */ "
"while break (){}[].,:; += -= *= /= %= == != <= >= + - * / % % = ! < >";
= "abc 123 123.. 0xFF 0b101 .5 1. 3.14 'a' '\\n' \"hello\" "
"\"world\\\"\\n\" if else /* /* while */ */ "
"while for in // in \n break let match false true not and or fn "
"return mut "
"defer (){}[],: :: ::< ; & _ . .. ..= ..< += -= *= /= %= == != <= >= "
"+ - * / % "
"% = ! < >";
printf("text = \"%s\"\n", text);

View File

@ -2,10 +2,13 @@
#define PARSER_H
#include "lexer.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
typedef enum {
ParsedNodeTypeError,
ParsedNodeTypeInt,
ParsedNodeTypeFloat,
ParsedNodeTypeChar,
@ -27,9 +30,41 @@ typedef enum {
ParsedNodeTypeAssign,
} ParsedNodeType;
typedef struct ParsedNode {
ParsedNodeType node_type;
} ParsedNode;
typedef struct KeyValuePair KeyValuePair;
typedef struct ParsedNode ParsedNode;
struct ParsedNode {
ParsedNodeType type;
union {
int64_t int_value;
double float_value;
char char_value;
struct {
char* value;
size_t length;
} string;
bool bool_value;
struct {
ParsedNode* values;
size_t length;
} array;
struct {
KeyValuePair* pairs;
size_t length;
} dict;
struct {
ParsedNode* condition;
ParsedNode* truthy;
ParsedNode* falsy;
} if_node;
};
};
struct KeyValuePair {
char* key;
size_t key_length;
ParsedNode* value;
};
typedef struct {
Lexer* lexer;