more lexer stuff

This commit is contained in:
SimonFJ20 2023-02-18 00:07:24 +01:00
parent 8c4d734af0
commit b71c2e5afb
5 changed files with 230 additions and 71 deletions

9
.clang-format Normal file
View File

@ -0,0 +1,9 @@
BasedOnStyle: WebKit
IndentWidth: 4
ColumnLimit: 80
IndentCaseLabels: true
BreakBeforeBraces: Custom
BraceWrapping:
AfterFunction: true
SplitEmptyFunction: false

223
lexer.c
View File

@ -6,7 +6,8 @@
#include <string.h> #include <string.h>
#define ASSERT_EXHAUSTIVE_MATCH() \ #define ASSERT_EXHAUSTIVE_MATCH() \
(fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, __LINE__, __func__), \ (fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, \
__LINE__, __func__), \
exit(1)) exit(1))
Token lexer_skip_whitespace(Lexer* lexer); Token lexer_skip_whitespace(Lexer* lexer);
@ -19,10 +20,12 @@ Token lexer_make_char(Lexer* lexer);
Token lexer_make_string(Lexer* lexer); Token lexer_make_string(Lexer* lexer);
void lexer_skip_literal_char(Lexer* lexer); void lexer_skip_literal_char(Lexer* lexer);
Token lexer_make_single_char_token(Lexer* lexer, TokenType type); Token lexer_make_single_char_token(Lexer* lexer, TokenType type);
Token lexer_make_dot_token(Lexer* lexer);
Token lexer_make_colon_token(Lexer* lexer);
Token lexer_make_slash_token(Lexer* lexer); Token lexer_make_slash_token(Lexer* lexer);
Token lexer_skip_singleline_comment(Lexer* lexer); Token lexer_skip_singleline_comment(Lexer* lexer);
Token lexer_make_single_or_double_char_token( Token lexer_make_single_or_double_char_token(Lexer* lexer,
Lexer* lexer, TokenType single_type, char second_char, TokenType double_type); TokenType single_type, char second_char, TokenType double_type);
Token lexer_skip_multiline_comment(Lexer* lexer); Token lexer_skip_multiline_comment(Lexer* lexer);
Token lexer_make_invalid_char(Lexer* lexer); Token lexer_make_invalid_char(Lexer* lexer);
Position lexer_position(const Lexer* lexer); Position lexer_position(const Lexer* lexer);
@ -72,9 +75,14 @@ Token lexer_make_int_or_float(Lexer* lexer)
lexer_step(lexer); lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') { if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer); lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeIntDoubleDot, begin);
} else {
while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer); lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin); return lexer_token(lexer, TokenTypeFloat, begin);
}
} else { } else {
return lexer_token(lexer, TokenTypeInt, begin); return lexer_token(lexer, TokenTypeInt, begin);
} }
@ -125,6 +133,10 @@ Token lexer_make_id(Lexer* lexer)
return lexer_token(lexer, TokenTypeFn, begin); return lexer_token(lexer, TokenTypeFn, begin);
else if (lexer_span_matches(lexer, begin, "return")) else if (lexer_span_matches(lexer, begin, "return"))
return lexer_token(lexer, TokenTypeReturn, begin); return lexer_token(lexer, TokenTypeReturn, begin);
else if (lexer_span_matches(lexer, begin, "mut"))
return lexer_token(lexer, TokenTypeMut, begin);
else if (lexer_span_matches(lexer, begin, "defer"))
return lexer_token(lexer, TokenTypeDefer, begin);
else else
return lexer_token(lexer, TokenTypeId, begin); return lexer_token(lexer, TokenTypeId, begin);
} }
@ -159,12 +171,11 @@ Token lexer_make_static_token(Lexer* lexer)
case ']': case ']':
return lexer_make_single_char_token(lexer, TokenTypeRBracket); return lexer_make_single_char_token(lexer, TokenTypeRBracket);
case '.': case '.':
return lexer_make_single_or_double_char_token( return lexer_make_dot_token(lexer);
lexer, TokenTypeDot, '.', TokenTypeDoubleDot);
case ',': case ',':
return lexer_make_single_char_token(lexer, TokenTypeComma); return lexer_make_single_char_token(lexer, TokenTypeComma);
case ':': case ':':
return lexer_make_single_char_token(lexer, TokenTypeColon); return lexer_make_colon_token(lexer);
case ';': case ';':
return lexer_make_single_char_token(lexer, TokenTypeSemicolon); return lexer_make_single_char_token(lexer, TokenTypeSemicolon);
case '&': case '&':
@ -209,16 +220,21 @@ Token lexer_make_int_hex_binary_or_float(Lexer* lexer)
while (!lexer_done(lexer) && isdigit(lexer_current(lexer))) while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer); lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin); return lexer_token(lexer, TokenTypeFloat, begin);
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) { } else if (!lexer_done(lexer)
&& (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
lexer_step(lexer);
while (!lexer_done(lexer) while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer)) && (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) || (lexer_current(lexer) >= 'A'
&& lexer_current(lexer) <= 'F')))
lexer_step(lexer); lexer_step(lexer);
return lexer_token(lexer, TokenTypeHex, begin); return lexer_token(lexer, TokenTypeHex, begin);
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) { } else if (!lexer_done(lexer)
&& (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
lexer_step(lexer); lexer_step(lexer);
while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1')) while (!lexer_done(lexer)
&& (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
lexer_step(lexer); lexer_step(lexer);
return lexer_token(lexer, TokenTypeBinary, begin); return lexer_token(lexer, TokenTypeBinary, begin);
} else { } else {
@ -271,7 +287,8 @@ void lexer_skip_literal_char(Lexer* lexer)
while (!lexer_done(lexer) while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer)) && (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f') || (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F'))) || (lexer_current(lexer) >= 'A'
&& lexer_current(lexer) <= 'F')))
lexer_step(lexer); lexer_step(lexer);
} }
} }
@ -283,8 +300,8 @@ Token lexer_make_single_char_token(Lexer* lexer, TokenType type)
return lexer_token(lexer, type, begin); return lexer_token(lexer, type, begin);
} }
Token lexer_make_single_or_double_char_token( Token lexer_make_single_or_double_char_token(Lexer* lexer,
Lexer* lexer, TokenType single_type, char second_char, TokenType double_type) TokenType single_type, char second_char, TokenType double_type)
{ {
Position begin = lexer_position(lexer); Position begin = lexer_position(lexer);
lexer_step(lexer); lexer_step(lexer);
@ -296,6 +313,48 @@ Token lexer_make_single_or_double_char_token(
} }
} }
Token lexer_make_dot_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '=') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleDotEqual, begin);
} else if (!lexer_done(lexer) && lexer_current(lexer) == '<') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleDotLt, begin);
} else {
return lexer_token(lexer, TokenTypeDoubleDot, begin);
}
} else if (!lexer_done(lexer) && isdigit(lexer_current(lexer))) {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
} else {
return lexer_token(lexer, TokenTypeDot, begin);
}
}
Token lexer_make_colon_token(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == ':') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '<') {
lexer_step(lexer);
return lexer_token(lexer, TokenTypeDoubleColonLt, begin);
} else {
return lexer_token(lexer, TokenTypeDoubleColon, begin);
}
} else {
return lexer_token(lexer, TokenTypeColon, begin);
}
}
Token lexer_make_slash_token(Lexer* lexer) Token lexer_make_slash_token(Lexer* lexer)
{ {
Position begin = lexer_position(lexer); Position begin = lexer_position(lexer);
@ -343,8 +402,8 @@ Token lexer_skip_multiline_comment(Lexer* lexer)
} }
lexer_step(lexer); lexer_step(lexer);
} }
return depth != 0 return depth != 0 ? lexer_token(
? lexer_token(lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer)) lexer, TokenTypeMalformedMultilineComment, lexer_position(lexer))
: lexer_next(lexer); : lexer_next(lexer);
} }
@ -417,93 +476,137 @@ const char* token_type_to_string(TokenType type)
{ {
switch (type) { switch (type) {
case TokenTypeEof: case TokenTypeEof:
return "Eof"; return "TokenTypeEof";
case TokenTypeInvalidChar: case TokenTypeInvalidChar:
return "InvalidChar"; return "TokenTypeInvalidChar";
case TokenTypeMalformedMultilineComment: case TokenTypeMalformedMultilineComment:
return "MalformedMultilineComment"; return "TokenTypeMalformedMultilineComment";
case TokenTypeMalformedChar: case TokenTypeMalformedChar:
return "MalformedChar"; return "TokenTypeMalformedChar";
case TokenTypeMalformedString: case TokenTypeMalformedString:
return "MalformedString"; return "TokenTypeMalformedString";
case TokenTypeId: case TokenTypeId:
return "Id"; return "TokenTypeId";
case TokenTypeInt: case TokenTypeInt:
return "Int"; return "TokenTypeInt";
case TokenTypeIntDoubleDot:
return "TokenTypeIntDoubleDot";
case TokenTypeHex: case TokenTypeHex:
return "Hex"; return "TokenTypeHex";
case TokenTypeBinary: case TokenTypeBinary:
return "Binary"; return "TokenTypeBinary";
case TokenTypeFloat: case TokenTypeFloat:
return "Float"; return "TokenTypeFloat";
case TokenTypeChar: case TokenTypeChar:
return "Char"; return "TokenTypeChar";
case TokenTypeString: case TokenTypeString:
return "String"; return "TokenTypeString";
case TokenTypeIf: case TokenTypeIf:
return "If"; return "TokenTypeIf";
case TokenTypeElse: case TokenTypeElse:
return "Else"; return "TokenTypeElse";
case TokenTypeLoop:
return "TokenTypeLoop";
case TokenTypeWhile: case TokenTypeWhile:
return "While"; return "TokenTypeWhile";
case TokenTypeFor:
return "TokenTypeFor";
case TokenTypeIn:
return "TokenTypeIn";
case TokenTypeBreak: case TokenTypeBreak:
return "Break"; return "TokenTypeBreak";
case TokenTypeLet:
return "TokenTypeLet";
case TokenTypeMatch:
return "TokenTypeMatch";
case TokenTypeFalse:
return "TokenTypeFalse";
case TokenTypeTrue:
return "TokenTypeTrue";
case TokenTypeNot:
return "TokenTypeNot";
case TokenTypeAnd:
return "TokenTypeAnd";
case TokenTypeOr:
return "TokenTypeOr";
case TokenTypeFn:
return "TokenTypeFn";
case TokenTypeReturn:
return "TokenTypeReturn";
case TokenTypeMut:
return "TokenTypeMut";
case TokenTypeDefer:
return "TokenTypeDefer";
case TokenTypeLParen: case TokenTypeLParen:
return "LParen"; return "TokenTypeLParen";
case TokenTypeRParen: case TokenTypeRParen:
return "RParen"; return "TokenTypeRParen";
case TokenTypeLBrace: case TokenTypeLBrace:
return "LBrace"; return "TokenTypeLBrace";
case TokenTypeRBrace: case TokenTypeRBrace:
return "RBrace"; return "TokenTypeRBrace";
case TokenTypeLBracket: case TokenTypeLBracket:
return "LBracket"; return "TokenTypeLBracket";
case TokenTypeRBracket: case TokenTypeRBracket:
return "RBracket"; return "TokenTypeRBracket";
case TokenTypeDot:
return "Dot";
case TokenTypeComma: case TokenTypeComma:
return "Comma"; return "TokenTypeComma";
case TokenTypeColon: case TokenTypeColon:
return "Colon"; return "TokenTypeColon";
case TokenTypeDoubleColon:
return "TokenTypeDoubleColon";
case TokenTypeDoubleColonLt:
return "TokenTypeDoubleColonLt";
case TokenTypeSemicolon: case TokenTypeSemicolon:
return "Semicolon"; return "TokenTypeSemicolon";
case TokenTypeAmpersand:
return "TokenTypeAmpersand";
case TokenTypeUnderscore:
return "TokenTypeUnderscore";
case TokenTypeDot:
return "TokenTypeDot";
case TokenTypeDoubleDot:
return "TokenTypeDoubleDot";
case TokenTypeDoubleDotEqual:
return "TokenTypeDoubleDotEqual";
case TokenTypeDoubleDotLt:
return "TokenTypeDoubleDotLt";
case TokenTypePlusEqual: case TokenTypePlusEqual:
return "PlusEqual"; return "TokenTypePlusEqual";
case TokenTypeMinusEqual: case TokenTypeMinusEqual:
return "MinusEqual"; return "TokenTypeMinusEqual";
case TokenTypeAsteriskEqual: case TokenTypeAsteriskEqual:
return "AsteriskEqual"; return "TokenTypeAsteriskEqual";
case TokenTypeSlashEqual: case TokenTypeSlashEqual:
return "SlashEqual"; return "TokenTypeSlashEqual";
case TokenTypePercentEqual: case TokenTypePercentEqual:
return "PercentEqual"; return "TokenTypePercentEqual";
case TokenTypeDoubleEqual: case TokenTypeDoubleEqual:
return "DoubleEqual"; return "TokenTypeDoubleEqual";
case TokenTypeExclamationEqual: case TokenTypeExclamationEqual:
return "ExclamationEqual"; return "TokenTypeExclamationEqual";
case TokenTypeLtEqual: case TokenTypeLtEqual:
return "LtEqual"; return "TokenTypeLtEqual";
case TokenTypeGtEqual: case TokenTypeGtEqual:
return "GtEqual"; return "TokenTypeGtEqual";
case TokenTypePlus: case TokenTypePlus:
return "Plus"; return "TokenTypePlus";
case TokenTypeMinus: case TokenTypeMinus:
return "Minus"; return "TokenTypeMinus";
case TokenTypeAsterisk: case TokenTypeAsterisk:
return "Asterisk"; return "TokenTypeAsterisk";
case TokenTypeSlash: case TokenTypeSlash:
return "Slash"; return "TokenTypeSlash";
case TokenTypePercent: case TokenTypePercent:
return "Percent"; return "TokenTypePercent";
case TokenTypeEqual: case TokenTypeEqual:
return "Equal"; return "TokenTypeEqual";
case TokenTypeExclamation: case TokenTypeExclamation:
return "Exclamation"; return "TokenTypeExclamation";
case TokenTypeLt: case TokenTypeLt:
return "Lt"; return "TokenTypeLt";
case TokenTypeGt: case TokenTypeGt:
return "Gt"; return "TokenTypeGt";
default: default:
ASSERT_EXHAUSTIVE_MATCH(); ASSERT_EXHAUSTIVE_MATCH();
} }

11
lexer.h
View File

@ -13,6 +13,7 @@ typedef enum {
TokenTypeId, TokenTypeId,
TokenTypeInt, TokenTypeInt,
TokenTypeIntDoubleDot,
TokenTypeHex, TokenTypeHex,
TokenTypeBinary, TokenTypeBinary,
TokenTypeFloat, TokenTypeFloat,
@ -35,6 +36,8 @@ typedef enum {
TokenTypeOr, TokenTypeOr,
TokenTypeFn, TokenTypeFn,
TokenTypeReturn, TokenTypeReturn,
TokenTypeMut,
TokenTypeDefer,
TokenTypeLParen, TokenTypeLParen,
TokenTypeRParen, TokenTypeRParen,
@ -42,13 +45,17 @@ typedef enum {
TokenTypeRBrace, TokenTypeRBrace,
TokenTypeLBracket, TokenTypeLBracket,
TokenTypeRBracket, TokenTypeRBracket,
TokenTypeDot,
TokenTypeComma, TokenTypeComma,
TokenTypeColon, TokenTypeColon,
TokenTypeDoubleColon,
TokenTypeDoubleColonLt,
TokenTypeSemicolon, TokenTypeSemicolon,
TokenTypeDoubleMatch,
TokenTypeAmpersand, TokenTypeAmpersand,
TokenTypeUnderscore, TokenTypeUnderscore,
TokenTypeDot,
TokenTypeDoubleDot,
TokenTypeDoubleDotEqual,
TokenTypeDoubleDotLt,
TokenTypePlusEqual, TokenTypePlusEqual,
TokenTypeMinusEqual, TokenTypeMinusEqual,

9
main.c
View File

@ -6,8 +6,13 @@
int main(void) int main(void)
{ {
char text[] char text[]
= "abc 123 0xFF 0b101 3.14 'a' '\\n' \"hello\" \"world\\\"\\n\" if else /* /* while */ */ " = "abc 123 123.. 0xFF 0b101 .5 1. 3.14 'a' '\\n' \"hello\" "
"while break (){}[].,:; += -= *= /= %= == != <= >= + - * / % % = ! < >"; "\"world\\\"\\n\" if else /* /* while */ */ "
"while for in // in \n break let match false true not and or fn "
"return mut "
"defer (){}[],: :: ::< ; & _ . .. ..= ..< += -= *= /= %= == != <= >= "
"+ - * / % "
"% = ! < >";
printf("text = \"%s\"\n", text); printf("text = \"%s\"\n", text);

View File

@ -2,10 +2,13 @@
#define PARSER_H #define PARSER_H
#include "lexer.h" #include "lexer.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h> #include <stdint.h>
typedef enum { typedef enum {
ParsedNodeTypeError, ParsedNodeTypeError,
ParsedNodeTypeInt, ParsedNodeTypeInt,
ParsedNodeTypeFloat, ParsedNodeTypeFloat,
ParsedNodeTypeChar, ParsedNodeTypeChar,
@ -27,9 +30,41 @@ typedef enum {
ParsedNodeTypeAssign, ParsedNodeTypeAssign,
} ParsedNodeType; } ParsedNodeType;
typedef struct ParsedNode { typedef struct KeyValuePair KeyValuePair;
ParsedNodeType node_type; typedef struct ParsedNode ParsedNode;
} ParsedNode;
struct ParsedNode {
ParsedNodeType type;
union {
int64_t int_value;
double float_value;
char char_value;
struct {
char* value;
size_t length;
} string;
bool bool_value;
struct {
ParsedNode* values;
size_t length;
} array;
struct {
KeyValuePair* pairs;
size_t length;
} dict;
struct {
ParsedNode* condition;
ParsedNode* truthy;
ParsedNode* falsy;
} if_node;
};
};
struct KeyValuePair {
char* key;
size_t key_length;
ParsedNode* value;
};
typedef struct { typedef struct {
Lexer* lexer; Lexer* lexer;