lexer and stuff

This commit is contained in:
Simon 2023-02-13 14:42:46 +01:00
parent 84be2cbbba
commit 8c4d734af0
5 changed files with 156 additions and 22 deletions

88
lexer.c
View File

@ -5,18 +5,16 @@
#include <stdlib.h>
#include <string.h>
struct Lexer {
const char* text;
size_t index, length;
int line, column;
};
#define ASSERT_EXHAUSTIVE_MATCH() \
(fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, __LINE__, __func__), \
exit(1))
Token lexer_skip_whitespace(Lexer* lexer);
Token lexer_make_int_or_float(Lexer* lexer);
Token lexer_make_id(Lexer* lexer);
bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value);
Token lexer_make_static_token(Lexer* lexer);
Token lexer_make_int_hex_or_binary(Lexer* lexer);
Token lexer_make_int_hex_binary_or_float(Lexer* lexer);
Token lexer_make_char(Lexer* lexer);
Token lexer_make_string(Lexer* lexer);
void lexer_skip_literal_char(Lexer* lexer);
@ -73,6 +71,7 @@ Token lexer_make_int_or_float(Lexer* lexer)
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
@ -85,6 +84,11 @@ Token lexer_make_id(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (lexer_done(lexer)
|| (!isalpha(lexer_current(lexer)) && !isdigit(lexer_current(lexer))
&& lexer_current(lexer) != '_')) {
return lexer_token(lexer, TokenTypeUnderscore, begin);
}
while (!lexer_done(lexer)
&& (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer))
|| lexer_current(lexer) == '_'))
@ -95,8 +99,32 @@ Token lexer_make_id(Lexer* lexer)
return lexer_token(lexer, TokenTypeElse, begin);
else if (lexer_span_matches(lexer, begin, "while"))
return lexer_token(lexer, TokenTypeWhile, begin);
else if (lexer_span_matches(lexer, begin, "loop"))
return lexer_token(lexer, TokenTypeLoop, begin);
else if (lexer_span_matches(lexer, begin, "for"))
return lexer_token(lexer, TokenTypeFor, begin);
else if (lexer_span_matches(lexer, begin, "in"))
return lexer_token(lexer, TokenTypeIn, begin);
else if (lexer_span_matches(lexer, begin, "break"))
return lexer_token(lexer, TokenTypeBreak, begin);
else if (lexer_span_matches(lexer, begin, "let"))
return lexer_token(lexer, TokenTypeLet, begin);
else if (lexer_span_matches(lexer, begin, "match"))
return lexer_token(lexer, TokenTypeMatch, begin);
else if (lexer_span_matches(lexer, begin, "false"))
return lexer_token(lexer, TokenTypeFalse, begin);
else if (lexer_span_matches(lexer, begin, "true"))
return lexer_token(lexer, TokenTypeTrue, begin);
else if (lexer_span_matches(lexer, begin, "not"))
return lexer_token(lexer, TokenTypeNot, begin);
else if (lexer_span_matches(lexer, begin, "and"))
return lexer_token(lexer, TokenTypeAnd, begin);
else if (lexer_span_matches(lexer, begin, "or"))
return lexer_token(lexer, TokenTypeOr, begin);
else if (lexer_span_matches(lexer, begin, "fn"))
return lexer_token(lexer, TokenTypeFn, begin);
else if (lexer_span_matches(lexer, begin, "return"))
return lexer_token(lexer, TokenTypeReturn, begin);
else
return lexer_token(lexer, TokenTypeId, begin);
}
@ -113,7 +141,7 @@ Token lexer_make_static_token(Lexer* lexer)
{
switch (lexer_current(lexer)) {
case '0':
return lexer_make_int_hex_or_binary(lexer);
return lexer_make_int_hex_binary_or_float(lexer);
case '\'':
return lexer_make_char(lexer);
case '"':
@ -131,13 +159,16 @@ Token lexer_make_static_token(Lexer* lexer)
case ']':
return lexer_make_single_char_token(lexer, TokenTypeRBracket);
case '.':
return lexer_make_single_char_token(lexer, TokenTypeDot);
return lexer_make_single_or_double_char_token(
lexer, TokenTypeDot, '.', TokenTypeDoubleDot);
case ',':
return lexer_make_single_char_token(lexer, TokenTypeComma);
case ':':
return lexer_make_single_char_token(lexer, TokenTypeColon);
case ';':
return lexer_make_single_char_token(lexer, TokenTypeSemicolon);
case '&':
return lexer_make_single_char_token(lexer, TokenTypeAmpersand);
case '+':
return lexer_make_single_or_double_char_token(
lexer, TokenTypePlus, '=', TokenTypePlusEqual);
@ -169,18 +200,24 @@ Token lexer_make_static_token(Lexer* lexer)
}
}
Token lexer_make_int_hex_or_binary(Lexer* lexer)
Token lexer_make_int_hex_binary_or_float(Lexer* lexer)
{
Position begin = lexer_position(lexer);
lexer_step(lexer);
if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
lexer_step(lexer);
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeFloat, begin);
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
while (!lexer_done(lexer)
&& (isdigit(lexer_current(lexer))
|| (lexer_current(lexer) >= 'a' || lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A' || lexer_current(lexer) <= 'F')))
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|| (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F')))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeHex, begin);
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
lexer_step(lexer);
while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
lexer_step(lexer);
return lexer_token(lexer, TokenTypeBinary, begin);
@ -213,7 +250,7 @@ Token lexer_make_string(Lexer* lexer)
if (lexer_done(lexer) && lexer_current(lexer) != '\"')
return lexer_token(lexer, TokenTypeMalformedString, begin);
lexer_step(lexer);
return lexer_token(lexer, TokenTypeChar, begin);
return lexer_token(lexer, TokenTypeString, begin);
}
void lexer_skip_literal_char(Lexer* lexer)
@ -253,9 +290,9 @@ Token lexer_make_single_or_double_char_token(
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == second_char) {
lexer_step(lexer);
return lexer_token(lexer, single_type, begin);
} else {
return lexer_token(lexer, double_type, begin);
} else {
return lexer_token(lexer, single_type, begin);
}
}
@ -290,14 +327,18 @@ Token lexer_skip_multiline_comment(Lexer* lexer)
{
lexer_step(lexer);
int depth = 1;
while (!lexer_done(lexer)) {
while (!lexer_done(lexer) && depth != 0) {
if (lexer_current(lexer) == '/') {
lexer_step(lexer);
if (!lexer_done(lexer) && lexer_current(lexer) == '*')
if (lexer_done(lexer))
break;
else if (lexer_current(lexer) == '*')
depth += 1;
} else if (lexer_current(lexer) == '*') {
lexer_step(lexer);
if (lexer_done(lexer) && lexer_current(lexer) == '/')
if (lexer_done(lexer))
break;
else if (lexer_current(lexer) == '/')
depth -= 1;
}
lexer_step(lexer);
@ -360,13 +401,18 @@ char* token_to_string(const Token* token, const char* text)
{
const char* type_string = token_type_to_string(token->type);
char* value_string = token_string(token, text);
size_t size = token->length + strlen(type_string) + 5;
size_t size = token->length + strlen(type_string) + 7;
char* value = calloc(size, sizeof(char));
snprintf(value, size, "(%s, %s)", type_string, value_string);
snprintf(value, size, "(%s, \"%s\")", type_string, value_string);
free(value_string);
return value;
}
char* lexer_token_string(const Lexer* lexer, const Token* token)
{
return token_string(token, lexer->text);
}
const char* token_type_to_string(TokenType type)
{
switch (type) {
@ -458,5 +504,7 @@ const char* token_type_to_string(TokenType type)
return "Lt";
case TokenTypeGt:
return "Gt";
default:
ASSERT_EXHAUSTIVE_MATCH();
}
}

22
lexer.h
View File

@ -21,8 +21,20 @@ typedef enum {
TokenTypeIf,
TokenTypeElse,
TokenTypeLoop,
TokenTypeWhile,
TokenTypeFor,
TokenTypeIn,
TokenTypeBreak,
TokenTypeLet,
TokenTypeMatch,
TokenTypeFalse,
TokenTypeTrue,
TokenTypeNot,
TokenTypeAnd,
TokenTypeOr,
TokenTypeFn,
TokenTypeReturn,
TokenTypeLParen,
TokenTypeRParen,
@ -34,6 +46,9 @@ typedef enum {
TokenTypeComma,
TokenTypeColon,
TokenTypeSemicolon,
TokenTypeDoubleMatch,
TokenTypeAmpersand,
TokenTypeUnderscore,
TokenTypePlusEqual,
TokenTypeMinusEqual,
@ -72,9 +87,14 @@ typedef struct {
char* token_string(const Token* token, const char* text);
char* token_to_string(const Token* token, const char* text);
typedef struct Lexer Lexer;
typedef struct {
const char* text;
size_t index, length;
int line, column;
} Lexer;
void lexer_create(Lexer* lexer, const char* text, size_t text_length);
Token lexer_next(Lexer* lexer);
char* lexer_token_string(const Lexer* lexer, const Token* token);
#endif

19
main.c
View File

@ -1,9 +1,26 @@
#include "lexer.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(void)
{
char text[]
= "abc 123 0xFF 0b101 3.14 'a' '\\n' \"hello\" \"world\\\"\\n\" if else /* /* while */ */ "
"while break (){}[].,:; += -= *= /= %= == != <= >= + - * / % % = ! < >";
printf("text = \"%s\"\n", text);
Lexer lexer;
lexer_create(&lexer, text, strlen(text));
printf("tokens = [\n");
Token token = lexer_next(&lexer);
while (token.type != TokenTypeEof) {
char* stringified = token_to_string(&token, text);
printf(" %s\n", stringified);
free(stringified);
token = lexer_next(&lexer);
}
printf("]\n");
}

View File

@ -1 +1,12 @@
#include "parser.h"
#include "lexer.h"
void parser_create(Parser* parser, Lexer* lexer)
{
*parser = (Parser) {
.lexer = lexer,
.current = lexer_next(lexer),
};
}
void parser_parse_expression(Parser* parser) { }

View File

@ -1,4 +1,42 @@
#ifndef PARSER_H
#define PARSER_H
#include "lexer.h"
#include <stdint.h>
typedef enum {
ParsedNodeTypeError,
ParsedNodeTypeInt,
ParsedNodeTypeFloat,
ParsedNodeTypeChar,
ParsedNodeTypeString,
ParsedNodeTypeBool,
ParsedNodeTypeArray,
ParsedNodeTypeDict,
ParsedNodeTypeIf,
ParsedNodeTypeWhile,
ParsedNodeTypeLoop,
ParsedNodeTypeFor,
ParsedNodeTypeLambda,
ParsedNodeTypeCall,
ParsedNodeTypeAccess,
ParsedNodeTypeIndex,
ParsedNodeTypeUnary,
ParsedNodeTypeBinary,
ParsedNodeTypeAssign,
} ParsedNodeType;
typedef struct ParsedNode {
ParsedNodeType node_type;
} ParsedNode;
typedef struct {
Lexer* lexer;
Token current;
} Parser;
void parser_create(Parser* parser, Lexer* lexer);
void parser_parse_expression(Parser* parser);
#endif