lexer and stuff
This commit is contained in:
parent
84be2cbbba
commit
8c4d734af0
88
lexer.c
88
lexer.c
@ -5,18 +5,16 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
struct Lexer {
|
#define ASSERT_EXHAUSTIVE_MATCH() \
|
||||||
const char* text;
|
(fprintf(stderr, "unexhaustive match at %s:%d in %s()\n", __FILE__, __LINE__, __func__), \
|
||||||
size_t index, length;
|
exit(1))
|
||||||
int line, column;
|
|
||||||
};
|
|
||||||
|
|
||||||
Token lexer_skip_whitespace(Lexer* lexer);
|
Token lexer_skip_whitespace(Lexer* lexer);
|
||||||
Token lexer_make_int_or_float(Lexer* lexer);
|
Token lexer_make_int_or_float(Lexer* lexer);
|
||||||
Token lexer_make_id(Lexer* lexer);
|
Token lexer_make_id(Lexer* lexer);
|
||||||
bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value);
|
bool lexer_span_matches(const Lexer* lexer, Position begin, const char* value);
|
||||||
Token lexer_make_static_token(Lexer* lexer);
|
Token lexer_make_static_token(Lexer* lexer);
|
||||||
Token lexer_make_int_hex_or_binary(Lexer* lexer);
|
Token lexer_make_int_hex_binary_or_float(Lexer* lexer);
|
||||||
Token lexer_make_char(Lexer* lexer);
|
Token lexer_make_char(Lexer* lexer);
|
||||||
Token lexer_make_string(Lexer* lexer);
|
Token lexer_make_string(Lexer* lexer);
|
||||||
void lexer_skip_literal_char(Lexer* lexer);
|
void lexer_skip_literal_char(Lexer* lexer);
|
||||||
@ -73,6 +71,7 @@ Token lexer_make_int_or_float(Lexer* lexer)
|
|||||||
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
|
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
|
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
|
||||||
|
lexer_step(lexer);
|
||||||
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
|
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
return lexer_token(lexer, TokenTypeFloat, begin);
|
return lexer_token(lexer, TokenTypeFloat, begin);
|
||||||
@ -85,6 +84,11 @@ Token lexer_make_id(Lexer* lexer)
|
|||||||
{
|
{
|
||||||
Position begin = lexer_position(lexer);
|
Position begin = lexer_position(lexer);
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
|
if (lexer_done(lexer)
|
||||||
|
|| (!isalpha(lexer_current(lexer)) && !isdigit(lexer_current(lexer))
|
||||||
|
&& lexer_current(lexer) != '_')) {
|
||||||
|
return lexer_token(lexer, TokenTypeUnderscore, begin);
|
||||||
|
}
|
||||||
while (!lexer_done(lexer)
|
while (!lexer_done(lexer)
|
||||||
&& (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer))
|
&& (isalpha(lexer_current(lexer)) || isdigit(lexer_current(lexer))
|
||||||
|| lexer_current(lexer) == '_'))
|
|| lexer_current(lexer) == '_'))
|
||||||
@ -95,8 +99,32 @@ Token lexer_make_id(Lexer* lexer)
|
|||||||
return lexer_token(lexer, TokenTypeElse, begin);
|
return lexer_token(lexer, TokenTypeElse, begin);
|
||||||
else if (lexer_span_matches(lexer, begin, "while"))
|
else if (lexer_span_matches(lexer, begin, "while"))
|
||||||
return lexer_token(lexer, TokenTypeWhile, begin);
|
return lexer_token(lexer, TokenTypeWhile, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "loop"))
|
||||||
|
return lexer_token(lexer, TokenTypeLoop, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "for"))
|
||||||
|
return lexer_token(lexer, TokenTypeFor, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "in"))
|
||||||
|
return lexer_token(lexer, TokenTypeIn, begin);
|
||||||
else if (lexer_span_matches(lexer, begin, "break"))
|
else if (lexer_span_matches(lexer, begin, "break"))
|
||||||
return lexer_token(lexer, TokenTypeBreak, begin);
|
return lexer_token(lexer, TokenTypeBreak, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "let"))
|
||||||
|
return lexer_token(lexer, TokenTypeLet, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "match"))
|
||||||
|
return lexer_token(lexer, TokenTypeMatch, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "false"))
|
||||||
|
return lexer_token(lexer, TokenTypeFalse, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "true"))
|
||||||
|
return lexer_token(lexer, TokenTypeTrue, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "not"))
|
||||||
|
return lexer_token(lexer, TokenTypeNot, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "and"))
|
||||||
|
return lexer_token(lexer, TokenTypeAnd, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "or"))
|
||||||
|
return lexer_token(lexer, TokenTypeOr, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "fn"))
|
||||||
|
return lexer_token(lexer, TokenTypeFn, begin);
|
||||||
|
else if (lexer_span_matches(lexer, begin, "return"))
|
||||||
|
return lexer_token(lexer, TokenTypeReturn, begin);
|
||||||
else
|
else
|
||||||
return lexer_token(lexer, TokenTypeId, begin);
|
return lexer_token(lexer, TokenTypeId, begin);
|
||||||
}
|
}
|
||||||
@ -113,7 +141,7 @@ Token lexer_make_static_token(Lexer* lexer)
|
|||||||
{
|
{
|
||||||
switch (lexer_current(lexer)) {
|
switch (lexer_current(lexer)) {
|
||||||
case '0':
|
case '0':
|
||||||
return lexer_make_int_hex_or_binary(lexer);
|
return lexer_make_int_hex_binary_or_float(lexer);
|
||||||
case '\'':
|
case '\'':
|
||||||
return lexer_make_char(lexer);
|
return lexer_make_char(lexer);
|
||||||
case '"':
|
case '"':
|
||||||
@ -131,13 +159,16 @@ Token lexer_make_static_token(Lexer* lexer)
|
|||||||
case ']':
|
case ']':
|
||||||
return lexer_make_single_char_token(lexer, TokenTypeRBracket);
|
return lexer_make_single_char_token(lexer, TokenTypeRBracket);
|
||||||
case '.':
|
case '.':
|
||||||
return lexer_make_single_char_token(lexer, TokenTypeDot);
|
return lexer_make_single_or_double_char_token(
|
||||||
|
lexer, TokenTypeDot, '.', TokenTypeDoubleDot);
|
||||||
case ',':
|
case ',':
|
||||||
return lexer_make_single_char_token(lexer, TokenTypeComma);
|
return lexer_make_single_char_token(lexer, TokenTypeComma);
|
||||||
case ':':
|
case ':':
|
||||||
return lexer_make_single_char_token(lexer, TokenTypeColon);
|
return lexer_make_single_char_token(lexer, TokenTypeColon);
|
||||||
case ';':
|
case ';':
|
||||||
return lexer_make_single_char_token(lexer, TokenTypeSemicolon);
|
return lexer_make_single_char_token(lexer, TokenTypeSemicolon);
|
||||||
|
case '&':
|
||||||
|
return lexer_make_single_char_token(lexer, TokenTypeAmpersand);
|
||||||
case '+':
|
case '+':
|
||||||
return lexer_make_single_or_double_char_token(
|
return lexer_make_single_or_double_char_token(
|
||||||
lexer, TokenTypePlus, '=', TokenTypePlusEqual);
|
lexer, TokenTypePlus, '=', TokenTypePlusEqual);
|
||||||
@ -169,18 +200,24 @@ Token lexer_make_static_token(Lexer* lexer)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Token lexer_make_int_hex_or_binary(Lexer* lexer)
|
Token lexer_make_int_hex_binary_or_float(Lexer* lexer)
|
||||||
{
|
{
|
||||||
Position begin = lexer_position(lexer);
|
Position begin = lexer_position(lexer);
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
|
if (!lexer_done(lexer) && lexer_current(lexer) == '.') {
|
||||||
|
lexer_step(lexer);
|
||||||
|
while (!lexer_done(lexer) && isdigit(lexer_current(lexer)))
|
||||||
|
lexer_step(lexer);
|
||||||
|
return lexer_token(lexer, TokenTypeFloat, begin);
|
||||||
|
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'x' || lexer_current(lexer) == 'X')) {
|
||||||
while (!lexer_done(lexer)
|
while (!lexer_done(lexer)
|
||||||
&& (isdigit(lexer_current(lexer))
|
&& (isdigit(lexer_current(lexer))
|
||||||
|| (lexer_current(lexer) >= 'a' || lexer_current(lexer) <= 'f')
|
|| (lexer_current(lexer) >= 'a' && lexer_current(lexer) <= 'f')
|
||||||
|| (lexer_current(lexer) >= 'A' || lexer_current(lexer) <= 'F')))
|
|| (lexer_current(lexer) >= 'A' && lexer_current(lexer) <= 'F')))
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
return lexer_token(lexer, TokenTypeHex, begin);
|
return lexer_token(lexer, TokenTypeHex, begin);
|
||||||
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
|
} else if (!lexer_done(lexer) && (lexer_current(lexer) == 'b' || lexer_current(lexer) == 'B')) {
|
||||||
|
lexer_step(lexer);
|
||||||
while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
|
while (!lexer_done(lexer) && (lexer_current(lexer) == '0' || lexer_current(lexer) == '1'))
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
return lexer_token(lexer, TokenTypeBinary, begin);
|
return lexer_token(lexer, TokenTypeBinary, begin);
|
||||||
@ -213,7 +250,7 @@ Token lexer_make_string(Lexer* lexer)
|
|||||||
if (lexer_done(lexer) && lexer_current(lexer) != '\"')
|
if (lexer_done(lexer) && lexer_current(lexer) != '\"')
|
||||||
return lexer_token(lexer, TokenTypeMalformedString, begin);
|
return lexer_token(lexer, TokenTypeMalformedString, begin);
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
return lexer_token(lexer, TokenTypeChar, begin);
|
return lexer_token(lexer, TokenTypeString, begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
void lexer_skip_literal_char(Lexer* lexer)
|
void lexer_skip_literal_char(Lexer* lexer)
|
||||||
@ -253,9 +290,9 @@ Token lexer_make_single_or_double_char_token(
|
|||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
if (!lexer_done(lexer) && lexer_current(lexer) == second_char) {
|
if (!lexer_done(lexer) && lexer_current(lexer) == second_char) {
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
return lexer_token(lexer, single_type, begin);
|
|
||||||
} else {
|
|
||||||
return lexer_token(lexer, double_type, begin);
|
return lexer_token(lexer, double_type, begin);
|
||||||
|
} else {
|
||||||
|
return lexer_token(lexer, single_type, begin);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -290,14 +327,18 @@ Token lexer_skip_multiline_comment(Lexer* lexer)
|
|||||||
{
|
{
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
int depth = 1;
|
int depth = 1;
|
||||||
while (!lexer_done(lexer)) {
|
while (!lexer_done(lexer) && depth != 0) {
|
||||||
if (lexer_current(lexer) == '/') {
|
if (lexer_current(lexer) == '/') {
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
if (!lexer_done(lexer) && lexer_current(lexer) == '*')
|
if (lexer_done(lexer))
|
||||||
|
break;
|
||||||
|
else if (lexer_current(lexer) == '*')
|
||||||
depth += 1;
|
depth += 1;
|
||||||
} else if (lexer_current(lexer) == '*') {
|
} else if (lexer_current(lexer) == '*') {
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
if (lexer_done(lexer) && lexer_current(lexer) == '/')
|
if (lexer_done(lexer))
|
||||||
|
break;
|
||||||
|
else if (lexer_current(lexer) == '/')
|
||||||
depth -= 1;
|
depth -= 1;
|
||||||
}
|
}
|
||||||
lexer_step(lexer);
|
lexer_step(lexer);
|
||||||
@ -360,13 +401,18 @@ char* token_to_string(const Token* token, const char* text)
|
|||||||
{
|
{
|
||||||
const char* type_string = token_type_to_string(token->type);
|
const char* type_string = token_type_to_string(token->type);
|
||||||
char* value_string = token_string(token, text);
|
char* value_string = token_string(token, text);
|
||||||
size_t size = token->length + strlen(type_string) + 5;
|
size_t size = token->length + strlen(type_string) + 7;
|
||||||
char* value = calloc(size, sizeof(char));
|
char* value = calloc(size, sizeof(char));
|
||||||
snprintf(value, size, "(%s, %s)", type_string, value_string);
|
snprintf(value, size, "(%s, \"%s\")", type_string, value_string);
|
||||||
free(value_string);
|
free(value_string);
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char* lexer_token_string(const Lexer* lexer, const Token* token)
|
||||||
|
{
|
||||||
|
return token_string(token, lexer->text);
|
||||||
|
}
|
||||||
|
|
||||||
const char* token_type_to_string(TokenType type)
|
const char* token_type_to_string(TokenType type)
|
||||||
{
|
{
|
||||||
switch (type) {
|
switch (type) {
|
||||||
@ -458,5 +504,7 @@ const char* token_type_to_string(TokenType type)
|
|||||||
return "Lt";
|
return "Lt";
|
||||||
case TokenTypeGt:
|
case TokenTypeGt:
|
||||||
return "Gt";
|
return "Gt";
|
||||||
|
default:
|
||||||
|
ASSERT_EXHAUSTIVE_MATCH();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
22
lexer.h
22
lexer.h
@ -21,8 +21,20 @@ typedef enum {
|
|||||||
|
|
||||||
TokenTypeIf,
|
TokenTypeIf,
|
||||||
TokenTypeElse,
|
TokenTypeElse,
|
||||||
|
TokenTypeLoop,
|
||||||
TokenTypeWhile,
|
TokenTypeWhile,
|
||||||
|
TokenTypeFor,
|
||||||
|
TokenTypeIn,
|
||||||
TokenTypeBreak,
|
TokenTypeBreak,
|
||||||
|
TokenTypeLet,
|
||||||
|
TokenTypeMatch,
|
||||||
|
TokenTypeFalse,
|
||||||
|
TokenTypeTrue,
|
||||||
|
TokenTypeNot,
|
||||||
|
TokenTypeAnd,
|
||||||
|
TokenTypeOr,
|
||||||
|
TokenTypeFn,
|
||||||
|
TokenTypeReturn,
|
||||||
|
|
||||||
TokenTypeLParen,
|
TokenTypeLParen,
|
||||||
TokenTypeRParen,
|
TokenTypeRParen,
|
||||||
@ -34,6 +46,9 @@ typedef enum {
|
|||||||
TokenTypeComma,
|
TokenTypeComma,
|
||||||
TokenTypeColon,
|
TokenTypeColon,
|
||||||
TokenTypeSemicolon,
|
TokenTypeSemicolon,
|
||||||
|
TokenTypeDoubleMatch,
|
||||||
|
TokenTypeAmpersand,
|
||||||
|
TokenTypeUnderscore,
|
||||||
|
|
||||||
TokenTypePlusEqual,
|
TokenTypePlusEqual,
|
||||||
TokenTypeMinusEqual,
|
TokenTypeMinusEqual,
|
||||||
@ -72,9 +87,14 @@ typedef struct {
|
|||||||
char* token_string(const Token* token, const char* text);
|
char* token_string(const Token* token, const char* text);
|
||||||
char* token_to_string(const Token* token, const char* text);
|
char* token_to_string(const Token* token, const char* text);
|
||||||
|
|
||||||
typedef struct Lexer Lexer;
|
typedef struct {
|
||||||
|
const char* text;
|
||||||
|
size_t index, length;
|
||||||
|
int line, column;
|
||||||
|
} Lexer;
|
||||||
|
|
||||||
void lexer_create(Lexer* lexer, const char* text, size_t text_length);
|
void lexer_create(Lexer* lexer, const char* text, size_t text_length);
|
||||||
Token lexer_next(Lexer* lexer);
|
Token lexer_next(Lexer* lexer);
|
||||||
|
char* lexer_token_string(const Lexer* lexer, const Token* token);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
19
main.c
19
main.c
@ -1,9 +1,26 @@
|
|||||||
|
#include "lexer.h"
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
int main(void)
|
int main(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
char text[]
|
char text[]
|
||||||
= "abc 123 0xFF 0b101 3.14 'a' '\\n' \"hello\" \"world\\\"\\n\" if else /* /* while */ */ "
|
= "abc 123 0xFF 0b101 3.14 'a' '\\n' \"hello\" \"world\\\"\\n\" if else /* /* while */ */ "
|
||||||
"while break (){}[].,:; += -= *= /= %= == != <= >= + - * / % % = ! < >";
|
"while break (){}[].,:; += -= *= /= %= == != <= >= + - * / % % = ! < >";
|
||||||
|
|
||||||
|
printf("text = \"%s\"\n", text);
|
||||||
|
|
||||||
|
Lexer lexer;
|
||||||
|
lexer_create(&lexer, text, strlen(text));
|
||||||
|
|
||||||
|
printf("tokens = [\n");
|
||||||
|
Token token = lexer_next(&lexer);
|
||||||
|
while (token.type != TokenTypeEof) {
|
||||||
|
char* stringified = token_to_string(&token, text);
|
||||||
|
printf(" %s\n", stringified);
|
||||||
|
free(stringified);
|
||||||
|
token = lexer_next(&lexer);
|
||||||
|
}
|
||||||
|
printf("]\n");
|
||||||
}
|
}
|
||||||
|
11
parser.c
11
parser.c
@ -1 +1,12 @@
|
|||||||
#include "parser.h"
|
#include "parser.h"
|
||||||
|
#include "lexer.h"
|
||||||
|
|
||||||
|
void parser_create(Parser* parser, Lexer* lexer)
|
||||||
|
{
|
||||||
|
*parser = (Parser) {
|
||||||
|
.lexer = lexer,
|
||||||
|
.current = lexer_next(lexer),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
void parser_parse_expression(Parser* parser) { }
|
||||||
|
38
parser.h
38
parser.h
@ -1,4 +1,42 @@
|
|||||||
#ifndef PARSER_H
|
#ifndef PARSER_H
|
||||||
#define PARSER_H
|
#define PARSER_H
|
||||||
|
|
||||||
|
#include "lexer.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
ParsedNodeTypeError,
|
||||||
|
ParsedNodeTypeInt,
|
||||||
|
ParsedNodeTypeFloat,
|
||||||
|
ParsedNodeTypeChar,
|
||||||
|
ParsedNodeTypeString,
|
||||||
|
ParsedNodeTypeBool,
|
||||||
|
ParsedNodeTypeArray,
|
||||||
|
ParsedNodeTypeDict,
|
||||||
|
ParsedNodeTypeIf,
|
||||||
|
ParsedNodeTypeWhile,
|
||||||
|
ParsedNodeTypeLoop,
|
||||||
|
ParsedNodeTypeFor,
|
||||||
|
ParsedNodeTypeLambda,
|
||||||
|
|
||||||
|
ParsedNodeTypeCall,
|
||||||
|
ParsedNodeTypeAccess,
|
||||||
|
ParsedNodeTypeIndex,
|
||||||
|
ParsedNodeTypeUnary,
|
||||||
|
ParsedNodeTypeBinary,
|
||||||
|
ParsedNodeTypeAssign,
|
||||||
|
} ParsedNodeType;
|
||||||
|
|
||||||
|
typedef struct ParsedNode {
|
||||||
|
ParsedNodeType node_type;
|
||||||
|
} ParsedNode;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
Lexer* lexer;
|
||||||
|
Token current;
|
||||||
|
} Parser;
|
||||||
|
|
||||||
|
void parser_create(Parser* parser, Lexer* lexer);
|
||||||
|
void parser_parse_expression(Parser* parser);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user