roll own parser

This commit is contained in:
Simon 2024-07-19 01:46:30 +02:00
parent ff45df6bc1
commit a307f3578a
3 changed files with 301 additions and 0 deletions

5
.clangd Normal file
View File

@ -0,0 +1,5 @@
CompileFlags:
Remove:
- '-fmodules-ts'
- '-fmodule-mapper=CMakeFiles/stela.dir/main.cpp.o.modmap'
- '-fdeps-format=p1689r5'

154
parser.cpp Normal file
View File

@ -0,0 +1,154 @@
#include "parser.hpp"
#include <cstdlib>
using namespace stela;
static inline auto in_range(char ch, char begin, char end) -> bool
{
return ch >= begin && ch <= end;
}
static inline auto whitespace_char(char ch) -> bool
{
return ch == ' ' or ch == '\t' or ch == '\n';
}
static inline auto id_start_char(char ch) -> bool
{
return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z') or ch == '_';
}
static inline auto id_char(char ch) -> bool
{
return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z')
or in_range(ch, '0', '1') or ch == '_';
}
auto Lexer::next() -> Token
{
auto pos = this->pos();
if (done()) {
return token(TokenType::Eof, pos);
}
char ch = current();
if (whitespace_char(ch)) {
step();
return next();
}
if (id_start_char(ch)) {
std::string value;
value.push_back(ch);
step();
while (not done() and id_char(current())) {
value.push_back(current());
step();
}
if (this->keyword_map.contains(value)) {
return token(this->keyword_map[value], pos);
}
size_t id = this->symbol_values.size();
this->symbol_values.push_back(value);
return Token { TokenType::Id, pos, id };
}
if (in_range(ch, '1', '9')) {
std::string value;
value.push_back(ch);
step();
while (not done() and in_range(ch, '0', '9')) {
value.push_back(current());
step();
}
int64_t int_value = std::strtoll(value.c_str(), nullptr, 10);
size_t id = this->symbol_values.size();
this->int_values.push_back(int_value);
return Token { TokenType::Id, pos, id };
}
if (ch == '0') {
step();
int64_t int_value = 0;
size_t id = this->symbol_values.size();
this->int_values.push_back(int_value);
return Token { TokenType::Id, pos, id };
}
if (ch == '"') {
// TODO string
}
if (ch == '#') {
while (not done() and current() != '\n') {
step();
}
return next();
}
if (ch == '/') {
step();
if (current() == '/') {
while (not done() and current() != '\n') {
step();
}
return next();
}
return error_token(pos, "'/' not implemented");
}
if (ch == '-') {
step();
if (not done() and current() == '>') {
step();
return token(TokenType::MinusLt, pos);
}
return token(TokenType::Minus, pos);
}
if (ch == ':') {
step();
if (current() == ':') {
step();
return token(TokenType::ColonColon, pos);
}
return token(TokenType::Colon, pos);
}
switch (ch) {
case '(':
return single_token(TokenType::LParen, pos);
case ')':
return single_token(TokenType::RParen, pos);
case '{':
return single_token(TokenType::LBrace, pos);
case '}':
return single_token(TokenType::RBrace, pos);
case '[':
return single_token(TokenType::LBracket, pos);
case ']':
return single_token(TokenType::RBracket, pos);
case '.':
return single_token(TokenType::Dot, pos);
case ',':
return single_token(TokenType::Comma, pos);
case ';':
return single_token(TokenType::Semicolon, pos);
}
step();
return error_token(pos, "unrecognized character");
}
auto Lexer::populate_keyword_map()
{
this->keyword_map["error"] = TokenType::Error;
this->keyword_map["eof"] = TokenType::Eof;
this->keyword_map["if"] = TokenType::If;
this->keyword_map["else"] = TokenType::Else;
this->keyword_map["return"] = TokenType::Return;
this->keyword_map["public"] = TokenType::Public;
this->keyword_map["private"] = TokenType::Private;
this->keyword_map["class"] = TokenType::Class;
this->keyword_map["derivable"] = TokenType::Derivable;
this->keyword_map["derives"] = TokenType::Derives;
this->keyword_map["enumeration"] = TokenType::Enumeration;
this->keyword_map["associate"] = TokenType::Associate;
this->keyword_map["attribute"] = TokenType::Attribute;
this->keyword_map["operation"] = TokenType::Operation;
this->keyword_map["state_machine"] = TokenType::StateMachine;
this->keyword_map["transition"] = TokenType::Transition;
this->keyword_map["initial"] = TokenType::Initial;
this->keyword_map["final"] = TokenType::Final;
this->keyword_map["entry"] = TokenType::Entry;
this->keyword_map["exit"] = TokenType::Exit;
}

142
parser.hpp Normal file
View File

@ -0,0 +1,142 @@
#pragma once
#include <cstdint>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>
namespace stela {
struct Pos {
size_t index;
int64_t line;
int64_t col;
};
struct Error {
Pos pos;
std::string message;
};
enum class TokenType {
Error,
Eof,
If,
Else,
Return,
Public,
Private,
Class,
Derivable,
Derives,
Enumeration,
Associate,
Attribute,
Operation,
StateMachine,
Transition,
Initial,
Final,
Entry,
Exit,
LParen,
RParen,
LBrace,
RBrace,
LBracket,
RBracket,
Dot,
Comma,
Semicolon,
Colon,
ColonColon,
Minus,
MinusLt,
Equal,
Id,
Int,
};
struct Token {
TokenType type;
Pos pos;
uint64_t id;
};
class Lexer {
public:
Lexer(std::ifstream& file, std::vector<Error>& errors)
: file(&file)
, current_char(file.get())
, errors(&errors)
{ }
auto next() -> Token;
private:
inline auto step()
{
if (this->current_char == EOF) {
this->eof_reached = true;
return;
}
if (this->current_char == '\n') {
this->line += 1;
this->col = 1;
} else {
this->col += 1;
}
this->current_char = this->file->get();
this->index += 1;
}
inline auto single_token(TokenType type, Pos pos) -> Token
{
step();
return token(type, pos);
}
inline auto error_token(Pos pos, std::string message) const -> Token
{
this->errors->push_back(Error { pos, message });
return token(TokenType::Error, pos);
}
inline auto token(TokenType type, Pos pos) const -> Token
{
return Token { type, pos, 0 };
}
inline auto pos() const -> Pos
{
return Pos {
.index = this->index,
.line = this->line,
.col = this->col,
};
}
inline auto done() const -> bool { return this->eof_reached; }
inline auto current() const -> char
{
return static_cast<char>(this->current_char);
}
auto populate_keyword_map();
bool eof_reached = false;
size_t index = 0;
int64_t line = 1;
int64_t col = 1;
std::ifstream* file;
int current_char;
std::unordered_map<std::string, TokenType> keyword_map {};
std::vector<std::string> symbol_values {};
std::vector<int64_t> int_values {};
std::vector<Error>* errors;
};
}