From a307f3578a386ca73d4eedf4cc7a178a7d71e0d9 Mon Sep 17 00:00:00 2001 From: Simon Date: Fri, 19 Jul 2024 01:46:30 +0200 Subject: [PATCH] roll own parser --- .clangd | 5 ++ parser.cpp | 154 +++++++++++++++++++++++++++++++++++++++++++++++++++++ parser.hpp | 142 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 301 insertions(+) create mode 100644 .clangd create mode 100644 parser.cpp create mode 100644 parser.hpp diff --git a/.clangd b/.clangd new file mode 100644 index 0000000..d567e73 --- /dev/null +++ b/.clangd @@ -0,0 +1,5 @@ +CompileFlags: + Remove: + - '-fmodules-ts' + - '-fmodule-mapper=CMakeFiles/stela.dir/main.cpp.o.modmap' + - '-fdeps-format=p1689r5' diff --git a/parser.cpp b/parser.cpp new file mode 100644 index 0000000..6930f7c --- /dev/null +++ b/parser.cpp @@ -0,0 +1,154 @@ +#include "parser.hpp" +#include + +using namespace stela; + +static inline auto in_range(char ch, char begin, char end) -> bool +{ + return ch >= begin && ch <= end; +} + +static inline auto whitespace_char(char ch) -> bool +{ + return ch == ' ' or ch == '\t' or ch == '\n'; +} + +static inline auto id_start_char(char ch) -> bool +{ + return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z') or ch == '_'; +} + +static inline auto id_char(char ch) -> bool +{ + return in_range(ch, 'a', 'z') or in_range(ch, 'A', 'Z') + or in_range(ch, '0', '1') or ch == '_'; +} + +auto Lexer::next() -> Token +{ + auto pos = this->pos(); + if (done()) { + return token(TokenType::Eof, pos); + } + char ch = current(); + if (whitespace_char(ch)) { + step(); + return next(); + } + if (id_start_char(ch)) { + std::string value; + value.push_back(ch); + step(); + while (not done() and id_char(current())) { + value.push_back(current()); + step(); + } + if (this->keyword_map.contains(value)) { + return token(this->keyword_map[value], pos); + } + size_t id = this->symbol_values.size(); + this->symbol_values.push_back(value); + return Token { TokenType::Id, pos, id }; + } + if (in_range(ch, '1', '9')) { + std::string value; + value.push_back(ch); + step(); + while (not done() and in_range(ch, '0', '9')) { + value.push_back(current()); + step(); + } + int64_t int_value = std::strtoll(value.c_str(), nullptr, 10); + size_t id = this->symbol_values.size(); + this->int_values.push_back(int_value); + return Token { TokenType::Id, pos, id }; + } + if (ch == '0') { + step(); + int64_t int_value = 0; + size_t id = this->symbol_values.size(); + this->int_values.push_back(int_value); + return Token { TokenType::Id, pos, id }; + } + if (ch == '"') { + // TODO string + } + if (ch == '#') { + while (not done() and current() != '\n') { + step(); + } + return next(); + } + if (ch == '/') { + step(); + if (current() == '/') { + while (not done() and current() != '\n') { + step(); + } + return next(); + } + return error_token(pos, "'/' not implemented"); + } + if (ch == '-') { + step(); + if (not done() and current() == '>') { + step(); + return token(TokenType::MinusLt, pos); + } + return token(TokenType::Minus, pos); + } + if (ch == ':') { + step(); + if (current() == ':') { + step(); + return token(TokenType::ColonColon, pos); + } + return token(TokenType::Colon, pos); + } + switch (ch) { + case '(': + return single_token(TokenType::LParen, pos); + case ')': + return single_token(TokenType::RParen, pos); + case '{': + return single_token(TokenType::LBrace, pos); + case '}': + return single_token(TokenType::RBrace, pos); + case '[': + return single_token(TokenType::LBracket, pos); + case ']': + return single_token(TokenType::RBracket, pos); + case '.': + return single_token(TokenType::Dot, pos); + case ',': + return single_token(TokenType::Comma, pos); + case ';': + return single_token(TokenType::Semicolon, pos); + } + step(); + return error_token(pos, "unrecognized character"); +} + +auto Lexer::populate_keyword_map() +{ + this->keyword_map["error"] = TokenType::Error; + this->keyword_map["eof"] = TokenType::Eof; + this->keyword_map["if"] = TokenType::If; + this->keyword_map["else"] = TokenType::Else; + this->keyword_map["return"] = TokenType::Return; + this->keyword_map["public"] = TokenType::Public; + this->keyword_map["private"] = TokenType::Private; + this->keyword_map["class"] = TokenType::Class; + this->keyword_map["derivable"] = TokenType::Derivable; + this->keyword_map["derives"] = TokenType::Derives; + this->keyword_map["enumeration"] = TokenType::Enumeration; + this->keyword_map["associate"] = TokenType::Associate; + this->keyword_map["attribute"] = TokenType::Attribute; + this->keyword_map["operation"] = TokenType::Operation; + this->keyword_map["state_machine"] = TokenType::StateMachine; + this->keyword_map["transition"] = TokenType::Transition; + this->keyword_map["initial"] = TokenType::Initial; + this->keyword_map["final"] = TokenType::Final; + this->keyword_map["entry"] = TokenType::Entry; + this->keyword_map["exit"] = TokenType::Exit; +} diff --git a/parser.hpp b/parser.hpp new file mode 100644 index 0000000..02d6964 --- /dev/null +++ b/parser.hpp @@ -0,0 +1,142 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace stela { + +struct Pos { + size_t index; + int64_t line; + int64_t col; +}; + +struct Error { + Pos pos; + std::string message; +}; + +enum class TokenType { + Error, + Eof, + If, + Else, + Return, + Public, + Private, + Class, + Derivable, + Derives, + Enumeration, + Associate, + Attribute, + Operation, + StateMachine, + Transition, + Initial, + Final, + Entry, + Exit, + LParen, + RParen, + LBrace, + RBrace, + LBracket, + RBracket, + Dot, + Comma, + Semicolon, + Colon, + ColonColon, + Minus, + MinusLt, + Equal, + Id, + Int, +}; + +struct Token { + TokenType type; + Pos pos; + uint64_t id; +}; + +class Lexer { +public: + Lexer(std::ifstream& file, std::vector& errors) + : file(&file) + , current_char(file.get()) + , errors(&errors) + { } + + auto next() -> Token; + +private: + inline auto step() + { + if (this->current_char == EOF) { + this->eof_reached = true; + return; + } + if (this->current_char == '\n') { + this->line += 1; + this->col = 1; + } else { + this->col += 1; + } + this->current_char = this->file->get(); + this->index += 1; + } + + inline auto single_token(TokenType type, Pos pos) -> Token + { + step(); + return token(type, pos); + } + inline auto error_token(Pos pos, std::string message) const -> Token + { + this->errors->push_back(Error { pos, message }); + return token(TokenType::Error, pos); + } + inline auto token(TokenType type, Pos pos) const -> Token + { + return Token { type, pos, 0 }; + } + inline auto pos() const -> Pos + { + return Pos { + .index = this->index, + .line = this->line, + .col = this->col, + }; + } + + inline auto done() const -> bool { return this->eof_reached; } + inline auto current() const -> char + { + return static_cast(this->current_char); + } + + auto populate_keyword_map(); + + bool eof_reached = false; + + size_t index = 0; + int64_t line = 1; + int64_t col = 1; + + std::ifstream* file; + int current_char; + + std::unordered_map keyword_map {}; + + std::vector symbol_values {}; + std::vector int_values {}; + + std::vector* errors; +}; + +}