parrot/lexer.py

279 lines
11 KiB
Python
Raw Permalink Normal View History

2023-04-06 03:17:57 +01:00
from tokens import TokenIterator, Token, TokenType
from typing import Optional
from position import Position, Span
class Lexer(TokenIterator):
def __init__(self, text: str) -> None:
self.text = text
self.index = 0
self.line = 1
self.col = 1
def next(self) -> Token:
if self.done():
return self.token(TokenType.Eof, self.pos(), self.pos())
elif self.current() in " \t\r\n":
self.step()
while not self.done() and self.current() in " \t\r\n":
self.step()
return self.next()
elif self.current() == "(":
return self.single(TokenType.LParen)
elif self.current() == ")":
return self.single(TokenType.RParen)
elif self.current() == "{":
return self.single(TokenType.LBrace)
elif self.current() == "}":
return self.single(TokenType.RBrace)
elif self.current() == "[":
return self.single(TokenType.LBracket)
elif self.current() == "]":
return self.single(TokenType.RBracket)
elif self.current() == ".":
start = self.pos()
self.step()
if self.current_is("."):
end = self.pos()
self.step()
if self.current_is("."):
end = self.pos()
self.step()
return self.token(TokenType.DotDotDot, start, end)
elif self.current_is("="):
end = self.pos()
self.step()
return self.token(TokenType.DotDotEqual, start, end)
else:
return self.token(TokenType.DotDot, start, end)
else:
return self.token(TokenType.Dot, start, start)
elif self.current() == ",":
return self.single(TokenType.Comma)
elif self.current() == ":":
start = self.pos()
self.step()
if self.current_is(":"):
end = self.pos()
self.step()
if self.current_is("<"):
end = self.pos()
self.step()
return self.token(TokenType.ColonColonLT, start, end)
else:
return self.token(TokenType.ColonColon, start, end)
else:
return self.token(TokenType.Comma, start, start)
elif self.current() == ";":
return self.single(TokenType.Semicolon)
elif self.current() == "&":
return self.single(TokenType.Ampersand)
elif self.current() == "+":
return self.single_or_double(TokenType.Plus, "=", TokenType.PlusEqual)
elif self.current() == "-":
start = self.pos()
self.step()
if self.current_is("="):
end = self.pos()
self.step()
return self.token(TokenType.MinusEqual, start, end)
elif self.current_is(">"):
end = self.pos()
self.step()
return self.token(TokenType.MinusLT, start, end)
else:
return self.token(TokenType.Minus, start, start)
elif self.current() == "*":
return self.single_or_double(TokenType.Asterisk, "=", TokenType.AsteriskEqual)
elif self.current() == "/":
start = self.pos()
self.step()
if self.current_is("="):
end = self.pos()
self.step()
return self.token(TokenType.AsteriskEqual, start, end)
elif self.current_is("/"):
self.step()
while not self.done() and self.current() != "\n":
self.step()
return self.next()
elif self.current_is("*"):
end = self.pos()
self.step()
depth = 1
last_char: Optional[str] = None
while not self.done():
if last_char == "/" and self.current() == "*":
depth += 1
elif last_char == "*" and self.current() == "/":
depth -= 1
if depth == 0:
self.step()
break
last_char = self.current()
end = self.pos()
self.step()
if depth != 0:
return self.token(TokenType.MalformedComment, start, end)
return self.next()
else:
return self.token(TokenType.Slash, start, start)
elif self.current() == "%":
return self.single_or_double(TokenType.Percent, "=", TokenType.PercentEqual)
elif self.current() == "=":
start = self.pos()
self.step()
if self.current_is("="):
end = self.pos()
self.step()
return self.token(TokenType.EqualEqual, start, end)
elif self.current_is(">"):
end = self.pos()
self.step()
return self.token(TokenType.EqualLT, start, end)
else:
return self.token(TokenType.Equal, start, start)
elif self.current() == "!":
return self.single_or_double(TokenType.Exclamation, "=", TokenType.ExclamationEqual)
elif self.current() == "<":
return self.single_or_double(TokenType.LT, "=", TokenType.LTEqual)
elif self.current() == ">":
return self.single_or_double(TokenType.GT, "=", TokenType.GTEqual)
elif self.current() == "'":
start = self.pos()
self.step()
end = self.pos()
first = self.current()
self.step()
if not self.done() and first == "\\":
end = self.pos()
self.step()
if not self.current_is("'"):
return self.token(TokenType.MalformedChar, start, end)
else:
end = self.pos()
self.step()
return self.token(TokenType.Char, start, end)
elif self.current() == "\"":
start = self.pos()
end = self.pos()
self.step()
while not self.done() and self.current() != "\"":
end = self.pos()
first = self.current()
self.step()
if not self.done() and first == "\\":
end = self.pos()
self.step()
if not self.current_is("\""):
return self.token(TokenType.MalformedString, start, end)
else:
end = self.pos()
self.step()
return self.token(TokenType.String, start, end)
elif self.current() == "0":
return self.single(TokenType.Int)
elif self.current() in "123456789":
start = self.pos()
end = self.pos()
self.step()
while not self.done() and self.current() in "1234567890":
end = self.pos()
self.step()
return self.token(TokenType.Int, start, end)
elif self.current() in "abcdefghijklmnopqrstuvwxyz_":
start = self.pos()
end = self.pos()
self.step()
while (not self.done()
and self.current().lower()
in "1234567890abcdefghijklmnopqrstuvwxyz_"):
end = self.pos()
self.step()
value = self.text[start.index:self.index]
if value == "_":
return self.token(TokenType.Underscore, start, end)
elif value == "false":
return self.token(TokenType.KwFalse, start, end)
elif value == "true":
return self.token(TokenType.KwTrue, start, end)
elif value == "not":
return self.token(TokenType.KwNot, start, end)
elif value == "in":
return self.token(TokenType.KwIn, start, end)
elif value == "and":
return self.token(TokenType.KwAnd, start, end)
elif value == "or":
return self.token(TokenType.KwOr, start, end)
elif value == "xor":
return self.token(TokenType.KwXor, start, end)
elif value == "let":
return self.token(TokenType.KwLet, start, end)
elif value == "mut":
return self.token(TokenType.KwMut, start, end)
elif value == "if":
return self.token(TokenType.KwIf, start, end)
elif value == "else":
return self.token(TokenType.KwElse, start, end)
elif value == "while":
return self.token(TokenType.KwWhile, start, end)
elif value == "for":
return self.token(TokenType.KwFor, start, end)
elif value == "loop":
return self.token(TokenType.KwLoop, start, end)
elif value == "break":
return self.token(TokenType.KwBreak, start, end)
elif value == "continue":
return self.token(TokenType.KwContinue, start, end)
elif value == "fn":
return self.token(TokenType.KwFn, start, end)
elif value == "return":
return self.token(TokenType.KwReturn, start, end)
elif value == "match":
return self.token(TokenType.KwMatch, start, end)
else:
return self.token(TokenType.Id, start, end)
else:
start = self.pos()
self.step()
return self.token(TokenType.InvalidChar, start, start)
def single(self, token_type: TokenType) -> Token:
start = self.pos()
self.step()
return self.token(token_type, start, start)
def single_or_double(self, type1: TokenType, char2: str, type2: TokenType) -> Token:
start = self.pos()
self.step()
if not self.done() and self.current() == char2:
end = self.pos()
self.step()
return self.token(type2, start, end)
else:
return self.token(type1, start, start)
def token(self, token_type: TokenType, start: Position, end: Position) -> Token:
return Token(token_type, start.index, self.index - start.index, Span(start, end))
def pos(self) -> Position:
return Position(self.index, self.line, self.col)
def step(self) -> None:
self.index += 1
if not self.done():
if self.current() == "\n":
self.line += 1
self.col = 1
else:
self.col += 1
def current_is(self, value: str) -> bool:
return not self.done() and self.current() == value
def done(self) -> bool:
return self.index >= len(self.text)
def current(self) -> str:
return self.text[self.index]