279 lines
11 KiB
Python
279 lines
11 KiB
Python
from tokens import TokenIterator, Token, TokenType
|
|
from typing import Optional
|
|
from position import Position, Span
|
|
|
|
|
|
class Lexer(TokenIterator):
|
|
def __init__(self, text: str) -> None:
|
|
self.text = text
|
|
self.index = 0
|
|
self.line = 1
|
|
self.col = 1
|
|
|
|
def next(self) -> Token:
|
|
if self.done():
|
|
return self.token(TokenType.Eof, self.pos(), self.pos())
|
|
elif self.current() in " \t\r\n":
|
|
self.step()
|
|
while not self.done() and self.current() in " \t\r\n":
|
|
self.step()
|
|
return self.next()
|
|
elif self.current() == "(":
|
|
return self.single(TokenType.LParen)
|
|
elif self.current() == ")":
|
|
return self.single(TokenType.RParen)
|
|
elif self.current() == "{":
|
|
return self.single(TokenType.LBrace)
|
|
elif self.current() == "}":
|
|
return self.single(TokenType.RBrace)
|
|
elif self.current() == "[":
|
|
return self.single(TokenType.LBracket)
|
|
elif self.current() == "]":
|
|
return self.single(TokenType.RBracket)
|
|
elif self.current() == ".":
|
|
start = self.pos()
|
|
self.step()
|
|
if self.current_is("."):
|
|
end = self.pos()
|
|
self.step()
|
|
if self.current_is("."):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.DotDotDot, start, end)
|
|
elif self.current_is("="):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.DotDotEqual, start, end)
|
|
else:
|
|
return self.token(TokenType.DotDot, start, end)
|
|
else:
|
|
return self.token(TokenType.Dot, start, start)
|
|
elif self.current() == ",":
|
|
return self.single(TokenType.Comma)
|
|
elif self.current() == ":":
|
|
start = self.pos()
|
|
self.step()
|
|
if self.current_is(":"):
|
|
end = self.pos()
|
|
self.step()
|
|
if self.current_is("<"):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.ColonColonLT, start, end)
|
|
else:
|
|
return self.token(TokenType.ColonColon, start, end)
|
|
else:
|
|
return self.token(TokenType.Comma, start, start)
|
|
elif self.current() == ";":
|
|
return self.single(TokenType.Semicolon)
|
|
elif self.current() == "&":
|
|
return self.single(TokenType.Ampersand)
|
|
elif self.current() == "+":
|
|
return self.single_or_double(TokenType.Plus, "=", TokenType.PlusEqual)
|
|
elif self.current() == "-":
|
|
start = self.pos()
|
|
self.step()
|
|
if self.current_is("="):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.MinusEqual, start, end)
|
|
elif self.current_is(">"):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.MinusLT, start, end)
|
|
else:
|
|
return self.token(TokenType.Minus, start, start)
|
|
elif self.current() == "*":
|
|
return self.single_or_double(TokenType.Asterisk, "=", TokenType.AsteriskEqual)
|
|
elif self.current() == "/":
|
|
start = self.pos()
|
|
self.step()
|
|
if self.current_is("="):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.AsteriskEqual, start, end)
|
|
elif self.current_is("/"):
|
|
self.step()
|
|
while not self.done() and self.current() != "\n":
|
|
self.step()
|
|
return self.next()
|
|
elif self.current_is("*"):
|
|
end = self.pos()
|
|
self.step()
|
|
depth = 1
|
|
last_char: Optional[str] = None
|
|
while not self.done():
|
|
if last_char == "/" and self.current() == "*":
|
|
depth += 1
|
|
elif last_char == "*" and self.current() == "/":
|
|
depth -= 1
|
|
if depth == 0:
|
|
self.step()
|
|
break
|
|
last_char = self.current()
|
|
end = self.pos()
|
|
self.step()
|
|
if depth != 0:
|
|
return self.token(TokenType.MalformedComment, start, end)
|
|
return self.next()
|
|
else:
|
|
return self.token(TokenType.Slash, start, start)
|
|
elif self.current() == "%":
|
|
return self.single_or_double(TokenType.Percent, "=", TokenType.PercentEqual)
|
|
elif self.current() == "=":
|
|
start = self.pos()
|
|
self.step()
|
|
if self.current_is("="):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.EqualEqual, start, end)
|
|
elif self.current_is(">"):
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.EqualLT, start, end)
|
|
else:
|
|
return self.token(TokenType.Equal, start, start)
|
|
elif self.current() == "!":
|
|
return self.single_or_double(TokenType.Exclamation, "=", TokenType.ExclamationEqual)
|
|
elif self.current() == "<":
|
|
return self.single_or_double(TokenType.LT, "=", TokenType.LTEqual)
|
|
elif self.current() == ">":
|
|
return self.single_or_double(TokenType.GT, "=", TokenType.GTEqual)
|
|
elif self.current() == "'":
|
|
start = self.pos()
|
|
self.step()
|
|
end = self.pos()
|
|
first = self.current()
|
|
self.step()
|
|
if not self.done() and first == "\\":
|
|
end = self.pos()
|
|
self.step()
|
|
if not self.current_is("'"):
|
|
return self.token(TokenType.MalformedChar, start, end)
|
|
else:
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.Char, start, end)
|
|
elif self.current() == "\"":
|
|
start = self.pos()
|
|
end = self.pos()
|
|
self.step()
|
|
while not self.done() and self.current() != "\"":
|
|
end = self.pos()
|
|
first = self.current()
|
|
self.step()
|
|
if not self.done() and first == "\\":
|
|
end = self.pos()
|
|
self.step()
|
|
if not self.current_is("\""):
|
|
return self.token(TokenType.MalformedString, start, end)
|
|
else:
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.String, start, end)
|
|
elif self.current() == "0":
|
|
return self.single(TokenType.Int)
|
|
elif self.current() in "123456789":
|
|
start = self.pos()
|
|
end = self.pos()
|
|
self.step()
|
|
while not self.done() and self.current() in "1234567890":
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.Int, start, end)
|
|
elif self.current() in "abcdefghijklmnopqrstuvwxyz_":
|
|
start = self.pos()
|
|
end = self.pos()
|
|
self.step()
|
|
while (not self.done()
|
|
and self.current().lower()
|
|
in "1234567890abcdefghijklmnopqrstuvwxyz_"):
|
|
end = self.pos()
|
|
self.step()
|
|
value = self.text[start.index:self.index]
|
|
if value == "_":
|
|
return self.token(TokenType.Underscore, start, end)
|
|
elif value == "false":
|
|
return self.token(TokenType.KwFalse, start, end)
|
|
elif value == "true":
|
|
return self.token(TokenType.KwTrue, start, end)
|
|
elif value == "not":
|
|
return self.token(TokenType.KwNot, start, end)
|
|
elif value == "in":
|
|
return self.token(TokenType.KwIn, start, end)
|
|
elif value == "and":
|
|
return self.token(TokenType.KwAnd, start, end)
|
|
elif value == "or":
|
|
return self.token(TokenType.KwOr, start, end)
|
|
elif value == "xor":
|
|
return self.token(TokenType.KwXor, start, end)
|
|
elif value == "let":
|
|
return self.token(TokenType.KwLet, start, end)
|
|
elif value == "mut":
|
|
return self.token(TokenType.KwMut, start, end)
|
|
elif value == "if":
|
|
return self.token(TokenType.KwIf, start, end)
|
|
elif value == "else":
|
|
return self.token(TokenType.KwElse, start, end)
|
|
elif value == "while":
|
|
return self.token(TokenType.KwWhile, start, end)
|
|
elif value == "for":
|
|
return self.token(TokenType.KwFor, start, end)
|
|
elif value == "loop":
|
|
return self.token(TokenType.KwLoop, start, end)
|
|
elif value == "break":
|
|
return self.token(TokenType.KwBreak, start, end)
|
|
elif value == "continue":
|
|
return self.token(TokenType.KwContinue, start, end)
|
|
elif value == "fn":
|
|
return self.token(TokenType.KwFn, start, end)
|
|
elif value == "return":
|
|
return self.token(TokenType.KwReturn, start, end)
|
|
elif value == "match":
|
|
return self.token(TokenType.KwMatch, start, end)
|
|
else:
|
|
return self.token(TokenType.Id, start, end)
|
|
else:
|
|
start = self.pos()
|
|
self.step()
|
|
return self.token(TokenType.InvalidChar, start, start)
|
|
|
|
def single(self, token_type: TokenType) -> Token:
|
|
start = self.pos()
|
|
self.step()
|
|
return self.token(token_type, start, start)
|
|
|
|
def single_or_double(self, type1: TokenType, char2: str, type2: TokenType) -> Token:
|
|
start = self.pos()
|
|
self.step()
|
|
if not self.done() and self.current() == char2:
|
|
end = self.pos()
|
|
self.step()
|
|
return self.token(type2, start, end)
|
|
else:
|
|
return self.token(type1, start, start)
|
|
|
|
def token(self, token_type: TokenType, start: Position, end: Position) -> Token:
|
|
return Token(token_type, start.index, self.index - start.index, Span(start, end))
|
|
|
|
def pos(self) -> Position:
|
|
return Position(self.index, self.line, self.col)
|
|
|
|
def step(self) -> None:
|
|
self.index += 1
|
|
if not self.done():
|
|
if self.current() == "\n":
|
|
self.line += 1
|
|
self.col = 1
|
|
else:
|
|
self.col += 1
|
|
|
|
def current_is(self, value: str) -> bool:
|
|
return not self.done() and self.current() == value
|
|
|
|
def done(self) -> bool:
|
|
return self.index >= len(self.text)
|
|
|
|
def current(self) -> str:
|
|
return self.text[self.index]
|