from tokens import TokenIterator, Token, TokenType from typing import Optional from position import Position, Span class Lexer(TokenIterator): def __init__(self, text: str) -> None: self.text = text self.index = 0 self.line = 1 self.col = 1 def next(self) -> Token: if self.done(): return self.token(TokenType.Eof, self.pos(), self.pos()) elif self.current() in " \t\r\n": self.step() while not self.done() and self.current() in " \t\r\n": self.step() return self.next() elif self.current() == "(": return self.single(TokenType.LParen) elif self.current() == ")": return self.single(TokenType.RParen) elif self.current() == "{": return self.single(TokenType.LBrace) elif self.current() == "}": return self.single(TokenType.RBrace) elif self.current() == "[": return self.single(TokenType.LBracket) elif self.current() == "]": return self.single(TokenType.RBracket) elif self.current() == ".": start = self.pos() self.step() if self.current_is("."): end = self.pos() self.step() if self.current_is("."): end = self.pos() self.step() return self.token(TokenType.DotDotDot, start, end) elif self.current_is("="): end = self.pos() self.step() return self.token(TokenType.DotDotEqual, start, end) else: return self.token(TokenType.DotDot, start, end) else: return self.token(TokenType.Dot, start, start) elif self.current() == ",": return self.single(TokenType.Comma) elif self.current() == ":": start = self.pos() self.step() if self.current_is(":"): end = self.pos() self.step() if self.current_is("<"): end = self.pos() self.step() return self.token(TokenType.ColonColonLT, start, end) else: return self.token(TokenType.ColonColon, start, end) else: return self.token(TokenType.Comma, start, start) elif self.current() == ";": return self.single(TokenType.Semicolon) elif self.current() == "&": return self.single(TokenType.Ampersand) elif self.current() == "+": return self.single_or_double(TokenType.Plus, "=", TokenType.PlusEqual) elif self.current() == "-": start = self.pos() self.step() if self.current_is("="): end = self.pos() self.step() return self.token(TokenType.MinusEqual, start, end) elif self.current_is(">"): end = self.pos() self.step() return self.token(TokenType.MinusLT, start, end) else: return self.token(TokenType.Minus, start, start) elif self.current() == "*": return self.single_or_double(TokenType.Asterisk, "=", TokenType.AsteriskEqual) elif self.current() == "/": start = self.pos() self.step() if self.current_is("="): end = self.pos() self.step() return self.token(TokenType.AsteriskEqual, start, end) elif self.current_is("/"): self.step() while not self.done() and self.current() != "\n": self.step() return self.next() elif self.current_is("*"): end = self.pos() self.step() depth = 1 last_char: Optional[str] = None while not self.done(): if last_char == "/" and self.current() == "*": depth += 1 elif last_char == "*" and self.current() == "/": depth -= 1 if depth == 0: self.step() break last_char = self.current() end = self.pos() self.step() if depth != 0: return self.token(TokenType.MalformedComment, start, end) return self.next() else: return self.token(TokenType.Slash, start, start) elif self.current() == "%": return self.single_or_double(TokenType.Percent, "=", TokenType.PercentEqual) elif self.current() == "=": start = self.pos() self.step() if self.current_is("="): end = self.pos() self.step() return self.token(TokenType.EqualEqual, start, end) elif self.current_is(">"): end = self.pos() self.step() return self.token(TokenType.EqualLT, start, end) else: return self.token(TokenType.Equal, start, start) elif self.current() == "!": return self.single_or_double(TokenType.Exclamation, "=", TokenType.ExclamationEqual) elif self.current() == "<": return self.single_or_double(TokenType.LT, "=", TokenType.LTEqual) elif self.current() == ">": return self.single_or_double(TokenType.GT, "=", TokenType.GTEqual) elif self.current() == "'": start = self.pos() self.step() end = self.pos() first = self.current() self.step() if not self.done() and first == "\\": end = self.pos() self.step() if not self.current_is("'"): return self.token(TokenType.MalformedChar, start, end) else: end = self.pos() self.step() return self.token(TokenType.Char, start, end) elif self.current() == "\"": start = self.pos() end = self.pos() self.step() while not self.done() and self.current() != "\"": end = self.pos() first = self.current() self.step() if not self.done() and first == "\\": end = self.pos() self.step() if not self.current_is("\""): return self.token(TokenType.MalformedString, start, end) else: end = self.pos() self.step() return self.token(TokenType.String, start, end) elif self.current() == "0": return self.single(TokenType.Int) elif self.current() in "123456789": start = self.pos() end = self.pos() self.step() while not self.done() and self.current() in "1234567890": end = self.pos() self.step() return self.token(TokenType.Int, start, end) elif self.current() in "abcdefghijklmnopqrstuvwxyz_": start = self.pos() end = self.pos() self.step() while (not self.done() and self.current().lower() in "1234567890abcdefghijklmnopqrstuvwxyz_"): end = self.pos() self.step() value = self.text[start.index:self.index] if value == "_": return self.token(TokenType.Underscore, start, end) elif value == "false": return self.token(TokenType.KwFalse, start, end) elif value == "true": return self.token(TokenType.KwTrue, start, end) elif value == "not": return self.token(TokenType.KwNot, start, end) elif value == "in": return self.token(TokenType.KwIn, start, end) elif value == "and": return self.token(TokenType.KwAnd, start, end) elif value == "or": return self.token(TokenType.KwOr, start, end) elif value == "xor": return self.token(TokenType.KwXor, start, end) elif value == "let": return self.token(TokenType.KwLet, start, end) elif value == "mut": return self.token(TokenType.KwMut, start, end) elif value == "if": return self.token(TokenType.KwIf, start, end) elif value == "else": return self.token(TokenType.KwElse, start, end) elif value == "while": return self.token(TokenType.KwWhile, start, end) elif value == "for": return self.token(TokenType.KwFor, start, end) elif value == "loop": return self.token(TokenType.KwLoop, start, end) elif value == "break": return self.token(TokenType.KwBreak, start, end) elif value == "continue": return self.token(TokenType.KwContinue, start, end) elif value == "fn": return self.token(TokenType.KwFn, start, end) elif value == "return": return self.token(TokenType.KwReturn, start, end) elif value == "match": return self.token(TokenType.KwMatch, start, end) else: return self.token(TokenType.Id, start, end) else: start = self.pos() self.step() return self.token(TokenType.InvalidChar, start, start) def single(self, token_type: TokenType) -> Token: start = self.pos() self.step() return self.token(token_type, start, start) def single_or_double(self, type1: TokenType, char2: str, type2: TokenType) -> Token: start = self.pos() self.step() if not self.done() and self.current() == char2: end = self.pos() self.step() return self.token(type2, start, end) else: return self.token(type1, start, start) def token(self, token_type: TokenType, start: Position, end: Position) -> Token: return Token(token_type, start.index, self.index - start.index, Span(start, end)) def pos(self) -> Position: return Position(self.index, self.line, self.col) def step(self) -> None: self.index += 1 if not self.done(): if self.current() == "\n": self.line += 1 self.col = 1 else: self.col += 1 def current_is(self, value: str) -> bool: return not self.done() and self.current() == value def done(self) -> bool: return self.index >= len(self.text) def current(self) -> str: return self.text[self.index]