slige-mirror/compiler/parse/lexer.ts
2025-02-04 15:06:19 +01:00

340 lines
8.6 KiB
TypeScript

import { Ctx, File } from "../ctx.ts";
import { Pos, Span } from "../diagnostics.ts";
import { ControlFlow, range } from "../util.ts";
import { Token, TokenIter } from "./token.ts";
export class Lexer implements TokenIter {
private idx = 0;
private line = 1;
private col = 1;
private text: string;
public constructor(
private ctx: Ctx,
private file: File,
) {
this.text = ctx.fileInfo(file).text;
}
next(): Token | null {
if (this.done()) {
return null;
}
let cf: ControlFlow<Token>;
if (
cf = this.lexWithTail(
(span) => this.token("whitespace", span),
/[ \t\r\n]/,
), cf.break
) {
return cf.val;
}
if (
cf = this.lexWithTail(
(span, val) => {
return keywords.has(val)
? this.token(val, span)
: this.token("ident", span, {
type: "ident",
identId: this.ctx.internIdent(val),
identText: val,
});
},
/[a-zA-Z_]/,
/[a-zA-Z0-9_]/,
), cf.break
) {
return cf.val;
}
if (
cf = this.lexWithTail(
(span, val) =>
this.token("int", span, {
type: "int",
intValue: parseInt(val),
}),
/[1-9]/,
/[0-9]/,
), cf.break
) {
return cf.val;
}
const begin = this.pos();
let end = begin;
const pos = begin;
if (this.test("0")) {
this.step();
if (!this.done() && this.test(/[0-9]/)) {
this.report("invalid number", pos);
return this.token("error", { begin, end });
}
return this.token("int", { begin, end }, {
type: "int",
intValue: 0,
});
}
if (this.test("'")) {
this.step();
let value: string;
if (this.test("\\")) {
this.step();
if (this.done()) {
this.report("malformed character literal", pos);
return this.token("error", { begin, end });
}
value = {
n: "\n",
t: "\t",
"0": "\0",
}[this.current()] ?? this.current();
} else {
value = this.current();
}
this.step();
if (this.done() || !this.test("'") || value.length === 0) {
this.report("malformed character literal", pos);
return this.token("error", { begin, end });
}
this.step();
return this.token("int", { begin, end }, {
type: "int",
intValue: value.charCodeAt(0),
});
}
if (this.test('"')) {
this.step();
let value = "";
while (!this.done() && !this.test('"')) {
if (this.test("\\")) {
this.step();
if (this.done()) {
break;
}
value += {
n: "\n",
t: "\t",
"0": "\0",
}[this.current()] ?? this.current();
} else {
value += this.current();
}
this.step();
}
if (this.done() || !this.test('"')) {
this.report("unclosed/malformed string", pos);
return this.token("error", { begin, end });
}
this.step();
return this.token("str", { begin, end }, {
type: "str",
stringValue: value,
});
}
if (this.test("/")) {
this.step();
if (this.test("/")) {
while (!this.done() && !this.test("\n")) {
end = this.pos();
this.step();
}
return this.token("comment", { begin, end });
}
if (this.test("*")) {
end = this.pos();
this.step();
let depth = 1;
let last: string | undefined = undefined;
while (!this.done() && depth > 0) {
if (last === "*" && this.current() === "/") {
depth -= 1;
last = undefined;
} else if (last === "/" && this.current() === "*") {
depth += 1;
last = undefined;
} else {
last = this.current();
}
end = this.pos();
this.step();
}
if (depth !== 0) {
this.report("unclosed/malformed multiline comment", pos);
return this.token("comment", { begin, end });
}
}
return this.token("/", { begin, end });
}
const match = this.text.slice(this.idx).match(
new RegExp(`^(${
staticTokenRes
.map((tok) => tok.length > 1 ? `(?:${tok})` : tok)
.join("|")
})`),
);
if (match) {
for (const _ of range(match[1].length)) {
end = this.pos();
this.step();
}
return this.token(match[1], { begin, end });
}
this.report(`illegal character '${this.current()}'`, pos);
this.step();
return this.next();
}
private lexWithTail<R>(
builder: (span: Span, val: string) => R,
startPat: RegExp,
tailPat = startPat,
): ControlFlow<R> {
const begin = this.pos();
if (!this.test(startPat)) {
return ControlFlow.Continue(undefined);
}
let end = begin;
let val = this.current();
this.step();
while (this.test(tailPat)) {
end = begin;
val += this.current();
this.step();
}
return ControlFlow.Break(builder({ begin, end }, val));
}
private done(): boolean {
return this.idx >= this.text.length;
}
private current(): string {
return this.text[this.idx];
}
private step() {
if (this.done()) {
return;
}
if (this.current() === "\n") {
this.line += 1;
this.col = 1;
} else {
this.col += 1;
}
this.idx += 1;
}
private pos(): Pos {
return {
idx: this.idx,
line: this.line,
col: this.col,
};
}
private token(type: string, span: Span, token?: Partial<Token>): Token {
const length = span.end.idx - span.begin.idx + 1;
return { type, span, length, ...token };
}
private test(pattern: RegExp | string): boolean {
if (this.done()) {
return false;
}
if (typeof pattern === "string") {
return this.current() === pattern;
} else if (pattern.source.startsWith("^")) {
return pattern.test(this.text.slice(this.idx));
} else {
return pattern.test(this.current());
}
}
private report(msg: string, pos: Pos) {
this.ctx.report({
severity: "error",
origin: "parser",
file: this.file,
msg,
pos,
});
}
}
const keywords = new Set([
"false",
"true",
"null",
"int",
"bool",
"str",
"return",
"break",
"continue",
"let",
"mut",
"fn",
"loop",
"if",
"else",
"struct",
"enum",
"or",
"and",
"not",
"while",
"for",
"in",
"mod",
"pub",
"use",
"type_alias",
]);
const staticTokens = [
"=",
"==",
"<",
"<=",
">",
">=",
"-",
"->",
"!",
"!=",
"+",
"+=",
"-=",
":",
"::",
"::<",
"(",
")",
"{",
"}",
"[",
"]",
"<",
">",
".",
",",
":",
";",
"#",
"&",
"0",
] as const;
const staticTokenRes = staticTokens
.toSorted((a, b) => b.length - a.length)
.map((tok) => tok.split("").map((c) => `\\${c}`).join(""));