compiler/compiler/lexer.py

139 lines
4.7 KiB
Python

from __future__ import annotations
import collections.abc
import enum
import re
from dataclasses import dataclass, field
from typing import cast
from .logger import Logger
from .source import SourceLocation, Location
from .typechecking import typecheck
from .utils import implies
logger = Logger(__name__)
@typecheck
@dataclass
class Token:
kind: Tokens
loc: SourceLocation = field(compare=False, hash=False, default=None)
value: str | None = field(compare=False, hash=False, default=None)
def __repr__(self):
return f"{self.kind.name}({repr(self.value)})"
class Tokens(enum.Enum):
Integer = re.compile(r"[0-9]+")
Float = re.compile(r"[0-9]+\.[0-9]*")
Op_Plus = re.compile(r"\+")
Op_Minus = re.compile(r"-")
Op_Multiply = re.compile(r"\*")
Op_Divide = re.compile(r"/")
Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)")
Brace_Left = re.compile(r"\{")
Brace_Right = re.compile(r"}")
KwLet = re.compile(r"\blet\b")
Identifier = re.compile(r"[a-zA-Z_][a-zA-Z_0-9]*")
Equal = re.compile(r"=")
Colon = re.compile(r":")
Semicolon = re.compile(r";")
Newline = re.compile(r"\n", flags=re.MULTILINE)
EOF = re.compile(r"\Z")
Blank = re.compile(r"[ \t]+")
Unknown = re.compile(r".*", flags=re.DOTALL)
def __bool__(self):
return True
def is_keyword(self) -> bool:
return self in [Tokens.KwLet]
class Lexer(collections.abc.Sequence):
def __init__(self, data: str):
self.data = data
self.tokens = []
self.length: int | None = None
self.begin: int = 0
self.end: int = 0
self.character: int = 0
self.line: int = 0
def __getitem__(self, index: int) -> Token:
while len(self) <= index + 1 and self.length is None:
self._next_token()
return self.tokens[index]
def __next__(self):
return self._next_token()
def __len__(self) -> int:
while self.length is None:
self._next_token()
assert self.length is not None
return self.length
def _next_token(self) -> Token:
actual_result: Token
if self.begin < len(self.data):
best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(
Location(line=self.line, character=self.character),
source=self.data),
value=""
)
token_kind: Tokens
for token_kind in list(Tokens):
if token_kind == Tokens.Unknown:
continue
regex = cast(re.Pattern, token_kind.value)
match = regex.match(self.data, self.begin)
if match is not None:
logger.trace(f"Got match: {match}")
result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
if not implies(best_result.kind.is_keyword(), token_kind.is_keyword()):
logger.trace(
f"Best match is a keyword ({best_result}) and current match ({token_kind}) is not, skipping")
continue
loc = SourceLocation(
begin=Location(line=self.line, character=self.character),
end=Location(line=self.line, character=self.character + len(result))
)
best_result = Token(token_kind, value=result, loc=loc)
logger.trace(f"New best match: {best_result}")
self.begin += len(best_result.value)
self.character += len(best_result.value)
if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1)
elif best_result.kind == Tokens.Newline:
self.line += 1
self.character = 0
best_result.loc.end = Location(line=self.line, character=0)
logger.debug(f"Added token {best_result}")
self.tokens += [best_result]
return best_result
elif self.begin == len(self.data):
eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=self.line, character=0), source=self.data
))
self.tokens += [eof_token]
self.length = len(self.tokens)
return eof_token
else:
raise IndexError("EOF already reached")