139 lines
4.7 KiB
Python
139 lines
4.7 KiB
Python
from __future__ import annotations
|
|
|
|
import collections.abc
|
|
import enum
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import cast
|
|
|
|
from .logger import Logger
|
|
from .source import SourceLocation, Location
|
|
from .typechecking import typecheck
|
|
from .utils import implies
|
|
|
|
logger = Logger(__name__)
|
|
|
|
|
|
@typecheck
|
|
@dataclass
|
|
class Token:
|
|
kind: Tokens
|
|
loc: SourceLocation = field(compare=False, hash=False, default=None)
|
|
value: str | None = field(compare=False, hash=False, default=None)
|
|
|
|
def __repr__(self):
|
|
return f"{self.kind.name}({repr(self.value)})"
|
|
|
|
|
|
class Tokens(enum.Enum):
|
|
Integer = re.compile(r"[0-9]+")
|
|
Float = re.compile(r"[0-9]+\.[0-9]*")
|
|
Op_Plus = re.compile(r"\+")
|
|
Op_Minus = re.compile(r"-")
|
|
Op_Multiply = re.compile(r"\*")
|
|
Op_Divide = re.compile(r"/")
|
|
Parens_Left = re.compile(r"\(")
|
|
Parens_Right = re.compile(r"\)")
|
|
Brace_Left = re.compile(r"\{")
|
|
Brace_Right = re.compile(r"}")
|
|
|
|
KwLet = re.compile(r"\blet\b")
|
|
|
|
Identifier = re.compile(r"[a-zA-Z_][a-zA-Z_0-9]*")
|
|
|
|
Equal = re.compile(r"=")
|
|
Colon = re.compile(r":")
|
|
Semicolon = re.compile(r";")
|
|
|
|
Newline = re.compile(r"\n", flags=re.MULTILINE)
|
|
EOF = re.compile(r"\Z")
|
|
Blank = re.compile(r"[ \t]+")
|
|
Unknown = re.compile(r".*", flags=re.DOTALL)
|
|
|
|
def __bool__(self):
|
|
return True
|
|
|
|
def is_keyword(self) -> bool:
|
|
return self in [Tokens.KwLet]
|
|
|
|
|
|
class Lexer(collections.abc.Sequence):
|
|
def __init__(self, data: str):
|
|
self.data = data
|
|
self.tokens = []
|
|
self.length: int | None = None
|
|
self.begin: int = 0
|
|
self.end: int = 0
|
|
self.character: int = 0
|
|
self.line: int = 0
|
|
|
|
def __getitem__(self, index: int) -> Token:
|
|
while len(self) <= index + 1 and self.length is None:
|
|
self._next_token()
|
|
return self.tokens[index]
|
|
|
|
def __next__(self):
|
|
return self._next_token()
|
|
|
|
def __len__(self) -> int:
|
|
while self.length is None:
|
|
self._next_token()
|
|
|
|
assert self.length is not None
|
|
|
|
return self.length
|
|
|
|
def _next_token(self) -> Token:
|
|
actual_result: Token
|
|
if self.begin < len(self.data):
|
|
best_result: Token = Token(Tokens.Unknown,
|
|
loc=SourceLocation(
|
|
Location(line=self.line, character=self.character),
|
|
source=self.data),
|
|
value=""
|
|
)
|
|
token_kind: Tokens
|
|
for token_kind in list(Tokens):
|
|
if token_kind == Tokens.Unknown:
|
|
continue
|
|
regex = cast(re.Pattern, token_kind.value)
|
|
match = regex.match(self.data, self.begin)
|
|
if match is not None:
|
|
logger.trace(f"Got match: {match}")
|
|
result = match.group(0)
|
|
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
|
if not implies(best_result.kind.is_keyword(), token_kind.is_keyword()):
|
|
logger.trace(
|
|
f"Best match is a keyword ({best_result}) and current match ({token_kind}) is not, skipping")
|
|
continue
|
|
loc = SourceLocation(
|
|
begin=Location(line=self.line, character=self.character),
|
|
end=Location(line=self.line, character=self.character + len(result))
|
|
)
|
|
best_result = Token(token_kind, value=result, loc=loc)
|
|
logger.trace(f"New best match: {best_result}")
|
|
|
|
self.begin += len(best_result.value)
|
|
self.character += len(best_result.value)
|
|
if best_result.kind == Tokens.Unknown:
|
|
source_hint = best_result.loc.show_in_source()
|
|
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
|
exit(1)
|
|
elif best_result.kind == Tokens.Newline:
|
|
self.line += 1
|
|
self.character = 0
|
|
best_result.loc.end = Location(line=self.line, character=0)
|
|
|
|
logger.debug(f"Added token {best_result}")
|
|
|
|
self.tokens += [best_result]
|
|
return best_result
|
|
elif self.begin == len(self.data):
|
|
eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
|
|
Location(line=self.line, character=0), source=self.data
|
|
))
|
|
self.tokens += [eof_token]
|
|
self.length = len(self.tokens)
|
|
return eof_token
|
|
else:
|
|
raise IndexError("EOF already reached")
|