compiler/compiler/lexer.py

154 lines
5.4 KiB
Python

from __future__ import annotations
import collections.abc
import enum
import re
import typing
from dataclasses import dataclass, field
from typing import cast
from .logger import Logger
from .source import SourceLocation, Location
from .typechecking import typecheck
from .utils import implies
logger = Logger(__name__)
@typecheck
@dataclass
class Token:
kind: Tokens
loc: SourceLocation = field(compare=False, hash=False, default=None)
value: str | None = field(compare=False, hash=False, default=None)
def __repr__(self):
return f"{self.kind.name}({repr(self.value)})"
class Tokens(enum.Enum):
Integer = re.compile(r"[0-9]+")
Float = re.compile(r"[0-9]+\.[0-9]*")
Op_Plus = re.compile(r"\+")
Op_Minus = re.compile(r"-")
Op_Multiply = re.compile(r"\*")
Op_Divide = re.compile(r"/")
Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)")
Brace_Left = re.compile(r"\{")
Brace_Right = re.compile(r"}")
KwLet = re.compile(r"\blet\b")
Identifier = re.compile(r"\b[a-zA-Z_][a-zA-Z_0-9]*\b")
Equal = re.compile(r"=")
Colon = re.compile(r":")
Semicolon = re.compile(r";")
Comma = re.compile(r",")
Comment = re.compile(r"//.*")
Newline = re.compile(r"\n", flags=re.MULTILINE)
BEGIN = re.compile(r"\A")
EOF = re.compile(r"\Z")
Blank = re.compile(r"[ \t]+")
Unknown = re.compile(r".*", flags=re.DOTALL)
def __bool__(self):
return True
def is_keyword(self) -> bool:
return self in [Tokens.KwLet]
class Lexer(collections.abc.Iterator):
def __init__(self, input_stream: typing.TextIO, token_filter: typing.Callable[[Token], bool] | None = None):
self.input = input_stream
self.data: str = ""
self.tokens = []
self.length: int | None = None
self.begin: int = 0
self.end: int = 0
self.character: int = 0
self.line: int = 0
self.token_filter = token_filter
def __next__(self):
return self._filtered_next_token()
def _filtered_next_token(self) -> Token:
tok = self._next_token()
if self.token_filter is not None:
while not self.token_filter(tok):
tok = self._next_token()
logger.debug(f"Returning token: {tok}")
return tok
def _next_token(self) -> Token:
actual_result: Token
if len(self.tokens) == 0:
tok = Token(Tokens.BEGIN,
loc=SourceLocation(
Location(line=0, character=0),
source=self.data
),
value=None)
self.tokens.append(tok)
return tok
if self.tokens[-1].kind in [Tokens.BEGIN, Tokens.Newline]:
self.data += self.input.readline()
if self.begin == len(self.data):
eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=self.line, character=0),
))
self.tokens += [eof_token]
self.length = len(self.tokens)
return eof_token
elif self.begin < len(self.data):
best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(
Location(line=self.line, character=self.character),
source=self.data),
value=""
)
token_kind: Tokens
for token_kind in list(Tokens):
if token_kind == Tokens.Unknown:
continue
regex = cast(re.Pattern, token_kind.value)
match = regex.match(self.data, self.begin)
if match is not None:
logger.trace(f"Got match: {match}")
result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
if not implies(best_result.kind.is_keyword(), token_kind.is_keyword()):
logger.trace(
f"Best match is a keyword ({best_result}) and current match ({token_kind}) is not, skipping")
continue
loc = SourceLocation(
begin=Location(line=self.line, character=self.character),
end=Location(line=self.line, character=self.character + len(result))
)
best_result = Token(token_kind, value=result, loc=loc)
logger.trace(f"New best match: {best_result}")
self.begin += len(best_result.value)
self.character += len(best_result.value)
if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1)
elif best_result.kind == Tokens.Newline:
self.line += 1
self.character = 0
best_result.loc.end = Location(line=self.line, character=0)
logger.debug(f"Added token {best_result}")
self.tokens += [best_result]
return best_result
else:
raise IndexError("EOF already reached")