compiler/compiler/tokenizer.py

98 lines
3.3 KiB
Python

from __future__ import annotations
import enum
import re
from dataclasses import dataclass, field
from beartype import beartype
from beartype.typing import Optional, List
from .logger import Logger
from .source import SourceLocation, Location
logger = Logger(__name__)
@beartype
@dataclass
class Token:
kind: Tokens
loc: SourceLocation = field(compare=False, hash=False, default=None)
value: Optional[str] = field(compare=False, hash=False, default=None)
def __repr__(self):
return f"{self.kind.name}({repr(self.value)})"
class Tokens(enum.Enum):
Integer = re.compile(r"[0-9]+")
Float = re.compile(r"[0-9]+\.[0-9]*")
Op_Plus = re.compile(r"\+")
Op_Minus = re.compile(r"-")
Op_Multiply = re.compile(r"\*")
Op_Divide = re.compile(r"/")
Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)")
Brace_Left = re.compile(r"\{")
Brace_Right = re.compile(r"}")
Identifier = re.compile(r"[a-zA-Z_][a-zA-Z_0-9]*")
Equal = re.compile(r"=")
Semicolon = re.compile(r";")
Newline = re.compile(r"\n", flags=re.MULTILINE)
EOF = re.compile(r"\Z")
Blank = re.compile(r"[ \t]+")
Unknown = re.compile(r".*", flags=re.DOTALL)
def __bool__(self):
return True
class Tokenizer:
def __init__(self):
pass
def tokenize(self, data: str) -> List[Token]:
results: List[Token] = []
line = 0
character = 0
begin = 0
while begin < len(data):
best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(Location(line=line, character=character), source=data),
value=""
)
for token_kind in list(Tokens):
if token_kind == Tokens.Unknown:
continue
regex: re.Pattern = token_kind.value
match = regex.match(data, begin)
if match is not None:
logger.trace(f"Got match: {match}")
result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
loc = SourceLocation(
begin=Location(line=line, character=character),
end=Location(line=line, character=character + len(result))
)
best_result = Token(token_kind, value=result, loc=loc)
logger.trace(f"New best match: {best_result}")
begin += len(best_result.value)
character += len(best_result.value)
if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1)
elif best_result.kind == Tokens.Newline:
line += 1
character = 0
best_result.loc.end = Location(line=line, character=0)
logger.debug(f"Added token {best_result}")
results += [best_result]
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=line, character=0), source=data
))]
return results