98 lines
3.3 KiB
Python
98 lines
3.3 KiB
Python
from __future__ import annotations
|
|
|
|
import enum
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
|
|
from beartype import beartype
|
|
from beartype.typing import Optional, List
|
|
|
|
from .logger import Logger
|
|
from .source import SourceLocation, Location
|
|
|
|
logger = Logger(__name__)
|
|
|
|
|
|
@beartype
|
|
@dataclass
|
|
class Token:
|
|
kind: Tokens
|
|
loc: SourceLocation = field(compare=False, hash=False, default=None)
|
|
value: Optional[str] = field(compare=False, hash=False, default=None)
|
|
|
|
def __repr__(self):
|
|
return f"{self.kind.name}({repr(self.value)})"
|
|
|
|
|
|
class Tokens(enum.Enum):
|
|
Integer = re.compile(r"[0-9]+")
|
|
Float = re.compile(r"[0-9]+\.[0-9]*")
|
|
Op_Plus = re.compile(r"\+")
|
|
Op_Minus = re.compile(r"-")
|
|
Op_Multiply = re.compile(r"\*")
|
|
Op_Divide = re.compile(r"/")
|
|
Parens_Left = re.compile(r"\(")
|
|
Parens_Right = re.compile(r"\)")
|
|
Brace_Left = re.compile(r"\{")
|
|
Brace_Right = re.compile(r"}")
|
|
Identifier = re.compile(r"[a-zA-Z_][a-zA-Z_0-9]*")
|
|
Equal = re.compile(r"=")
|
|
Semicolon = re.compile(r";")
|
|
Newline = re.compile(r"\n", flags=re.MULTILINE)
|
|
EOF = re.compile(r"\Z")
|
|
Blank = re.compile(r"[ \t]+")
|
|
Unknown = re.compile(r".*", flags=re.DOTALL)
|
|
|
|
def __bool__(self):
|
|
return True
|
|
|
|
|
|
class Tokenizer:
|
|
def __init__(self):
|
|
pass
|
|
|
|
def tokenize(self, data: str) -> List[Token]:
|
|
results: List[Token] = []
|
|
line = 0
|
|
character = 0
|
|
begin = 0
|
|
while begin < len(data):
|
|
best_result: Token = Token(Tokens.Unknown,
|
|
loc=SourceLocation(Location(line=line, character=character), source=data),
|
|
value=""
|
|
)
|
|
for token_kind in list(Tokens):
|
|
if token_kind == Tokens.Unknown:
|
|
continue
|
|
regex: re.Pattern = token_kind.value
|
|
match = regex.match(data, begin)
|
|
if match is not None:
|
|
logger.trace(f"Got match: {match}")
|
|
result = match.group(0)
|
|
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
|
loc = SourceLocation(
|
|
begin=Location(line=line, character=character),
|
|
end=Location(line=line, character=character + len(result))
|
|
)
|
|
best_result = Token(token_kind, value=result, loc=loc)
|
|
logger.trace(f"New best match: {best_result}")
|
|
|
|
begin += len(best_result.value)
|
|
character += len(best_result.value)
|
|
if best_result.kind == Tokens.Unknown:
|
|
source_hint = best_result.loc.show_in_source()
|
|
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
|
exit(1)
|
|
elif best_result.kind == Tokens.Newline:
|
|
line += 1
|
|
character = 0
|
|
best_result.loc.end = Location(line=line, character=0)
|
|
|
|
logger.debug(f"Added token {best_result}")
|
|
|
|
results += [best_result]
|
|
|
|
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
|
Location(line=line, character=0), source=data
|
|
))]
|
|
return results
|