lexer: rename tokenizer to lexer, and convert to sequence

This commit is contained in:
Antoine Viallon 2023-05-14 23:22:38 +02:00
parent 36b1fad7fe
commit bac49d20d7
Signed by: aviallon
GPG key ID: D126B13AB555E16F
5 changed files with 58 additions and 33 deletions

View file

@ -8,7 +8,7 @@ from . import semantic
from .errors import CompilationError
from .logger import rootLogger, LogLevel
from .parser import Parser
from .tokenizer import Tokenizer, Tokens
from .lexer import Lexer, Tokens
def main():
@ -32,8 +32,7 @@ def main():
data = sys.stdin.read().strip()
print("Source:\n", data)
tokenizer = Tokenizer()
tokens = tokenizer.tokenize(data)
tokens = Lexer(data)
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
if rootLogger.level <= LogLevel.Debug:

View file

@ -2,7 +2,7 @@ from __future__ import annotations
import inspect
from . import source, tokenizer
from . import source, lexer
class CompilationError(Exception):
@ -12,9 +12,9 @@ class CompilationError(Exception):
class UnexpectedTokenError(CompilationError):
def __init__(self, got: tokenizer.Token, wanted: tokenizer.Tokens | str):
def __init__(self, got: lexer.Token, wanted: lexer.Tokens | str):
message = wanted
if type(wanted) == tokenizer.Tokens:
if type(wanted) == lexer.Tokens:
message = str(wanted)
super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}")

View file

@ -1,5 +1,6 @@
from __future__ import annotations
import collections.abc
import enum
import re
from dataclasses import dataclass, field
@ -47,52 +48,76 @@ class Tokens(enum.Enum):
return True
class Tokenizer:
def __init__(self):
pass
class Lexer(collections.abc.Sequence):
def __init__(self, data: str):
self.data = data
self.tokens = []
self.length: int | None = None
self.begin: int = 0
self.end: int = 0
self.character: int = 0
self.line: int = 0
def tokenize(self, data: str) -> List[Token]:
results: List[Token] = []
line = 0
character = 0
begin = 0
while begin < len(data):
def __getitem__(self, index: int) -> Token:
while len(self) <= index + 1 and self.length is None:
self._next_token()
return self.tokens[index]
def __next__(self):
return self._next_token()
def __len__(self) -> int:
while self.length is None:
self._next_token()
assert self.length is not None
return self.length
def _next_token(self) -> Token:
actual_result: Token
if self.begin < len(self.data):
best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(Location(line=line, character=character), source=data),
loc=SourceLocation(Location(line=self.line, character=self.character), source=self.data),
value=""
)
for token_kind in list(Tokens):
if token_kind == Tokens.Unknown:
continue
regex: re.Pattern = token_kind.value
match = regex.match(data, begin)
match = regex.match(self.data, self.begin)
if match is not None:
logger.trace(f"Got match: {match}")
result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
loc = SourceLocation(
begin=Location(line=line, character=character),
end=Location(line=line, character=character + len(result))
begin=Location(line=self.line, character=self.character),
end=Location(line=self.line, character=self.character + len(result))
)
best_result = Token(token_kind, value=result, loc=loc)
logger.trace(f"New best match: {best_result}")
begin += len(best_result.value)
character += len(best_result.value)
self.begin += len(best_result.value)
self.character += len(best_result.value)
if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1)
elif best_result.kind == Tokens.Newline:
line += 1
character = 0
best_result.loc.end = Location(line=line, character=0)
self.line += 1
self.character = 0
best_result.loc.end = Location(line=self.line, character=0)
logger.debug(f"Added token {best_result}")
results += [best_result]
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=line, character=0), source=data
))]
return results
self.tokens += [best_result]
return best_result
elif self.begin == len(self.data):
eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=self.line, character=0), source=self.data
))
self.tokens += [eof_token]
self.length = len(self.tokens)
return eof_token
else:
raise IndexError("EOF already reached")

View file

@ -6,7 +6,7 @@ from typing import Any, Iterable
from beartype import beartype
from . import ir, semantic, tokenizer
from . import ir, semantic, lexer
from .errors import SemanticAnalysisError, OverrideMandatoryError
from .logger import Logger
from .source import SourceLocation
@ -116,7 +116,7 @@ class PseudoNode(Literal):
Only used for better diagnostics
"""
def __init__(self, token: tokenizer.Token):
def __init__(self, token: lexer.Token):
super().__init__(token.loc, token.value)
self.token = token

View file

@ -6,11 +6,12 @@ from .errors import CompilationError, UnexpectedTokenError
from .logger import Logger, Tracer, LogLevel
from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \
Variable, Statement, PseudoNode, Block
from .tokenizer import Tokens, Token
from .lexer import Tokens, Token
logger = Logger(__name__)
tracer = Tracer(logger, level=LogLevel.Debug)
class Parser:
def __init__(self, tokens: List[Token]):
self.tokens = tokens