lexer: rename tokenizer to lexer, and convert to sequence

This commit is contained in:
Antoine Viallon 2023-05-14 23:22:38 +02:00
parent 36b1fad7fe
commit bac49d20d7
Signed by: aviallon
GPG key ID: D126B13AB555E16F
5 changed files with 58 additions and 33 deletions

View file

@ -8,7 +8,7 @@ from . import semantic
from .errors import CompilationError from .errors import CompilationError
from .logger import rootLogger, LogLevel from .logger import rootLogger, LogLevel
from .parser import Parser from .parser import Parser
from .tokenizer import Tokenizer, Tokens from .lexer import Lexer, Tokens
def main(): def main():
@ -32,8 +32,7 @@ def main():
data = sys.stdin.read().strip() data = sys.stdin.read().strip()
print("Source:\n", data) print("Source:\n", data)
tokenizer = Tokenizer() tokens = Lexer(data)
tokens = tokenizer.tokenize(data)
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]] tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
if rootLogger.level <= LogLevel.Debug: if rootLogger.level <= LogLevel.Debug:

View file

@ -2,7 +2,7 @@ from __future__ import annotations
import inspect import inspect
from . import source, tokenizer from . import source, lexer
class CompilationError(Exception): class CompilationError(Exception):
@ -12,9 +12,9 @@ class CompilationError(Exception):
class UnexpectedTokenError(CompilationError): class UnexpectedTokenError(CompilationError):
def __init__(self, got: tokenizer.Token, wanted: tokenizer.Tokens | str): def __init__(self, got: lexer.Token, wanted: lexer.Tokens | str):
message = wanted message = wanted
if type(wanted) == tokenizer.Tokens: if type(wanted) == lexer.Tokens:
message = str(wanted) message = str(wanted)
super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}") super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}")

View file

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import collections.abc
import enum import enum
import re import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
@ -47,52 +48,76 @@ class Tokens(enum.Enum):
return True return True
class Tokenizer: class Lexer(collections.abc.Sequence):
def __init__(self): def __init__(self, data: str):
pass self.data = data
self.tokens = []
self.length: int | None = None
self.begin: int = 0
self.end: int = 0
self.character: int = 0
self.line: int = 0
def tokenize(self, data: str) -> List[Token]: def __getitem__(self, index: int) -> Token:
results: List[Token] = [] while len(self) <= index + 1 and self.length is None:
line = 0 self._next_token()
character = 0 return self.tokens[index]
begin = 0
while begin < len(data): def __next__(self):
return self._next_token()
def __len__(self) -> int:
while self.length is None:
self._next_token()
assert self.length is not None
return self.length
def _next_token(self) -> Token:
actual_result: Token
if self.begin < len(self.data):
best_result: Token = Token(Tokens.Unknown, best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(Location(line=line, character=character), source=data), loc=SourceLocation(Location(line=self.line, character=self.character), source=self.data),
value="" value=""
) )
for token_kind in list(Tokens): for token_kind in list(Tokens):
if token_kind == Tokens.Unknown: if token_kind == Tokens.Unknown:
continue continue
regex: re.Pattern = token_kind.value regex: re.Pattern = token_kind.value
match = regex.match(data, begin) match = regex.match(self.data, self.begin)
if match is not None: if match is not None:
logger.trace(f"Got match: {match}") logger.trace(f"Got match: {match}")
result = match.group(0) result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value): if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
loc = SourceLocation( loc = SourceLocation(
begin=Location(line=line, character=character), begin=Location(line=self.line, character=self.character),
end=Location(line=line, character=character + len(result)) end=Location(line=self.line, character=self.character + len(result))
) )
best_result = Token(token_kind, value=result, loc=loc) best_result = Token(token_kind, value=result, loc=loc)
logger.trace(f"New best match: {best_result}") logger.trace(f"New best match: {best_result}")
begin += len(best_result.value) self.begin += len(best_result.value)
character += len(best_result.value) self.character += len(best_result.value)
if best_result.kind == Tokens.Unknown: if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source() source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}") logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1) exit(1)
elif best_result.kind == Tokens.Newline: elif best_result.kind == Tokens.Newline:
line += 1 self.line += 1
character = 0 self.character = 0
best_result.loc.end = Location(line=line, character=0) best_result.loc.end = Location(line=self.line, character=0)
logger.debug(f"Added token {best_result}") logger.debug(f"Added token {best_result}")
results += [best_result] self.tokens += [best_result]
return best_result
results += [Token(Tokens.EOF, value=None, loc=SourceLocation( elif self.begin == len(self.data):
Location(line=line, character=0), source=data eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
))] Location(line=self.line, character=0), source=self.data
return results ))
self.tokens += [eof_token]
self.length = len(self.tokens)
return eof_token
else:
raise IndexError("EOF already reached")

View file

@ -6,7 +6,7 @@ from typing import Any, Iterable
from beartype import beartype from beartype import beartype
from . import ir, semantic, tokenizer from . import ir, semantic, lexer
from .errors import SemanticAnalysisError, OverrideMandatoryError from .errors import SemanticAnalysisError, OverrideMandatoryError
from .logger import Logger from .logger import Logger
from .source import SourceLocation from .source import SourceLocation
@ -116,7 +116,7 @@ class PseudoNode(Literal):
Only used for better diagnostics Only used for better diagnostics
""" """
def __init__(self, token: tokenizer.Token): def __init__(self, token: lexer.Token):
super().__init__(token.loc, token.value) super().__init__(token.loc, token.value)
self.token = token self.token = token

View file

@ -6,11 +6,12 @@ from .errors import CompilationError, UnexpectedTokenError
from .logger import Logger, Tracer, LogLevel from .logger import Logger, Tracer, LogLevel
from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \ from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \
Variable, Statement, PseudoNode, Block Variable, Statement, PseudoNode, Block
from .tokenizer import Tokens, Token from .lexer import Tokens, Token
logger = Logger(__name__) logger = Logger(__name__)
tracer = Tracer(logger, level=LogLevel.Debug) tracer = Tracer(logger, level=LogLevel.Debug)
class Parser: class Parser:
def __init__(self, tokens: List[Token]): def __init__(self, tokens: List[Token]):
self.tokens = tokens self.tokens = tokens