lexer: rename tokenizer to lexer, and convert to sequence
This commit is contained in:
parent
36b1fad7fe
commit
bac49d20d7
5 changed files with 58 additions and 33 deletions
|
|
@ -8,7 +8,7 @@ from . import semantic
|
|||
from .errors import CompilationError
|
||||
from .logger import rootLogger, LogLevel
|
||||
from .parser import Parser
|
||||
from .tokenizer import Tokenizer, Tokens
|
||||
from .lexer import Lexer, Tokens
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -32,8 +32,7 @@ def main():
|
|||
data = sys.stdin.read().strip()
|
||||
|
||||
print("Source:\n", data)
|
||||
tokenizer = Tokenizer()
|
||||
tokens = tokenizer.tokenize(data)
|
||||
tokens = Lexer(data)
|
||||
|
||||
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
|
||||
if rootLogger.level <= LogLevel.Debug:
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
|
||||
import inspect
|
||||
|
||||
from . import source, tokenizer
|
||||
from . import source, lexer
|
||||
|
||||
|
||||
class CompilationError(Exception):
|
||||
|
|
@ -12,9 +12,9 @@ class CompilationError(Exception):
|
|||
|
||||
|
||||
class UnexpectedTokenError(CompilationError):
|
||||
def __init__(self, got: tokenizer.Token, wanted: tokenizer.Tokens | str):
|
||||
def __init__(self, got: lexer.Token, wanted: lexer.Tokens | str):
|
||||
message = wanted
|
||||
if type(wanted) == tokenizer.Tokens:
|
||||
if type(wanted) == lexer.Tokens:
|
||||
message = str(wanted)
|
||||
super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import collections.abc
|
||||
import enum
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
|
@ -47,52 +48,76 @@ class Tokens(enum.Enum):
|
|||
return True
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self):
|
||||
pass
|
||||
class Lexer(collections.abc.Sequence):
|
||||
def __init__(self, data: str):
|
||||
self.data = data
|
||||
self.tokens = []
|
||||
self.length: int | None = None
|
||||
self.begin: int = 0
|
||||
self.end: int = 0
|
||||
self.character: int = 0
|
||||
self.line: int = 0
|
||||
|
||||
def tokenize(self, data: str) -> List[Token]:
|
||||
results: List[Token] = []
|
||||
line = 0
|
||||
character = 0
|
||||
begin = 0
|
||||
while begin < len(data):
|
||||
def __getitem__(self, index: int) -> Token:
|
||||
while len(self) <= index + 1 and self.length is None:
|
||||
self._next_token()
|
||||
return self.tokens[index]
|
||||
|
||||
def __next__(self):
|
||||
return self._next_token()
|
||||
|
||||
def __len__(self) -> int:
|
||||
while self.length is None:
|
||||
self._next_token()
|
||||
|
||||
assert self.length is not None
|
||||
|
||||
return self.length
|
||||
|
||||
def _next_token(self) -> Token:
|
||||
actual_result: Token
|
||||
if self.begin < len(self.data):
|
||||
best_result: Token = Token(Tokens.Unknown,
|
||||
loc=SourceLocation(Location(line=line, character=character), source=data),
|
||||
loc=SourceLocation(Location(line=self.line, character=self.character), source=self.data),
|
||||
value=""
|
||||
)
|
||||
for token_kind in list(Tokens):
|
||||
if token_kind == Tokens.Unknown:
|
||||
continue
|
||||
regex: re.Pattern = token_kind.value
|
||||
match = regex.match(data, begin)
|
||||
match = regex.match(self.data, self.begin)
|
||||
if match is not None:
|
||||
logger.trace(f"Got match: {match}")
|
||||
result = match.group(0)
|
||||
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
||||
loc = SourceLocation(
|
||||
begin=Location(line=line, character=character),
|
||||
end=Location(line=line, character=character + len(result))
|
||||
begin=Location(line=self.line, character=self.character),
|
||||
end=Location(line=self.line, character=self.character + len(result))
|
||||
)
|
||||
best_result = Token(token_kind, value=result, loc=loc)
|
||||
logger.trace(f"New best match: {best_result}")
|
||||
|
||||
begin += len(best_result.value)
|
||||
character += len(best_result.value)
|
||||
self.begin += len(best_result.value)
|
||||
self.character += len(best_result.value)
|
||||
if best_result.kind == Tokens.Unknown:
|
||||
source_hint = best_result.loc.show_in_source()
|
||||
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
||||
exit(1)
|
||||
elif best_result.kind == Tokens.Newline:
|
||||
line += 1
|
||||
character = 0
|
||||
best_result.loc.end = Location(line=line, character=0)
|
||||
self.line += 1
|
||||
self.character = 0
|
||||
best_result.loc.end = Location(line=self.line, character=0)
|
||||
|
||||
logger.debug(f"Added token {best_result}")
|
||||
|
||||
results += [best_result]
|
||||
|
||||
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||
Location(line=line, character=0), source=data
|
||||
))]
|
||||
return results
|
||||
self.tokens += [best_result]
|
||||
return best_result
|
||||
elif self.begin == len(self.data):
|
||||
eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||
Location(line=self.line, character=0), source=self.data
|
||||
))
|
||||
self.tokens += [eof_token]
|
||||
self.length = len(self.tokens)
|
||||
return eof_token
|
||||
else:
|
||||
raise IndexError("EOF already reached")
|
||||
|
|
@ -6,7 +6,7 @@ from typing import Any, Iterable
|
|||
|
||||
from beartype import beartype
|
||||
|
||||
from . import ir, semantic, tokenizer
|
||||
from . import ir, semantic, lexer
|
||||
from .errors import SemanticAnalysisError, OverrideMandatoryError
|
||||
from .logger import Logger
|
||||
from .source import SourceLocation
|
||||
|
|
@ -116,7 +116,7 @@ class PseudoNode(Literal):
|
|||
Only used for better diagnostics
|
||||
"""
|
||||
|
||||
def __init__(self, token: tokenizer.Token):
|
||||
def __init__(self, token: lexer.Token):
|
||||
super().__init__(token.loc, token.value)
|
||||
self.token = token
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,12 @@ from .errors import CompilationError, UnexpectedTokenError
|
|||
from .logger import Logger, Tracer, LogLevel
|
||||
from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \
|
||||
Variable, Statement, PseudoNode, Block
|
||||
from .tokenizer import Tokens, Token
|
||||
from .lexer import Tokens, Token
|
||||
|
||||
logger = Logger(__name__)
|
||||
tracer = Tracer(logger, level=LogLevel.Debug)
|
||||
|
||||
|
||||
class Parser:
|
||||
def __init__(self, tokens: List[Token]):
|
||||
self.tokens = tokens
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue