lexer: rename tokenizer to lexer, and convert to sequence
This commit is contained in:
parent
36b1fad7fe
commit
bac49d20d7
5 changed files with 58 additions and 33 deletions
|
|
@ -8,7 +8,7 @@ from . import semantic
|
||||||
from .errors import CompilationError
|
from .errors import CompilationError
|
||||||
from .logger import rootLogger, LogLevel
|
from .logger import rootLogger, LogLevel
|
||||||
from .parser import Parser
|
from .parser import Parser
|
||||||
from .tokenizer import Tokenizer, Tokens
|
from .lexer import Lexer, Tokens
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
@ -32,8 +32,7 @@ def main():
|
||||||
data = sys.stdin.read().strip()
|
data = sys.stdin.read().strip()
|
||||||
|
|
||||||
print("Source:\n", data)
|
print("Source:\n", data)
|
||||||
tokenizer = Tokenizer()
|
tokens = Lexer(data)
|
||||||
tokens = tokenizer.tokenize(data)
|
|
||||||
|
|
||||||
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
|
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
|
||||||
if rootLogger.level <= LogLevel.Debug:
|
if rootLogger.level <= LogLevel.Debug:
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
|
|
||||||
from . import source, tokenizer
|
from . import source, lexer
|
||||||
|
|
||||||
|
|
||||||
class CompilationError(Exception):
|
class CompilationError(Exception):
|
||||||
|
|
@ -12,9 +12,9 @@ class CompilationError(Exception):
|
||||||
|
|
||||||
|
|
||||||
class UnexpectedTokenError(CompilationError):
|
class UnexpectedTokenError(CompilationError):
|
||||||
def __init__(self, got: tokenizer.Token, wanted: tokenizer.Tokens | str):
|
def __init__(self, got: lexer.Token, wanted: lexer.Tokens | str):
|
||||||
message = wanted
|
message = wanted
|
||||||
if type(wanted) == tokenizer.Tokens:
|
if type(wanted) == lexer.Tokens:
|
||||||
message = str(wanted)
|
message = str(wanted)
|
||||||
super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}")
|
super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import collections.abc
|
||||||
import enum
|
import enum
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
@ -47,52 +48,76 @@ class Tokens(enum.Enum):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
class Tokenizer:
|
class Lexer(collections.abc.Sequence):
|
||||||
def __init__(self):
|
def __init__(self, data: str):
|
||||||
pass
|
self.data = data
|
||||||
|
self.tokens = []
|
||||||
|
self.length: int | None = None
|
||||||
|
self.begin: int = 0
|
||||||
|
self.end: int = 0
|
||||||
|
self.character: int = 0
|
||||||
|
self.line: int = 0
|
||||||
|
|
||||||
def tokenize(self, data: str) -> List[Token]:
|
def __getitem__(self, index: int) -> Token:
|
||||||
results: List[Token] = []
|
while len(self) <= index + 1 and self.length is None:
|
||||||
line = 0
|
self._next_token()
|
||||||
character = 0
|
return self.tokens[index]
|
||||||
begin = 0
|
|
||||||
while begin < len(data):
|
def __next__(self):
|
||||||
|
return self._next_token()
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
while self.length is None:
|
||||||
|
self._next_token()
|
||||||
|
|
||||||
|
assert self.length is not None
|
||||||
|
|
||||||
|
return self.length
|
||||||
|
|
||||||
|
def _next_token(self) -> Token:
|
||||||
|
actual_result: Token
|
||||||
|
if self.begin < len(self.data):
|
||||||
best_result: Token = Token(Tokens.Unknown,
|
best_result: Token = Token(Tokens.Unknown,
|
||||||
loc=SourceLocation(Location(line=line, character=character), source=data),
|
loc=SourceLocation(Location(line=self.line, character=self.character), source=self.data),
|
||||||
value=""
|
value=""
|
||||||
)
|
)
|
||||||
for token_kind in list(Tokens):
|
for token_kind in list(Tokens):
|
||||||
if token_kind == Tokens.Unknown:
|
if token_kind == Tokens.Unknown:
|
||||||
continue
|
continue
|
||||||
regex: re.Pattern = token_kind.value
|
regex: re.Pattern = token_kind.value
|
||||||
match = regex.match(data, begin)
|
match = regex.match(self.data, self.begin)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
logger.trace(f"Got match: {match}")
|
logger.trace(f"Got match: {match}")
|
||||||
result = match.group(0)
|
result = match.group(0)
|
||||||
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
||||||
loc = SourceLocation(
|
loc = SourceLocation(
|
||||||
begin=Location(line=line, character=character),
|
begin=Location(line=self.line, character=self.character),
|
||||||
end=Location(line=line, character=character + len(result))
|
end=Location(line=self.line, character=self.character + len(result))
|
||||||
)
|
)
|
||||||
best_result = Token(token_kind, value=result, loc=loc)
|
best_result = Token(token_kind, value=result, loc=loc)
|
||||||
logger.trace(f"New best match: {best_result}")
|
logger.trace(f"New best match: {best_result}")
|
||||||
|
|
||||||
begin += len(best_result.value)
|
self.begin += len(best_result.value)
|
||||||
character += len(best_result.value)
|
self.character += len(best_result.value)
|
||||||
if best_result.kind == Tokens.Unknown:
|
if best_result.kind == Tokens.Unknown:
|
||||||
source_hint = best_result.loc.show_in_source()
|
source_hint = best_result.loc.show_in_source()
|
||||||
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
||||||
exit(1)
|
exit(1)
|
||||||
elif best_result.kind == Tokens.Newline:
|
elif best_result.kind == Tokens.Newline:
|
||||||
line += 1
|
self.line += 1
|
||||||
character = 0
|
self.character = 0
|
||||||
best_result.loc.end = Location(line=line, character=0)
|
best_result.loc.end = Location(line=self.line, character=0)
|
||||||
|
|
||||||
logger.debug(f"Added token {best_result}")
|
logger.debug(f"Added token {best_result}")
|
||||||
|
|
||||||
results += [best_result]
|
self.tokens += [best_result]
|
||||||
|
return best_result
|
||||||
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
elif self.begin == len(self.data):
|
||||||
Location(line=line, character=0), source=data
|
eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||||
))]
|
Location(line=self.line, character=0), source=self.data
|
||||||
return results
|
))
|
||||||
|
self.tokens += [eof_token]
|
||||||
|
self.length = len(self.tokens)
|
||||||
|
return eof_token
|
||||||
|
else:
|
||||||
|
raise IndexError("EOF already reached")
|
||||||
|
|
@ -6,7 +6,7 @@ from typing import Any, Iterable
|
||||||
|
|
||||||
from beartype import beartype
|
from beartype import beartype
|
||||||
|
|
||||||
from . import ir, semantic, tokenizer
|
from . import ir, semantic, lexer
|
||||||
from .errors import SemanticAnalysisError, OverrideMandatoryError
|
from .errors import SemanticAnalysisError, OverrideMandatoryError
|
||||||
from .logger import Logger
|
from .logger import Logger
|
||||||
from .source import SourceLocation
|
from .source import SourceLocation
|
||||||
|
|
@ -116,7 +116,7 @@ class PseudoNode(Literal):
|
||||||
Only used for better diagnostics
|
Only used for better diagnostics
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, token: tokenizer.Token):
|
def __init__(self, token: lexer.Token):
|
||||||
super().__init__(token.loc, token.value)
|
super().__init__(token.loc, token.value)
|
||||||
self.token = token
|
self.token = token
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -6,11 +6,12 @@ from .errors import CompilationError, UnexpectedTokenError
|
||||||
from .logger import Logger, Tracer, LogLevel
|
from .logger import Logger, Tracer, LogLevel
|
||||||
from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \
|
from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \
|
||||||
Variable, Statement, PseudoNode, Block
|
Variable, Statement, PseudoNode, Block
|
||||||
from .tokenizer import Tokens, Token
|
from .lexer import Tokens, Token
|
||||||
|
|
||||||
logger = Logger(__name__)
|
logger = Logger(__name__)
|
||||||
tracer = Tracer(logger, level=LogLevel.Debug)
|
tracer = Tracer(logger, level=LogLevel.Debug)
|
||||||
|
|
||||||
|
|
||||||
class Parser:
|
class Parser:
|
||||||
def __init__(self, tokens: List[Token]):
|
def __init__(self, tokens: List[Token]):
|
||||||
self.tokens = tokens
|
self.tokens = tokens
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue