From bac49d20d76c6433fbb85aac35a5d8ecb451ee96 Mon Sep 17 00:00:00 2001 From: Antoine Viallon Date: Sun, 14 May 2023 23:22:38 +0200 Subject: [PATCH] lexer: rename tokenizer to lexer, and convert to sequence --- compiler/__main__.py | 5 +- compiler/errors.py | 6 +-- compiler/{tokenizer.py => lexer.py} | 73 +++++++++++++++++++---------- compiler/nodes.py | 4 +- compiler/parser.py | 3 +- 5 files changed, 58 insertions(+), 33 deletions(-) rename compiler/{tokenizer.py => lexer.py} (57%) diff --git a/compiler/__main__.py b/compiler/__main__.py index 4049dc0..bf52687 100644 --- a/compiler/__main__.py +++ b/compiler/__main__.py @@ -8,7 +8,7 @@ from . import semantic from .errors import CompilationError from .logger import rootLogger, LogLevel from .parser import Parser -from .tokenizer import Tokenizer, Tokens +from .lexer import Lexer, Tokens def main(): @@ -32,8 +32,7 @@ def main(): data = sys.stdin.read().strip() print("Source:\n", data) - tokenizer = Tokenizer() - tokens = tokenizer.tokenize(data) + tokens = Lexer(data) tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]] if rootLogger.level <= LogLevel.Debug: diff --git a/compiler/errors.py b/compiler/errors.py index 577ad07..a98a2da 100644 --- a/compiler/errors.py +++ b/compiler/errors.py @@ -2,7 +2,7 @@ from __future__ import annotations import inspect -from . import source, tokenizer +from . import source, lexer class CompilationError(Exception): @@ -12,9 +12,9 @@ class CompilationError(Exception): class UnexpectedTokenError(CompilationError): - def __init__(self, got: tokenizer.Token, wanted: tokenizer.Tokens | str): + def __init__(self, got: lexer.Token, wanted: lexer.Tokens | str): message = wanted - if type(wanted) == tokenizer.Tokens: + if type(wanted) == lexer.Tokens: message = str(wanted) super().__init__(got.loc, f"Unexpected token '{got}', wanted {message}") diff --git a/compiler/tokenizer.py b/compiler/lexer.py similarity index 57% rename from compiler/tokenizer.py rename to compiler/lexer.py index 1c0727c..87fa1d3 100644 --- a/compiler/tokenizer.py +++ b/compiler/lexer.py @@ -1,5 +1,6 @@ from __future__ import annotations +import collections.abc import enum import re from dataclasses import dataclass, field @@ -47,52 +48,76 @@ class Tokens(enum.Enum): return True -class Tokenizer: - def __init__(self): - pass +class Lexer(collections.abc.Sequence): + def __init__(self, data: str): + self.data = data + self.tokens = [] + self.length: int | None = None + self.begin: int = 0 + self.end: int = 0 + self.character: int = 0 + self.line: int = 0 - def tokenize(self, data: str) -> List[Token]: - results: List[Token] = [] - line = 0 - character = 0 - begin = 0 - while begin < len(data): + def __getitem__(self, index: int) -> Token: + while len(self) <= index + 1 and self.length is None: + self._next_token() + return self.tokens[index] + + def __next__(self): + return self._next_token() + + def __len__(self) -> int: + while self.length is None: + self._next_token() + + assert self.length is not None + + return self.length + + def _next_token(self) -> Token: + actual_result: Token + if self.begin < len(self.data): best_result: Token = Token(Tokens.Unknown, - loc=SourceLocation(Location(line=line, character=character), source=data), + loc=SourceLocation(Location(line=self.line, character=self.character), source=self.data), value="" ) for token_kind in list(Tokens): if token_kind == Tokens.Unknown: continue regex: re.Pattern = token_kind.value - match = regex.match(data, begin) + match = regex.match(self.data, self.begin) if match is not None: logger.trace(f"Got match: {match}") result = match.group(0) if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value): loc = SourceLocation( - begin=Location(line=line, character=character), - end=Location(line=line, character=character + len(result)) + begin=Location(line=self.line, character=self.character), + end=Location(line=self.line, character=self.character + len(result)) ) best_result = Token(token_kind, value=result, loc=loc) logger.trace(f"New best match: {best_result}") - begin += len(best_result.value) - character += len(best_result.value) + self.begin += len(best_result.value) + self.character += len(best_result.value) if best_result.kind == Tokens.Unknown: source_hint = best_result.loc.show_in_source() logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}") exit(1) elif best_result.kind == Tokens.Newline: - line += 1 - character = 0 - best_result.loc.end = Location(line=line, character=0) + self.line += 1 + self.character = 0 + best_result.loc.end = Location(line=self.line, character=0) logger.debug(f"Added token {best_result}") - results += [best_result] - - results += [Token(Tokens.EOF, value=None, loc=SourceLocation( - Location(line=line, character=0), source=data - ))] - return results + self.tokens += [best_result] + return best_result + elif self.begin == len(self.data): + eof_token = Token(Tokens.EOF, value=None, loc=SourceLocation( + Location(line=self.line, character=0), source=self.data + )) + self.tokens += [eof_token] + self.length = len(self.tokens) + return eof_token + else: + raise IndexError("EOF already reached") diff --git a/compiler/nodes.py b/compiler/nodes.py index ddf2aac..090cecf 100644 --- a/compiler/nodes.py +++ b/compiler/nodes.py @@ -6,7 +6,7 @@ from typing import Any, Iterable from beartype import beartype -from . import ir, semantic, tokenizer +from . import ir, semantic, lexer from .errors import SemanticAnalysisError, OverrideMandatoryError from .logger import Logger from .source import SourceLocation @@ -116,7 +116,7 @@ class PseudoNode(Literal): Only used for better diagnostics """ - def __init__(self, token: tokenizer.Token): + def __init__(self, token: lexer.Token): super().__init__(token.loc, token.value) self.token = token diff --git a/compiler/parser.py b/compiler/parser.py index 411ea27..ccafa09 100644 --- a/compiler/parser.py +++ b/compiler/parser.py @@ -6,11 +6,12 @@ from .errors import CompilationError, UnexpectedTokenError from .logger import Logger, Tracer, LogLevel from .nodes import Float, Sum, Value, Product, Node, Division, Sub, Integer, Expression, Identifier, Assignment, \ Variable, Statement, PseudoNode, Block -from .tokenizer import Tokens, Token +from .lexer import Tokens, Token logger = Logger(__name__) tracer = Tracer(logger, level=LogLevel.Debug) + class Parser: def __init__(self, tokens: List[Token]): self.tokens = tokens