From fc9b6b30c688b3fb72aa7792ac3302822ae5c822 Mon Sep 17 00:00:00 2001 From: Antoine Viallon Date: Mon, 8 May 2023 00:50:42 +0200 Subject: [PATCH] parser: fully rewrite parsing Use a simpler and more direct recursive descent method. --- compiler/__main__.py | 74 +---------------- compiler/nodes.py | 34 +++++++- compiler/parser.py | 106 +++++++++++++++++++++++++ compiler/rules.py | 180 ------------------------------------------ compiler/tokenizer.py | 9 ++- 5 files changed, 151 insertions(+), 252 deletions(-) create mode 100644 compiler/parser.py delete mode 100644 compiler/rules.py diff --git a/compiler/__main__.py b/compiler/__main__.py index ec8675a..7b8dac9 100644 --- a/compiler/__main__.py +++ b/compiler/__main__.py @@ -1,73 +1,10 @@ from __future__ import annotations -from .nodes import Node -from .rules import Rule, And, Or, Terminal, EvalResult -from .tokenizer import Tokenizer, Tokens, Token +from .tokenizer import Tokenizer, Tokens +from .parser import Parser data = "2 * (32.9 + 1)" -grammar = r""" -number = r"[0-9]+(\.[0-9]*)?" -operator = "+" | "-" | "*" | "/" -value = "(" expression ")" | number | expression -expression = value operator value -root = expression -""" - -number = Rule.make_rule("number", - Terminal(token_type=Tokens.Number) - ) -operator = Rule.make_rule("operator", Or( - Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus), - Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide) -)) - -value = Rule.make_rule("value", Or( - And( - Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right) - ), - number, - "expression" -)) - -expression = Rule.make_rule("expression", And( - number, - operator, - number -)) - -root = expression -root.prepare() - - -def print_results(result: EvalResult) -> str: - prefix = "\t" * print_results._depth - message = "" - if type(result) != EvalResult: - message = f"{prefix}{str(result)}\n" - return message - - print_results._depth += 1 - - if result.name is not None: - message += f"{prefix}{result.name}\n" - - if type(result.result) == list: - message += prefix + "{\n" - mylist = [] - for r in result.result: - mylist += [print_results(r)] - message += " ".join(mylist) - message += prefix + "}\n" - else: - message += print_results(result.result) + "\n" - print_results._depth -= 1 - return message - - -print_results._depth = 0 - - def main(): tokenizer = Tokenizer() tokens = tokenizer.tokenize("2 + 3") @@ -75,11 +12,8 @@ def main(): tokens = [token for token in tokens if token.kind != Tokens.Blank] print(tokens) - result = root.evaluate(tokens) - print(result) - print(print_results(result)) - if result.errors is not None: - raise result.errors + parser = Parser(tokens) + parser.root() if __name__ == "__main__": diff --git a/compiler/nodes.py b/compiler/nodes.py index 35d00a0..ef70583 100644 --- a/compiler/nodes.py +++ b/compiler/nodes.py @@ -1,4 +1,36 @@ from __future__ import annotations + class Node: - pass \ No newline at end of file + pass + + +class Operator(Node): + op: str + + +class Sum(Node): + def __init__(self, *values: Expression): + self.values = values + + def __repr__(self): + return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})" + + +class Product(Node): + def __init__(self, *values: Expression): + self.values = values + + def __repr__(self): + return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})" + + +class Number(Node): + def __init__(self, value: int): + self.value = value + + def __repr__(self): + return f"{self.__class__.__name__}({self.value})" + + +Expression = Sum | Product | Number diff --git a/compiler/parser.py b/compiler/parser.py new file mode 100644 index 0000000..1cf732a --- /dev/null +++ b/compiler/parser.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from beartype.typing import List + +from .logger import make_logger +from .nodes import Number, Sum, Expression, Product +from .source import SourceLocation +from .tokenizer import Tokens, Token + +logger = make_logger(__name__) + + +class ParsingError(Exception): + def __init__(self, location: SourceLocation, message: str = "Unknown error"): + super().__init__(f"{message} at {str(location)}") + self.location = location + + +class Parser: + def __init__(self, tokens: List[Token]): + self.tokens = tokens + self.pos = 0 + + @property + def token(self) -> Token: + if self.pos >= len(self.tokens): + return Token(kind=Tokens.EOF) + return self.tokens[self.pos] + + @property + def prev_token(self) -> Token: + return self.tokens[self.pos - 1] + + def next_symbol(self): + self.pos += 1 + logger.debug("%s", f"Advancing to token {self.pos} {self.token}") + + def accept(self, *token_types: Tokens) -> False | Token: + tok = self.token + if self.token.kind in token_types: + self.next_symbol() + return tok + return False + + def peek(self, token_type) -> False | Token: + tok = self.token + if self.token.kind == token_type: + return tok + return False + + def expect(self, token_type: Tokens) -> Token: + r = self.accept(token_type) + logger.debug("%s", f"Expecting {token_type}, got {r}") + if r is False: + raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted {token_type}") + return r + + def factor(self) -> Expression: + if self.accept(Tokens.Parens_Left): + v = self.expression() + self.expect(Tokens.Parens_Right) + return v + elif tok := self.accept(Tokens.Number): + logger.debug("%s", f"Found number {self.prev_token}") + return Number(value=int(tok.value)) + else: + raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted parenthesized expression or " + f"number") + + def term(self) -> Expression: + operations = [] + operand = self.factor() + operations += [operand] + + while operator := self.accept(Tokens.Op_Multiply, Tokens.Op_Divide): + operand = self.factor() + operations += [operand] + + if len(operations) == 1: + return operations[0] + + logger.debug("%s", f"Product of the following terms: {operations}") + return Product(*operations) + + def summation(self) -> Sum: + operations = [] + operand = self.term() + operations += [operand] + + while operator := self.accept(Tokens.Op_Plus, Tokens.Op_Minus): + operand = self.term() + operations += [operand] + + if len(operations) == 1: + return operations[0] + + logger.debug("%s", f"Sum of the following terms: {operations}") + return Sum(*operations) + + def expression(self) -> Expression: + summation = self.summation() + return summation + + def root(self): + self.expression() + self.expect(Tokens.EOF) diff --git a/compiler/rules.py b/compiler/rules.py deleted file mode 100644 index 883b232..0000000 --- a/compiler/rules.py +++ /dev/null @@ -1,180 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass - -from beartype.typing import Union, Dict, Optional, List, Any, Tuple -import abc - -from .nodes import Node -from .source import SourceLocation -from .tokenizer import Token, Tokens -from .logger import logger - -class ParsingError(Exception): - def __init__(self, location: SourceLocation, message: str = "Unknown error"): - super().__init__(f"{message} at {str(location)}") - self.location = location - -@dataclass -class EvalResult: - result: EvalResult | List[EvalResult] | Token | None = None - errors: Optional[ParsingError] = None - name: Optional[str] = None - - def __str__(self): - #if isinstance(self.result, EvalResult): - # return self.result - - name = self.name if self.name is not None else self.__class__.__name__ - value = str(self.result) - if type(self.result) == list: - value = ", ".join(str(r) for r in self.result) - return f"{name}({value})" - - def __repr__(self): - return super(EvalResult, self).__repr__() - -RuleLike = Union[str, 'Rule'] - -class Rule(abc.ABC): - _named_rules: Dict[str, Rule] = dict() - - def __init__(self, *sub_rules: RuleLike): - self._prepared: bool = False - self._rules: List[RuleLike] = list(sub_rules) - self.rules: List[Rule] = [] - self.name: Optional[str] = None - self.node: Optional[Node] = None - - @staticmethod - def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule: - rule.name = name - rule.node = node - Rule._named_rules[name] = rule - return rule - - @abc.abstractmethod - def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult: - raise NotImplementedError() - - def prepare(self): - if self._prepared: - return - - for key, rule in enumerate(self._rules): - if type(rule) == str: - self._rules[key] = Rule._named_rules[rule] - - self.rules = self._rules - - self._prepared = True - for rule in self._rules: - if not rule._prepared: - rule.prepare() - - def __repr__(self): - return "{}:{}({})".format(self.__class__.__name__, - self.name if self.name is not None else "", - ", ".join(rule.__class__.__name__ for rule in self.rules)) - - -class Terminal(Rule): - def __init__(self, token_type: Tokens): - super().__init__() - self.token_type = token_type - - def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult: - assert len(tokens) > 0 - result = EvalResult(name=self.name) - logger.debug("%s", f"{depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'") - if len(tokens) != 1: - result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token") - return result - - if tokens[0].kind != self.token_type: - result.errors = ParsingError( - tokens[0].loc, - message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}" - ) - return result - - result.result = tokens[0] - - logger.debug(f"{depth}: Terminal: Found terminal node: {result}") - - return result - - def __repr__(self): - return "{}({})".format(self.__class__.__name__, self.token_type) - - -class Or(Rule): - def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult: - result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name) - rule: Rule - for i, rule in enumerate(self.rules): - logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} : trying rule: {rule}") - result = rule.evaluate(tokens, depth=depth + 1) - if result.errors is None: - logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {result}") - break - - logger.debug(f"{depth}: Or: Finished with errors: {result.errors}") - - result.name = self.name - - return result - - -class And(Rule): - def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult: - result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name) - result.result = [] - begin = 0 - end = 0 - for i, rule in enumerate(self.rules): - logger.debug("%s", f"{depth}: And: Trying rule '{rule}'") - - if end == len(tokens): - logger.error("%s", f"{depth}: And: Oops, reached the end of the tokens") - - best_r: Optional[EvalResult] = None - while end < len(tokens): - tokens_ = tokens[begin:end + 1] - r = rule.evaluate(tokens_, depth=depth + 1) - - # No previous match, but we found one - if best_r is None and r.errors is None: - logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}") - best_r = r - - # No result at all. That's an error. - elif best_r is None and r.errors is not None: - result.errors = r.errors - return result - - # We had a result, but we still matched with more tokens - elif best_r is not None and r.errors is None: - logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}") - best_r = r - - # We already have a match, and we can't improve it. Finish this rule. - elif best_r is not None and r.errors is not None: - logger.debug( - f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}") - result.result += [best_r] - # Matching rule ended at 'end - 1', meaning next rule will begin at end - begin = end - break - - end += 1 - - else: - logger.debug( - f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}") - result.result += [best_r] - result.errors = None - - if end != len(tokens): - logger.debug(f"{depth}: And: Didn't consume all tokens") - return result diff --git a/compiler/tokenizer.py b/compiler/tokenizer.py index 504b9c4..6145765 100644 --- a/compiler/tokenizer.py +++ b/compiler/tokenizer.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass, field from beartype import beartype -from beartype.typing import Optional +from beartype.typing import Optional, List import enum import re @@ -31,8 +31,12 @@ class Tokens(enum.Enum): Parens_Left = re.compile(r"\(") Parens_Right = re.compile(r"\)") Blank = re.compile(r"\s+") + EOF = re.compile(r"\Z") Unknown = re.compile(r".*") + def __bool__(self): + return True + class Tokenizer: def __init__(self): @@ -67,4 +71,7 @@ class Tokenizer: results += [best_result] begin += len(best_result.value) + results += [Token(Tokens.EOF, value=None, loc=SourceLocation( + Location(line=0, character=len(data)), source=data + ))] return results \ No newline at end of file