parser: fully rewrite parsing

Use a simpler and more direct recursive descent method.
This commit is contained in:
Antoine Viallon 2023-05-08 00:50:42 +02:00
parent 14813a8bdf
commit fc9b6b30c6
Signed by: aviallon
GPG key ID: D126B13AB555E16F
5 changed files with 151 additions and 252 deletions

View file

@ -1,73 +1,10 @@
from __future__ import annotations
from .nodes import Node
from .rules import Rule, And, Or, Terminal, EvalResult
from .tokenizer import Tokenizer, Tokens, Token
from .tokenizer import Tokenizer, Tokens
from .parser import Parser
data = "2 * (32.9 + 1)"
grammar = r"""
number = r"[0-9]+(\.[0-9]*)?"
operator = "+" | "-" | "*" | "/"
value = "(" expression ")" | number | expression
expression = value operator value
root = expression
"""
number = Rule.make_rule("number",
Terminal(token_type=Tokens.Number)
)
operator = Rule.make_rule("operator", Or(
Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus),
Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide)
))
value = Rule.make_rule("value", Or(
And(
Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right)
),
number,
"expression"
))
expression = Rule.make_rule("expression", And(
number,
operator,
number
))
root = expression
root.prepare()
def print_results(result: EvalResult) -> str:
prefix = "\t" * print_results._depth
message = ""
if type(result) != EvalResult:
message = f"{prefix}{str(result)}\n"
return message
print_results._depth += 1
if result.name is not None:
message += f"{prefix}{result.name}\n"
if type(result.result) == list:
message += prefix + "{\n"
mylist = []
for r in result.result:
mylist += [print_results(r)]
message += " ".join(mylist)
message += prefix + "}\n"
else:
message += print_results(result.result) + "\n"
print_results._depth -= 1
return message
print_results._depth = 0
def main():
tokenizer = Tokenizer()
tokens = tokenizer.tokenize("2 + 3")
@ -75,11 +12,8 @@ def main():
tokens = [token for token in tokens if token.kind != Tokens.Blank]
print(tokens)
result = root.evaluate(tokens)
print(result)
print(print_results(result))
if result.errors is not None:
raise result.errors
parser = Parser(tokens)
parser.root()
if __name__ == "__main__":

View file

@ -1,4 +1,36 @@
from __future__ import annotations
class Node:
pass
pass
class Operator(Node):
op: str
class Sum(Node):
def __init__(self, *values: Expression):
self.values = values
def __repr__(self):
return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})"
class Product(Node):
def __init__(self, *values: Expression):
self.values = values
def __repr__(self):
return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})"
class Number(Node):
def __init__(self, value: int):
self.value = value
def __repr__(self):
return f"{self.__class__.__name__}({self.value})"
Expression = Sum | Product | Number

106
compiler/parser.py Normal file
View file

@ -0,0 +1,106 @@
from __future__ import annotations
from beartype.typing import List
from .logger import make_logger
from .nodes import Number, Sum, Expression, Product
from .source import SourceLocation
from .tokenizer import Tokens, Token
logger = make_logger(__name__)
class ParsingError(Exception):
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
super().__init__(f"{message} at {str(location)}")
self.location = location
class Parser:
def __init__(self, tokens: List[Token]):
self.tokens = tokens
self.pos = 0
@property
def token(self) -> Token:
if self.pos >= len(self.tokens):
return Token(kind=Tokens.EOF)
return self.tokens[self.pos]
@property
def prev_token(self) -> Token:
return self.tokens[self.pos - 1]
def next_symbol(self):
self.pos += 1
logger.debug("%s", f"Advancing to token {self.pos} {self.token}")
def accept(self, *token_types: Tokens) -> False | Token:
tok = self.token
if self.token.kind in token_types:
self.next_symbol()
return tok
return False
def peek(self, token_type) -> False | Token:
tok = self.token
if self.token.kind == token_type:
return tok
return False
def expect(self, token_type: Tokens) -> Token:
r = self.accept(token_type)
logger.debug("%s", f"Expecting {token_type}, got {r}")
if r is False:
raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted {token_type}")
return r
def factor(self) -> Expression:
if self.accept(Tokens.Parens_Left):
v = self.expression()
self.expect(Tokens.Parens_Right)
return v
elif tok := self.accept(Tokens.Number):
logger.debug("%s", f"Found number {self.prev_token}")
return Number(value=int(tok.value))
else:
raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted parenthesized expression or "
f"number")
def term(self) -> Expression:
operations = []
operand = self.factor()
operations += [operand]
while operator := self.accept(Tokens.Op_Multiply, Tokens.Op_Divide):
operand = self.factor()
operations += [operand]
if len(operations) == 1:
return operations[0]
logger.debug("%s", f"Product of the following terms: {operations}")
return Product(*operations)
def summation(self) -> Sum:
operations = []
operand = self.term()
operations += [operand]
while operator := self.accept(Tokens.Op_Plus, Tokens.Op_Minus):
operand = self.term()
operations += [operand]
if len(operations) == 1:
return operations[0]
logger.debug("%s", f"Sum of the following terms: {operations}")
return Sum(*operations)
def expression(self) -> Expression:
summation = self.summation()
return summation
def root(self):
self.expression()
self.expect(Tokens.EOF)

View file

@ -1,180 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass
from beartype.typing import Union, Dict, Optional, List, Any, Tuple
import abc
from .nodes import Node
from .source import SourceLocation
from .tokenizer import Token, Tokens
from .logger import logger
class ParsingError(Exception):
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
super().__init__(f"{message} at {str(location)}")
self.location = location
@dataclass
class EvalResult:
result: EvalResult | List[EvalResult] | Token | None = None
errors: Optional[ParsingError] = None
name: Optional[str] = None
def __str__(self):
#if isinstance(self.result, EvalResult):
# return self.result
name = self.name if self.name is not None else self.__class__.__name__
value = str(self.result)
if type(self.result) == list:
value = ", ".join(str(r) for r in self.result)
return f"{name}({value})"
def __repr__(self):
return super(EvalResult, self).__repr__()
RuleLike = Union[str, 'Rule']
class Rule(abc.ABC):
_named_rules: Dict[str, Rule] = dict()
def __init__(self, *sub_rules: RuleLike):
self._prepared: bool = False
self._rules: List[RuleLike] = list(sub_rules)
self.rules: List[Rule] = []
self.name: Optional[str] = None
self.node: Optional[Node] = None
@staticmethod
def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule:
rule.name = name
rule.node = node
Rule._named_rules[name] = rule
return rule
@abc.abstractmethod
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
raise NotImplementedError()
def prepare(self):
if self._prepared:
return
for key, rule in enumerate(self._rules):
if type(rule) == str:
self._rules[key] = Rule._named_rules[rule]
self.rules = self._rules
self._prepared = True
for rule in self._rules:
if not rule._prepared:
rule.prepare()
def __repr__(self):
return "{}:{}({})".format(self.__class__.__name__,
self.name if self.name is not None else "",
", ".join(rule.__class__.__name__ for rule in self.rules))
class Terminal(Rule):
def __init__(self, token_type: Tokens):
super().__init__()
self.token_type = token_type
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
assert len(tokens) > 0
result = EvalResult(name=self.name)
logger.debug("%s", f"{depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'")
if len(tokens) != 1:
result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token")
return result
if tokens[0].kind != self.token_type:
result.errors = ParsingError(
tokens[0].loc,
message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}"
)
return result
result.result = tokens[0]
logger.debug(f"{depth}: Terminal: Found terminal node: {result}")
return result
def __repr__(self):
return "{}({})".format(self.__class__.__name__, self.token_type)
class Or(Rule):
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name)
rule: Rule
for i, rule in enumerate(self.rules):
logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} : trying rule: {rule}")
result = rule.evaluate(tokens, depth=depth + 1)
if result.errors is None:
logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {result}")
break
logger.debug(f"{depth}: Or: Finished with errors: {result.errors}")
result.name = self.name
return result
class And(Rule):
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name)
result.result = []
begin = 0
end = 0
for i, rule in enumerate(self.rules):
logger.debug("%s", f"{depth}: And: Trying rule '{rule}'")
if end == len(tokens):
logger.error("%s", f"{depth}: And: Oops, reached the end of the tokens")
best_r: Optional[EvalResult] = None
while end < len(tokens):
tokens_ = tokens[begin:end + 1]
r = rule.evaluate(tokens_, depth=depth + 1)
# No previous match, but we found one
if best_r is None and r.errors is None:
logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}")
best_r = r
# No result at all. That's an error.
elif best_r is None and r.errors is not None:
result.errors = r.errors
return result
# We had a result, but we still matched with more tokens
elif best_r is not None and r.errors is None:
logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}")
best_r = r
# We already have a match, and we can't improve it. Finish this rule.
elif best_r is not None and r.errors is not None:
logger.debug(
f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}")
result.result += [best_r]
# Matching rule ended at 'end - 1', meaning next rule will begin at end
begin = end
break
end += 1
else:
logger.debug(
f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}")
result.result += [best_r]
result.errors = None
if end != len(tokens):
logger.debug(f"{depth}: And: Didn't consume all tokens")
return result

View file

@ -1,7 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass, field
from beartype import beartype
from beartype.typing import Optional
from beartype.typing import Optional, List
import enum
import re
@ -31,8 +31,12 @@ class Tokens(enum.Enum):
Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)")
Blank = re.compile(r"\s+")
EOF = re.compile(r"\Z")
Unknown = re.compile(r".*")
def __bool__(self):
return True
class Tokenizer:
def __init__(self):
@ -67,4 +71,7 @@ class Tokenizer:
results += [best_result]
begin += len(best_result.value)
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=0, character=len(data)), source=data
))]
return results