parser: fully rewrite parsing
Use a simpler and more direct recursive descent method.
This commit is contained in:
parent
14813a8bdf
commit
fc9b6b30c6
5 changed files with 151 additions and 252 deletions
|
|
@ -1,73 +1,10 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .nodes import Node
|
||||
from .rules import Rule, And, Or, Terminal, EvalResult
|
||||
from .tokenizer import Tokenizer, Tokens, Token
|
||||
from .tokenizer import Tokenizer, Tokens
|
||||
from .parser import Parser
|
||||
|
||||
data = "2 * (32.9 + 1)"
|
||||
|
||||
grammar = r"""
|
||||
number = r"[0-9]+(\.[0-9]*)?"
|
||||
operator = "+" | "-" | "*" | "/"
|
||||
value = "(" expression ")" | number | expression
|
||||
expression = value operator value
|
||||
root = expression
|
||||
"""
|
||||
|
||||
number = Rule.make_rule("number",
|
||||
Terminal(token_type=Tokens.Number)
|
||||
)
|
||||
operator = Rule.make_rule("operator", Or(
|
||||
Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus),
|
||||
Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide)
|
||||
))
|
||||
|
||||
value = Rule.make_rule("value", Or(
|
||||
And(
|
||||
Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right)
|
||||
),
|
||||
number,
|
||||
"expression"
|
||||
))
|
||||
|
||||
expression = Rule.make_rule("expression", And(
|
||||
number,
|
||||
operator,
|
||||
number
|
||||
))
|
||||
|
||||
root = expression
|
||||
root.prepare()
|
||||
|
||||
|
||||
def print_results(result: EvalResult) -> str:
|
||||
prefix = "\t" * print_results._depth
|
||||
message = ""
|
||||
if type(result) != EvalResult:
|
||||
message = f"{prefix}{str(result)}\n"
|
||||
return message
|
||||
|
||||
print_results._depth += 1
|
||||
|
||||
if result.name is not None:
|
||||
message += f"{prefix}{result.name}\n"
|
||||
|
||||
if type(result.result) == list:
|
||||
message += prefix + "{\n"
|
||||
mylist = []
|
||||
for r in result.result:
|
||||
mylist += [print_results(r)]
|
||||
message += " ".join(mylist)
|
||||
message += prefix + "}\n"
|
||||
else:
|
||||
message += print_results(result.result) + "\n"
|
||||
print_results._depth -= 1
|
||||
return message
|
||||
|
||||
|
||||
print_results._depth = 0
|
||||
|
||||
|
||||
def main():
|
||||
tokenizer = Tokenizer()
|
||||
tokens = tokenizer.tokenize("2 + 3")
|
||||
|
|
@ -75,11 +12,8 @@ def main():
|
|||
tokens = [token for token in tokens if token.kind != Tokens.Blank]
|
||||
print(tokens)
|
||||
|
||||
result = root.evaluate(tokens)
|
||||
print(result)
|
||||
print(print_results(result))
|
||||
if result.errors is not None:
|
||||
raise result.errors
|
||||
parser = Parser(tokens)
|
||||
parser.root()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,4 +1,36 @@
|
|||
from __future__ import annotations
|
||||
|
||||
|
||||
class Node:
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
class Operator(Node):
|
||||
op: str
|
||||
|
||||
|
||||
class Sum(Node):
|
||||
def __init__(self, *values: Expression):
|
||||
self.values = values
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})"
|
||||
|
||||
|
||||
class Product(Node):
|
||||
def __init__(self, *values: Expression):
|
||||
self.values = values
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})"
|
||||
|
||||
|
||||
class Number(Node):
|
||||
def __init__(self, value: int):
|
||||
self.value = value
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({self.value})"
|
||||
|
||||
|
||||
Expression = Sum | Product | Number
|
||||
|
|
|
|||
106
compiler/parser.py
Normal file
106
compiler/parser.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from beartype.typing import List
|
||||
|
||||
from .logger import make_logger
|
||||
from .nodes import Number, Sum, Expression, Product
|
||||
from .source import SourceLocation
|
||||
from .tokenizer import Tokens, Token
|
||||
|
||||
logger = make_logger(__name__)
|
||||
|
||||
|
||||
class ParsingError(Exception):
|
||||
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
|
||||
super().__init__(f"{message} at {str(location)}")
|
||||
self.location = location
|
||||
|
||||
|
||||
class Parser:
|
||||
def __init__(self, tokens: List[Token]):
|
||||
self.tokens = tokens
|
||||
self.pos = 0
|
||||
|
||||
@property
|
||||
def token(self) -> Token:
|
||||
if self.pos >= len(self.tokens):
|
||||
return Token(kind=Tokens.EOF)
|
||||
return self.tokens[self.pos]
|
||||
|
||||
@property
|
||||
def prev_token(self) -> Token:
|
||||
return self.tokens[self.pos - 1]
|
||||
|
||||
def next_symbol(self):
|
||||
self.pos += 1
|
||||
logger.debug("%s", f"Advancing to token {self.pos} {self.token}")
|
||||
|
||||
def accept(self, *token_types: Tokens) -> False | Token:
|
||||
tok = self.token
|
||||
if self.token.kind in token_types:
|
||||
self.next_symbol()
|
||||
return tok
|
||||
return False
|
||||
|
||||
def peek(self, token_type) -> False | Token:
|
||||
tok = self.token
|
||||
if self.token.kind == token_type:
|
||||
return tok
|
||||
return False
|
||||
|
||||
def expect(self, token_type: Tokens) -> Token:
|
||||
r = self.accept(token_type)
|
||||
logger.debug("%s", f"Expecting {token_type}, got {r}")
|
||||
if r is False:
|
||||
raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted {token_type}")
|
||||
return r
|
||||
|
||||
def factor(self) -> Expression:
|
||||
if self.accept(Tokens.Parens_Left):
|
||||
v = self.expression()
|
||||
self.expect(Tokens.Parens_Right)
|
||||
return v
|
||||
elif tok := self.accept(Tokens.Number):
|
||||
logger.debug("%s", f"Found number {self.prev_token}")
|
||||
return Number(value=int(tok.value))
|
||||
else:
|
||||
raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted parenthesized expression or "
|
||||
f"number")
|
||||
|
||||
def term(self) -> Expression:
|
||||
operations = []
|
||||
operand = self.factor()
|
||||
operations += [operand]
|
||||
|
||||
while operator := self.accept(Tokens.Op_Multiply, Tokens.Op_Divide):
|
||||
operand = self.factor()
|
||||
operations += [operand]
|
||||
|
||||
if len(operations) == 1:
|
||||
return operations[0]
|
||||
|
||||
logger.debug("%s", f"Product of the following terms: {operations}")
|
||||
return Product(*operations)
|
||||
|
||||
def summation(self) -> Sum:
|
||||
operations = []
|
||||
operand = self.term()
|
||||
operations += [operand]
|
||||
|
||||
while operator := self.accept(Tokens.Op_Plus, Tokens.Op_Minus):
|
||||
operand = self.term()
|
||||
operations += [operand]
|
||||
|
||||
if len(operations) == 1:
|
||||
return operations[0]
|
||||
|
||||
logger.debug("%s", f"Sum of the following terms: {operations}")
|
||||
return Sum(*operations)
|
||||
|
||||
def expression(self) -> Expression:
|
||||
summation = self.summation()
|
||||
return summation
|
||||
|
||||
def root(self):
|
||||
self.expression()
|
||||
self.expect(Tokens.EOF)
|
||||
|
|
@ -1,180 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from beartype.typing import Union, Dict, Optional, List, Any, Tuple
|
||||
import abc
|
||||
|
||||
from .nodes import Node
|
||||
from .source import SourceLocation
|
||||
from .tokenizer import Token, Tokens
|
||||
from .logger import logger
|
||||
|
||||
class ParsingError(Exception):
|
||||
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
|
||||
super().__init__(f"{message} at {str(location)}")
|
||||
self.location = location
|
||||
|
||||
@dataclass
|
||||
class EvalResult:
|
||||
result: EvalResult | List[EvalResult] | Token | None = None
|
||||
errors: Optional[ParsingError] = None
|
||||
name: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
#if isinstance(self.result, EvalResult):
|
||||
# return self.result
|
||||
|
||||
name = self.name if self.name is not None else self.__class__.__name__
|
||||
value = str(self.result)
|
||||
if type(self.result) == list:
|
||||
value = ", ".join(str(r) for r in self.result)
|
||||
return f"{name}({value})"
|
||||
|
||||
def __repr__(self):
|
||||
return super(EvalResult, self).__repr__()
|
||||
|
||||
RuleLike = Union[str, 'Rule']
|
||||
|
||||
class Rule(abc.ABC):
|
||||
_named_rules: Dict[str, Rule] = dict()
|
||||
|
||||
def __init__(self, *sub_rules: RuleLike):
|
||||
self._prepared: bool = False
|
||||
self._rules: List[RuleLike] = list(sub_rules)
|
||||
self.rules: List[Rule] = []
|
||||
self.name: Optional[str] = None
|
||||
self.node: Optional[Node] = None
|
||||
|
||||
@staticmethod
|
||||
def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule:
|
||||
rule.name = name
|
||||
rule.node = node
|
||||
Rule._named_rules[name] = rule
|
||||
return rule
|
||||
|
||||
@abc.abstractmethod
|
||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare(self):
|
||||
if self._prepared:
|
||||
return
|
||||
|
||||
for key, rule in enumerate(self._rules):
|
||||
if type(rule) == str:
|
||||
self._rules[key] = Rule._named_rules[rule]
|
||||
|
||||
self.rules = self._rules
|
||||
|
||||
self._prepared = True
|
||||
for rule in self._rules:
|
||||
if not rule._prepared:
|
||||
rule.prepare()
|
||||
|
||||
def __repr__(self):
|
||||
return "{}:{}({})".format(self.__class__.__name__,
|
||||
self.name if self.name is not None else "",
|
||||
", ".join(rule.__class__.__name__ for rule in self.rules))
|
||||
|
||||
|
||||
class Terminal(Rule):
|
||||
def __init__(self, token_type: Tokens):
|
||||
super().__init__()
|
||||
self.token_type = token_type
|
||||
|
||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
||||
assert len(tokens) > 0
|
||||
result = EvalResult(name=self.name)
|
||||
logger.debug("%s", f"{depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'")
|
||||
if len(tokens) != 1:
|
||||
result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token")
|
||||
return result
|
||||
|
||||
if tokens[0].kind != self.token_type:
|
||||
result.errors = ParsingError(
|
||||
tokens[0].loc,
|
||||
message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}"
|
||||
)
|
||||
return result
|
||||
|
||||
result.result = tokens[0]
|
||||
|
||||
logger.debug(f"{depth}: Terminal: Found terminal node: {result}")
|
||||
|
||||
return result
|
||||
|
||||
def __repr__(self):
|
||||
return "{}({})".format(self.__class__.__name__, self.token_type)
|
||||
|
||||
|
||||
class Or(Rule):
|
||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
||||
result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name)
|
||||
rule: Rule
|
||||
for i, rule in enumerate(self.rules):
|
||||
logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} : trying rule: {rule}")
|
||||
result = rule.evaluate(tokens, depth=depth + 1)
|
||||
if result.errors is None:
|
||||
logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {result}")
|
||||
break
|
||||
|
||||
logger.debug(f"{depth}: Or: Finished with errors: {result.errors}")
|
||||
|
||||
result.name = self.name
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class And(Rule):
|
||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
||||
result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name)
|
||||
result.result = []
|
||||
begin = 0
|
||||
end = 0
|
||||
for i, rule in enumerate(self.rules):
|
||||
logger.debug("%s", f"{depth}: And: Trying rule '{rule}'")
|
||||
|
||||
if end == len(tokens):
|
||||
logger.error("%s", f"{depth}: And: Oops, reached the end of the tokens")
|
||||
|
||||
best_r: Optional[EvalResult] = None
|
||||
while end < len(tokens):
|
||||
tokens_ = tokens[begin:end + 1]
|
||||
r = rule.evaluate(tokens_, depth=depth + 1)
|
||||
|
||||
# No previous match, but we found one
|
||||
if best_r is None and r.errors is None:
|
||||
logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}")
|
||||
best_r = r
|
||||
|
||||
# No result at all. That's an error.
|
||||
elif best_r is None and r.errors is not None:
|
||||
result.errors = r.errors
|
||||
return result
|
||||
|
||||
# We had a result, but we still matched with more tokens
|
||||
elif best_r is not None and r.errors is None:
|
||||
logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}")
|
||||
best_r = r
|
||||
|
||||
# We already have a match, and we can't improve it. Finish this rule.
|
||||
elif best_r is not None and r.errors is not None:
|
||||
logger.debug(
|
||||
f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}")
|
||||
result.result += [best_r]
|
||||
# Matching rule ended at 'end - 1', meaning next rule will begin at end
|
||||
begin = end
|
||||
break
|
||||
|
||||
end += 1
|
||||
|
||||
else:
|
||||
logger.debug(
|
||||
f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}")
|
||||
result.result += [best_r]
|
||||
result.errors = None
|
||||
|
||||
if end != len(tokens):
|
||||
logger.debug(f"{depth}: And: Didn't consume all tokens")
|
||||
return result
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from beartype import beartype
|
||||
from beartype.typing import Optional
|
||||
from beartype.typing import Optional, List
|
||||
|
||||
import enum
|
||||
import re
|
||||
|
|
@ -31,8 +31,12 @@ class Tokens(enum.Enum):
|
|||
Parens_Left = re.compile(r"\(")
|
||||
Parens_Right = re.compile(r"\)")
|
||||
Blank = re.compile(r"\s+")
|
||||
EOF = re.compile(r"\Z")
|
||||
Unknown = re.compile(r".*")
|
||||
|
||||
def __bool__(self):
|
||||
return True
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self):
|
||||
|
|
@ -67,4 +71,7 @@ class Tokenizer:
|
|||
|
||||
results += [best_result]
|
||||
begin += len(best_result.value)
|
||||
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||
Location(line=0, character=len(data)), source=data
|
||||
))]
|
||||
return results
|
||||
Loading…
Add table
Add a link
Reference in a new issue