parser: fully rewrite parsing
Use a simpler and more direct recursive descent method.
This commit is contained in:
parent
14813a8bdf
commit
fc9b6b30c6
5 changed files with 151 additions and 252 deletions
|
|
@ -1,73 +1,10 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from .nodes import Node
|
from .tokenizer import Tokenizer, Tokens
|
||||||
from .rules import Rule, And, Or, Terminal, EvalResult
|
from .parser import Parser
|
||||||
from .tokenizer import Tokenizer, Tokens, Token
|
|
||||||
|
|
||||||
data = "2 * (32.9 + 1)"
|
data = "2 * (32.9 + 1)"
|
||||||
|
|
||||||
grammar = r"""
|
|
||||||
number = r"[0-9]+(\.[0-9]*)?"
|
|
||||||
operator = "+" | "-" | "*" | "/"
|
|
||||||
value = "(" expression ")" | number | expression
|
|
||||||
expression = value operator value
|
|
||||||
root = expression
|
|
||||||
"""
|
|
||||||
|
|
||||||
number = Rule.make_rule("number",
|
|
||||||
Terminal(token_type=Tokens.Number)
|
|
||||||
)
|
|
||||||
operator = Rule.make_rule("operator", Or(
|
|
||||||
Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus),
|
|
||||||
Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide)
|
|
||||||
))
|
|
||||||
|
|
||||||
value = Rule.make_rule("value", Or(
|
|
||||||
And(
|
|
||||||
Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right)
|
|
||||||
),
|
|
||||||
number,
|
|
||||||
"expression"
|
|
||||||
))
|
|
||||||
|
|
||||||
expression = Rule.make_rule("expression", And(
|
|
||||||
number,
|
|
||||||
operator,
|
|
||||||
number
|
|
||||||
))
|
|
||||||
|
|
||||||
root = expression
|
|
||||||
root.prepare()
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(result: EvalResult) -> str:
|
|
||||||
prefix = "\t" * print_results._depth
|
|
||||||
message = ""
|
|
||||||
if type(result) != EvalResult:
|
|
||||||
message = f"{prefix}{str(result)}\n"
|
|
||||||
return message
|
|
||||||
|
|
||||||
print_results._depth += 1
|
|
||||||
|
|
||||||
if result.name is not None:
|
|
||||||
message += f"{prefix}{result.name}\n"
|
|
||||||
|
|
||||||
if type(result.result) == list:
|
|
||||||
message += prefix + "{\n"
|
|
||||||
mylist = []
|
|
||||||
for r in result.result:
|
|
||||||
mylist += [print_results(r)]
|
|
||||||
message += " ".join(mylist)
|
|
||||||
message += prefix + "}\n"
|
|
||||||
else:
|
|
||||||
message += print_results(result.result) + "\n"
|
|
||||||
print_results._depth -= 1
|
|
||||||
return message
|
|
||||||
|
|
||||||
|
|
||||||
print_results._depth = 0
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
tokenizer = Tokenizer()
|
tokenizer = Tokenizer()
|
||||||
tokens = tokenizer.tokenize("2 + 3")
|
tokens = tokenizer.tokenize("2 + 3")
|
||||||
|
|
@ -75,11 +12,8 @@ def main():
|
||||||
tokens = [token for token in tokens if token.kind != Tokens.Blank]
|
tokens = [token for token in tokens if token.kind != Tokens.Blank]
|
||||||
print(tokens)
|
print(tokens)
|
||||||
|
|
||||||
result = root.evaluate(tokens)
|
parser = Parser(tokens)
|
||||||
print(result)
|
parser.root()
|
||||||
print(print_results(result))
|
|
||||||
if result.errors is not None:
|
|
||||||
raise result.errors
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,36 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
class Node:
|
class Node:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Operator(Node):
|
||||||
|
op: str
|
||||||
|
|
||||||
|
|
||||||
|
class Sum(Node):
|
||||||
|
def __init__(self, *values: Expression):
|
||||||
|
self.values = values
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})"
|
||||||
|
|
||||||
|
|
||||||
|
class Product(Node):
|
||||||
|
def __init__(self, *values: Expression):
|
||||||
|
self.values = values
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({', '.join(repr(v) for v in self.values)})"
|
||||||
|
|
||||||
|
|
||||||
|
class Number(Node):
|
||||||
|
def __init__(self, value: int):
|
||||||
|
self.value = value
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({self.value})"
|
||||||
|
|
||||||
|
|
||||||
|
Expression = Sum | Product | Number
|
||||||
|
|
|
||||||
106
compiler/parser.py
Normal file
106
compiler/parser.py
Normal file
|
|
@ -0,0 +1,106 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from beartype.typing import List
|
||||||
|
|
||||||
|
from .logger import make_logger
|
||||||
|
from .nodes import Number, Sum, Expression, Product
|
||||||
|
from .source import SourceLocation
|
||||||
|
from .tokenizer import Tokens, Token
|
||||||
|
|
||||||
|
logger = make_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ParsingError(Exception):
|
||||||
|
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
|
||||||
|
super().__init__(f"{message} at {str(location)}")
|
||||||
|
self.location = location
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
def __init__(self, tokens: List[Token]):
|
||||||
|
self.tokens = tokens
|
||||||
|
self.pos = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def token(self) -> Token:
|
||||||
|
if self.pos >= len(self.tokens):
|
||||||
|
return Token(kind=Tokens.EOF)
|
||||||
|
return self.tokens[self.pos]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prev_token(self) -> Token:
|
||||||
|
return self.tokens[self.pos - 1]
|
||||||
|
|
||||||
|
def next_symbol(self):
|
||||||
|
self.pos += 1
|
||||||
|
logger.debug("%s", f"Advancing to token {self.pos} {self.token}")
|
||||||
|
|
||||||
|
def accept(self, *token_types: Tokens) -> False | Token:
|
||||||
|
tok = self.token
|
||||||
|
if self.token.kind in token_types:
|
||||||
|
self.next_symbol()
|
||||||
|
return tok
|
||||||
|
return False
|
||||||
|
|
||||||
|
def peek(self, token_type) -> False | Token:
|
||||||
|
tok = self.token
|
||||||
|
if self.token.kind == token_type:
|
||||||
|
return tok
|
||||||
|
return False
|
||||||
|
|
||||||
|
def expect(self, token_type: Tokens) -> Token:
|
||||||
|
r = self.accept(token_type)
|
||||||
|
logger.debug("%s", f"Expecting {token_type}, got {r}")
|
||||||
|
if r is False:
|
||||||
|
raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted {token_type}")
|
||||||
|
return r
|
||||||
|
|
||||||
|
def factor(self) -> Expression:
|
||||||
|
if self.accept(Tokens.Parens_Left):
|
||||||
|
v = self.expression()
|
||||||
|
self.expect(Tokens.Parens_Right)
|
||||||
|
return v
|
||||||
|
elif tok := self.accept(Tokens.Number):
|
||||||
|
logger.debug("%s", f"Found number {self.prev_token}")
|
||||||
|
return Number(value=int(tok.value))
|
||||||
|
else:
|
||||||
|
raise ParsingError(self.token.loc, f"Unexpected token '{self.token}', wanted parenthesized expression or "
|
||||||
|
f"number")
|
||||||
|
|
||||||
|
def term(self) -> Expression:
|
||||||
|
operations = []
|
||||||
|
operand = self.factor()
|
||||||
|
operations += [operand]
|
||||||
|
|
||||||
|
while operator := self.accept(Tokens.Op_Multiply, Tokens.Op_Divide):
|
||||||
|
operand = self.factor()
|
||||||
|
operations += [operand]
|
||||||
|
|
||||||
|
if len(operations) == 1:
|
||||||
|
return operations[0]
|
||||||
|
|
||||||
|
logger.debug("%s", f"Product of the following terms: {operations}")
|
||||||
|
return Product(*operations)
|
||||||
|
|
||||||
|
def summation(self) -> Sum:
|
||||||
|
operations = []
|
||||||
|
operand = self.term()
|
||||||
|
operations += [operand]
|
||||||
|
|
||||||
|
while operator := self.accept(Tokens.Op_Plus, Tokens.Op_Minus):
|
||||||
|
operand = self.term()
|
||||||
|
operations += [operand]
|
||||||
|
|
||||||
|
if len(operations) == 1:
|
||||||
|
return operations[0]
|
||||||
|
|
||||||
|
logger.debug("%s", f"Sum of the following terms: {operations}")
|
||||||
|
return Sum(*operations)
|
||||||
|
|
||||||
|
def expression(self) -> Expression:
|
||||||
|
summation = self.summation()
|
||||||
|
return summation
|
||||||
|
|
||||||
|
def root(self):
|
||||||
|
self.expression()
|
||||||
|
self.expect(Tokens.EOF)
|
||||||
|
|
@ -1,180 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
from beartype.typing import Union, Dict, Optional, List, Any, Tuple
|
|
||||||
import abc
|
|
||||||
|
|
||||||
from .nodes import Node
|
|
||||||
from .source import SourceLocation
|
|
||||||
from .tokenizer import Token, Tokens
|
|
||||||
from .logger import logger
|
|
||||||
|
|
||||||
class ParsingError(Exception):
|
|
||||||
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
|
|
||||||
super().__init__(f"{message} at {str(location)}")
|
|
||||||
self.location = location
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class EvalResult:
|
|
||||||
result: EvalResult | List[EvalResult] | Token | None = None
|
|
||||||
errors: Optional[ParsingError] = None
|
|
||||||
name: Optional[str] = None
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
#if isinstance(self.result, EvalResult):
|
|
||||||
# return self.result
|
|
||||||
|
|
||||||
name = self.name if self.name is not None else self.__class__.__name__
|
|
||||||
value = str(self.result)
|
|
||||||
if type(self.result) == list:
|
|
||||||
value = ", ".join(str(r) for r in self.result)
|
|
||||||
return f"{name}({value})"
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return super(EvalResult, self).__repr__()
|
|
||||||
|
|
||||||
RuleLike = Union[str, 'Rule']
|
|
||||||
|
|
||||||
class Rule(abc.ABC):
|
|
||||||
_named_rules: Dict[str, Rule] = dict()
|
|
||||||
|
|
||||||
def __init__(self, *sub_rules: RuleLike):
|
|
||||||
self._prepared: bool = False
|
|
||||||
self._rules: List[RuleLike] = list(sub_rules)
|
|
||||||
self.rules: List[Rule] = []
|
|
||||||
self.name: Optional[str] = None
|
|
||||||
self.node: Optional[Node] = None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule:
|
|
||||||
rule.name = name
|
|
||||||
rule.node = node
|
|
||||||
Rule._named_rules[name] = rule
|
|
||||||
return rule
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def prepare(self):
|
|
||||||
if self._prepared:
|
|
||||||
return
|
|
||||||
|
|
||||||
for key, rule in enumerate(self._rules):
|
|
||||||
if type(rule) == str:
|
|
||||||
self._rules[key] = Rule._named_rules[rule]
|
|
||||||
|
|
||||||
self.rules = self._rules
|
|
||||||
|
|
||||||
self._prepared = True
|
|
||||||
for rule in self._rules:
|
|
||||||
if not rule._prepared:
|
|
||||||
rule.prepare()
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "{}:{}({})".format(self.__class__.__name__,
|
|
||||||
self.name if self.name is not None else "",
|
|
||||||
", ".join(rule.__class__.__name__ for rule in self.rules))
|
|
||||||
|
|
||||||
|
|
||||||
class Terminal(Rule):
|
|
||||||
def __init__(self, token_type: Tokens):
|
|
||||||
super().__init__()
|
|
||||||
self.token_type = token_type
|
|
||||||
|
|
||||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
|
||||||
assert len(tokens) > 0
|
|
||||||
result = EvalResult(name=self.name)
|
|
||||||
logger.debug("%s", f"{depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'")
|
|
||||||
if len(tokens) != 1:
|
|
||||||
result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token")
|
|
||||||
return result
|
|
||||||
|
|
||||||
if tokens[0].kind != self.token_type:
|
|
||||||
result.errors = ParsingError(
|
|
||||||
tokens[0].loc,
|
|
||||||
message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}"
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
result.result = tokens[0]
|
|
||||||
|
|
||||||
logger.debug(f"{depth}: Terminal: Found terminal node: {result}")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "{}({})".format(self.__class__.__name__, self.token_type)
|
|
||||||
|
|
||||||
|
|
||||||
class Or(Rule):
|
|
||||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
|
||||||
result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name)
|
|
||||||
rule: Rule
|
|
||||||
for i, rule in enumerate(self.rules):
|
|
||||||
logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} : trying rule: {rule}")
|
|
||||||
result = rule.evaluate(tokens, depth=depth + 1)
|
|
||||||
if result.errors is None:
|
|
||||||
logger.debug(f"{depth}: Or: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {result}")
|
|
||||||
break
|
|
||||||
|
|
||||||
logger.debug(f"{depth}: Or: Finished with errors: {result.errors}")
|
|
||||||
|
|
||||||
result.name = self.name
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class And(Rule):
|
|
||||||
def evaluate(self, tokens: List[Token], *, depth: int = 0, parent: Optional[Rule] = None) -> EvalResult:
|
|
||||||
result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name)
|
|
||||||
result.result = []
|
|
||||||
begin = 0
|
|
||||||
end = 0
|
|
||||||
for i, rule in enumerate(self.rules):
|
|
||||||
logger.debug("%s", f"{depth}: And: Trying rule '{rule}'")
|
|
||||||
|
|
||||||
if end == len(tokens):
|
|
||||||
logger.error("%s", f"{depth}: And: Oops, reached the end of the tokens")
|
|
||||||
|
|
||||||
best_r: Optional[EvalResult] = None
|
|
||||||
while end < len(tokens):
|
|
||||||
tokens_ = tokens[begin:end + 1]
|
|
||||||
r = rule.evaluate(tokens_, depth=depth + 1)
|
|
||||||
|
|
||||||
# No previous match, but we found one
|
|
||||||
if best_r is None and r.errors is None:
|
|
||||||
logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}")
|
|
||||||
best_r = r
|
|
||||||
|
|
||||||
# No result at all. That's an error.
|
|
||||||
elif best_r is None and r.errors is not None:
|
|
||||||
result.errors = r.errors
|
|
||||||
return result
|
|
||||||
|
|
||||||
# We had a result, but we still matched with more tokens
|
|
||||||
elif best_r is not None and r.errors is None:
|
|
||||||
logger.debug(f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}")
|
|
||||||
best_r = r
|
|
||||||
|
|
||||||
# We already have a match, and we can't improve it. Finish this rule.
|
|
||||||
elif best_r is not None and r.errors is not None:
|
|
||||||
logger.debug(
|
|
||||||
f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}")
|
|
||||||
result.result += [best_r]
|
|
||||||
# Matching rule ended at 'end - 1', meaning next rule will begin at end
|
|
||||||
begin = end
|
|
||||||
break
|
|
||||||
|
|
||||||
end += 1
|
|
||||||
|
|
||||||
else:
|
|
||||||
logger.debug(
|
|
||||||
f"{depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}")
|
|
||||||
result.result += [best_r]
|
|
||||||
result.errors = None
|
|
||||||
|
|
||||||
if end != len(tokens):
|
|
||||||
logger.debug(f"{depth}: And: Didn't consume all tokens")
|
|
||||||
return result
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from beartype import beartype
|
from beartype import beartype
|
||||||
from beartype.typing import Optional
|
from beartype.typing import Optional, List
|
||||||
|
|
||||||
import enum
|
import enum
|
||||||
import re
|
import re
|
||||||
|
|
@ -31,8 +31,12 @@ class Tokens(enum.Enum):
|
||||||
Parens_Left = re.compile(r"\(")
|
Parens_Left = re.compile(r"\(")
|
||||||
Parens_Right = re.compile(r"\)")
|
Parens_Right = re.compile(r"\)")
|
||||||
Blank = re.compile(r"\s+")
|
Blank = re.compile(r"\s+")
|
||||||
|
EOF = re.compile(r"\Z")
|
||||||
Unknown = re.compile(r".*")
|
Unknown = re.compile(r".*")
|
||||||
|
|
||||||
|
def __bool__(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -67,4 +71,7 @@ class Tokenizer:
|
||||||
|
|
||||||
results += [best_result]
|
results += [best_result]
|
||||||
begin += len(best_result.value)
|
begin += len(best_result.value)
|
||||||
|
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||||
|
Location(line=0, character=len(data)), source=data
|
||||||
|
))]
|
||||||
return results
|
return results
|
||||||
Loading…
Add table
Add a link
Reference in a new issue