meta: initial commit
This commit is contained in:
commit
3d15b6dd63
7 changed files with 435 additions and 0 deletions
0
compiler/__init__.py
Normal file
0
compiler/__init__.py
Normal file
86
compiler/__main__.py
Normal file
86
compiler/__main__.py
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from .nodes import Node
|
||||
from .rules import Rule, And, Or, Terminal, EvalResult
|
||||
from .tokenizer import Tokenizer, Tokens, Token
|
||||
|
||||
data = "2 * (32.9 + 1)"
|
||||
|
||||
grammar = r"""
|
||||
number = r"[0-9]+(\.[0-9]*)?"
|
||||
operator = "+" | "-" | "*" | "/"
|
||||
value = "(" expression ")" | number | expression
|
||||
expression = value operator value
|
||||
root = expression
|
||||
"""
|
||||
|
||||
number = Rule.make_rule("number",
|
||||
Terminal(token_type=Tokens.Number)
|
||||
)
|
||||
operator = Rule.make_rule("operator", Or(
|
||||
Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus),
|
||||
Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide)
|
||||
))
|
||||
|
||||
value = Rule.make_rule("value", Or(
|
||||
And(
|
||||
Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right)
|
||||
),
|
||||
number,
|
||||
"expression"
|
||||
))
|
||||
|
||||
expression = Rule.make_rule("expression", And(
|
||||
number,
|
||||
operator,
|
||||
number
|
||||
))
|
||||
|
||||
root = expression
|
||||
root.prepare()
|
||||
|
||||
|
||||
def print_results(result: EvalResult) -> str:
|
||||
prefix = "\t" * print_results._depth
|
||||
message = ""
|
||||
if type(result) != EvalResult:
|
||||
message = f"{prefix}{str(result)}\n"
|
||||
return message
|
||||
|
||||
print_results._depth += 1
|
||||
|
||||
if result.name is not None:
|
||||
message += f"{prefix}{result.name}\n"
|
||||
|
||||
if type(result.result) == list:
|
||||
message += prefix + "{"
|
||||
mylist = []
|
||||
for r in result.result:
|
||||
mylist += [print_results(r)]
|
||||
message += " ".join(mylist)
|
||||
message += prefix + "}"
|
||||
else:
|
||||
message += print_results(result.result) + "\n"
|
||||
print_results._depth -= 1
|
||||
return message
|
||||
|
||||
|
||||
print_results._depth = 0
|
||||
|
||||
|
||||
def main():
|
||||
tokenizer = Tokenizer()
|
||||
tokens = tokenizer.tokenize("2 + 3")
|
||||
|
||||
tokens = [token for token in tokens if token.kind != Tokens.Blank]
|
||||
print(tokens)
|
||||
|
||||
result = root.evaluate(tokens)
|
||||
print(result)
|
||||
print(print_results(result))
|
||||
if result.errors is not None:
|
||||
raise result.errors
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
28
compiler/logger.py
Normal file
28
compiler/logger.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
import logging
|
||||
import enum
|
||||
|
||||
|
||||
class LogLevel(enum.IntEnum):
|
||||
Critical = logging.CRITICAL
|
||||
Error = logging.ERROR
|
||||
Warning = logging.WARNING
|
||||
Info = logging.INFO
|
||||
Debug = logging.DEBUG
|
||||
|
||||
|
||||
def make_logger(name: str, level: LogLevel = LogLevel.Debug) -> logging.Logger:
|
||||
_logger = logging.getLogger(name)
|
||||
_logger.setLevel(level)
|
||||
# create console handler and set level to debug
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(level)
|
||||
# create formatter
|
||||
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
# add ch to logger
|
||||
_logger.addHandler(ch)
|
||||
|
||||
return _logger
|
||||
|
||||
|
||||
logger = make_logger("compiler")
|
||||
4
compiler/nodes.py
Normal file
4
compiler/nodes.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from __future__ import annotations
|
||||
|
||||
class Node:
|
||||
pass
|
||||
183
compiler/rules.py
Normal file
183
compiler/rules.py
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
from beartype.typing import Union, Dict, Optional, List, Any, Tuple
|
||||
import abc
|
||||
|
||||
from .nodes import Node
|
||||
from .source import SourceLocation
|
||||
from .tokenizer import Token, Tokens
|
||||
from .logger import logger
|
||||
|
||||
class ParsingError(Exception):
|
||||
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
|
||||
super().__init__(f"{message} at {str(location)}")
|
||||
self.location = location
|
||||
|
||||
@dataclass
|
||||
class EvalResult:
|
||||
result: EvalResult | List[EvalResult] | Token | None = None
|
||||
errors: Optional[ParsingError] = None
|
||||
name: Optional[str] = None
|
||||
|
||||
def __str__(self):
|
||||
#if isinstance(self.result, EvalResult):
|
||||
# return self.result
|
||||
|
||||
name = self.name if self.name is not None else self.__class__.__name__
|
||||
value = str(self.result)
|
||||
if type(self.result) == list:
|
||||
value = ", ".join(str(r) for r in self.result)
|
||||
return f"{name}({value})"
|
||||
|
||||
def __repr__(self):
|
||||
return super(EvalResult, self).__repr__()
|
||||
|
||||
RuleLike = Union[str, 'Rule']
|
||||
|
||||
class Rule(abc.ABC):
|
||||
_named_rules: Dict[str, Rule] = dict()
|
||||
_depth: int = 0
|
||||
|
||||
def __init__(self, *sub_rules: RuleLike):
|
||||
self._prepared: bool = False
|
||||
self._rules: List[RuleLike] = list(sub_rules)
|
||||
self.rules: List[Rule] = []
|
||||
self.name: Optional[str] = None
|
||||
self.node: Optional[Node] = None
|
||||
|
||||
@staticmethod
|
||||
def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule:
|
||||
rule.name = name
|
||||
rule.node = node
|
||||
Rule._named_rules[name] = rule
|
||||
return rule
|
||||
|
||||
@abc.abstractmethod
|
||||
def evaluate(self, tokens: List[Token]) -> EvalResult:
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare(self):
|
||||
if self._prepared:
|
||||
return
|
||||
|
||||
for key, rule in enumerate(self._rules):
|
||||
if type(rule) == str:
|
||||
self._rules[key] = Rule._named_rules[rule]
|
||||
|
||||
self.rules = self._rules
|
||||
|
||||
self._prepared = True
|
||||
for rule in self._rules:
|
||||
if not rule._prepared:
|
||||
rule.prepare()
|
||||
|
||||
def __repr__(self):
|
||||
return "{}:{}({})".format(self.__class__.__name__,
|
||||
self.name if self.name is not None else "",
|
||||
", ".join(rule.__class__.__name__ for rule in self.rules))
|
||||
|
||||
|
||||
class Terminal(Rule):
|
||||
def __init__(self, token_type: Tokens):
|
||||
super().__init__()
|
||||
self.token_type = token_type
|
||||
|
||||
def evaluate(self, tokens: List[Token]) -> EvalResult:
|
||||
assert len(tokens) > 0
|
||||
result = EvalResult(name=self.name)
|
||||
logger.debug(f"{Rule._depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'")
|
||||
if len(tokens) != 1:
|
||||
result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token")
|
||||
return result
|
||||
|
||||
if tokens[0].kind != self.token_type:
|
||||
result.errors = ParsingError(
|
||||
tokens[0].loc,
|
||||
message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}"
|
||||
)
|
||||
return result
|
||||
|
||||
result.result = tokens[0]
|
||||
|
||||
logger.debug(f"{Rule._depth}: Terminal: Found terminal node: {result}")
|
||||
|
||||
return result
|
||||
|
||||
def __repr__(self):
|
||||
return "{}({})".format(self.__class__.__name__, self.token_type)
|
||||
|
||||
class Or(Rule):
|
||||
def evaluate(self, tokens: List[Token]) -> (Optional[Exception], Dict[str, Any]):
|
||||
result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name)
|
||||
rule: Rule
|
||||
for rule in self.rules:
|
||||
logger.debug(f"{Rule._depth}: Or: trying rule: {rule}")
|
||||
Rule._depth += 1
|
||||
result = rule.evaluate(tokens)
|
||||
Rule._depth -= 1
|
||||
if result.errors is None:
|
||||
logger.debug(f"{Rule._depth}: Or: Rule '{rule}' matched, result: {result}")
|
||||
break
|
||||
|
||||
logger.debug(f"{Rule._depth}: Or: Finished with errors: {result.errors}")
|
||||
|
||||
result.name = self.name
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class And(Rule):
|
||||
def evaluate(self, tokens: List[Token]) -> (Optional[Exception], Dict[str, Any]):
|
||||
result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name)
|
||||
result.result = []
|
||||
begin = 0
|
||||
end = 0
|
||||
for i, rule in enumerate(self.rules):
|
||||
logger.debug("%s", f"{Rule._depth}: And: Trying rule '{rule}'")
|
||||
|
||||
if end == len(tokens):
|
||||
logger.error("%s", f"{Rule._depth}: And: Oops, reached the end of the tokens")
|
||||
|
||||
best_r: Optional[EvalResult] = None
|
||||
while end < len(tokens):
|
||||
tokens_ = tokens[begin:end+1]
|
||||
Rule._depth += 1
|
||||
r = rule.evaluate(tokens_)
|
||||
Rule._depth -= 1
|
||||
|
||||
# No previous match, but we found one
|
||||
if best_r is None and r.errors is None:
|
||||
logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}")
|
||||
best_r = r
|
||||
|
||||
# No result at all. That's an error.
|
||||
elif best_r is None and r.errors is not None:
|
||||
result.errors = r.errors
|
||||
return result
|
||||
|
||||
# We had a result, but we still matched with more tokens
|
||||
elif best_r is not None and r.errors is None:
|
||||
logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}")
|
||||
best_r = r
|
||||
|
||||
# We already have a match, and we can't improve it. Finish this rule.
|
||||
elif best_r is not None and r.errors is not None:
|
||||
logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}")
|
||||
result.result += [best_r]
|
||||
# Matching rule ended at 'end - 1', meaning next rule will begin at end
|
||||
begin = end
|
||||
break
|
||||
|
||||
end += 1
|
||||
|
||||
else:
|
||||
logger.debug(
|
||||
f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}")
|
||||
result.result += [best_r]
|
||||
result.errors = None
|
||||
|
||||
if end != len(tokens):
|
||||
logger.debug(f"{Rule._depth}: And: Didn't consume all tokens")
|
||||
return result
|
||||
64
compiler/source.py
Normal file
64
compiler/source.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from beartype import beartype
|
||||
from beartype.typing import Optional
|
||||
from dataclasses import dataclass
|
||||
|
||||
@beartype
|
||||
@dataclass
|
||||
class Location:
|
||||
line: int
|
||||
character: int
|
||||
file: str = "<none>"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.file}:{self.line}:{self.character}"
|
||||
|
||||
|
||||
@beartype
|
||||
class SourceLocation:
|
||||
def __init__(self, begin: Location, end: Optional[Location] = None, source: Optional[str] = None):
|
||||
self.begin = begin
|
||||
self.end = end
|
||||
if self.end is None:
|
||||
self.end = self.begin
|
||||
|
||||
self.source = source
|
||||
|
||||
assert (self.begin.line, self.begin.character) <= (self.end.line, self.end.character)
|
||||
assert self.begin.file == self.end.file
|
||||
|
||||
def __str__(self):
|
||||
if self.begin == self.end:
|
||||
return str(self.begin)
|
||||
return f"{str(self.begin)} - {str(self.end)}"
|
||||
|
||||
@property
|
||||
def source_substring(self) -> str:
|
||||
source = self.source.splitlines(keepends=False)
|
||||
source_lines = source[self.begin.line:self.end.line + 1]
|
||||
if len(source_lines) == 1:
|
||||
source_lines[0] = source_lines[0][self.begin.character:self.end.character + 1]
|
||||
else:
|
||||
source_lines[0] = source_lines[0][self.begin.character:]
|
||||
source_lines[-1] = source_lines[-1][:self.end.character]
|
||||
return "\n".join(source_lines)
|
||||
|
||||
def show_in_source(self) -> str:
|
||||
source = self.source.splitlines(keepends=False)
|
||||
source_line = source[self.begin.line]
|
||||
result = [source_line]
|
||||
if self.begin.line != self.end.line:
|
||||
return "\n".join(result)
|
||||
|
||||
line = " " * self.begin.character
|
||||
line += "^" + "-" * max(0, (self.end.character - self.begin.character - 1))
|
||||
line += " " * (len(source_line) - len(line))
|
||||
|
||||
result += [line]
|
||||
|
||||
return "\n".join(result)
|
||||
|
||||
|
||||
|
||||
|
||||
70
compiler/tokenizer.py
Normal file
70
compiler/tokenizer.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from beartype import beartype
|
||||
from beartype.typing import Optional
|
||||
|
||||
import enum
|
||||
import re
|
||||
|
||||
from .logger import logger
|
||||
from .source import SourceLocation, Location
|
||||
|
||||
@beartype
|
||||
@dataclass
|
||||
class Token:
|
||||
kind: Tokens
|
||||
loc: SourceLocation = field(compare=False, hash=False, default=None)
|
||||
value: Optional[str] = field(compare=False, hash=False, default=None)
|
||||
|
||||
def __repr__(self):
|
||||
if self.value is None:
|
||||
return super().__repr__()
|
||||
return f"{self.kind.name}({repr(self.value)})"
|
||||
|
||||
|
||||
class Tokens(enum.Enum):
|
||||
Number = re.compile(r"[0-9]+(\.?[0-9]*)")
|
||||
Op_Plus = re.compile(r"\+")
|
||||
Op_Minus = re.compile(r"-")
|
||||
Op_Multiply = re.compile(r"\*")
|
||||
Op_Divide = re.compile(r"/")
|
||||
Parens_Left = re.compile(r"\(")
|
||||
Parens_Right = re.compile(r"\)")
|
||||
Blank = re.compile(r"\s+")
|
||||
Unknown = re.compile(r".*")
|
||||
|
||||
|
||||
class Tokenizer:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def tokenize(self, data: str) -> List[Token]:
|
||||
results: List[Token] = []
|
||||
begin = 0
|
||||
while begin < len(data):
|
||||
best_result: Token = Token(Tokens.Unknown,
|
||||
loc=SourceLocation(Location(line=0, character=begin), source=data)
|
||||
)
|
||||
for token_kind in Tokens:
|
||||
if token_kind == Tokens.Unknown:
|
||||
continue
|
||||
match = token_kind.value.match(data, begin)
|
||||
if match is not None:
|
||||
logger.debug(f"Got match: {match}")
|
||||
result = match.group(0)
|
||||
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
||||
loc = SourceLocation(
|
||||
begin=Location(line=0, character=begin),
|
||||
end=Location(line=0, character=begin + len(result))
|
||||
)
|
||||
best_result = Token(token_kind, value=result, loc=loc)
|
||||
logger.debug(f"New best match: {best_result}")
|
||||
|
||||
if best_result.kind == Tokens.Unknown:
|
||||
source_hint = best_result.loc.show_in_source()
|
||||
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
||||
exit(1)
|
||||
|
||||
results += [best_result]
|
||||
begin += len(best_result.value)
|
||||
return results
|
||||
Loading…
Add table
Add a link
Reference in a new issue