meta: initial commit

This commit is contained in:
Antoine Viallon 2023-04-20 21:52:13 +02:00
commit 3d15b6dd63
Signed by: aviallon
GPG key ID: D126B13AB555E16F
7 changed files with 435 additions and 0 deletions

0
compiler/__init__.py Normal file
View file

86
compiler/__main__.py Normal file
View file

@ -0,0 +1,86 @@
from __future__ import annotations
from .nodes import Node
from .rules import Rule, And, Or, Terminal, EvalResult
from .tokenizer import Tokenizer, Tokens, Token
data = "2 * (32.9 + 1)"
grammar = r"""
number = r"[0-9]+(\.[0-9]*)?"
operator = "+" | "-" | "*" | "/"
value = "(" expression ")" | number | expression
expression = value operator value
root = expression
"""
number = Rule.make_rule("number",
Terminal(token_type=Tokens.Number)
)
operator = Rule.make_rule("operator", Or(
Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus),
Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide)
))
value = Rule.make_rule("value", Or(
And(
Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right)
),
number,
"expression"
))
expression = Rule.make_rule("expression", And(
number,
operator,
number
))
root = expression
root.prepare()
def print_results(result: EvalResult) -> str:
prefix = "\t" * print_results._depth
message = ""
if type(result) != EvalResult:
message = f"{prefix}{str(result)}\n"
return message
print_results._depth += 1
if result.name is not None:
message += f"{prefix}{result.name}\n"
if type(result.result) == list:
message += prefix + "{"
mylist = []
for r in result.result:
mylist += [print_results(r)]
message += " ".join(mylist)
message += prefix + "}"
else:
message += print_results(result.result) + "\n"
print_results._depth -= 1
return message
print_results._depth = 0
def main():
tokenizer = Tokenizer()
tokens = tokenizer.tokenize("2 + 3")
tokens = [token for token in tokens if token.kind != Tokens.Blank]
print(tokens)
result = root.evaluate(tokens)
print(result)
print(print_results(result))
if result.errors is not None:
raise result.errors
if __name__ == "__main__":
main()

28
compiler/logger.py Normal file
View file

@ -0,0 +1,28 @@
import logging
import enum
class LogLevel(enum.IntEnum):
Critical = logging.CRITICAL
Error = logging.ERROR
Warning = logging.WARNING
Info = logging.INFO
Debug = logging.DEBUG
def make_logger(name: str, level: LogLevel = LogLevel.Debug) -> logging.Logger:
_logger = logging.getLogger(name)
_logger.setLevel(level)
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(level)
# create formatter
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add ch to logger
_logger.addHandler(ch)
return _logger
logger = make_logger("compiler")

4
compiler/nodes.py Normal file
View file

@ -0,0 +1,4 @@
from __future__ import annotations
class Node:
pass

183
compiler/rules.py Normal file
View file

@ -0,0 +1,183 @@
from __future__ import annotations
from dataclasses import dataclass
from beartype.typing import Union, Dict, Optional, List, Any, Tuple
import abc
from .nodes import Node
from .source import SourceLocation
from .tokenizer import Token, Tokens
from .logger import logger
class ParsingError(Exception):
def __init__(self, location: SourceLocation, message: str = "Unknown error"):
super().__init__(f"{message} at {str(location)}")
self.location = location
@dataclass
class EvalResult:
result: EvalResult | List[EvalResult] | Token | None = None
errors: Optional[ParsingError] = None
name: Optional[str] = None
def __str__(self):
#if isinstance(self.result, EvalResult):
# return self.result
name = self.name if self.name is not None else self.__class__.__name__
value = str(self.result)
if type(self.result) == list:
value = ", ".join(str(r) for r in self.result)
return f"{name}({value})"
def __repr__(self):
return super(EvalResult, self).__repr__()
RuleLike = Union[str, 'Rule']
class Rule(abc.ABC):
_named_rules: Dict[str, Rule] = dict()
_depth: int = 0
def __init__(self, *sub_rules: RuleLike):
self._prepared: bool = False
self._rules: List[RuleLike] = list(sub_rules)
self.rules: List[Rule] = []
self.name: Optional[str] = None
self.node: Optional[Node] = None
@staticmethod
def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule:
rule.name = name
rule.node = node
Rule._named_rules[name] = rule
return rule
@abc.abstractmethod
def evaluate(self, tokens: List[Token]) -> EvalResult:
raise NotImplementedError()
def prepare(self):
if self._prepared:
return
for key, rule in enumerate(self._rules):
if type(rule) == str:
self._rules[key] = Rule._named_rules[rule]
self.rules = self._rules
self._prepared = True
for rule in self._rules:
if not rule._prepared:
rule.prepare()
def __repr__(self):
return "{}:{}({})".format(self.__class__.__name__,
self.name if self.name is not None else "",
", ".join(rule.__class__.__name__ for rule in self.rules))
class Terminal(Rule):
def __init__(self, token_type: Tokens):
super().__init__()
self.token_type = token_type
def evaluate(self, tokens: List[Token]) -> EvalResult:
assert len(tokens) > 0
result = EvalResult(name=self.name)
logger.debug(f"{Rule._depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'")
if len(tokens) != 1:
result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token")
return result
if tokens[0].kind != self.token_type:
result.errors = ParsingError(
tokens[0].loc,
message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}"
)
return result
result.result = tokens[0]
logger.debug(f"{Rule._depth}: Terminal: Found terminal node: {result}")
return result
def __repr__(self):
return "{}({})".format(self.__class__.__name__, self.token_type)
class Or(Rule):
def evaluate(self, tokens: List[Token]) -> (Optional[Exception], Dict[str, Any]):
result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name)
rule: Rule
for rule in self.rules:
logger.debug(f"{Rule._depth}: Or: trying rule: {rule}")
Rule._depth += 1
result = rule.evaluate(tokens)
Rule._depth -= 1
if result.errors is None:
logger.debug(f"{Rule._depth}: Or: Rule '{rule}' matched, result: {result}")
break
logger.debug(f"{Rule._depth}: Or: Finished with errors: {result.errors}")
result.name = self.name
return result
class And(Rule):
def evaluate(self, tokens: List[Token]) -> (Optional[Exception], Dict[str, Any]):
result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name)
result.result = []
begin = 0
end = 0
for i, rule in enumerate(self.rules):
logger.debug("%s", f"{Rule._depth}: And: Trying rule '{rule}'")
if end == len(tokens):
logger.error("%s", f"{Rule._depth}: And: Oops, reached the end of the tokens")
best_r: Optional[EvalResult] = None
while end < len(tokens):
tokens_ = tokens[begin:end+1]
Rule._depth += 1
r = rule.evaluate(tokens_)
Rule._depth -= 1
# No previous match, but we found one
if best_r is None and r.errors is None:
logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}")
best_r = r
# No result at all. That's an error.
elif best_r is None and r.errors is not None:
result.errors = r.errors
return result
# We had a result, but we still matched with more tokens
elif best_r is not None and r.errors is None:
logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}")
best_r = r
# We already have a match, and we can't improve it. Finish this rule.
elif best_r is not None and r.errors is not None:
logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}")
result.result += [best_r]
# Matching rule ended at 'end - 1', meaning next rule will begin at end
begin = end
break
end += 1
else:
logger.debug(
f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}")
result.result += [best_r]
result.errors = None
if end != len(tokens):
logger.debug(f"{Rule._depth}: And: Didn't consume all tokens")
return result

64
compiler/source.py Normal file
View file

@ -0,0 +1,64 @@
from __future__ import annotations
from beartype import beartype
from beartype.typing import Optional
from dataclasses import dataclass
@beartype
@dataclass
class Location:
line: int
character: int
file: str = "<none>"
def __str__(self) -> str:
return f"{self.file}:{self.line}:{self.character}"
@beartype
class SourceLocation:
def __init__(self, begin: Location, end: Optional[Location] = None, source: Optional[str] = None):
self.begin = begin
self.end = end
if self.end is None:
self.end = self.begin
self.source = source
assert (self.begin.line, self.begin.character) <= (self.end.line, self.end.character)
assert self.begin.file == self.end.file
def __str__(self):
if self.begin == self.end:
return str(self.begin)
return f"{str(self.begin)} - {str(self.end)}"
@property
def source_substring(self) -> str:
source = self.source.splitlines(keepends=False)
source_lines = source[self.begin.line:self.end.line + 1]
if len(source_lines) == 1:
source_lines[0] = source_lines[0][self.begin.character:self.end.character + 1]
else:
source_lines[0] = source_lines[0][self.begin.character:]
source_lines[-1] = source_lines[-1][:self.end.character]
return "\n".join(source_lines)
def show_in_source(self) -> str:
source = self.source.splitlines(keepends=False)
source_line = source[self.begin.line]
result = [source_line]
if self.begin.line != self.end.line:
return "\n".join(result)
line = " " * self.begin.character
line += "^" + "-" * max(0, (self.end.character - self.begin.character - 1))
line += " " * (len(source_line) - len(line))
result += [line]
return "\n".join(result)

70
compiler/tokenizer.py Normal file
View file

@ -0,0 +1,70 @@
from __future__ import annotations
from dataclasses import dataclass, field
from beartype import beartype
from beartype.typing import Optional
import enum
import re
from .logger import logger
from .source import SourceLocation, Location
@beartype
@dataclass
class Token:
kind: Tokens
loc: SourceLocation = field(compare=False, hash=False, default=None)
value: Optional[str] = field(compare=False, hash=False, default=None)
def __repr__(self):
if self.value is None:
return super().__repr__()
return f"{self.kind.name}({repr(self.value)})"
class Tokens(enum.Enum):
Number = re.compile(r"[0-9]+(\.?[0-9]*)")
Op_Plus = re.compile(r"\+")
Op_Minus = re.compile(r"-")
Op_Multiply = re.compile(r"\*")
Op_Divide = re.compile(r"/")
Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)")
Blank = re.compile(r"\s+")
Unknown = re.compile(r".*")
class Tokenizer:
def __init__(self):
pass
def tokenize(self, data: str) -> List[Token]:
results: List[Token] = []
begin = 0
while begin < len(data):
best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(Location(line=0, character=begin), source=data)
)
for token_kind in Tokens:
if token_kind == Tokens.Unknown:
continue
match = token_kind.value.match(data, begin)
if match is not None:
logger.debug(f"Got match: {match}")
result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
loc = SourceLocation(
begin=Location(line=0, character=begin),
end=Location(line=0, character=begin + len(result))
)
best_result = Token(token_kind, value=result, loc=loc)
logger.debug(f"New best match: {best_result}")
if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1)
results += [best_result]
begin += len(best_result.value)
return results