From 3d15b6dd63c44465194e31ebd1e5809a861adfa8 Mon Sep 17 00:00:00 2001 From: Antoine Viallon Date: Thu, 20 Apr 2023 21:52:13 +0200 Subject: [PATCH] meta: initial commit --- compiler/__init__.py | 0 compiler/__main__.py | 86 ++++++++++++++++++++ compiler/logger.py | 28 +++++++ compiler/nodes.py | 4 + compiler/rules.py | 183 ++++++++++++++++++++++++++++++++++++++++++ compiler/source.py | 64 +++++++++++++++ compiler/tokenizer.py | 70 ++++++++++++++++ 7 files changed, 435 insertions(+) create mode 100644 compiler/__init__.py create mode 100644 compiler/__main__.py create mode 100644 compiler/logger.py create mode 100644 compiler/nodes.py create mode 100644 compiler/rules.py create mode 100644 compiler/source.py create mode 100644 compiler/tokenizer.py diff --git a/compiler/__init__.py b/compiler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/compiler/__main__.py b/compiler/__main__.py new file mode 100644 index 0000000..0736b3b --- /dev/null +++ b/compiler/__main__.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from .nodes import Node +from .rules import Rule, And, Or, Terminal, EvalResult +from .tokenizer import Tokenizer, Tokens, Token + +data = "2 * (32.9 + 1)" + +grammar = r""" +number = r"[0-9]+(\.[0-9]*)?" +operator = "+" | "-" | "*" | "/" +value = "(" expression ")" | number | expression +expression = value operator value +root = expression +""" + +number = Rule.make_rule("number", + Terminal(token_type=Tokens.Number) + ) +operator = Rule.make_rule("operator", Or( + Terminal(Tokens.Op_Plus), Terminal(Tokens.Op_Minus), + Terminal(Tokens.Op_Multiply), Terminal(Tokens.Op_Divide) +)) + +value = Rule.make_rule("value", Or( + And( + Terminal(Tokens.Parens_Left), operator, Terminal(Tokens.Parens_Right) + ), + number, + "expression" +)) + +expression = Rule.make_rule("expression", And( + number, + operator, + number +)) + +root = expression +root.prepare() + + +def print_results(result: EvalResult) -> str: + prefix = "\t" * print_results._depth + message = "" + if type(result) != EvalResult: + message = f"{prefix}{str(result)}\n" + return message + + print_results._depth += 1 + + if result.name is not None: + message += f"{prefix}{result.name}\n" + + if type(result.result) == list: + message += prefix + "{" + mylist = [] + for r in result.result: + mylist += [print_results(r)] + message += " ".join(mylist) + message += prefix + "}" + else: + message += print_results(result.result) + "\n" + print_results._depth -= 1 + return message + + +print_results._depth = 0 + + +def main(): + tokenizer = Tokenizer() + tokens = tokenizer.tokenize("2 + 3") + + tokens = [token for token in tokens if token.kind != Tokens.Blank] + print(tokens) + + result = root.evaluate(tokens) + print(result) + print(print_results(result)) + if result.errors is not None: + raise result.errors + + +if __name__ == "__main__": + main() diff --git a/compiler/logger.py b/compiler/logger.py new file mode 100644 index 0000000..09ab1d8 --- /dev/null +++ b/compiler/logger.py @@ -0,0 +1,28 @@ +import logging +import enum + + +class LogLevel(enum.IntEnum): + Critical = logging.CRITICAL + Error = logging.ERROR + Warning = logging.WARNING + Info = logging.INFO + Debug = logging.DEBUG + + +def make_logger(name: str, level: LogLevel = LogLevel.Debug) -> logging.Logger: + _logger = logging.getLogger(name) + _logger.setLevel(level) + # create console handler and set level to debug + ch = logging.StreamHandler() + ch.setLevel(level) + # create formatter + formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s') + ch.setFormatter(formatter) + # add ch to logger + _logger.addHandler(ch) + + return _logger + + +logger = make_logger("compiler") diff --git a/compiler/nodes.py b/compiler/nodes.py new file mode 100644 index 0000000..35d00a0 --- /dev/null +++ b/compiler/nodes.py @@ -0,0 +1,4 @@ +from __future__ import annotations + +class Node: + pass \ No newline at end of file diff --git a/compiler/rules.py b/compiler/rules.py new file mode 100644 index 0000000..1e7145d --- /dev/null +++ b/compiler/rules.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from beartype.typing import Union, Dict, Optional, List, Any, Tuple +import abc + +from .nodes import Node +from .source import SourceLocation +from .tokenizer import Token, Tokens +from .logger import logger + +class ParsingError(Exception): + def __init__(self, location: SourceLocation, message: str = "Unknown error"): + super().__init__(f"{message} at {str(location)}") + self.location = location + +@dataclass +class EvalResult: + result: EvalResult | List[EvalResult] | Token | None = None + errors: Optional[ParsingError] = None + name: Optional[str] = None + + def __str__(self): + #if isinstance(self.result, EvalResult): + # return self.result + + name = self.name if self.name is not None else self.__class__.__name__ + value = str(self.result) + if type(self.result) == list: + value = ", ".join(str(r) for r in self.result) + return f"{name}({value})" + + def __repr__(self): + return super(EvalResult, self).__repr__() + +RuleLike = Union[str, 'Rule'] + +class Rule(abc.ABC): + _named_rules: Dict[str, Rule] = dict() + _depth: int = 0 + + def __init__(self, *sub_rules: RuleLike): + self._prepared: bool = False + self._rules: List[RuleLike] = list(sub_rules) + self.rules: List[Rule] = [] + self.name: Optional[str] = None + self.node: Optional[Node] = None + + @staticmethod + def make_rule(name: str, rule: Rule, node: Optional[Node] = None) -> Rule: + rule.name = name + rule.node = node + Rule._named_rules[name] = rule + return rule + + @abc.abstractmethod + def evaluate(self, tokens: List[Token]) -> EvalResult: + raise NotImplementedError() + + def prepare(self): + if self._prepared: + return + + for key, rule in enumerate(self._rules): + if type(rule) == str: + self._rules[key] = Rule._named_rules[rule] + + self.rules = self._rules + + self._prepared = True + for rule in self._rules: + if not rule._prepared: + rule.prepare() + + def __repr__(self): + return "{}:{}({})".format(self.__class__.__name__, + self.name if self.name is not None else "", + ", ".join(rule.__class__.__name__ for rule in self.rules)) + + +class Terminal(Rule): + def __init__(self, token_type: Tokens): + super().__init__() + self.token_type = token_type + + def evaluate(self, tokens: List[Token]) -> EvalResult: + assert len(tokens) > 0 + result = EvalResult(name=self.name) + logger.debug(f"{Rule._depth}: Terminal: Evaluating terminal token with tokens: '{tokens}'") + if len(tokens) != 1: + result.errors = ParsingError(tokens[0].loc, message=f"Terminal rule must have exactly one token") + return result + + if tokens[0].kind != self.token_type: + result.errors = ParsingError( + tokens[0].loc, + message=f"Unexpected token. Wanted {self.token_type.name}, got {tokens[0].kind.name}" + ) + return result + + result.result = tokens[0] + + logger.debug(f"{Rule._depth}: Terminal: Found terminal node: {result}") + + return result + + def __repr__(self): + return "{}({})".format(self.__class__.__name__, self.token_type) + +class Or(Rule): + def evaluate(self, tokens: List[Token]) -> (Optional[Exception], Dict[str, Any]): + result = EvalResult(errors=ParsingError(location=tokens[0].loc), name=self.name) + rule: Rule + for rule in self.rules: + logger.debug(f"{Rule._depth}: Or: trying rule: {rule}") + Rule._depth += 1 + result = rule.evaluate(tokens) + Rule._depth -= 1 + if result.errors is None: + logger.debug(f"{Rule._depth}: Or: Rule '{rule}' matched, result: {result}") + break + + logger.debug(f"{Rule._depth}: Or: Finished with errors: {result.errors}") + + result.name = self.name + + return result + + +class And(Rule): + def evaluate(self, tokens: List[Token]) -> (Optional[Exception], Dict[str, Any]): + result = EvalResult(errors=ParsingError(tokens[0].loc), name=self.name) + result.result = [] + begin = 0 + end = 0 + for i, rule in enumerate(self.rules): + logger.debug("%s", f"{Rule._depth}: And: Trying rule '{rule}'") + + if end == len(tokens): + logger.error("%s", f"{Rule._depth}: And: Oops, reached the end of the tokens") + + best_r: Optional[EvalResult] = None + while end < len(tokens): + tokens_ = tokens[begin:end+1] + Rule._depth += 1 + r = rule.evaluate(tokens_) + Rule._depth -= 1 + + # No previous match, but we found one + if best_r is None and r.errors is None: + logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' matched, result: {r}") + best_r = r + + # No result at all. That's an error. + elif best_r is None and r.errors is not None: + result.errors = r.errors + return result + + # We had a result, but we still matched with more tokens + elif best_r is not None and r.errors is None: + logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' **improved**, result: {r}") + best_r = r + + # We already have a match, and we can't improve it. Finish this rule. + elif best_r is not None and r.errors is not None: + logger.debug(f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, result: {best_r}") + result.result += [best_r] + # Matching rule ended at 'end - 1', meaning next rule will begin at end + begin = end + break + + end += 1 + + else: + logger.debug( + f"{Rule._depth}: And: Rule {i + 1}/{len(self.rules)} '{rule}' has FINAL match, finishing rule, result: {best_r}") + result.result += [best_r] + result.errors = None + + if end != len(tokens): + logger.debug(f"{Rule._depth}: And: Didn't consume all tokens") + return result diff --git a/compiler/source.py b/compiler/source.py new file mode 100644 index 0000000..6026807 --- /dev/null +++ b/compiler/source.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from beartype import beartype +from beartype.typing import Optional +from dataclasses import dataclass + +@beartype +@dataclass +class Location: + line: int + character: int + file: str = "" + + def __str__(self) -> str: + return f"{self.file}:{self.line}:{self.character}" + + +@beartype +class SourceLocation: + def __init__(self, begin: Location, end: Optional[Location] = None, source: Optional[str] = None): + self.begin = begin + self.end = end + if self.end is None: + self.end = self.begin + + self.source = source + + assert (self.begin.line, self.begin.character) <= (self.end.line, self.end.character) + assert self.begin.file == self.end.file + + def __str__(self): + if self.begin == self.end: + return str(self.begin) + return f"{str(self.begin)} - {str(self.end)}" + + @property + def source_substring(self) -> str: + source = self.source.splitlines(keepends=False) + source_lines = source[self.begin.line:self.end.line + 1] + if len(source_lines) == 1: + source_lines[0] = source_lines[0][self.begin.character:self.end.character + 1] + else: + source_lines[0] = source_lines[0][self.begin.character:] + source_lines[-1] = source_lines[-1][:self.end.character] + return "\n".join(source_lines) + + def show_in_source(self) -> str: + source = self.source.splitlines(keepends=False) + source_line = source[self.begin.line] + result = [source_line] + if self.begin.line != self.end.line: + return "\n".join(result) + + line = " " * self.begin.character + line += "^" + "-" * max(0, (self.end.character - self.begin.character - 1)) + line += " " * (len(source_line) - len(line)) + + result += [line] + + return "\n".join(result) + + + + diff --git a/compiler/tokenizer.py b/compiler/tokenizer.py new file mode 100644 index 0000000..504b9c4 --- /dev/null +++ b/compiler/tokenizer.py @@ -0,0 +1,70 @@ +from __future__ import annotations +from dataclasses import dataclass, field +from beartype import beartype +from beartype.typing import Optional + +import enum +import re + +from .logger import logger +from .source import SourceLocation, Location + +@beartype +@dataclass +class Token: + kind: Tokens + loc: SourceLocation = field(compare=False, hash=False, default=None) + value: Optional[str] = field(compare=False, hash=False, default=None) + + def __repr__(self): + if self.value is None: + return super().__repr__() + return f"{self.kind.name}({repr(self.value)})" + + +class Tokens(enum.Enum): + Number = re.compile(r"[0-9]+(\.?[0-9]*)") + Op_Plus = re.compile(r"\+") + Op_Minus = re.compile(r"-") + Op_Multiply = re.compile(r"\*") + Op_Divide = re.compile(r"/") + Parens_Left = re.compile(r"\(") + Parens_Right = re.compile(r"\)") + Blank = re.compile(r"\s+") + Unknown = re.compile(r".*") + + +class Tokenizer: + def __init__(self): + pass + + def tokenize(self, data: str) -> List[Token]: + results: List[Token] = [] + begin = 0 + while begin < len(data): + best_result: Token = Token(Tokens.Unknown, + loc=SourceLocation(Location(line=0, character=begin), source=data) + ) + for token_kind in Tokens: + if token_kind == Tokens.Unknown: + continue + match = token_kind.value.match(data, begin) + if match is not None: + logger.debug(f"Got match: {match}") + result = match.group(0) + if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value): + loc = SourceLocation( + begin=Location(line=0, character=begin), + end=Location(line=0, character=begin + len(result)) + ) + best_result = Token(token_kind, value=result, loc=loc) + logger.debug(f"New best match: {best_result}") + + if best_result.kind == Tokens.Unknown: + source_hint = best_result.loc.show_in_source() + logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}") + exit(1) + + results += [best_result] + begin += len(best_result.value) + return results \ No newline at end of file