diff --git a/compiler/__main__.py b/compiler/__main__.py index 7b8dac9..cb1da0d 100644 --- a/compiler/__main__.py +++ b/compiler/__main__.py @@ -9,7 +9,7 @@ def main(): tokenizer = Tokenizer() tokens = tokenizer.tokenize("2 + 3") - tokens = [token for token in tokens if token.kind != Tokens.Blank] + tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]] print(tokens) parser = Parser(tokens) diff --git a/compiler/tokenizer.py b/compiler/tokenizer.py index 5ee0cb2..f65cab4 100644 --- a/compiler/tokenizer.py +++ b/compiler/tokenizer.py @@ -1,5 +1,9 @@ from __future__ import annotations + +import enum +import re from dataclasses import dataclass, field + from beartype import beartype from beartype.typing import Optional, List @@ -21,16 +25,18 @@ class Token: class Tokens(enum.Enum): - Number = re.compile(r"[0-9]+(\.?[0-9]*)") + Integer = re.compile(r"[0-9]+") + Float = re.compile(r"[0-9]+\.[0-9]*") Op_Plus = re.compile(r"\+") Op_Minus = re.compile(r"-") Op_Multiply = re.compile(r"\*") Op_Divide = re.compile(r"/") Parens_Left = re.compile(r"\(") Parens_Right = re.compile(r"\)") - Blank = re.compile(r"\s+") + Newline = re.compile(r"\n", flags=re.MULTILINE) EOF = re.compile(r"\Z") - Unknown = re.compile(r".*") + Blank = re.compile(r"[ \t]+") + Unknown = re.compile(r".*", flags=re.DOTALL) def __bool__(self): return True @@ -42,34 +48,46 @@ class Tokenizer: def tokenize(self, data: str) -> List[Token]: results: List[Token] = [] + line = 0 + character = 0 begin = 0 while begin < len(data): best_result: Token = Token(Tokens.Unknown, - loc=SourceLocation(Location(line=0, character=begin), source=data) + loc=SourceLocation(Location(line=line, character=character), source=data), + value="" ) - for token_kind in Tokens: + for token_kind in list(Tokens): if token_kind == Tokens.Unknown: continue - match = token_kind.value.match(data, begin) + regex: re.Pattern = token_kind.value + match = regex.match(data, begin) if match is not None: - logger.debug(f"Got match: {match}") + logger.trace(f"Got match: {match}") result = match.group(0) if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value): loc = SourceLocation( - begin=Location(line=0, character=begin), - end=Location(line=0, character=begin + len(result)) + begin=Location(line=line, character=character), + end=Location(line=line, character=character + len(result)) ) best_result = Token(token_kind, value=result, loc=loc) - logger.debug(f"New best match: {best_result}") + logger.trace(f"New best match: {best_result}") + begin += len(best_result.value) + character += len(best_result.value) if best_result.kind == Tokens.Unknown: source_hint = best_result.loc.show_in_source() logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}") exit(1) + elif best_result.kind == Tokens.Newline: + line += 1 + character = 0 + best_result.loc.end = Location(line=line, character=0) + + logger.debug(f"Added token {best_result}") results += [best_result] - begin += len(best_result.value) + results += [Token(Tokens.EOF, value=None, loc=SourceLocation( - Location(line=0, character=len(data)), source=data + Location(line=line, character=0), source=data ))] - return results \ No newline at end of file + return results