tokenizer: match newlines and improve location information

Use the generated newline tokens to know at what line a token is located.
2023-05-08 17:34:21 +02:00 · 2023-05-08 17:34:21 +02:00 · 272bed25b9
commit 272bed25b9
parent 86457c6972
2 changed files with 32 additions and 14 deletions
--- a/compiler/main.py
+++ b/compiler/main.py
@ -9,7 +9,7 @@ def main():
    tokenizer = Tokenizer()
    tokens = tokenizer.tokenize("2 + 3")

-    tokens = [token for token in tokens if token.kind != Tokens.Blank]
+    tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
    print(tokens)

    parser = Parser(tokens)
--- a/compiler/tokenizer.py
+++ b/compiler/tokenizer.py
@ -1,5 +1,9 @@
 from __future__ import annotations
+
+import enum
+import re
 from dataclasses import dataclass, field
+
 from beartype import beartype
 from beartype.typing import Optional, List

@ -21,16 +25,18 @@ class Token:


 class Tokens(enum.Enum):
-    Number = re.compile(r"[0-9]+(\.?[0-9]*)")
+    Integer = re.compile(r"[0-9]+")
+    Float = re.compile(r"[0-9]+\.[0-9]*")
    Op_Plus = re.compile(r"\+")
    Op_Minus = re.compile(r"-")
    Op_Multiply = re.compile(r"\*")
    Op_Divide = re.compile(r"/")
    Parens_Left = re.compile(r"\(")
    Parens_Right = re.compile(r"\)")
-    Blank = re.compile(r"\s+")
+    Newline = re.compile(r"\n", flags=re.MULTILINE)
    EOF = re.compile(r"\Z")
-    Unknown = re.compile(r".*")
+    Blank = re.compile(r"[ \t]+")
+    Unknown = re.compile(r".*", flags=re.DOTALL)

    def __bool__(self):
        return True
@ -42,34 +48,46 @@ class Tokenizer:

    def tokenize(self, data: str) -> List[Token]:
        results: List[Token] = []
+        line = 0
+        character = 0
        begin = 0
        while begin < len(data):
            best_result: Token = Token(Tokens.Unknown,
-                                       loc=SourceLocation(Location(line=0, character=begin), source=data)
+                                       loc=SourceLocation(Location(line=line, character=character), source=data),
+                                       value=""
                                       )
-            for token_kind in Tokens:
+            for token_kind in list(Tokens):
                if token_kind == Tokens.Unknown:
                    continue
-                match = token_kind.value.match(data, begin)
+                regex: re.Pattern = token_kind.value
+                match = regex.match(data, begin)
                if match is not None:
-                    logger.debug(f"Got match: {match}")
+                    logger.trace(f"Got match: {match}")
                    result = match.group(0)
                    if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
                        loc = SourceLocation(
-                            begin=Location(line=0, character=begin),
-                            end=Location(line=0, character=begin + len(result))
+                            begin=Location(line=line, character=character),
+                            end=Location(line=line, character=character + len(result))
                        )
                        best_result = Token(token_kind, value=result, loc=loc)
-                        logger.debug(f"New best match: {best_result}")
+                        logger.trace(f"New best match: {best_result}")

+            begin += len(best_result.value)
+            character += len(best_result.value)
            if best_result.kind == Tokens.Unknown:
                source_hint = best_result.loc.show_in_source()
                logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
                exit(1)
+            elif best_result.kind == Tokens.Newline:
+                line += 1
+                character = 0
+                best_result.loc.end = Location(line=line, character=0)
+
+            logger.debug(f"Added token {best_result}")

            results += [best_result]
-            begin += len(best_result.value)
+
        results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
-            Location(line=0, character=len(data)), source=data
+            Location(line=line, character=0), source=data
        ))]
-        return results
+        return results