diff --git a/compiler/lexer.py b/compiler/lexer.py index 55c67cc..f8060c6 100644 --- a/compiler/lexer.py +++ b/compiler/lexer.py @@ -4,10 +4,12 @@ import collections.abc import enum import re from dataclasses import dataclass, field +from typing import cast from .logger import Logger from .source import SourceLocation, Location from .typechecking import typecheck +from .utils import implies logger = Logger(__name__) @@ -34,9 +36,15 @@ class Tokens(enum.Enum): Parens_Right = re.compile(r"\)") Brace_Left = re.compile(r"\{") Brace_Right = re.compile(r"}") + + KwLet = re.compile(r"\blet\b") + Identifier = re.compile(r"[a-zA-Z_][a-zA-Z_0-9]*") + Equal = re.compile(r"=") + Colon = re.compile(r":") Semicolon = re.compile(r";") + Newline = re.compile(r"\n", flags=re.MULTILINE) EOF = re.compile(r"\Z") Blank = re.compile(r"[ \t]+") @@ -45,6 +53,9 @@ class Tokens(enum.Enum): def __bool__(self): return True + def is_keyword(self) -> bool: + return self in [Tokens.KwLet] + class Lexer(collections.abc.Sequence): def __init__(self, data: str): @@ -81,15 +92,20 @@ class Lexer(collections.abc.Sequence): source=self.data), value="" ) + token_kind: Tokens for token_kind in list(Tokens): if token_kind == Tokens.Unknown: continue - regex: re.Pattern = token_kind.value + regex = cast(re.Pattern, token_kind.value) match = regex.match(self.data, self.begin) if match is not None: logger.trace(f"Got match: {match}") result = match.group(0) if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value): + if not implies(best_result.kind.is_keyword(), token_kind.is_keyword()): + logger.trace( + f"Best match is a keyword ({best_result}) and current match ({token_kind}) is not, skipping") + continue loc = SourceLocation( begin=Location(line=self.line, character=self.character), end=Location(line=self.line, character=self.character + len(result))