tokenizer: match newlines and improve location information

Use the generated newline tokens to know at what line a token is located.
This commit is contained in:
Antoine Viallon 2023-05-08 17:34:21 +02:00
parent 86457c6972
commit 272bed25b9
Signed by: aviallon
GPG key ID: D126B13AB555E16F
2 changed files with 32 additions and 14 deletions

View file

@ -9,7 +9,7 @@ def main():
tokenizer = Tokenizer()
tokens = tokenizer.tokenize("2 + 3")
tokens = [token for token in tokens if token.kind != Tokens.Blank]
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
print(tokens)
parser = Parser(tokens)

View file

@ -1,5 +1,9 @@
from __future__ import annotations
import enum
import re
from dataclasses import dataclass, field
from beartype import beartype
from beartype.typing import Optional, List
@ -21,16 +25,18 @@ class Token:
class Tokens(enum.Enum):
Number = re.compile(r"[0-9]+(\.?[0-9]*)")
Integer = re.compile(r"[0-9]+")
Float = re.compile(r"[0-9]+\.[0-9]*")
Op_Plus = re.compile(r"\+")
Op_Minus = re.compile(r"-")
Op_Multiply = re.compile(r"\*")
Op_Divide = re.compile(r"/")
Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)")
Blank = re.compile(r"\s+")
Newline = re.compile(r"\n", flags=re.MULTILINE)
EOF = re.compile(r"\Z")
Unknown = re.compile(r".*")
Blank = re.compile(r"[ \t]+")
Unknown = re.compile(r".*", flags=re.DOTALL)
def __bool__(self):
return True
@ -42,34 +48,46 @@ class Tokenizer:
def tokenize(self, data: str) -> List[Token]:
results: List[Token] = []
line = 0
character = 0
begin = 0
while begin < len(data):
best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(Location(line=0, character=begin), source=data)
loc=SourceLocation(Location(line=line, character=character), source=data),
value=""
)
for token_kind in Tokens:
for token_kind in list(Tokens):
if token_kind == Tokens.Unknown:
continue
match = token_kind.value.match(data, begin)
regex: re.Pattern = token_kind.value
match = regex.match(data, begin)
if match is not None:
logger.debug(f"Got match: {match}")
logger.trace(f"Got match: {match}")
result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
loc = SourceLocation(
begin=Location(line=0, character=begin),
end=Location(line=0, character=begin + len(result))
begin=Location(line=line, character=character),
end=Location(line=line, character=character + len(result))
)
best_result = Token(token_kind, value=result, loc=loc)
logger.debug(f"New best match: {best_result}")
logger.trace(f"New best match: {best_result}")
begin += len(best_result.value)
character += len(best_result.value)
if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1)
elif best_result.kind == Tokens.Newline:
line += 1
character = 0
best_result.loc.end = Location(line=line, character=0)
logger.debug(f"Added token {best_result}")
results += [best_result]
begin += len(best_result.value)
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=0, character=len(data)), source=data
Location(line=line, character=0), source=data
))]
return results
return results