tokenizer: match newlines and improve location information

Use the generated newline tokens to know at what line a token is located.
This commit is contained in:
Antoine Viallon 2023-05-08 17:34:21 +02:00
parent 86457c6972
commit 272bed25b9
Signed by: aviallon
GPG key ID: D126B13AB555E16F
2 changed files with 32 additions and 14 deletions

View file

@ -9,7 +9,7 @@ def main():
tokenizer = Tokenizer() tokenizer = Tokenizer()
tokens = tokenizer.tokenize("2 + 3") tokens = tokenizer.tokenize("2 + 3")
tokens = [token for token in tokens if token.kind != Tokens.Blank] tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
print(tokens) print(tokens)
parser = Parser(tokens) parser = Parser(tokens)

View file

@ -1,5 +1,9 @@
from __future__ import annotations from __future__ import annotations
import enum
import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from beartype import beartype from beartype import beartype
from beartype.typing import Optional, List from beartype.typing import Optional, List
@ -21,16 +25,18 @@ class Token:
class Tokens(enum.Enum): class Tokens(enum.Enum):
Number = re.compile(r"[0-9]+(\.?[0-9]*)") Integer = re.compile(r"[0-9]+")
Float = re.compile(r"[0-9]+\.[0-9]*")
Op_Plus = re.compile(r"\+") Op_Plus = re.compile(r"\+")
Op_Minus = re.compile(r"-") Op_Minus = re.compile(r"-")
Op_Multiply = re.compile(r"\*") Op_Multiply = re.compile(r"\*")
Op_Divide = re.compile(r"/") Op_Divide = re.compile(r"/")
Parens_Left = re.compile(r"\(") Parens_Left = re.compile(r"\(")
Parens_Right = re.compile(r"\)") Parens_Right = re.compile(r"\)")
Blank = re.compile(r"\s+") Newline = re.compile(r"\n", flags=re.MULTILINE)
EOF = re.compile(r"\Z") EOF = re.compile(r"\Z")
Unknown = re.compile(r".*") Blank = re.compile(r"[ \t]+")
Unknown = re.compile(r".*", flags=re.DOTALL)
def __bool__(self): def __bool__(self):
return True return True
@ -42,34 +48,46 @@ class Tokenizer:
def tokenize(self, data: str) -> List[Token]: def tokenize(self, data: str) -> List[Token]:
results: List[Token] = [] results: List[Token] = []
line = 0
character = 0
begin = 0 begin = 0
while begin < len(data): while begin < len(data):
best_result: Token = Token(Tokens.Unknown, best_result: Token = Token(Tokens.Unknown,
loc=SourceLocation(Location(line=0, character=begin), source=data) loc=SourceLocation(Location(line=line, character=character), source=data),
value=""
) )
for token_kind in Tokens: for token_kind in list(Tokens):
if token_kind == Tokens.Unknown: if token_kind == Tokens.Unknown:
continue continue
match = token_kind.value.match(data, begin) regex: re.Pattern = token_kind.value
match = regex.match(data, begin)
if match is not None: if match is not None:
logger.debug(f"Got match: {match}") logger.trace(f"Got match: {match}")
result = match.group(0) result = match.group(0)
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value): if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
loc = SourceLocation( loc = SourceLocation(
begin=Location(line=0, character=begin), begin=Location(line=line, character=character),
end=Location(line=0, character=begin + len(result)) end=Location(line=line, character=character + len(result))
) )
best_result = Token(token_kind, value=result, loc=loc) best_result = Token(token_kind, value=result, loc=loc)
logger.debug(f"New best match: {best_result}") logger.trace(f"New best match: {best_result}")
begin += len(best_result.value)
character += len(best_result.value)
if best_result.kind == Tokens.Unknown: if best_result.kind == Tokens.Unknown:
source_hint = best_result.loc.show_in_source() source_hint = best_result.loc.show_in_source()
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}") logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
exit(1) exit(1)
elif best_result.kind == Tokens.Newline:
line += 1
character = 0
best_result.loc.end = Location(line=line, character=0)
logger.debug(f"Added token {best_result}")
results += [best_result] results += [best_result]
begin += len(best_result.value)
results += [Token(Tokens.EOF, value=None, loc=SourceLocation( results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
Location(line=0, character=len(data)), source=data Location(line=line, character=0), source=data
))] ))]
return results return results