tokenizer: match newlines and improve location information
Use the generated newline tokens to know at what line a token is located.
This commit is contained in:
parent
86457c6972
commit
272bed25b9
2 changed files with 32 additions and 14 deletions
|
|
@ -9,7 +9,7 @@ def main():
|
|||
tokenizer = Tokenizer()
|
||||
tokens = tokenizer.tokenize("2 + 3")
|
||||
|
||||
tokens = [token for token in tokens if token.kind != Tokens.Blank]
|
||||
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
|
||||
print(tokens)
|
||||
|
||||
parser = Parser(tokens)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import enum
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from beartype import beartype
|
||||
from beartype.typing import Optional, List
|
||||
|
||||
|
|
@ -21,16 +25,18 @@ class Token:
|
|||
|
||||
|
||||
class Tokens(enum.Enum):
|
||||
Number = re.compile(r"[0-9]+(\.?[0-9]*)")
|
||||
Integer = re.compile(r"[0-9]+")
|
||||
Float = re.compile(r"[0-9]+\.[0-9]*")
|
||||
Op_Plus = re.compile(r"\+")
|
||||
Op_Minus = re.compile(r"-")
|
||||
Op_Multiply = re.compile(r"\*")
|
||||
Op_Divide = re.compile(r"/")
|
||||
Parens_Left = re.compile(r"\(")
|
||||
Parens_Right = re.compile(r"\)")
|
||||
Blank = re.compile(r"\s+")
|
||||
Newline = re.compile(r"\n", flags=re.MULTILINE)
|
||||
EOF = re.compile(r"\Z")
|
||||
Unknown = re.compile(r".*")
|
||||
Blank = re.compile(r"[ \t]+")
|
||||
Unknown = re.compile(r".*", flags=re.DOTALL)
|
||||
|
||||
def __bool__(self):
|
||||
return True
|
||||
|
|
@ -42,34 +48,46 @@ class Tokenizer:
|
|||
|
||||
def tokenize(self, data: str) -> List[Token]:
|
||||
results: List[Token] = []
|
||||
line = 0
|
||||
character = 0
|
||||
begin = 0
|
||||
while begin < len(data):
|
||||
best_result: Token = Token(Tokens.Unknown,
|
||||
loc=SourceLocation(Location(line=0, character=begin), source=data)
|
||||
loc=SourceLocation(Location(line=line, character=character), source=data),
|
||||
value=""
|
||||
)
|
||||
for token_kind in Tokens:
|
||||
for token_kind in list(Tokens):
|
||||
if token_kind == Tokens.Unknown:
|
||||
continue
|
||||
match = token_kind.value.match(data, begin)
|
||||
regex: re.Pattern = token_kind.value
|
||||
match = regex.match(data, begin)
|
||||
if match is not None:
|
||||
logger.debug(f"Got match: {match}")
|
||||
logger.trace(f"Got match: {match}")
|
||||
result = match.group(0)
|
||||
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
||||
loc = SourceLocation(
|
||||
begin=Location(line=0, character=begin),
|
||||
end=Location(line=0, character=begin + len(result))
|
||||
begin=Location(line=line, character=character),
|
||||
end=Location(line=line, character=character + len(result))
|
||||
)
|
||||
best_result = Token(token_kind, value=result, loc=loc)
|
||||
logger.debug(f"New best match: {best_result}")
|
||||
logger.trace(f"New best match: {best_result}")
|
||||
|
||||
begin += len(best_result.value)
|
||||
character += len(best_result.value)
|
||||
if best_result.kind == Tokens.Unknown:
|
||||
source_hint = best_result.loc.show_in_source()
|
||||
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
||||
exit(1)
|
||||
elif best_result.kind == Tokens.Newline:
|
||||
line += 1
|
||||
character = 0
|
||||
best_result.loc.end = Location(line=line, character=0)
|
||||
|
||||
logger.debug(f"Added token {best_result}")
|
||||
|
||||
results += [best_result]
|
||||
begin += len(best_result.value)
|
||||
|
||||
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||
Location(line=0, character=len(data)), source=data
|
||||
Location(line=line, character=0), source=data
|
||||
))]
|
||||
return results
|
||||
return results
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue