tokenizer: match newlines and improve location information
Use the generated newline tokens to know at what line a token is located.
This commit is contained in:
parent
86457c6972
commit
272bed25b9
2 changed files with 32 additions and 14 deletions
|
|
@ -9,7 +9,7 @@ def main():
|
||||||
tokenizer = Tokenizer()
|
tokenizer = Tokenizer()
|
||||||
tokens = tokenizer.tokenize("2 + 3")
|
tokens = tokenizer.tokenize("2 + 3")
|
||||||
|
|
||||||
tokens = [token for token in tokens if token.kind != Tokens.Blank]
|
tokens = [token for token in tokens if token.kind not in [Tokens.Blank, Tokens.Newline]]
|
||||||
print(tokens)
|
print(tokens)
|
||||||
|
|
||||||
parser = Parser(tokens)
|
parser = Parser(tokens)
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,9 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import enum
|
||||||
|
import re
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from beartype import beartype
|
from beartype import beartype
|
||||||
from beartype.typing import Optional, List
|
from beartype.typing import Optional, List
|
||||||
|
|
||||||
|
|
@ -21,16 +25,18 @@ class Token:
|
||||||
|
|
||||||
|
|
||||||
class Tokens(enum.Enum):
|
class Tokens(enum.Enum):
|
||||||
Number = re.compile(r"[0-9]+(\.?[0-9]*)")
|
Integer = re.compile(r"[0-9]+")
|
||||||
|
Float = re.compile(r"[0-9]+\.[0-9]*")
|
||||||
Op_Plus = re.compile(r"\+")
|
Op_Plus = re.compile(r"\+")
|
||||||
Op_Minus = re.compile(r"-")
|
Op_Minus = re.compile(r"-")
|
||||||
Op_Multiply = re.compile(r"\*")
|
Op_Multiply = re.compile(r"\*")
|
||||||
Op_Divide = re.compile(r"/")
|
Op_Divide = re.compile(r"/")
|
||||||
Parens_Left = re.compile(r"\(")
|
Parens_Left = re.compile(r"\(")
|
||||||
Parens_Right = re.compile(r"\)")
|
Parens_Right = re.compile(r"\)")
|
||||||
Blank = re.compile(r"\s+")
|
Newline = re.compile(r"\n", flags=re.MULTILINE)
|
||||||
EOF = re.compile(r"\Z")
|
EOF = re.compile(r"\Z")
|
||||||
Unknown = re.compile(r".*")
|
Blank = re.compile(r"[ \t]+")
|
||||||
|
Unknown = re.compile(r".*", flags=re.DOTALL)
|
||||||
|
|
||||||
def __bool__(self):
|
def __bool__(self):
|
||||||
return True
|
return True
|
||||||
|
|
@ -42,34 +48,46 @@ class Tokenizer:
|
||||||
|
|
||||||
def tokenize(self, data: str) -> List[Token]:
|
def tokenize(self, data: str) -> List[Token]:
|
||||||
results: List[Token] = []
|
results: List[Token] = []
|
||||||
|
line = 0
|
||||||
|
character = 0
|
||||||
begin = 0
|
begin = 0
|
||||||
while begin < len(data):
|
while begin < len(data):
|
||||||
best_result: Token = Token(Tokens.Unknown,
|
best_result: Token = Token(Tokens.Unknown,
|
||||||
loc=SourceLocation(Location(line=0, character=begin), source=data)
|
loc=SourceLocation(Location(line=line, character=character), source=data),
|
||||||
|
value=""
|
||||||
)
|
)
|
||||||
for token_kind in Tokens:
|
for token_kind in list(Tokens):
|
||||||
if token_kind == Tokens.Unknown:
|
if token_kind == Tokens.Unknown:
|
||||||
continue
|
continue
|
||||||
match = token_kind.value.match(data, begin)
|
regex: re.Pattern = token_kind.value
|
||||||
|
match = regex.match(data, begin)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
logger.debug(f"Got match: {match}")
|
logger.trace(f"Got match: {match}")
|
||||||
result = match.group(0)
|
result = match.group(0)
|
||||||
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
if best_result.kind == Tokens.Unknown or len(result) >= len(best_result.value):
|
||||||
loc = SourceLocation(
|
loc = SourceLocation(
|
||||||
begin=Location(line=0, character=begin),
|
begin=Location(line=line, character=character),
|
||||||
end=Location(line=0, character=begin + len(result))
|
end=Location(line=line, character=character + len(result))
|
||||||
)
|
)
|
||||||
best_result = Token(token_kind, value=result, loc=loc)
|
best_result = Token(token_kind, value=result, loc=loc)
|
||||||
logger.debug(f"New best match: {best_result}")
|
logger.trace(f"New best match: {best_result}")
|
||||||
|
|
||||||
|
begin += len(best_result.value)
|
||||||
|
character += len(best_result.value)
|
||||||
if best_result.kind == Tokens.Unknown:
|
if best_result.kind == Tokens.Unknown:
|
||||||
source_hint = best_result.loc.show_in_source()
|
source_hint = best_result.loc.show_in_source()
|
||||||
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
logger.error(f"{best_result.loc}: Unknown token '{best_result.loc.source_substring}'\n{source_hint}")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
elif best_result.kind == Tokens.Newline:
|
||||||
|
line += 1
|
||||||
|
character = 0
|
||||||
|
best_result.loc.end = Location(line=line, character=0)
|
||||||
|
|
||||||
|
logger.debug(f"Added token {best_result}")
|
||||||
|
|
||||||
results += [best_result]
|
results += [best_result]
|
||||||
begin += len(best_result.value)
|
|
||||||
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
results += [Token(Tokens.EOF, value=None, loc=SourceLocation(
|
||||||
Location(line=0, character=len(data)), source=data
|
Location(line=line, character=0), source=data
|
||||||
))]
|
))]
|
||||||
return results
|
return results
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue