The Tokenizer
The Tokenizer
The tokenizer (also called a lexer) is the first stage of the parsing pipeline. It takes a raw input string and produces a sequence of Token objects, each carrying a type and a value. The tokenizer does not validate grammar — it only identifies what kind of thing each piece of text is.
Token Representation
# tokenizer.py
from dataclasses import dataclass
@dataclass
class Token:
"""A single token extracted from input text."""
type: TokenType
value: str # the raw text that produced this token
position: int # character offset in the original input
The position field enables precise error messages: “Unexpected character at position 12.”
The Tokenizer Class
The tokenizer is a single-pass state machine. It reads the input string character by character, building tokens as it goes:
class Tokenizer:
"""Breaks an input string into a sequence of Tokens."""
KEYWORDS: dict[str, TokenType] = {
"INSERT": TokenType.KEYWORD_INSERT,
"SELECT": TokenType.KEYWORD_SELECT,
}
def __init__(self, text: str) -> None:
self.text = text
self.pos = 0
self.tokens: list[Token] = []
def tokenize(self) -> list[Token]:
"""Process the entire input and return a list of Tokens."""
while self.pos < len(self.text):
ch = self.text[self.pos]
# Skip whitespace
if ch.isspace():
self.pos += 1
continue
# Meta command: starts with '.'
if ch == '.':
self._read_meta_command()
continue
# Number: starts with a digit or negative sign
if ch.isdigit() or (ch == '-' and self._peek_digit()):
self._read_integer()
continue
# Word: keyword or string identifier
if ch.isalpha() or ch == '_':
self._read_word()
continue
raise TokenError(
f"Unexpected character '{ch}' at position {self.pos}"
)
self.tokens.append(Token(TokenType.EOF, "", self.pos))
return self.tokens
def _read_meta_command(self) -> None:
"""Read a meta-command starting with '.'."""
start = self.pos
self.pos += 1 # skip the '.'
while self.pos < len(self.text) and not self.text[self.pos].isspace():
self.pos += 1
value = self.text[start:self.pos]
self.tokens.append(Token(TokenType.META_COMMAND, value, start))
def _read_integer(self) -> None:
"""Read an integer literal (possibly negative)."""
start = self.pos
if self.text[self.pos] == '-':
self.pos += 1
while self.pos < len(self.text) and self.text[self.pos].isdigit():
self.pos += 1
value = self.text[start:self.pos]
if value == '-':
raise TokenError(f"Expected digit after '-' at position {start}")
self.tokens.append(Token(TokenType.INTEGER, value, start))
def _read_word(self) -> None:
"""Read a keyword or string identifier."""
start = self.pos
while self.pos < len(self.text) and (
self.text[self.pos].isalnum() or self.text[self.pos] == '_'
):
self.pos += 1
value = self.text[start:self.pos]
upper = value.upper()
if upper in self.KEYWORDS:
token_type = self.KEYWORDS[upper]
else:
token_type = TokenType.STRING
self.tokens.append(Token(token_type, value, start))
def _peek_digit(self) -> bool:
"""Check if the character after current position is a digit."""
next_pos = self.pos + 1
return next_pos < len(self.text) and self.text[next_pos].isdigit()
State Machine Walkthrough
The tokenizer processes input one character at a time. Here’s how it handles INSERT 1 alice:
| Position | Character | Action | Token produced |
|---|---|---|---|
| 0 | I | Start word | — |
| 1-5 | NSERT | Continue word | — |
| 6 | (end of word) | Lookup: “INSERT” is keyword | KEYWORD_INSERT("INSERT", 0) |
| 6 | | Skip whitespace | — |
| 7 | 1 | Start integer | — |
| 8 | (end of number) | Emit integer | INTEGER("1", 7) |
| 8 | | Skip whitespace | — |
| 9 | a | Start word | — |
| 10-13 | lice | Continue word | — |
| 14 | (end of input) | “alice” not a keyword | STRING("alice", 9) |
| 14 | — | Append EOF | EOF("", 14) |
Result: [KEYWORD_INSERT, INTEGER("1"), STRING("alice"), EOF]
Testing the Tokenizer
def test_tokenizer():
"""Verify tokenization of all supported input types."""
# INSERT statement
tokens = Tokenizer("INSERT 42 bob").tokenize()
assert tokens[0].type == TokenType.KEYWORD_INSERT
assert tokens[1].type == TokenType.INTEGER
assert tokens[1].value == "42"
assert tokens[2].type == TokenType.STRING
assert tokens[2].value == "bob"
assert tokens[3].type == TokenType.EOF
# SELECT statement
tokens = Tokenizer("SELECT").tokenize()
assert tokens[0].type == TokenType.KEYWORD_SELECT
assert tokens[1].type == TokenType.EOF
# Meta command
tokens = Tokenizer(".exit").tokenize()
assert tokens[0].type == TokenType.META_COMMAND
assert tokens[0].value == ".exit"
# Case insensitivity for keywords
tokens = Tokenizer("select").tokenize()
assert tokens[0].type == TokenType.KEYWORD_SELECT
# Negative integer
tokens = Tokenizer("INSERT -5 test_user").tokenize()
assert tokens[1].type == TokenType.INTEGER
assert tokens[1].value == "-5"
# Error: unexpected character
try:
Tokenizer("INSERT @invalid").tokenize()
assert False, "Should have raised TokenError"
except TokenError as e:
assert "position 7" in str(e)
# Whitespace handling
tokens = Tokenizer(" INSERT 1 alice ").tokenize()
assert len(tokens) == 4 # INSERT, 1, alice, EOF
assert tokens[0].type == TokenType.KEYWORD_INSERT
print("All tokenizer tests passed.")
test_tokenizer()
The Parser
With tokens in hand, the parser validates their sequence and produces a Statement object:
class Parser:
"""Parses a token list into a Statement (AST node)."""
def __init__(self, tokens: list[Token]) -> None:
self.tokens = tokens
self.pos = 0
def parse(self) -> InsertStatement | SelectStatement | MetaCommand:
"""Parse the token stream into a single statement."""
token = self._current()
if token.type == TokenType.META_COMMAND:
return MetaCommand(command=token.value)
if token.type == TokenType.KEYWORD_INSERT:
return self._parse_insert()
if token.type == TokenType.KEYWORD_SELECT:
return SelectStatement()
raise ParseError(
f"Expected INSERT, SELECT, or meta-command, "
f"got {token.type.name} at position {token.position}"
)
def _parse_insert(self) -> InsertStatement:
"""Parse: INSERT <id:int> <username:str>"""
self._advance() # consume INSERT
# Expect integer id
id_token = self._expect(TokenType.INTEGER)
row_id = int(id_token.value)
if row_id < 0:
raise ParseError(
f"Row id must be non-negative, got {row_id} "
f"at position {id_token.position}"
)
# Expect string username
username_token = self._expect(TokenType.STRING)
return InsertStatement(id=row_id, username=username_token.value)
def _current(self) -> Token:
if self.pos >= len(self.tokens):
raise ParseError("Unexpected end of input")
return self.tokens[self.pos]
def _advance(self) -> Token:
token = self._current()
self.pos += 1
return token
def _expect(self, expected_type: TokenType) -> Token:
token = self._advance()
if token.type != expected_type:
raise ParseError(
f"Expected {expected_type.name}, got {token.type.name} "
f"('{token.value}') at position {token.position}"
)
return token
Recursive descent. This parser is a recursive descent parser with one level of recursion. Each grammar rule (_parse_insert) is a method that consumes tokens left to right. The _expect helper enforces the expected token sequence. If any expectation fails, a ParseError with position information is raised.
Testing the Parser
def test_parser():
"""Verify parsing of valid and invalid inputs."""
# Valid INSERT
tokens = Tokenizer("INSERT 1 alice").tokenize()
stmt = Parser(tokens).parse()
assert isinstance(stmt, InsertStatement)
assert stmt.id == 1
assert stmt.username == "alice"
# Valid SELECT
tokens = Tokenizer("SELECT").tokenize()
stmt = Parser(tokens).parse()
assert isinstance(stmt, SelectStatement)
# Meta command
tokens = Tokenizer(".btree").tokenize()
stmt = Parser(tokens).parse()
assert isinstance(stmt, MetaCommand)
assert stmt.command == ".btree"
# Error: INSERT without id
try:
tokens = Tokenizer("INSERT alice").tokenize()
Parser(tokens).parse()
assert False, "Should have raised ParseError"
except ParseError as e:
assert "Expected INTEGER" in str(e)
# Error: negative id
try:
tokens = Tokenizer("INSERT -1 alice").tokenize()
Parser(tokens).parse()
assert False, "Should have raised ParseError"
except ParseError as e:
assert "non-negative" in str(e)
# Error: unknown command
try:
tokens = Tokenizer("DELETE 1").tokenize()
Parser(tokens).parse()
assert False, "Should have raised ParseError"
except ParseError:
pass
print("All parser tests passed.")
test_parser()
The tokenizer and parser transform text into structured Statement objects. The next section connects these statements to the B-Tree engine — the final integration step.