The Compiler
The Compiler
The compiler is the bridge between the parser’s output (Statement objects) and the Virtual Machine’s execution methods. It maps each statement type to the corresponding B-Tree operation. This is also where we assemble the complete REPL and run integration tests against the full system.
Statement Execution
# compiler.py
from row import Row
from constants import PAGE_SIZE
MAX_USERNAME_LENGTH: Final[int] = 32 # matches Row.USERNAME_SIZE
class Compiler:
"""Maps parsed statements to Virtual Machine operations."""
def __init__(self, vm: VirtualMachine) -> None:
self.vm = vm
def execute(
self, statement: InsertStatement | SelectStatement | MetaCommand
) -> str:
"""Execute a parsed statement and return a user-facing result string."""
if isinstance(statement, MetaCommand):
return self._execute_meta(statement)
elif isinstance(statement, InsertStatement):
return self._execute_insert(statement)
elif isinstance(statement, SelectStatement):
return self._execute_select(statement)
else:
return f"Unknown statement type: {type(statement).__name__}"
def _execute_insert(self, stmt: InsertStatement) -> str:
"""Execute an INSERT statement."""
# Validate username length
if len(stmt.username) > MAX_USERNAME_LENGTH:
return f"Error: username too long ({len(stmt.username)} > {MAX_USERNAME_LENGTH})"
row = Row(id=stmt.id, username=stmt.username)
result = self.vm.execute_insert(stmt.id, row)
if result == ExecuteResult.SUCCESS:
return "Executed."
elif result == ExecuteResult.DUPLICATE_KEY:
return f"Error: duplicate key {stmt.id}."
elif result == ExecuteResult.TABLE_FULL:
return "Error: table full."
else:
return f"Error: {result.name}"
def _execute_select(self, stmt: SelectStatement) -> str:
"""Execute a SELECT statement, returning all rows."""
rows = self.vm.execute_select()
if not rows:
return "(empty table)"
lines = [f"({r.id}, {r.username})" for r in rows]
return "\n".join(lines)
def _execute_meta(self, cmd: MetaCommand) -> str:
"""Execute a meta-command."""
if cmd.command == ".exit":
return "__EXIT__" # Signal to REPL to quit
elif cmd.command == ".btree":
import io
buf = io.StringIO()
print_tree(self.vm.pager, self.vm.root_page_num, file=buf)
return buf.getvalue().rstrip()
elif cmd.command == ".constants":
lines = [
f"PAGE_SIZE: {PAGE_SIZE}",
f"LEAF_NODE_MAX_CELLS: {LEAF_NODE_MAX_CELLS}",
f"INTERNAL_NODE_MAX_KEYS: {INTERNAL_NODE_MAX_KEYS}",
f"LEAF_NODE_CELL_SIZE: {LEAF_NODE_CELL_SIZE}",
f"INTERNAL_NODE_CELL_SIZE: {INTERNAL_NODE_CELL_SIZE}",
]
return "\n".join(lines)
else:
return f"Unrecognized meta-command: {cmd.command}"
The compiler is deliberately thin. It validates constraints that belong to the interface layer (username length), delegates storage operations to the VM, and formats results for display. No storage logic leaks into this layer.
The Complete REPL
import sys
def run_repl(db_path: str) -> None:
"""Run the interactive Read-Eval-Print Loop."""
pager = Pager(db_path)
# Initialize root page if database is new
if pager.num_pages == 0:
root = pager.get_page(0)
initialize_leaf_node(root, is_root=True)
pager.num_pages = 1
vm = VirtualMachine(pager, root_page_num=0)
compiler = Compiler(vm)
print(f"pydb — Python Database Internals")
print(f"Database: {db_path}")
print(f"Type '.exit' to quit.\n")
while True:
try:
line = input("db > ").strip()
except (EOFError, KeyboardInterrupt):
print()
break
if not line:
continue
try:
# Pipeline: tokenize → parse → compile → execute
tokens = Tokenizer(line).tokenize()
statement = Parser(tokens).parse()
result = compiler.execute(statement)
if result == "__EXIT__":
break
print(result)
except TokenError as e:
print(f"Token error: {e}")
except ParseError as e:
print(f"Parse error: {e}")
# Clean shutdown
pager.commit()
pager.close()
print("Goodbye.")
if __name__ == "__main__":
db_path = sys.argv[1] if len(sys.argv) > 1 else "pydb.db"
run_repl(db_path)
REPL session example:
$ python repl.py test.db
pydb — Python Database Internals
Database: test.db
Type '.exit' to quit.
db > INSERT 1 alice
Executed.
db > INSERT 2 bob
Executed.
db > INSERT 3 charlie
Executed.
db > SELECT
(1, alice)
(2, bob)
(3, charlie)
db > INSERT 1 duplicate
Error: duplicate key 1.
db > .btree
leaf (page 0): 3 cells keys [1..3]
db > .constants
PAGE_SIZE: 4096
LEAF_NODE_MAX_CELLS: 102
INTERNAL_NODE_MAX_KEYS: 510
LEAF_NODE_CELL_SIZE: 40
INTERNAL_NODE_CELL_SIZE: 8
db > .exit
Goodbye.
Integration Tests
The final test suite exercises the complete pipeline — from text input through tokenization, parsing, compilation, B-Tree operations, and WAL durability:
def test_full_integration():
"""End-to-end test: SQL text → storage → retrieval → recovery."""
import os
db_path = "test_integration.db"
for f in [db_path, db_path + ".wal"]:
if os.path.exists(f):
os.remove(f)
def execute_command(compiler: Compiler, text: str) -> str:
tokens = Tokenizer(text).tokenize()
statement = Parser(tokens).parse()
return compiler.execute(statement)
# --- Phase 1: Insert and verify ---
pager = Pager(db_path)
root = pager.get_page(0)
initialize_leaf_node(root, is_root=True)
pager.num_pages = 1
vm = VirtualMachine(pager, root_page_num=0)
compiler = Compiler(vm)
# Insert 200 rows (will trigger splits)
for i in range(200):
result = execute_command(compiler, f"INSERT {i} user_{i:03d}")
assert result == "Executed.", f"Insert {i} failed: {result}"
# Verify all rows via SELECT
result = execute_command(compiler, "SELECT")
lines = result.strip().split("\n")
assert len(lines) == 200, f"Expected 200 rows, got {len(lines)}"
# Verify sorted order
ids = [int(line.split(",")[0].strip("(")) for line in lines]
assert ids == list(range(200)), "Rows not in sorted order"
# Duplicate rejection
result = execute_command(compiler, "INSERT 0 duplicate")
assert "duplicate key" in result.lower()
# Username too long
long_name = "x" * 33
result = execute_command(compiler, f"INSERT 999 {long_name}")
assert "too long" in result.lower()
# Commit and close
pager.commit()
pager.close()
# --- Phase 2: Reopen and verify persistence ---
pager2 = Pager(db_path)
vm2 = VirtualMachine(pager2, root_page_num=0)
compiler2 = Compiler(vm2)
result = execute_command(compiler2, "SELECT")
lines = result.strip().split("\n")
assert len(lines) == 200, f"After reopen: expected 200, got {len(lines)}"
# Verify B-Tree structure
depth = tree_depth(pager2, 0)
assert depth == 2, f"Expected depth 2 for 200 rows, got {depth}"
pager2.close()
# Cleanup
for f in [db_path, db_path + ".wal"]:
if os.path.exists(f):
os.remove(f)
print("Full integration test passed.")
test_full_integration()
Error Handling Summary
| Error | Detected by | User sees |
|---|---|---|
Unrecognized character (@, #) | Tokenizer | Token error: Unexpected character '@' at position N |
Missing argument (INSERT alice) | Parser | Parse error: Expected INTEGER, got STRING at position N |
Negative ID (INSERT -1 alice) | Parser | Parse error: Row id must be non-negative |
Unknown command (DELETE) | Parser | Parse error: Expected INSERT, SELECT, or meta-command |
| Duplicate key | Compiler / VM | Error: duplicate key 42. |
| Username too long | Compiler | Error: username too long (33 > 32) |
| Table full | Compiler / VM | Error: table full. |
Unknown meta-command (.drop) | Compiler | Unrecognized meta-command: .drop |
Every error is caught and reported with context. The REPL never crashes from user input.
The Complete System
We have built, from scratch, a persistent database engine with the following architecture:
┌─────────────────────────────────────────────────┐
│ REPL / CLI │ ← CH7
│ Tokenizer → Parser → Compiler │
├─────────────────────────────────────────────────┤
│ Virtual Machine │ ← CH3
│ execute_insert() execute_select() │
├─────────────────────────────────────────────────┤
│ B-Tree │ ← CH4, CH5
│ Leaf Nodes (102 cells) + Internal Nodes (510) │
│ Binary search, splitting, tree traversal │
├─────────────────────────────────────────────────┤
│ Pager │ ← CH2
│ Page cache + disk I/O │
├─────────────────────────────────────────────────┤
│ Write-Ahead Log │ ← CH6
│ Crash recovery, durability │
├─────────────────────────────────────────────────┤
│ Row Serialization │ ← CH1
│ struct.pack / struct.unpack │
├─────────────────────────────────────────────────┤
│ Operating System │
│ os.open, os.read, os.write, os.fsync │
└─────────────────────────────────────────────────┘
Total line count: approximately 600-800 lines of Python, depending on how you count tests and comments. Every line is deliberate. There is no framework, no ORM, no dependency beyond the Python standard library. The entire engine fits in a single afternoon of focused reading.
What We Did Not Build
| Feature | Why omitted | What it would require |
|---|---|---|
WHERE clauses | Requires expression evaluation | Predicate pushdown, filter operators |
DELETE / UPDATE | Requires cell removal, in-place update | Tombstones, free list management |
| Variable-length rows | Our ROW_SIZE is fixed at 36 bytes | Overflow pages, slotted page layout |
| Multi-table support | One table is enough to teach fundamentals | Catalog pages, schema storage |
| Concurrency | Single-process, single-thread | Page-level locking, MVCC |
| Indexes | Primary key index only | Secondary index B-Trees |
| Query optimization | Only two query types | Cost-based optimizer, statistics |
Each omitted feature is a project in itself. The value of this book is not in building a production database — it is in understanding the principles that every production database is built upon. Page-oriented storage, B-Tree indexing, write-ahead logging, and query parsing are the four pillars. You now understand all four from the bytes up.