chicken_babies_site/app/db.py

"""SQLAlchemy engine factory, SQLite PRAGMA hookup, and migration runner.

Responsibilities in this module:

1. **Engine construction** — :func:`build_engine` produces a
   ``sqlalchemy.Engine`` from the application's ``DATABASE_URL``,
   threaded-safe for uvicorn's worker pool.
2. **Per-connection PRAGMAs** — a single ``@event.listens_for(Engine,
   "connect")`` hook sets ``journal_mode = WAL`` and ``foreign_keys =
   ON`` on *every* new SQLite connection, not just the first. SQLite
   applies both pragmas per-connection, so doing this once at startup
   would silently leave FKs disabled for every worker.
3. **Migration runner** — :func:`run_migrations` applies every
   ``.sql`` file under :mod:`app.models.migrations` in lexicographic
   order, tracking applied files in a ``schema_migrations`` table.
   Migrations are trusted developer-authored SQL loaded via
   :meth:`sqlite3.Connection.executescript`; they never touch user
   input.

No Python code in this module builds a SQL statement by string
interpolation. Queries go through ``sqlalchemy.text(":bind")``.
"""

from __future__ import annotations

import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Final

import structlog
from sqlalchemy import Engine, create_engine, event, text

# Directory containing the ``NNN_description.sql`` migration files. Kept
# as a module-level constant so tests can reason about it without
# importing the runner internals.
_MIGRATIONS_DIR: Final[Path] = Path(__file__).resolve().parent / "models" / "migrations"


_log = structlog.get_logger(__name__)


def build_engine(database_url: str) -> Engine:
    """Build a SQLAlchemy :class:`Engine` for the app's SQLite database.

    Parameters
    ----------
    database_url:
        A SQLAlchemy URL. In production this is
        ``sqlite:///data/app.db``; tests pass a tmp-path file URL.

    Notes
    -----
    - ``check_same_thread=False`` is required because uvicorn services
      requests from a worker-thread pool; SQLAlchemy's connection pool
      plus our explicit transactions keep this safe.
    - For file-backed SQLite URLs we eagerly create the parent
      directory (SQLite refuses to create missing directories).
    - ``future=True`` opts into SQLAlchemy 2.x semantics; redundant on
      2.0+ but explicit is better than implicit.
    """
    # Ensure the on-disk directory exists for file-backed SQLite URLs.
    # In-memory databases and ``:memory:`` URLs are left alone.
    if database_url.startswith("sqlite:///"):
        db_path_str = database_url[len("sqlite:///"):]
        if db_path_str and db_path_str != ":memory:":
            db_path = Path(db_path_str)
            # Relative paths resolve against the current working
            # directory. This matches uvicorn's default cwd (the repo
            # root) and Docker's WORKDIR.
            parent = db_path.parent
            if str(parent) and parent != Path("."):
                os.makedirs(parent, exist_ok=True)

    engine = create_engine(
        database_url,
        future=True,
        connect_args={"check_same_thread": False},
    )
    _install_sqlite_pragmas(engine)
    return engine


def _install_sqlite_pragmas(engine: Engine) -> None:
    """Attach a connect-event listener that enforces our SQLite PRAGMAs.

    ``journal_mode = WAL`` and ``foreign_keys = ON`` are both
    per-connection settings in SQLite. Applying them on every new
    connection — rather than once at startup — is the only way to
    guarantee foreign-key enforcement across all pool workers.
    """

    @event.listens_for(engine, "connect")
    def _on_connect(dbapi_connection, connection_record) -> None:  # type: ignore[no-untyped-def]
        """Run per-connection SQLite initialization.

        Uses the raw DB-API cursor (not SQLAlchemy ``text`` wrappers)
        because PRAGMA calls are not valid parameterized SQL — they
        are trusted, developer-authored literals with no external
        input.
        """
        cursor = dbapi_connection.cursor()
        try:
            # WAL improves concurrency (readers don't block the
            # single writer) and is well-suited to our read-heavy
            # workload. It persists on the database file, so
            # re-setting is a cheap no-op after the first call.
            cursor.execute("PRAGMA journal_mode = WAL")
            # foreign_keys is per-connection; SQLite defaults to OFF,
            # so we MUST set it here to have referential integrity.
            cursor.execute("PRAGMA foreign_keys = ON")
        finally:
            cursor.close()


def run_migrations(engine: Engine) -> list[str]:
    """Apply any un-applied SQL files from :mod:`app.models.migrations`.

    Behavior:

    - Creates a ``schema_migrations`` tracker table if missing.
    - Lists ``.sql`` files in :data:`_MIGRATIONS_DIR` in sorted order.
    - For each file not yet in ``schema_migrations``, runs its content
      via :meth:`sqlite3.Connection.executescript` (necessary because
      a migration file may contain multiple statements) inside a
      single ``BEGIN IMMEDIATE`` transaction, then records the
      version. Already-applied files are skipped.

    Returns
    -------
    list[str]
        The ordered list of versions applied on *this* call. Empty
        when the DB is already up to date, useful for logs and tests.

    Security note
    -------------
    Migration SQL is trusted input from the repository; it does not
    mix with user-origin data and therefore does not need bind
    parameters. User data still flows exclusively through
    parameterized queries elsewhere (see ``docs/security.md`` CWE-89).
    """
    files = sorted(p for p in _MIGRATIONS_DIR.glob("*.sql"))
    applied_now: list[str] = []

    # A single "raw connection" over the life of the migration run
    # lets us mix executescript (DDL) with ordinary parameterized
    # bookkeeping cleanly. We commit per file so a failure partway
    # through leaves earlier files recorded.
    with engine.connect() as conn:
        # Ensure the tracker table exists. Can't use schema_migrations
        # itself to gate this since it may not exist yet.
        conn.execute(
            text(
                "CREATE TABLE IF NOT EXISTS schema_migrations ("
                "  version TEXT PRIMARY KEY,"
                "  applied_at TEXT NOT NULL"
                ")"
            )
        )
        conn.commit()

        # Pull the set of already-applied versions once.
        already_applied = {
            row[0]
            for row in conn.execute(
                text("SELECT version FROM schema_migrations")
            ).fetchall()
        }

        for path in files:
            version = path.stem
            if version in already_applied:
                continue

            sql_text = path.read_text(encoding="utf-8")

            # executescript is only exposed on the DB-API connection,
            # so we reach through the SQLAlchemy connection's raw
            # cursor. Trust boundary: the file is checked into git,
            # never user-supplied, so there is no injection vector.
            raw = conn.connection
            raw.executescript(sql_text)

            conn.execute(
                text(
                    "INSERT INTO schema_migrations (version, applied_at) "
                    "VALUES (:v, :t)"
                ),
                {
                    "v": version,
                    "t": datetime.now(timezone.utc).isoformat(),
                },
            )
            conn.commit()
            applied_now.append(version)
            _log.info("migration_applied", version=version)

    if not applied_now:
        _log.info("migrations_up_to_date")
    return applied_now