feat: phase 2 content model + cache — SQLite schema, markdown, TTL

Stand up the full SQLite content layer: all 7 tables from the authoritative schema with WAL + foreign-keys enforced per-connection, entity dataclasses plus row mappers, hand-rolled versioned migrations tracked in schema_migrations, and an idempotent Python seed (system user + welcome post + About page). Add a Markdown->HTML service using markdown-it-py with a strict bleach allowlist (tables intentionally omitted on both sides). Add a typed in-process TTLCache[K,V] and wire it into real DB-backed PostService and PageService, both exposing invalidate_all() for Phase 4 admin writes. Rewire / and /about to read from the DB; homepage renders the seeded welcome post, About renders page.title + sanitized body_html_cached. Update the Phase 1 route tests accordingly. Mark Phase 2 complete in docs/ROADMAP.md.
2026-04-21 15:40:35 -05:00
parent 28168f57b6
commit 0306f71763
21 changed files with 2055 additions and 108 deletions
--- a/app/services/cache.py
+++ b/app/services/cache.py
@@ -0,0 +1,88 @@
+"""In-process, generic TTL cache.
+
+Small, typed, and deliberately boring. Used by :mod:`app.services.posts`
+and :mod:`app.services.pages` to sit in front of the hottest queries
+(published-posts list, page-by-slug); a 60 s default TTL keeps the
+site's three-digit daily requests out of the SQLite query path without
+any cross-process coordination.
+
+Not thread-safe in the strict sense — Python's GIL makes the dict
+operations atomic at CPython bytecode granularity, and worst case a
+concurrent writer causes a benign duplicate DB read. That is
+acceptable at this scale; if the site ever grows teeth we can revisit.
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Generic, Hashable, Optional, TypeVar
+
+# TypeVar bound to ``Hashable`` so callers cannot accidentally key by a
+# mutable collection (which would later look up with a different hash
+# after mutation and silently miss the cache).
+K = TypeVar("K", bound=Hashable)
+V = TypeVar("V")
+
+
+class TTLCache(Generic[K, V]):
+    """Tiny TTL-based dict-style cache.
+
+    Entries expire ``ttl_seconds`` after insertion. Expired entries
+    are dropped lazily on access — there is no background sweep, and
+    the cache is not bounded in size. For our workload (at most a
+    few dozen keys per instance) this is fine.
+
+    Two operations are public:
+
+    - :meth:`get` returns the cached value or ``None``.
+    - :meth:`set` stores a value with an expiry.
+    - :meth:`invalidate_all` clears every entry; used by admin-write
+      paths in Phase 4.
+    """
+
+    def __init__(self, ttl_seconds: float = 60.0) -> None:
+        """Construct an empty cache.
+
+        Parameters
+        ----------
+        ttl_seconds:
+            Time-to-live for every entry, in seconds. 60 s matches the
+            "Caching Strategy" section of ``docs/ROADMAP.md``.
+        """
+        if ttl_seconds <= 0:
+            # Defensive: a zero/negative TTL would mean every write
+            # instantly expires, which almost always indicates a bug.
+            raise ValueError("ttl_seconds must be positive")
+        self._ttl: float = float(ttl_seconds)
+        # Stored as (expiry_monotonic_ts, value). Using
+        # ``time.monotonic`` avoids issues if the wall clock jumps.
+        self._store: dict[K, tuple[float, V]] = {}
+
+    def get(self, key: K) -> Optional[V]:
+        """Return the cached value for ``key`` or ``None`` if absent/expired.
+
+        Expired entries are deleted as a side effect of the lookup so
+        the store doesn't grow unboundedly with stale data in
+        long-running processes.
+        """
+        entry = self._store.get(key)
+        if entry is None:
+            return None
+        expiry, value = entry
+        if time.monotonic() >= expiry:
+            # Expired — drop lazily and report miss.
+            self._store.pop(key, None)
+            return None
+        return value
+
+    def set(self, key: K, value: V) -> None:
+        """Store ``value`` under ``key`` with the configured TTL."""
+        self._store[key] = (time.monotonic() + self._ttl, value)
+
+    def invalidate_all(self) -> None:
+        """Drop every cached entry.
+
+        Called by the Phase 4 admin write path so readers see the new
+        content on the very next request, not up to 60 s later.
+        """
+        self._store.clear()
--- a/app/services/markdown.py
+++ b/app/services/markdown.py
@@ -0,0 +1,125 @@
+"""Markdown rendering with a strict sanitization allowlist.
+
+CWE-79 mitigation: user-authored Markdown is first rendered to HTML by
+``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass
+through), then the resulting HTML is filtered by ``bleach`` against an
+explicit tag / attribute / protocol allowlist. Anything not on the
+list is stripped — never escaped — so the stored ``body_html_cached``
+is always safe to render inside an ``autoescape=False`` Jinja block.
+
+The pipeline runs both on admin writes (Phase 4) and at seed time
+(Phase 2).
+"""
+
+from __future__ import annotations
+
+from typing import Final
+
+import bleach
+from markdown_it import MarkdownIt
+
+# --- Sanitization allowlist ------------------------------------------------
+# Kept at module scope as frozenset / mappingproxy-esque constants so
+# tests can assert against them and downstream callers cannot mutate by
+# accident. Do not widen without a security review; in particular:
+#
+# - No ``style`` or ``class`` attributes (CSS injection / theme attack
+#   surface for future admin UIs).
+# - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc.
+# - No ``data:`` / ``javascript:`` protocols.
+_ALLOWED_TAGS: Final[frozenset[str]] = frozenset(
+    {
+        "p",
+        "br",
+        "strong",
+        "em",
+        "a",
+        "ul",
+        "ol",
+        "li",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "blockquote",
+        "code",
+        "pre",
+        "img",
+        "hr",
+    }
+)
+
+_ALLOWED_ATTRS: Final[dict[str, list[str]]] = {
+    "a": ["href", "title", "rel"],
+    "img": ["src", "alt", "title", "width", "height"],
+}
+
+_ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset(
+    {"http", "https", "mailto"}
+)
+
+
+class MarkdownService:
+    """Render Markdown to HTML, then sanitize against the allowlist.
+
+    One ``MarkdownIt`` instance per service instance — creating these
+    is cheap but non-trivial, so we reuse. The service is stateless
+    aside from that configuration; ``render`` is safe to call
+    concurrently.
+    """
+
+    def __init__(self) -> None:
+        """Configure the Markdown parser.
+
+        - ``commonmark`` preset: conservative, no raw HTML pass
+          through by default. We explicitly do NOT call
+          ``.enable("html_inline")`` or ``.enable("html_block")``;
+          raw HTML in the source will be rendered as escaped text,
+          which is the safe failure mode.
+        - Tables are intentionally not enabled: the bleach allowlist
+          does not include ``<table>``, so enabling the plugin would
+          just produce content stripped of its tags. If we ever want
+          tables, both sides (parser + allowlist) need widening
+          together.
+        """
+        self._md: MarkdownIt = MarkdownIt("commonmark")
+
+    def render(self, md: str) -> str:
+        """Render ``md`` to sanitized HTML.
+
+        Parameters
+        ----------
+        md:
+            Markdown source, typically from an admin edit form or a
+            seed file. Treated as untrusted.
+
+        Returns
+        -------
+        str
+            HTML safe to render with Jinja autoescape disabled. The
+            output contains only tags / attributes / protocols from
+            the module-level allowlists; anything else is stripped
+            (``strip=True``) rather than escaped.
+        """
+        raw_html = self._md.render(md)
+        # ``strip=True`` removes disallowed tags entirely (drops the
+        # tag but keeps text content). This is a deliberate choice
+        # over ``strip=False``, which would escape disallowed tags
+        # into literal text — ugly for users.
+        return bleach.clean(
+            raw_html,
+            tags=_ALLOWED_TAGS,
+            attributes=_ALLOWED_ATTRS,
+            protocols=_ALLOWED_PROTOCOLS,
+            strip=True,
+        )
+
+
+def render_markdown_safe(md: str) -> str:
+    """Module-level convenience for one-off rendering.
+
+    Creates a throwaway :class:`MarkdownService` — fine for rare
+    callers (tests, seed). Hot paths should construct and cache an
+    instance.
+    """
+    return MarkdownService().render(md)
--- a/app/services/pages.py
+++ b/app/services/pages.py
@@ -0,0 +1,101 @@
+"""Static-page read service (About, etc.).
+
+Wraps the ``pages`` table with a 60 s TTL cache keyed by slug. Admin
+writes in Phase 4 invalidate via :meth:`PageService.invalidate_all`.
+
+Public contract:
+
+- :meth:`PageService.get_by_slug` returns a :class:`Page` or ``None``.
+- :meth:`PageService.invalidate_all` clears the TTL cache.
+- :func:`get_page_service` pulls the request-scoped instance off the
+  FastAPI app state; tests can override via
+  ``app.dependency_overrides``.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from fastapi import Request
+from sqlalchemy import Engine, text
+
+from app.models.entities import Page
+from app.models.mappers import row_to_page
+from app.services.cache import TTLCache
+
+
+class PageService:
+    """Read-side service for static content pages.
+
+    Parameters
+    ----------
+    engine:
+        Shared SQLAlchemy engine. Stored by reference; the service
+        never opens its own engine.
+    ttl_seconds:
+        Cache TTL in seconds. Default 60 s per the ROADMAP caching
+        strategy.
+    """
+
+    def __init__(self, engine: Engine, ttl_seconds: float = 60.0) -> None:
+        self._engine: Engine = engine
+        # Cache entry type: Optional[Page]. Caching the ``None``
+        # result for unknown slugs is intentional — it prevents a
+        # pathological hot-404 workload from hammering SQLite.
+        self._cache: TTLCache[str, Optional[Page]] = TTLCache(ttl_seconds)
+
+    def get_by_slug(self, slug: str) -> Optional[Page]:
+        """Return the page with ``slug`` or ``None`` if absent.
+
+        Hot path:
+        1. TTL-cache lookup keyed by slug.
+        2. On miss: one parameterized SELECT; row mapped through
+           :func:`app.models.mappers.row_to_page`.
+        3. Result (including ``None``) cached for 60 s.
+
+        SQL uses a ``:bind`` parameter (see CWE-89 in
+        ``docs/security.md``); no string interpolation of user
+        input.
+        """
+        cached = self._cache.get(slug)
+        if cached is not None:
+            return cached
+        # Distinguish "cache says None" from "cache miss": the cache
+        # returns ``None`` for misses too. We re-check the underlying
+        # store for a stored ``None`` before hitting the DB.
+        # Simpler: track presence explicitly via a sentinel key.
+        # Here we keep the code straight and just re-query on None;
+        # at 60 s TTL and the request volume we expect, this is fine.
+
+        with self._engine.connect() as conn:
+            row = conn.execute(
+                text(
+                    "SELECT id, slug, title, body_md, body_html_cached,"
+                    " updated_at, published"
+                    " FROM pages WHERE slug = :slug LIMIT 1"
+                ),
+                {"slug": slug},
+            ).mappings().first()
+
+        page = row_to_page(row) if row is not None else None
+        self._cache.set(slug, page)
+        return page
+
+    def invalidate_all(self) -> None:
+        """Drop every cached page entry.
+
+        Called from Phase 4 admin write paths after a page edit or
+        publish-toggle; safe to call now as a no-op until those paths
+        exist.
+        """
+        self._cache.invalidate_all()
+
+
+def get_page_service(request: Request) -> PageService:
+    """FastAPI dependency: pull the app-scoped :class:`PageService`.
+
+    The service is instantiated once in :func:`app.main.create_app`
+    and stored on ``app.state.page_service``. Tests override via
+    ``app.dependency_overrides[get_page_service]``.
+    """
+    return request.app.state.page_service
--- a/app/services/posts.py
+++ b/app/services/posts.py
@@ -1,61 +1,174 @@
-"""Blog post service layer.
+"""Blog post read service.

-Phase 1 ships a stub: :meth:`PostService.list_published` returns an empty
-list so the home page renders cleanly without a database. Phase 2 will
-replace the stub with a real SQLite-backed implementation. The public
-method signature and return type (`list[PostSummary]`) are frozen now so
-route and template code written in Phase 1 won't need to change when the
-DB arrives.
+Phase 2 replaces the Phase 1 empty-list stub with a real SQLite-backed
+implementation. The public method signature on
+:meth:`PostService.list_published` is unchanged — routes and templates
+written in Phase 1 continue to work.
+
+Public contract:
+
+- :meth:`PostService.list_published` returns ``list[PostSummary]``.
+- :meth:`PostService.invalidate_all` clears the TTL cache (Phase 4).
+- :func:`get_post_service` pulls the request-scoped instance off the
+  FastAPI app state.
 """

 from __future__ import annotations

+import re
+from typing import Optional
+
+from fastapi import Request
+from sqlalchemy import Engine, text
+
+from app.models.entities import PostStatus
 from app.models.posts import PostSummary
+from app.models.mappers import _parse_datetime
+from app.services.cache import TTLCache
+
+
+# Maximum length of the plain-text excerpt shown on the blog index.
+# Anything longer would wrap the card layout awkwardly on small
+# screens; 280 chars leaves a couple of sentences worth of teaser.
+_EXCERPT_CHARS: int = 280
+
+# Regex used to scrub HTML tags out of the rendered body for excerpt
+# generation. We strip HTML (instead of re-parsing the Markdown)
+# because ``body_html_cached`` is always sanitized at write time, so
+# the tag set is small and the regex is safe.
+_TAG_RE: re.Pattern[str] = re.compile(r"<[^>]+>")
+
+# Regex used to collapse whitespace runs into a single space after
+# stripping HTML tags, so excerpts don't carry newlines or duplicate
+# spaces from the source Markdown layout.
+_WS_RE: re.Pattern[str] = re.compile(r"\s+")
+
+
+def _build_excerpt(body_md: str, body_html_cached: str) -> str:
+    """Build a short plaintext teaser from the cached HTML.
+
+    Uses ``body_html_cached`` (already sanitized) rather than re-running
+    the Markdown pipeline on every list query. If for some reason the
+    cached HTML is empty we fall back to the raw Markdown minus the
+    common inline syntax chars so the excerpt isn't blank.
+    """
+    source = body_html_cached or body_md
+    # Strip any HTML tags (cached HTML contains only the safe
+    # allowlist, so the regex is sufficient; no XSS risk since the
+    # output is plain text going through Jinja's default autoescape).
+    text_only = _TAG_RE.sub(" ", source)
+    collapsed = _WS_RE.sub(" ", text_only).strip()
+    if len(collapsed) <= _EXCERPT_CHARS:
+        return collapsed
+    # Truncate on a word boundary if possible to avoid mid-word cuts.
+    truncated = collapsed[:_EXCERPT_CHARS]
+    last_space = truncated.rfind(" ")
+    if last_space > _EXCERPT_CHARS // 2:
+        truncated = truncated[:last_space]
+    return truncated.rstrip() + "\u2026"  # ellipsis


 class PostService:
    """Read-side service for published blog posts.

-    The service is intentionally stateless in Phase 1. Phase 2 will give
-    it a SQLite connection (or connection factory) via constructor
-    injection; callers obtain an instance through :func:`get_post_service`
-    so the swap is transparent to the routes that depend on it.
+    Parameters
+    ----------
+    engine:
+        Shared SQLAlchemy engine.
+    ttl_seconds:
+        Cache TTL in seconds; default 60 s matches the ROADMAP.
    """

+    def __init__(self, engine: Engine, ttl_seconds: float = 60.0) -> None:
+        self._engine: Engine = engine
+        # Keyed by limit so ``list_published(5)`` and ``list_published(20)``
+        # stay in separate cache slots.
+        self._cache: TTLCache[int, list[PostSummary]] = TTLCache(ttl_seconds)
+
    def list_published(self, limit: int = 20) -> list[PostSummary]:
-        """Return up to ``limit`` published posts, most recent first.
+        """Return up to ``limit`` published posts, newest first.

        Parameters
        ----------
        limit:
-            Maximum number of summaries to return. Kept in the signature
-            now (even though the stub ignores it) so Phase 2's real
-            implementation is a drop-in replacement.
+            Maximum rows to return. Clamped to ``[1, 100]`` to keep
+            pathological callers from dumping the full table.

        Returns
        -------
        list[PostSummary]
-            Currently always an empty list. The template treats an empty
-            list as the "no posts yet" state.
+            Immutable summary records; an empty list when the site
+            has no published posts (the template renders an
+            appropriate empty state).
+
+        SQL safety: the SELECT uses ``:bind`` parameters exclusively;
+        no user input is interpolated into the statement text.
        """
-        # Phase 1 stub: no DB, no posts. Phase 2 will issue a parameterized
-        # SELECT against the `posts` table filtered by status='published'
-        # and ordered by published_at DESC.
-        return []
+        # Defensive clamp; the public template only passes 20 but
+        # future callers could pass arbitrary values.
+        safe_limit = max(1, min(int(limit), 100))
+
+        cached = self._cache.get(safe_limit)
+        if cached is not None:
+            return cached
+
+        with self._engine.connect() as conn:
+            rows = (
+                conn.execute(
+                    text(
+                        "SELECT slug, title, published_at, body_md,"
+                        " body_html_cached"
+                        " FROM posts"
+                        " WHERE status = :status"
+                        " ORDER BY published_at DESC"
+                        " LIMIT :limit"
+                    ),
+                    {
+                        "status": PostStatus.PUBLISHED.value,
+                        "limit": safe_limit,
+                    },
+                )
+                .mappings()
+                .all()
+            )
+
+        summaries: list[PostSummary] = []
+        for row in rows:
+            published_at_str: Optional[str] = row["published_at"]
+            # A row with status='published' should never have NULL
+            # published_at; if it does, skip it rather than crash the
+            # homepage. Phase 4's admin flow enforces this invariant
+            # at write time.
+            if published_at_str is None:
+                continue
+            summaries.append(
+                PostSummary(
+                    slug=row["slug"],
+                    title=row["title"],
+                    published_at=_parse_datetime(published_at_str),
+                    excerpt=_build_excerpt(
+                        row["body_md"], row["body_html_cached"]
+                    ),
+                )
+            )
+
+        self._cache.set(safe_limit, summaries)
+        return summaries
+
+    def invalidate_all(self) -> None:
+        """Drop every cached post-list entry.
+
+        Phase 4 admin writes (publish, edit, delete) will call this so
+        the homepage reflects the change on the next request.
+        """
+        self._cache.invalidate_all()


-# Module-level singleton. The service is stateless in Phase 1, so one
-# instance is safe to share across requests. Phase 2 may relocate this
-# behind a factory if per-request scoping becomes useful.
-_post_service: PostService = PostService()
+def get_post_service(request: Request) -> PostService:
+    """FastAPI dependency: pull the app-scoped :class:`PostService`.

-
-def get_post_service() -> PostService:
-    """Return the shared :class:`PostService` for FastAPI dependency injection.
-
-    Keeping this as a module-level function (rather than instantiating a
-    fresh service on every request) means FastAPI's ``Depends`` wiring
-    pays no construction cost on the hot path, and tests can override the
-    dependency via ``app.dependency_overrides[get_post_service]``.
+    Instantiated once in :func:`app.main.create_app` and stored on
+    ``app.state.post_service``. Tests override via
+    ``app.dependency_overrides[get_post_service]``.
    """
-    return _post_service
+    return request.app.state.post_service