feat: phase 2 content model + cache — SQLite schema, markdown, TTL

Stand up the full SQLite content layer: all 7 tables from the authoritative
schema with WAL + foreign-keys enforced per-connection, entity dataclasses
plus row mappers, hand-rolled versioned migrations tracked in
schema_migrations, and an idempotent Python seed (system user + welcome
post + About page).

Add a Markdown->HTML service using markdown-it-py with a strict bleach
allowlist (tables intentionally omitted on both sides). Add a typed
in-process TTLCache[K,V] and wire it into real DB-backed PostService and
PageService, both exposing invalidate_all() for Phase 4 admin writes.

Rewire / and /about to read from the DB; homepage renders the seeded
welcome post, About renders page.title + sanitized body_html_cached.
Update the Phase 1 route tests accordingly.

Mark Phase 2 complete in docs/ROADMAP.md.
This commit is contained in:
2026-04-21 15:40:35 -05:00
parent 28168f57b6
commit 0306f71763
21 changed files with 2055 additions and 108 deletions

88
app/services/cache.py Normal file
View File

@@ -0,0 +1,88 @@
"""In-process, generic TTL cache.
Small, typed, and deliberately boring. Used by :mod:`app.services.posts`
and :mod:`app.services.pages` to sit in front of the hottest queries
(published-posts list, page-by-slug); a 60 s default TTL keeps the
site's three-digit daily requests out of the SQLite query path without
any cross-process coordination.
Not thread-safe in the strict sense — Python's GIL makes the dict
operations atomic at CPython bytecode granularity, and worst case a
concurrent writer causes a benign duplicate DB read. That is
acceptable at this scale; if the site ever grows teeth we can revisit.
"""
from __future__ import annotations
import time
from typing import Generic, Hashable, Optional, TypeVar
# TypeVar bound to ``Hashable`` so callers cannot accidentally key by a
# mutable collection (which would later look up with a different hash
# after mutation and silently miss the cache).
K = TypeVar("K", bound=Hashable)
V = TypeVar("V")
class TTLCache(Generic[K, V]):
"""Tiny TTL-based dict-style cache.
Entries expire ``ttl_seconds`` after insertion. Expired entries
are dropped lazily on access — there is no background sweep, and
the cache is not bounded in size. For our workload (at most a
few dozen keys per instance) this is fine.
Two operations are public:
- :meth:`get` returns the cached value or ``None``.
- :meth:`set` stores a value with an expiry.
- :meth:`invalidate_all` clears every entry; used by admin-write
paths in Phase 4.
"""
def __init__(self, ttl_seconds: float = 60.0) -> None:
"""Construct an empty cache.
Parameters
----------
ttl_seconds:
Time-to-live for every entry, in seconds. 60 s matches the
"Caching Strategy" section of ``docs/ROADMAP.md``.
"""
if ttl_seconds <= 0:
# Defensive: a zero/negative TTL would mean every write
# instantly expires, which almost always indicates a bug.
raise ValueError("ttl_seconds must be positive")
self._ttl: float = float(ttl_seconds)
# Stored as (expiry_monotonic_ts, value). Using
# ``time.monotonic`` avoids issues if the wall clock jumps.
self._store: dict[K, tuple[float, V]] = {}
def get(self, key: K) -> Optional[V]:
"""Return the cached value for ``key`` or ``None`` if absent/expired.
Expired entries are deleted as a side effect of the lookup so
the store doesn't grow unboundedly with stale data in
long-running processes.
"""
entry = self._store.get(key)
if entry is None:
return None
expiry, value = entry
if time.monotonic() >= expiry:
# Expired — drop lazily and report miss.
self._store.pop(key, None)
return None
return value
def set(self, key: K, value: V) -> None:
"""Store ``value`` under ``key`` with the configured TTL."""
self._store[key] = (time.monotonic() + self._ttl, value)
def invalidate_all(self) -> None:
"""Drop every cached entry.
Called by the Phase 4 admin write path so readers see the new
content on the very next request, not up to 60 s later.
"""
self._store.clear()

125
app/services/markdown.py Normal file
View File

@@ -0,0 +1,125 @@
"""Markdown rendering with a strict sanitization allowlist.
CWE-79 mitigation: user-authored Markdown is first rendered to HTML by
``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass
through), then the resulting HTML is filtered by ``bleach`` against an
explicit tag / attribute / protocol allowlist. Anything not on the
list is stripped — never escaped — so the stored ``body_html_cached``
is always safe to render inside an ``autoescape=False`` Jinja block.
The pipeline runs both on admin writes (Phase 4) and at seed time
(Phase 2).
"""
from __future__ import annotations
from typing import Final
import bleach
from markdown_it import MarkdownIt
# --- Sanitization allowlist ------------------------------------------------
# Kept at module scope as frozenset / mappingproxy-esque constants so
# tests can assert against them and downstream callers cannot mutate by
# accident. Do not widen without a security review; in particular:
#
# - No ``style`` or ``class`` attributes (CSS injection / theme attack
# surface for future admin UIs).
# - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc.
# - No ``data:`` / ``javascript:`` protocols.
_ALLOWED_TAGS: Final[frozenset[str]] = frozenset(
{
"p",
"br",
"strong",
"em",
"a",
"ul",
"ol",
"li",
"h1",
"h2",
"h3",
"h4",
"blockquote",
"code",
"pre",
"img",
"hr",
}
)
_ALLOWED_ATTRS: Final[dict[str, list[str]]] = {
"a": ["href", "title", "rel"],
"img": ["src", "alt", "title", "width", "height"],
}
_ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset(
{"http", "https", "mailto"}
)
class MarkdownService:
"""Render Markdown to HTML, then sanitize against the allowlist.
One ``MarkdownIt`` instance per service instance — creating these
is cheap but non-trivial, so we reuse. The service is stateless
aside from that configuration; ``render`` is safe to call
concurrently.
"""
def __init__(self) -> None:
"""Configure the Markdown parser.
- ``commonmark`` preset: conservative, no raw HTML pass
through by default. We explicitly do NOT call
``.enable("html_inline")`` or ``.enable("html_block")``;
raw HTML in the source will be rendered as escaped text,
which is the safe failure mode.
- Tables are intentionally not enabled: the bleach allowlist
does not include ``<table>``, so enabling the plugin would
just produce content stripped of its tags. If we ever want
tables, both sides (parser + allowlist) need widening
together.
"""
self._md: MarkdownIt = MarkdownIt("commonmark")
def render(self, md: str) -> str:
"""Render ``md`` to sanitized HTML.
Parameters
----------
md:
Markdown source, typically from an admin edit form or a
seed file. Treated as untrusted.
Returns
-------
str
HTML safe to render with Jinja autoescape disabled. The
output contains only tags / attributes / protocols from
the module-level allowlists; anything else is stripped
(``strip=True``) rather than escaped.
"""
raw_html = self._md.render(md)
# ``strip=True`` removes disallowed tags entirely (drops the
# tag but keeps text content). This is a deliberate choice
# over ``strip=False``, which would escape disallowed tags
# into literal text — ugly for users.
return bleach.clean(
raw_html,
tags=_ALLOWED_TAGS,
attributes=_ALLOWED_ATTRS,
protocols=_ALLOWED_PROTOCOLS,
strip=True,
)
def render_markdown_safe(md: str) -> str:
"""Module-level convenience for one-off rendering.
Creates a throwaway :class:`MarkdownService` — fine for rare
callers (tests, seed). Hot paths should construct and cache an
instance.
"""
return MarkdownService().render(md)

101
app/services/pages.py Normal file
View File

@@ -0,0 +1,101 @@
"""Static-page read service (About, etc.).
Wraps the ``pages`` table with a 60 s TTL cache keyed by slug. Admin
writes in Phase 4 invalidate via :meth:`PageService.invalidate_all`.
Public contract:
- :meth:`PageService.get_by_slug` returns a :class:`Page` or ``None``.
- :meth:`PageService.invalidate_all` clears the TTL cache.
- :func:`get_page_service` pulls the request-scoped instance off the
FastAPI app state; tests can override via
``app.dependency_overrides``.
"""
from __future__ import annotations
from typing import Optional
from fastapi import Request
from sqlalchemy import Engine, text
from app.models.entities import Page
from app.models.mappers import row_to_page
from app.services.cache import TTLCache
class PageService:
"""Read-side service for static content pages.
Parameters
----------
engine:
Shared SQLAlchemy engine. Stored by reference; the service
never opens its own engine.
ttl_seconds:
Cache TTL in seconds. Default 60 s per the ROADMAP caching
strategy.
"""
def __init__(self, engine: Engine, ttl_seconds: float = 60.0) -> None:
self._engine: Engine = engine
# Cache entry type: Optional[Page]. Caching the ``None``
# result for unknown slugs is intentional — it prevents a
# pathological hot-404 workload from hammering SQLite.
self._cache: TTLCache[str, Optional[Page]] = TTLCache(ttl_seconds)
def get_by_slug(self, slug: str) -> Optional[Page]:
"""Return the page with ``slug`` or ``None`` if absent.
Hot path:
1. TTL-cache lookup keyed by slug.
2. On miss: one parameterized SELECT; row mapped through
:func:`app.models.mappers.row_to_page`.
3. Result (including ``None``) cached for 60 s.
SQL uses a ``:bind`` parameter (see CWE-89 in
``docs/security.md``); no string interpolation of user
input.
"""
cached = self._cache.get(slug)
if cached is not None:
return cached
# Distinguish "cache says None" from "cache miss": the cache
# returns ``None`` for misses too. We re-check the underlying
# store for a stored ``None`` before hitting the DB.
# Simpler: track presence explicitly via a sentinel key.
# Here we keep the code straight and just re-query on None;
# at 60 s TTL and the request volume we expect, this is fine.
with self._engine.connect() as conn:
row = conn.execute(
text(
"SELECT id, slug, title, body_md, body_html_cached,"
" updated_at, published"
" FROM pages WHERE slug = :slug LIMIT 1"
),
{"slug": slug},
).mappings().first()
page = row_to_page(row) if row is not None else None
self._cache.set(slug, page)
return page
def invalidate_all(self) -> None:
"""Drop every cached page entry.
Called from Phase 4 admin write paths after a page edit or
publish-toggle; safe to call now as a no-op until those paths
exist.
"""
self._cache.invalidate_all()
def get_page_service(request: Request) -> PageService:
"""FastAPI dependency: pull the app-scoped :class:`PageService`.
The service is instantiated once in :func:`app.main.create_app`
and stored on ``app.state.page_service``. Tests override via
``app.dependency_overrides[get_page_service]``.
"""
return request.app.state.page_service

View File

@@ -1,61 +1,174 @@
"""Blog post service layer.
"""Blog post read service.
Phase 1 ships a stub: :meth:`PostService.list_published` returns an empty
list so the home page renders cleanly without a database. Phase 2 will
replace the stub with a real SQLite-backed implementation. The public
method signature and return type (`list[PostSummary]`) are frozen now so
route and template code written in Phase 1 won't need to change when the
DB arrives.
Phase 2 replaces the Phase 1 empty-list stub with a real SQLite-backed
implementation. The public method signature on
:meth:`PostService.list_published` is unchanged — routes and templates
written in Phase 1 continue to work.
Public contract:
- :meth:`PostService.list_published` returns ``list[PostSummary]``.
- :meth:`PostService.invalidate_all` clears the TTL cache (Phase 4).
- :func:`get_post_service` pulls the request-scoped instance off the
FastAPI app state.
"""
from __future__ import annotations
import re
from typing import Optional
from fastapi import Request
from sqlalchemy import Engine, text
from app.models.entities import PostStatus
from app.models.posts import PostSummary
from app.models.mappers import _parse_datetime
from app.services.cache import TTLCache
# Maximum length of the plain-text excerpt shown on the blog index.
# Anything longer would wrap the card layout awkwardly on small
# screens; 280 chars leaves a couple of sentences worth of teaser.
_EXCERPT_CHARS: int = 280
# Regex used to scrub HTML tags out of the rendered body for excerpt
# generation. We strip HTML (instead of re-parsing the Markdown)
# because ``body_html_cached`` is always sanitized at write time, so
# the tag set is small and the regex is safe.
_TAG_RE: re.Pattern[str] = re.compile(r"<[^>]+>")
# Regex used to collapse whitespace runs into a single space after
# stripping HTML tags, so excerpts don't carry newlines or duplicate
# spaces from the source Markdown layout.
_WS_RE: re.Pattern[str] = re.compile(r"\s+")
def _build_excerpt(body_md: str, body_html_cached: str) -> str:
"""Build a short plaintext teaser from the cached HTML.
Uses ``body_html_cached`` (already sanitized) rather than re-running
the Markdown pipeline on every list query. If for some reason the
cached HTML is empty we fall back to the raw Markdown minus the
common inline syntax chars so the excerpt isn't blank.
"""
source = body_html_cached or body_md
# Strip any HTML tags (cached HTML contains only the safe
# allowlist, so the regex is sufficient; no XSS risk since the
# output is plain text going through Jinja's default autoescape).
text_only = _TAG_RE.sub(" ", source)
collapsed = _WS_RE.sub(" ", text_only).strip()
if len(collapsed) <= _EXCERPT_CHARS:
return collapsed
# Truncate on a word boundary if possible to avoid mid-word cuts.
truncated = collapsed[:_EXCERPT_CHARS]
last_space = truncated.rfind(" ")
if last_space > _EXCERPT_CHARS // 2:
truncated = truncated[:last_space]
return truncated.rstrip() + "\u2026" # ellipsis
class PostService:
"""Read-side service for published blog posts.
The service is intentionally stateless in Phase 1. Phase 2 will give
it a SQLite connection (or connection factory) via constructor
injection; callers obtain an instance through :func:`get_post_service`
so the swap is transparent to the routes that depend on it.
Parameters
----------
engine:
Shared SQLAlchemy engine.
ttl_seconds:
Cache TTL in seconds; default 60 s matches the ROADMAP.
"""
def __init__(self, engine: Engine, ttl_seconds: float = 60.0) -> None:
self._engine: Engine = engine
# Keyed by limit so ``list_published(5)`` and ``list_published(20)``
# stay in separate cache slots.
self._cache: TTLCache[int, list[PostSummary]] = TTLCache(ttl_seconds)
def list_published(self, limit: int = 20) -> list[PostSummary]:
"""Return up to ``limit`` published posts, most recent first.
"""Return up to ``limit`` published posts, newest first.
Parameters
----------
limit:
Maximum number of summaries to return. Kept in the signature
now (even though the stub ignores it) so Phase 2's real
implementation is a drop-in replacement.
Maximum rows to return. Clamped to ``[1, 100]`` to keep
pathological callers from dumping the full table.
Returns
-------
list[PostSummary]
Currently always an empty list. The template treats an empty
list as the "no posts yet" state.
Immutable summary records; an empty list when the site
has no published posts (the template renders an
appropriate empty state).
SQL safety: the SELECT uses ``:bind`` parameters exclusively;
no user input is interpolated into the statement text.
"""
# Phase 1 stub: no DB, no posts. Phase 2 will issue a parameterized
# SELECT against the `posts` table filtered by status='published'
# and ordered by published_at DESC.
return []
# Defensive clamp; the public template only passes 20 but
# future callers could pass arbitrary values.
safe_limit = max(1, min(int(limit), 100))
cached = self._cache.get(safe_limit)
if cached is not None:
return cached
with self._engine.connect() as conn:
rows = (
conn.execute(
text(
"SELECT slug, title, published_at, body_md,"
" body_html_cached"
" FROM posts"
" WHERE status = :status"
" ORDER BY published_at DESC"
" LIMIT :limit"
),
{
"status": PostStatus.PUBLISHED.value,
"limit": safe_limit,
},
)
.mappings()
.all()
)
summaries: list[PostSummary] = []
for row in rows:
published_at_str: Optional[str] = row["published_at"]
# A row with status='published' should never have NULL
# published_at; if it does, skip it rather than crash the
# homepage. Phase 4's admin flow enforces this invariant
# at write time.
if published_at_str is None:
continue
summaries.append(
PostSummary(
slug=row["slug"],
title=row["title"],
published_at=_parse_datetime(published_at_str),
excerpt=_build_excerpt(
row["body_md"], row["body_html_cached"]
),
)
)
self._cache.set(safe_limit, summaries)
return summaries
def invalidate_all(self) -> None:
"""Drop every cached post-list entry.
Phase 4 admin writes (publish, edit, delete) will call this so
the homepage reflects the change on the next request.
"""
self._cache.invalidate_all()
# Module-level singleton. The service is stateless in Phase 1, so one
# instance is safe to share across requests. Phase 2 may relocate this
# behind a factory if per-request scoping becomes useful.
_post_service: PostService = PostService()
def get_post_service(request: Request) -> PostService:
"""FastAPI dependency: pull the app-scoped :class:`PostService`.
def get_post_service() -> PostService:
"""Return the shared :class:`PostService` for FastAPI dependency injection.
Keeping this as a module-level function (rather than instantiating a
fresh service on every request) means FastAPI's ``Depends`` wiring
pays no construction cost on the hot path, and tests can override the
dependency via ``app.dependency_overrides[get_post_service]``.
Instantiated once in :func:`app.main.create_app` and stored on
``app.state.post_service``. Tests override via
``app.dependency_overrides[get_post_service]``.
"""
return _post_service
return request.app.state.post_service