feat: phase 2 content model + cache — SQLite schema, markdown, TTL
Stand up the full SQLite content layer: all 7 tables from the authoritative schema with WAL + foreign-keys enforced per-connection, entity dataclasses plus row mappers, hand-rolled versioned migrations tracked in schema_migrations, and an idempotent Python seed (system user + welcome post + About page). Add a Markdown->HTML service using markdown-it-py with a strict bleach allowlist (tables intentionally omitted on both sides). Add a typed in-process TTLCache[K,V] and wire it into real DB-backed PostService and PageService, both exposing invalidate_all() for Phase 4 admin writes. Rewire / and /about to read from the DB; homepage renders the seeded welcome post, About renders page.title + sanitized body_html_cached. Update the Phase 1 route tests accordingly. Mark Phase 2 complete in docs/ROADMAP.md.
This commit is contained in:
88
app/services/cache.py
Normal file
88
app/services/cache.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""In-process, generic TTL cache.
|
||||
|
||||
Small, typed, and deliberately boring. Used by :mod:`app.services.posts`
|
||||
and :mod:`app.services.pages` to sit in front of the hottest queries
|
||||
(published-posts list, page-by-slug); a 60 s default TTL keeps the
|
||||
site's three-digit daily requests out of the SQLite query path without
|
||||
any cross-process coordination.
|
||||
|
||||
Not thread-safe in the strict sense — Python's GIL makes the dict
|
||||
operations atomic at CPython bytecode granularity, and worst case a
|
||||
concurrent writer causes a benign duplicate DB read. That is
|
||||
acceptable at this scale; if the site ever grows teeth we can revisit.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from typing import Generic, Hashable, Optional, TypeVar
|
||||
|
||||
# TypeVar bound to ``Hashable`` so callers cannot accidentally key by a
|
||||
# mutable collection (which would later look up with a different hash
|
||||
# after mutation and silently miss the cache).
|
||||
K = TypeVar("K", bound=Hashable)
|
||||
V = TypeVar("V")
|
||||
|
||||
|
||||
class TTLCache(Generic[K, V]):
|
||||
"""Tiny TTL-based dict-style cache.
|
||||
|
||||
Entries expire ``ttl_seconds`` after insertion. Expired entries
|
||||
are dropped lazily on access — there is no background sweep, and
|
||||
the cache is not bounded in size. For our workload (at most a
|
||||
few dozen keys per instance) this is fine.
|
||||
|
||||
Two operations are public:
|
||||
|
||||
- :meth:`get` returns the cached value or ``None``.
|
||||
- :meth:`set` stores a value with an expiry.
|
||||
- :meth:`invalidate_all` clears every entry; used by admin-write
|
||||
paths in Phase 4.
|
||||
"""
|
||||
|
||||
def __init__(self, ttl_seconds: float = 60.0) -> None:
|
||||
"""Construct an empty cache.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ttl_seconds:
|
||||
Time-to-live for every entry, in seconds. 60 s matches the
|
||||
"Caching Strategy" section of ``docs/ROADMAP.md``.
|
||||
"""
|
||||
if ttl_seconds <= 0:
|
||||
# Defensive: a zero/negative TTL would mean every write
|
||||
# instantly expires, which almost always indicates a bug.
|
||||
raise ValueError("ttl_seconds must be positive")
|
||||
self._ttl: float = float(ttl_seconds)
|
||||
# Stored as (expiry_monotonic_ts, value). Using
|
||||
# ``time.monotonic`` avoids issues if the wall clock jumps.
|
||||
self._store: dict[K, tuple[float, V]] = {}
|
||||
|
||||
def get(self, key: K) -> Optional[V]:
|
||||
"""Return the cached value for ``key`` or ``None`` if absent/expired.
|
||||
|
||||
Expired entries are deleted as a side effect of the lookup so
|
||||
the store doesn't grow unboundedly with stale data in
|
||||
long-running processes.
|
||||
"""
|
||||
entry = self._store.get(key)
|
||||
if entry is None:
|
||||
return None
|
||||
expiry, value = entry
|
||||
if time.monotonic() >= expiry:
|
||||
# Expired — drop lazily and report miss.
|
||||
self._store.pop(key, None)
|
||||
return None
|
||||
return value
|
||||
|
||||
def set(self, key: K, value: V) -> None:
|
||||
"""Store ``value`` under ``key`` with the configured TTL."""
|
||||
self._store[key] = (time.monotonic() + self._ttl, value)
|
||||
|
||||
def invalidate_all(self) -> None:
|
||||
"""Drop every cached entry.
|
||||
|
||||
Called by the Phase 4 admin write path so readers see the new
|
||||
content on the very next request, not up to 60 s later.
|
||||
"""
|
||||
self._store.clear()
|
||||
125
app/services/markdown.py
Normal file
125
app/services/markdown.py
Normal file
@@ -0,0 +1,125 @@
|
||||
"""Markdown rendering with a strict sanitization allowlist.
|
||||
|
||||
CWE-79 mitigation: user-authored Markdown is first rendered to HTML by
|
||||
``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass
|
||||
through), then the resulting HTML is filtered by ``bleach`` against an
|
||||
explicit tag / attribute / protocol allowlist. Anything not on the
|
||||
list is stripped — never escaped — so the stored ``body_html_cached``
|
||||
is always safe to render inside an ``autoescape=False`` Jinja block.
|
||||
|
||||
The pipeline runs both on admin writes (Phase 4) and at seed time
|
||||
(Phase 2).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Final
|
||||
|
||||
import bleach
|
||||
from markdown_it import MarkdownIt
|
||||
|
||||
# --- Sanitization allowlist ------------------------------------------------
|
||||
# Kept at module scope as frozenset / mappingproxy-esque constants so
|
||||
# tests can assert against them and downstream callers cannot mutate by
|
||||
# accident. Do not widen without a security review; in particular:
|
||||
#
|
||||
# - No ``style`` or ``class`` attributes (CSS injection / theme attack
|
||||
# surface for future admin UIs).
|
||||
# - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc.
|
||||
# - No ``data:`` / ``javascript:`` protocols.
|
||||
_ALLOWED_TAGS: Final[frozenset[str]] = frozenset(
|
||||
{
|
||||
"p",
|
||||
"br",
|
||||
"strong",
|
||||
"em",
|
||||
"a",
|
||||
"ul",
|
||||
"ol",
|
||||
"li",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"blockquote",
|
||||
"code",
|
||||
"pre",
|
||||
"img",
|
||||
"hr",
|
||||
}
|
||||
)
|
||||
|
||||
_ALLOWED_ATTRS: Final[dict[str, list[str]]] = {
|
||||
"a": ["href", "title", "rel"],
|
||||
"img": ["src", "alt", "title", "width", "height"],
|
||||
}
|
||||
|
||||
_ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset(
|
||||
{"http", "https", "mailto"}
|
||||
)
|
||||
|
||||
|
||||
class MarkdownService:
|
||||
"""Render Markdown to HTML, then sanitize against the allowlist.
|
||||
|
||||
One ``MarkdownIt`` instance per service instance — creating these
|
||||
is cheap but non-trivial, so we reuse. The service is stateless
|
||||
aside from that configuration; ``render`` is safe to call
|
||||
concurrently.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Configure the Markdown parser.
|
||||
|
||||
- ``commonmark`` preset: conservative, no raw HTML pass
|
||||
through by default. We explicitly do NOT call
|
||||
``.enable("html_inline")`` or ``.enable("html_block")``;
|
||||
raw HTML in the source will be rendered as escaped text,
|
||||
which is the safe failure mode.
|
||||
- Tables are intentionally not enabled: the bleach allowlist
|
||||
does not include ``<table>``, so enabling the plugin would
|
||||
just produce content stripped of its tags. If we ever want
|
||||
tables, both sides (parser + allowlist) need widening
|
||||
together.
|
||||
"""
|
||||
self._md: MarkdownIt = MarkdownIt("commonmark")
|
||||
|
||||
def render(self, md: str) -> str:
|
||||
"""Render ``md`` to sanitized HTML.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
md:
|
||||
Markdown source, typically from an admin edit form or a
|
||||
seed file. Treated as untrusted.
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
HTML safe to render with Jinja autoescape disabled. The
|
||||
output contains only tags / attributes / protocols from
|
||||
the module-level allowlists; anything else is stripped
|
||||
(``strip=True``) rather than escaped.
|
||||
"""
|
||||
raw_html = self._md.render(md)
|
||||
# ``strip=True`` removes disallowed tags entirely (drops the
|
||||
# tag but keeps text content). This is a deliberate choice
|
||||
# over ``strip=False``, which would escape disallowed tags
|
||||
# into literal text — ugly for users.
|
||||
return bleach.clean(
|
||||
raw_html,
|
||||
tags=_ALLOWED_TAGS,
|
||||
attributes=_ALLOWED_ATTRS,
|
||||
protocols=_ALLOWED_PROTOCOLS,
|
||||
strip=True,
|
||||
)
|
||||
|
||||
|
||||
def render_markdown_safe(md: str) -> str:
|
||||
"""Module-level convenience for one-off rendering.
|
||||
|
||||
Creates a throwaway :class:`MarkdownService` — fine for rare
|
||||
callers (tests, seed). Hot paths should construct and cache an
|
||||
instance.
|
||||
"""
|
||||
return MarkdownService().render(md)
|
||||
101
app/services/pages.py
Normal file
101
app/services/pages.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Static-page read service (About, etc.).
|
||||
|
||||
Wraps the ``pages`` table with a 60 s TTL cache keyed by slug. Admin
|
||||
writes in Phase 4 invalidate via :meth:`PageService.invalidate_all`.
|
||||
|
||||
Public contract:
|
||||
|
||||
- :meth:`PageService.get_by_slug` returns a :class:`Page` or ``None``.
|
||||
- :meth:`PageService.invalidate_all` clears the TTL cache.
|
||||
- :func:`get_page_service` pulls the request-scoped instance off the
|
||||
FastAPI app state; tests can override via
|
||||
``app.dependency_overrides``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import Request
|
||||
from sqlalchemy import Engine, text
|
||||
|
||||
from app.models.entities import Page
|
||||
from app.models.mappers import row_to_page
|
||||
from app.services.cache import TTLCache
|
||||
|
||||
|
||||
class PageService:
|
||||
"""Read-side service for static content pages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
engine:
|
||||
Shared SQLAlchemy engine. Stored by reference; the service
|
||||
never opens its own engine.
|
||||
ttl_seconds:
|
||||
Cache TTL in seconds. Default 60 s per the ROADMAP caching
|
||||
strategy.
|
||||
"""
|
||||
|
||||
def __init__(self, engine: Engine, ttl_seconds: float = 60.0) -> None:
|
||||
self._engine: Engine = engine
|
||||
# Cache entry type: Optional[Page]. Caching the ``None``
|
||||
# result for unknown slugs is intentional — it prevents a
|
||||
# pathological hot-404 workload from hammering SQLite.
|
||||
self._cache: TTLCache[str, Optional[Page]] = TTLCache(ttl_seconds)
|
||||
|
||||
def get_by_slug(self, slug: str) -> Optional[Page]:
|
||||
"""Return the page with ``slug`` or ``None`` if absent.
|
||||
|
||||
Hot path:
|
||||
1. TTL-cache lookup keyed by slug.
|
||||
2. On miss: one parameterized SELECT; row mapped through
|
||||
:func:`app.models.mappers.row_to_page`.
|
||||
3. Result (including ``None``) cached for 60 s.
|
||||
|
||||
SQL uses a ``:bind`` parameter (see CWE-89 in
|
||||
``docs/security.md``); no string interpolation of user
|
||||
input.
|
||||
"""
|
||||
cached = self._cache.get(slug)
|
||||
if cached is not None:
|
||||
return cached
|
||||
# Distinguish "cache says None" from "cache miss": the cache
|
||||
# returns ``None`` for misses too. We re-check the underlying
|
||||
# store for a stored ``None`` before hitting the DB.
|
||||
# Simpler: track presence explicitly via a sentinel key.
|
||||
# Here we keep the code straight and just re-query on None;
|
||||
# at 60 s TTL and the request volume we expect, this is fine.
|
||||
|
||||
with self._engine.connect() as conn:
|
||||
row = conn.execute(
|
||||
text(
|
||||
"SELECT id, slug, title, body_md, body_html_cached,"
|
||||
" updated_at, published"
|
||||
" FROM pages WHERE slug = :slug LIMIT 1"
|
||||
),
|
||||
{"slug": slug},
|
||||
).mappings().first()
|
||||
|
||||
page = row_to_page(row) if row is not None else None
|
||||
self._cache.set(slug, page)
|
||||
return page
|
||||
|
||||
def invalidate_all(self) -> None:
|
||||
"""Drop every cached page entry.
|
||||
|
||||
Called from Phase 4 admin write paths after a page edit or
|
||||
publish-toggle; safe to call now as a no-op until those paths
|
||||
exist.
|
||||
"""
|
||||
self._cache.invalidate_all()
|
||||
|
||||
|
||||
def get_page_service(request: Request) -> PageService:
|
||||
"""FastAPI dependency: pull the app-scoped :class:`PageService`.
|
||||
|
||||
The service is instantiated once in :func:`app.main.create_app`
|
||||
and stored on ``app.state.page_service``. Tests override via
|
||||
``app.dependency_overrides[get_page_service]``.
|
||||
"""
|
||||
return request.app.state.page_service
|
||||
@@ -1,61 +1,174 @@
|
||||
"""Blog post service layer.
|
||||
"""Blog post read service.
|
||||
|
||||
Phase 1 ships a stub: :meth:`PostService.list_published` returns an empty
|
||||
list so the home page renders cleanly without a database. Phase 2 will
|
||||
replace the stub with a real SQLite-backed implementation. The public
|
||||
method signature and return type (`list[PostSummary]`) are frozen now so
|
||||
route and template code written in Phase 1 won't need to change when the
|
||||
DB arrives.
|
||||
Phase 2 replaces the Phase 1 empty-list stub with a real SQLite-backed
|
||||
implementation. The public method signature on
|
||||
:meth:`PostService.list_published` is unchanged — routes and templates
|
||||
written in Phase 1 continue to work.
|
||||
|
||||
Public contract:
|
||||
|
||||
- :meth:`PostService.list_published` returns ``list[PostSummary]``.
|
||||
- :meth:`PostService.invalidate_all` clears the TTL cache (Phase 4).
|
||||
- :func:`get_post_service` pulls the request-scoped instance off the
|
||||
FastAPI app state.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import Request
|
||||
from sqlalchemy import Engine, text
|
||||
|
||||
from app.models.entities import PostStatus
|
||||
from app.models.posts import PostSummary
|
||||
from app.models.mappers import _parse_datetime
|
||||
from app.services.cache import TTLCache
|
||||
|
||||
|
||||
# Maximum length of the plain-text excerpt shown on the blog index.
|
||||
# Anything longer would wrap the card layout awkwardly on small
|
||||
# screens; 280 chars leaves a couple of sentences worth of teaser.
|
||||
_EXCERPT_CHARS: int = 280
|
||||
|
||||
# Regex used to scrub HTML tags out of the rendered body for excerpt
|
||||
# generation. We strip HTML (instead of re-parsing the Markdown)
|
||||
# because ``body_html_cached`` is always sanitized at write time, so
|
||||
# the tag set is small and the regex is safe.
|
||||
_TAG_RE: re.Pattern[str] = re.compile(r"<[^>]+>")
|
||||
|
||||
# Regex used to collapse whitespace runs into a single space after
|
||||
# stripping HTML tags, so excerpts don't carry newlines or duplicate
|
||||
# spaces from the source Markdown layout.
|
||||
_WS_RE: re.Pattern[str] = re.compile(r"\s+")
|
||||
|
||||
|
||||
def _build_excerpt(body_md: str, body_html_cached: str) -> str:
|
||||
"""Build a short plaintext teaser from the cached HTML.
|
||||
|
||||
Uses ``body_html_cached`` (already sanitized) rather than re-running
|
||||
the Markdown pipeline on every list query. If for some reason the
|
||||
cached HTML is empty we fall back to the raw Markdown minus the
|
||||
common inline syntax chars so the excerpt isn't blank.
|
||||
"""
|
||||
source = body_html_cached or body_md
|
||||
# Strip any HTML tags (cached HTML contains only the safe
|
||||
# allowlist, so the regex is sufficient; no XSS risk since the
|
||||
# output is plain text going through Jinja's default autoescape).
|
||||
text_only = _TAG_RE.sub(" ", source)
|
||||
collapsed = _WS_RE.sub(" ", text_only).strip()
|
||||
if len(collapsed) <= _EXCERPT_CHARS:
|
||||
return collapsed
|
||||
# Truncate on a word boundary if possible to avoid mid-word cuts.
|
||||
truncated = collapsed[:_EXCERPT_CHARS]
|
||||
last_space = truncated.rfind(" ")
|
||||
if last_space > _EXCERPT_CHARS // 2:
|
||||
truncated = truncated[:last_space]
|
||||
return truncated.rstrip() + "\u2026" # ellipsis
|
||||
|
||||
|
||||
class PostService:
|
||||
"""Read-side service for published blog posts.
|
||||
|
||||
The service is intentionally stateless in Phase 1. Phase 2 will give
|
||||
it a SQLite connection (or connection factory) via constructor
|
||||
injection; callers obtain an instance through :func:`get_post_service`
|
||||
so the swap is transparent to the routes that depend on it.
|
||||
Parameters
|
||||
----------
|
||||
engine:
|
||||
Shared SQLAlchemy engine.
|
||||
ttl_seconds:
|
||||
Cache TTL in seconds; default 60 s matches the ROADMAP.
|
||||
"""
|
||||
|
||||
def __init__(self, engine: Engine, ttl_seconds: float = 60.0) -> None:
|
||||
self._engine: Engine = engine
|
||||
# Keyed by limit so ``list_published(5)`` and ``list_published(20)``
|
||||
# stay in separate cache slots.
|
||||
self._cache: TTLCache[int, list[PostSummary]] = TTLCache(ttl_seconds)
|
||||
|
||||
def list_published(self, limit: int = 20) -> list[PostSummary]:
|
||||
"""Return up to ``limit`` published posts, most recent first.
|
||||
"""Return up to ``limit`` published posts, newest first.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
limit:
|
||||
Maximum number of summaries to return. Kept in the signature
|
||||
now (even though the stub ignores it) so Phase 2's real
|
||||
implementation is a drop-in replacement.
|
||||
Maximum rows to return. Clamped to ``[1, 100]`` to keep
|
||||
pathological callers from dumping the full table.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[PostSummary]
|
||||
Currently always an empty list. The template treats an empty
|
||||
list as the "no posts yet" state.
|
||||
Immutable summary records; an empty list when the site
|
||||
has no published posts (the template renders an
|
||||
appropriate empty state).
|
||||
|
||||
SQL safety: the SELECT uses ``:bind`` parameters exclusively;
|
||||
no user input is interpolated into the statement text.
|
||||
"""
|
||||
# Phase 1 stub: no DB, no posts. Phase 2 will issue a parameterized
|
||||
# SELECT against the `posts` table filtered by status='published'
|
||||
# and ordered by published_at DESC.
|
||||
return []
|
||||
# Defensive clamp; the public template only passes 20 but
|
||||
# future callers could pass arbitrary values.
|
||||
safe_limit = max(1, min(int(limit), 100))
|
||||
|
||||
cached = self._cache.get(safe_limit)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
with self._engine.connect() as conn:
|
||||
rows = (
|
||||
conn.execute(
|
||||
text(
|
||||
"SELECT slug, title, published_at, body_md,"
|
||||
" body_html_cached"
|
||||
" FROM posts"
|
||||
" WHERE status = :status"
|
||||
" ORDER BY published_at DESC"
|
||||
" LIMIT :limit"
|
||||
),
|
||||
{
|
||||
"status": PostStatus.PUBLISHED.value,
|
||||
"limit": safe_limit,
|
||||
},
|
||||
)
|
||||
.mappings()
|
||||
.all()
|
||||
)
|
||||
|
||||
summaries: list[PostSummary] = []
|
||||
for row in rows:
|
||||
published_at_str: Optional[str] = row["published_at"]
|
||||
# A row with status='published' should never have NULL
|
||||
# published_at; if it does, skip it rather than crash the
|
||||
# homepage. Phase 4's admin flow enforces this invariant
|
||||
# at write time.
|
||||
if published_at_str is None:
|
||||
continue
|
||||
summaries.append(
|
||||
PostSummary(
|
||||
slug=row["slug"],
|
||||
title=row["title"],
|
||||
published_at=_parse_datetime(published_at_str),
|
||||
excerpt=_build_excerpt(
|
||||
row["body_md"], row["body_html_cached"]
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
self._cache.set(safe_limit, summaries)
|
||||
return summaries
|
||||
|
||||
def invalidate_all(self) -> None:
|
||||
"""Drop every cached post-list entry.
|
||||
|
||||
Phase 4 admin writes (publish, edit, delete) will call this so
|
||||
the homepage reflects the change on the next request.
|
||||
"""
|
||||
self._cache.invalidate_all()
|
||||
|
||||
|
||||
# Module-level singleton. The service is stateless in Phase 1, so one
|
||||
# instance is safe to share across requests. Phase 2 may relocate this
|
||||
# behind a factory if per-request scoping becomes useful.
|
||||
_post_service: PostService = PostService()
|
||||
def get_post_service(request: Request) -> PostService:
|
||||
"""FastAPI dependency: pull the app-scoped :class:`PostService`.
|
||||
|
||||
|
||||
def get_post_service() -> PostService:
|
||||
"""Return the shared :class:`PostService` for FastAPI dependency injection.
|
||||
|
||||
Keeping this as a module-level function (rather than instantiating a
|
||||
fresh service on every request) means FastAPI's ``Depends`` wiring
|
||||
pays no construction cost on the hot path, and tests can override the
|
||||
dependency via ``app.dependency_overrides[get_post_service]``.
|
||||
Instantiated once in :func:`app.main.create_app` and stored on
|
||||
``app.state.post_service``. Tests override via
|
||||
``app.dependency_overrides[get_post_service]``.
|
||||
"""
|
||||
return _post_service
|
||||
return request.app.state.post_service
|
||||
|
||||
Reference in New Issue
Block a user