Files
chicken_babies_site/app/services/markdown.py
Phillip Tarrant 0306f71763 feat: phase 2 content model + cache — SQLite schema, markdown, TTL
Stand up the full SQLite content layer: all 7 tables from the authoritative
schema with WAL + foreign-keys enforced per-connection, entity dataclasses
plus row mappers, hand-rolled versioned migrations tracked in
schema_migrations, and an idempotent Python seed (system user + welcome
post + About page).

Add a Markdown->HTML service using markdown-it-py with a strict bleach
allowlist (tables intentionally omitted on both sides). Add a typed
in-process TTLCache[K,V] and wire it into real DB-backed PostService and
PageService, both exposing invalidate_all() for Phase 4 admin writes.

Rewire / and /about to read from the DB; homepage renders the seeded
welcome post, About renders page.title + sanitized body_html_cached.
Update the Phase 1 route tests accordingly.

Mark Phase 2 complete in docs/ROADMAP.md.
2026-04-21 15:40:35 -05:00

126 lines
4.0 KiB
Python

"""Markdown rendering with a strict sanitization allowlist.
CWE-79 mitigation: user-authored Markdown is first rendered to HTML by
``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass
through), then the resulting HTML is filtered by ``bleach`` against an
explicit tag / attribute / protocol allowlist. Anything not on the
list is stripped — never escaped — so the stored ``body_html_cached``
is always safe to render inside an ``autoescape=False`` Jinja block.
The pipeline runs both on admin writes (Phase 4) and at seed time
(Phase 2).
"""
from __future__ import annotations
from typing import Final
import bleach
from markdown_it import MarkdownIt
# --- Sanitization allowlist ------------------------------------------------
# Kept at module scope as frozenset / mappingproxy-esque constants so
# tests can assert against them and downstream callers cannot mutate by
# accident. Do not widen without a security review; in particular:
#
# - No ``style`` or ``class`` attributes (CSS injection / theme attack
# surface for future admin UIs).
# - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc.
# - No ``data:`` / ``javascript:`` protocols.
_ALLOWED_TAGS: Final[frozenset[str]] = frozenset(
{
"p",
"br",
"strong",
"em",
"a",
"ul",
"ol",
"li",
"h1",
"h2",
"h3",
"h4",
"blockquote",
"code",
"pre",
"img",
"hr",
}
)
_ALLOWED_ATTRS: Final[dict[str, list[str]]] = {
"a": ["href", "title", "rel"],
"img": ["src", "alt", "title", "width", "height"],
}
_ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset(
{"http", "https", "mailto"}
)
class MarkdownService:
"""Render Markdown to HTML, then sanitize against the allowlist.
One ``MarkdownIt`` instance per service instance — creating these
is cheap but non-trivial, so we reuse. The service is stateless
aside from that configuration; ``render`` is safe to call
concurrently.
"""
def __init__(self) -> None:
"""Configure the Markdown parser.
- ``commonmark`` preset: conservative, no raw HTML pass
through by default. We explicitly do NOT call
``.enable("html_inline")`` or ``.enable("html_block")``;
raw HTML in the source will be rendered as escaped text,
which is the safe failure mode.
- Tables are intentionally not enabled: the bleach allowlist
does not include ``<table>``, so enabling the plugin would
just produce content stripped of its tags. If we ever want
tables, both sides (parser + allowlist) need widening
together.
"""
self._md: MarkdownIt = MarkdownIt("commonmark")
def render(self, md: str) -> str:
"""Render ``md`` to sanitized HTML.
Parameters
----------
md:
Markdown source, typically from an admin edit form or a
seed file. Treated as untrusted.
Returns
-------
str
HTML safe to render with Jinja autoescape disabled. The
output contains only tags / attributes / protocols from
the module-level allowlists; anything else is stripped
(``strip=True``) rather than escaped.
"""
raw_html = self._md.render(md)
# ``strip=True`` removes disallowed tags entirely (drops the
# tag but keeps text content). This is a deliberate choice
# over ``strip=False``, which would escape disallowed tags
# into literal text — ugly for users.
return bleach.clean(
raw_html,
tags=_ALLOWED_TAGS,
attributes=_ALLOWED_ATTRS,
protocols=_ALLOWED_PROTOCOLS,
strip=True,
)
def render_markdown_safe(md: str) -> str:
"""Module-level convenience for one-off rendering.
Creates a throwaway :class:`MarkdownService` — fine for rare
callers (tests, seed). Hot paths should construct and cache an
instance.
"""
return MarkdownService().render(md)