"""Markdown rendering with a strict sanitization allowlist. CWE-79 mitigation: user-authored Markdown is first rendered to HTML by ``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass through), then the resulting HTML is filtered by ``bleach`` against an explicit tag / attribute / protocol allowlist. Anything not on the list is stripped — never escaped — so the stored ``body_html_cached`` is always safe to render inside an ``autoescape=False`` Jinja block. The pipeline runs both on admin writes (Phase 4) and at seed time (Phase 2). """ from __future__ import annotations from typing import Final import bleach from markdown_it import MarkdownIt # --- Sanitization allowlist ------------------------------------------------ # Kept at module scope as frozenset / mappingproxy-esque constants so # tests can assert against them and downstream callers cannot mutate by # accident. Do not widen without a security review; in particular: # # - No ``style`` or ``class`` attributes (CSS injection / theme attack # surface for future admin UIs). # - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc. # - No ``data:`` / ``javascript:`` protocols. _ALLOWED_TAGS: Final[frozenset[str]] = frozenset( { "p", "br", "strong", "em", "a", "ul", "ol", "li", "h1", "h2", "h3", "h4", "blockquote", "code", "pre", "img", "hr", } ) _ALLOWED_ATTRS: Final[dict[str, list[str]]] = { "a": ["href", "title", "rel"], "img": ["src", "alt", "title", "width", "height"], } _ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset( {"http", "https", "mailto"} ) class MarkdownService: """Render Markdown to HTML, then sanitize against the allowlist. One ``MarkdownIt`` instance per service instance — creating these is cheap but non-trivial, so we reuse. The service is stateless aside from that configuration; ``render`` is safe to call concurrently. """ def __init__(self) -> None: """Configure the Markdown parser. - ``commonmark`` preset: conservative, no raw HTML pass through by default. We explicitly do NOT call ``.enable("html_inline")`` or ``.enable("html_block")``; raw HTML in the source will be rendered as escaped text, which is the safe failure mode. - Tables are intentionally not enabled: the bleach allowlist does not include ``