chicken_babies_site/app/services/markdown.py

"""Markdown rendering with a strict sanitization allowlist.

CWE-79 mitigation: user-authored Markdown is first rendered to HTML by
``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass
through), then the resulting HTML is filtered by ``bleach`` against an
explicit tag / attribute / protocol allowlist. Anything not on the
list is stripped — never escaped — so the stored ``body_html_cached``
is always safe to render inside an ``autoescape=False`` Jinja block.

The pipeline runs both on admin writes (Phase 4) and at seed time
(Phase 2).
"""

from __future__ import annotations

from typing import Final

import bleach
from markdown_it import MarkdownIt

# --- Sanitization allowlist ------------------------------------------------
# Kept at module scope as frozenset / mappingproxy-esque constants so
# tests can assert against them and downstream callers cannot mutate by
# accident. Do not widen without a security review; in particular:
#
# - No ``style`` or ``class`` attributes (CSS injection / theme attack
#   surface for future admin UIs).
# - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc.
# - No ``data:`` / ``javascript:`` protocols.
_ALLOWED_TAGS: Final[frozenset[str]] = frozenset(
    {
        "p",
        "br",
        "strong",
        "em",
        "a",
        "ul",
        "ol",
        "li",
        "h1",
        "h2",
        "h3",
        "h4",
        "blockquote",
        "code",
        "pre",
        "img",
        "hr",
    }
)

_ALLOWED_ATTRS: Final[dict[str, list[str]]] = {
    "a": ["href", "title", "rel"],
    "img": ["src", "alt", "title", "width", "height"],
}

_ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset(
    {"http", "https", "mailto"}
)


class MarkdownService:
    """Render Markdown to HTML, then sanitize against the allowlist.

    One ``MarkdownIt`` instance per service instance — creating these
    is cheap but non-trivial, so we reuse. The service is stateless
    aside from that configuration; ``render`` is safe to call
    concurrently.
    """

    def __init__(self) -> None:
        """Configure the Markdown parser.

        - ``commonmark`` preset: conservative, no raw HTML pass
          through by default. We explicitly do NOT call
          ``.enable("html_inline")`` or ``.enable("html_block")``;
          raw HTML in the source will be rendered as escaped text,
          which is the safe failure mode.
        - Tables are intentionally not enabled: the bleach allowlist
          does not include ``<table>``, so enabling the plugin would
          just produce content stripped of its tags. If we ever want
          tables, both sides (parser + allowlist) need widening
          together.
        """
        self._md: MarkdownIt = MarkdownIt("commonmark")

    def render(self, md: str) -> str:
        """Render ``md`` to sanitized HTML.

        Parameters
        ----------
        md:
            Markdown source, typically from an admin edit form or a
            seed file. Treated as untrusted.

        Returns
        -------
        str
            HTML safe to render with Jinja autoescape disabled. The
            output contains only tags / attributes / protocols from
            the module-level allowlists; anything else is stripped
            (``strip=True``) rather than escaped.
        """
        raw_html = self._md.render(md)
        # ``strip=True`` removes disallowed tags entirely (drops the
        # tag but keeps text content). This is a deliberate choice
        # over ``strip=False``, which would escape disallowed tags
        # into literal text — ugly for users.
        return bleach.clean(
            raw_html,
            tags=_ALLOWED_TAGS,
            attributes=_ALLOWED_ATTRS,
            protocols=_ALLOWED_PROTOCOLS,
            strip=True,
        )


def render_markdown_safe(md: str) -> str:
    """Module-level convenience for one-off rendering.

    Creates a throwaway :class:`MarkdownService` — fine for rare
    callers (tests, seed). Hot paths should construct and cache an
    instance.
    """
    return MarkdownService().render(md)