Stand up the full SQLite content layer: all 7 tables from the authoritative schema with WAL + foreign-keys enforced per-connection, entity dataclasses plus row mappers, hand-rolled versioned migrations tracked in schema_migrations, and an idempotent Python seed (system user + welcome post + About page). Add a Markdown->HTML service using markdown-it-py with a strict bleach allowlist (tables intentionally omitted on both sides). Add a typed in-process TTLCache[K,V] and wire it into real DB-backed PostService and PageService, both exposing invalidate_all() for Phase 4 admin writes. Rewire / and /about to read from the DB; homepage renders the seeded welcome post, About renders page.title + sanitized body_html_cached. Update the Phase 1 route tests accordingly. Mark Phase 2 complete in docs/ROADMAP.md.
126 lines
4.0 KiB
Python
126 lines
4.0 KiB
Python
"""Markdown rendering with a strict sanitization allowlist.
|
|
|
|
CWE-79 mitigation: user-authored Markdown is first rendered to HTML by
|
|
``markdown-it-py`` (commonmark profile + tables only, no raw-HTML pass
|
|
through), then the resulting HTML is filtered by ``bleach`` against an
|
|
explicit tag / attribute / protocol allowlist. Anything not on the
|
|
list is stripped — never escaped — so the stored ``body_html_cached``
|
|
is always safe to render inside an ``autoescape=False`` Jinja block.
|
|
|
|
The pipeline runs both on admin writes (Phase 4) and at seed time
|
|
(Phase 2).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Final
|
|
|
|
import bleach
|
|
from markdown_it import MarkdownIt
|
|
|
|
# --- Sanitization allowlist ------------------------------------------------
|
|
# Kept at module scope as frozenset / mappingproxy-esque constants so
|
|
# tests can assert against them and downstream callers cannot mutate by
|
|
# accident. Do not widen without a security review; in particular:
|
|
#
|
|
# - No ``style`` or ``class`` attributes (CSS injection / theme attack
|
|
# surface for future admin UIs).
|
|
# - No ``script``, ``iframe``, ``object``, ``embed``, ``form``, etc.
|
|
# - No ``data:`` / ``javascript:`` protocols.
|
|
_ALLOWED_TAGS: Final[frozenset[str]] = frozenset(
|
|
{
|
|
"p",
|
|
"br",
|
|
"strong",
|
|
"em",
|
|
"a",
|
|
"ul",
|
|
"ol",
|
|
"li",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"blockquote",
|
|
"code",
|
|
"pre",
|
|
"img",
|
|
"hr",
|
|
}
|
|
)
|
|
|
|
_ALLOWED_ATTRS: Final[dict[str, list[str]]] = {
|
|
"a": ["href", "title", "rel"],
|
|
"img": ["src", "alt", "title", "width", "height"],
|
|
}
|
|
|
|
_ALLOWED_PROTOCOLS: Final[frozenset[str]] = frozenset(
|
|
{"http", "https", "mailto"}
|
|
)
|
|
|
|
|
|
class MarkdownService:
|
|
"""Render Markdown to HTML, then sanitize against the allowlist.
|
|
|
|
One ``MarkdownIt`` instance per service instance — creating these
|
|
is cheap but non-trivial, so we reuse. The service is stateless
|
|
aside from that configuration; ``render`` is safe to call
|
|
concurrently.
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Configure the Markdown parser.
|
|
|
|
- ``commonmark`` preset: conservative, no raw HTML pass
|
|
through by default. We explicitly do NOT call
|
|
``.enable("html_inline")`` or ``.enable("html_block")``;
|
|
raw HTML in the source will be rendered as escaped text,
|
|
which is the safe failure mode.
|
|
- Tables are intentionally not enabled: the bleach allowlist
|
|
does not include ``<table>``, so enabling the plugin would
|
|
just produce content stripped of its tags. If we ever want
|
|
tables, both sides (parser + allowlist) need widening
|
|
together.
|
|
"""
|
|
self._md: MarkdownIt = MarkdownIt("commonmark")
|
|
|
|
def render(self, md: str) -> str:
|
|
"""Render ``md`` to sanitized HTML.
|
|
|
|
Parameters
|
|
----------
|
|
md:
|
|
Markdown source, typically from an admin edit form or a
|
|
seed file. Treated as untrusted.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
HTML safe to render with Jinja autoescape disabled. The
|
|
output contains only tags / attributes / protocols from
|
|
the module-level allowlists; anything else is stripped
|
|
(``strip=True``) rather than escaped.
|
|
"""
|
|
raw_html = self._md.render(md)
|
|
# ``strip=True`` removes disallowed tags entirely (drops the
|
|
# tag but keeps text content). This is a deliberate choice
|
|
# over ``strip=False``, which would escape disallowed tags
|
|
# into literal text — ugly for users.
|
|
return bleach.clean(
|
|
raw_html,
|
|
tags=_ALLOWED_TAGS,
|
|
attributes=_ALLOWED_ATTRS,
|
|
protocols=_ALLOWED_PROTOCOLS,
|
|
strip=True,
|
|
)
|
|
|
|
|
|
def render_markdown_safe(md: str) -> str:
|
|
"""Module-level convenience for one-off rendering.
|
|
|
|
Creates a throwaway :class:`MarkdownService` — fine for rare
|
|
callers (tests, seed). Hot paths should construct and cache an
|
|
instance.
|
|
"""
|
|
return MarkdownService().render(md)
|