chicken_babies_site/tests/test_markdown.py

"""Tests for the Markdown → sanitized HTML pipeline.

We care about three things:

1. Safe inline markup (``**bold**``, ``*italic*``, links, lists) round-trips
   into the expected HTML tags.
2. Dangerous constructs (``<script>``, ``<iframe>``, ``<style>``, inline
   ``onclick`` handlers, ``javascript:`` URLs) are stripped — not
   escaped — from the output.
3. Tables render (we enabled the ``table`` plugin in
   :class:`MarkdownService`).

These are spot checks, not a full fuzz of bleach. The full allowlist
is already enforced in ``app.services.markdown``.
"""

from __future__ import annotations

import pytest

from app.services.markdown import MarkdownService, render_markdown_safe


@pytest.fixture
def md() -> MarkdownService:
    """Return a fresh :class:`MarkdownService`.

    Function-scoped to keep tests independent; the service is cheap
    to construct.
    """
    return MarkdownService()


def test_basic_markdown_renders_paragraphs_and_emphasis(md: MarkdownService) -> None:
    """Simple Markdown constructs produce the expected safe HTML."""
    html = md.render("Hello **world** and *friends*.")
    assert "<p>" in html
    assert "<strong>world</strong>" in html
    assert "<em>friends</em>" in html


def test_script_tags_are_stripped(md: MarkdownService) -> None:
    """A ``<script>`` injected through raw HTML is stripped entirely."""
    src = "Hello<script>alert('xss')</script>world"
    html = md.render(src)
    # bleach strip=True drops the tag; the (potentially dangerous)
    # content can remain as text but cannot execute.
    assert "<script" not in html
    assert "</script>" not in html


def test_iframe_and_style_tags_are_stripped(md: MarkdownService) -> None:
    """Disallowed block-level tags are removed from the output."""
    html = md.render(
        "<iframe src='evil'></iframe>\n\n<style>body{}</style>\n\nsafe"
    )
    assert "<iframe" not in html
    assert "<style" not in html
    assert "safe" in html


def test_javascript_urls_are_stripped_from_links(md: MarkdownService) -> None:
    """Raw ``<a href="javascript:...">`` links lose the dangerous href.

    We construct the link as raw HTML (rather than ``[text](url)``
    Markdown syntax, which commonmark silently refuses to turn into
    an anchor for the unknown ``javascript:`` protocol) so the
    bleach allowlist actually has an anchor to filter. The assertion
    is that the ``javascript:`` URL does not make it into the
    sanitized output.
    """
    html = md.render('<a href="javascript:alert(1)">click</a>')
    assert "javascript:" not in html


def test_allowed_link_and_image_attributes_survive(md: MarkdownService) -> None:
    """Safe link/image attributes are preserved."""
    html = md.render(
        '[hello](https://example.com "Example")\n\n'
        '![alt text](https://example.com/a.png "Caption")'
    )
    assert 'href="https://example.com"' in html
    assert 'title="Example"' in html
    assert 'alt="alt text"' in html
    assert 'src="https://example.com/a.png"' in html


def test_inline_event_handler_attribute_is_stripped(md: MarkdownService) -> None:
    """``onclick`` and similar inline handlers never survive sanitization."""
    html = md.render('<a href="/x" onclick="alert(1)">x</a>')
    assert "onclick" not in html


def test_table_tags_are_stripped(md: MarkdownService) -> None:
    """Tables are not in the bleach allowlist, so their tags are stripped.

    Documents the intentional policy: the Markdown parser is the
    commonmark preset with NO table plugin, and the bleach allowlist
    has no table tags — widening either without the other would be
    a policy mismatch. If a future phase wants tables, this test
    should flip to assert the opposite along with the matching
    allowlist change.
    """
    src = "| a | b |\n|---|---|\n| 1 | 2 |\n"
    html = md.render(src)
    assert "<table" not in html


def test_module_level_helper_matches_class(md: MarkdownService) -> None:
    """``render_markdown_safe`` produces the same output as the class."""
    src = "Hello **there**."
    assert render_markdown_safe(src) == md.render(src)