"""Image upload pipeline: validate → re-encode → store → record. Every admin image upload passes through this service. The contract is strict on purpose — the site serves user-editable HTML (via the sanitizer) plus the bytes that flow through here, so anything we miss becomes XSS / RCE surface area. Steps in :meth:`MediaService.save_upload`: 1. **Size cap** — reject anything over 8 MB at the bytes level (before decoding). We read the full buffer so we can hash and re-encode it; streaming would complicate Pillow's decode path and upload volumes are tiny. 2. **Magic-byte check** — :mod:`python-magic` inspects the first 2048 bytes and yields a MIME type. Anything not in our allowlist (``image/jpeg``, ``image/png``, ``image/webp``) is rejected. Notably, ``image/gif`` is NOT allowed — animated GIFs have a long history of ambiguous / abuse-friendly encodings. 3. **Pillow decode** — open via :func:`PIL.Image.open` on a :class:`io.BytesIO` wrapper. Call ``.verify()`` on a dedicated copy (it consumes the stream), then re-open for the actual encode path. Reject anything larger than 10000 px per side as a defense against decompression bombs. 4. **Re-encode to JPEG** — always JPEG. Strip metadata by reopening into a clean :class:`PIL.Image.Image`; flatten alpha on a white background so transparent PNG / WebP images don't render as black. 5. **Store** — write to ``///.jpg`` where the random component is :func:`secrets.token_urlsafe(16)`. The client-supplied filename is kept only in the DB row's ``original_filename`` for display; it is NEVER used to build a filesystem path. 6. **DB row** — insert a :class:`Media` row. Return the loaded dataclass. """ from __future__ import annotations import io import secrets from datetime import datetime, timezone from pathlib import Path from typing import Final, Optional import structlog from PIL import Image, UnidentifiedImageError from sqlalchemy import Engine, text from app.models.entities import Media from app.models.mappers import row_to_media from app.services.audit import AuditService _log = structlog.get_logger(__name__) # Upper bound on the raw upload bytes. 8 MB matches the project # security constraint; larger images are almost certainly a mistake # for a brochure-site blog. MAX_UPLOAD_BYTES: Final[int] = 8 * 1024 * 1024 # Maximum decoded dimension — reject any image wider or taller than # this as a lightweight defense against decompression bombs. MAX_PIXEL_DIMENSION: Final[int] = 10_000 # MIME types accepted from the magic-byte sniff. We always re-encode # to JPEG regardless of input. _ACCEPTED_MIME: Final[frozenset[str]] = frozenset( {"image/jpeg", "image/png", "image/webp"} ) # Output quality for Pillow's JPEG encoder. 85 is a widely-used # sweet spot for photograph-like content. _JPEG_QUALITY: Final[int] = 85 class MediaRejectedError(Exception): """Raised when an upload fails any validation step. The message is user-facing (shown in the admin editor) — keep it generic and free of implementation detail. """ class MediaService: """Validate and store admin-uploaded images. Parameters ---------- engine: Shared SQLAlchemy engine. media_root: Filesystem directory under which uploads live (the ``//`` partition is appended). Relative paths are resolved against the process cwd, matching how the FastAPI StaticFiles mount is configured. public_prefix: URL prefix where the media root is mounted for public serving. Defaults to ``/media`` so the Markdown that the admin inserts after a drag-drop upload uses a path the public site can actually reach. audit: :class:`AuditService` for the ``media_uploaded`` event. """ def __init__( self, engine: Engine, media_root: str, audit: AuditService, *, public_prefix: str = "/media", ) -> None: self._engine: Engine = engine self._media_root: Path = Path(media_root) # Normalize to no trailing slash — we always join with "//..." self._public_prefix: str = "/" + public_prefix.strip("/") self._audit: AuditService = audit # ------------------------------------------------------------------ # save_upload # ------------------------------------------------------------------ def save_upload( self, *, original_filename: str, data: bytes, uploaded_by: int, alt_text: str = "", ) -> Media: """Validate + re-encode + persist a new media upload. Parameters ---------- original_filename: The filename the client submitted. Stored in the DB row for display only; NEVER used to build a filesystem path. data: Raw request body. Must be at most :data:`MAX_UPLOAD_BYTES`. uploaded_by: :class:`User` id of the authenticated admin performing the upload. alt_text: Optional alt text. Empty is allowed — admin can set it later by hand-editing the Markdown. Returns ------- Media Fully-populated :class:`Media` dataclass. Raises ------ MediaRejectedError When any validation step fails (size, MIME, decode). """ # 1. Size cap — cheap, do first. if len(data) == 0: raise MediaRejectedError("Empty upload.") if len(data) > MAX_UPLOAD_BYTES: raise MediaRejectedError( "Upload exceeds the 8 MB limit." ) # 2. Magic-byte sniff. sniffed_mime = _sniff_mime(data) if sniffed_mime not in _ACCEPTED_MIME: raise MediaRejectedError( f"Unsupported image type ({sniffed_mime})." ) # 3. Pillow verify on a fresh BytesIO (verify consumes the # stream). If this raises we swallow and translate to a generic # rejection so we never echo the Pillow error string back to # the admin UI. try: Image.open(io.BytesIO(data)).verify() except (UnidentifiedImageError, Exception): # noqa: BLE001 raise MediaRejectedError("Image could not be decoded.") # 4. Re-open for the actual encode. try: image = Image.open(io.BytesIO(data)) # Load here so we catch truncated / corrupt payloads that # verify() misses. Without load() the decode is lazy. image.load() except (UnidentifiedImageError, Exception): # noqa: BLE001 raise MediaRejectedError("Image could not be decoded.") width, height = image.size if width <= 0 or height <= 0: raise MediaRejectedError("Image has zero dimension.") if width > MAX_PIXEL_DIMENSION or height > MAX_PIXEL_DIMENSION: raise MediaRejectedError( "Image dimensions exceed the maximum allowed." ) # Flatten transparency onto a white background when present. # Pillow uses "RGBA", "LA", and "P" (palette, possibly with # transparency) as modes that carry alpha-like semantics. We # always convert to "RGB" before encoding as JPEG. if image.mode in ("RGBA", "LA") or ( image.mode == "P" and "transparency" in image.info ): # Convert through RGBA so alpha-compositing is well-defined, # then flatten onto a white RGB background. rgba = image.convert("RGBA") background = Image.new("RGB", rgba.size, (255, 255, 255)) background.paste(rgba, mask=rgba.split()[-1]) image_out = background elif image.mode != "RGB": image_out = image.convert("RGB") else: image_out = image # 5. Randomize the storage name and partition by month. now = datetime.now(timezone.utc) partition = f"{now:%Y}/{now:%m}" random_name = f"{secrets.token_urlsafe(16)}.jpg" target_dir = self._media_root / partition target_dir.mkdir(parents=True, exist_ok=True) target_path = target_dir / random_name # Re-encode to JPEG with the metadata stripped (a fresh # re-save removes any EXIF / color profile the source had). image_out.save( target_path, format="JPEG", quality=_JPEG_QUALITY, optimize=True, ) final_bytes = target_path.stat().st_size stored_path = str(target_path) # 6. DB row. now_iso = now.isoformat() with self._engine.begin() as conn: result = conn.execute( text( "INSERT INTO media" " (filename, original_filename, content_type," " size_bytes, stored_path, alt_text, uploaded_by," " uploaded_at)" " VALUES (:filename, :original_filename, :content_type," " :size_bytes, :stored_path, :alt_text, :uploaded_by," " :uploaded_at)" ), { "filename": random_name, "original_filename": original_filename or random_name, "content_type": "image/jpeg", "size_bytes": int(final_bytes), "stored_path": stored_path, "alt_text": alt_text or "", "uploaded_by": int(uploaded_by), "uploaded_at": now_iso, }, ) new_id = int(result.lastrowid) # type: ignore[arg-type] row = conn.execute( text( "SELECT id, filename, original_filename, content_type," " size_bytes, stored_path, alt_text, uploaded_by," " uploaded_at" " FROM media WHERE id = :id" ), {"id": new_id}, ).mappings().first() if row is None: # pragma: no cover — just inserted raise RuntimeError("failed to reload just-inserted media row") media = row_to_media(row) self._audit.record( "media_uploaded", user_id=uploaded_by, detail={ "media_id": media.id, "filename": media.filename, "size_bytes": media.size_bytes, "original_mime": sniffed_mime, }, ) return media # ------------------------------------------------------------------ # URL helpers # ------------------------------------------------------------------ def public_url(self, media: Media) -> str: """Return the URL the public site uses to fetch ``media``. Built from the configured ``public_prefix`` + the partition under ``media_root``. A stored path outside the media root (should never happen — we always write under it) falls back to the partition-less prefix to avoid leaking filesystem paths. """ try: rel = Path(media.stored_path).resolve().relative_to( self._media_root.resolve() ) except (ValueError, OSError): return f"{self._public_prefix}/{media.filename}" return f"{self._public_prefix}/{rel.as_posix()}" def _sniff_mime(data: bytes) -> str: """Return the MIME type of ``data`` according to python-magic. Wrapped so tests that monkeypatch can reach a single seam, and so the import of :mod:`magic` stays local (the module has a filesystem dependency on libmagic that should not block app import). """ # Import is module-level normally; keep here to avoid any import # order weirdness if libmagic is missing in exotic environments. import magic # First 2 KB is well beyond what any image header uses, and # streaming beyond that buys nothing for MIME sniffing. head = data[:2048] return magic.from_buffer(head, mime=True)