SneakyScope/app/rules/function_rules.py

"""
app/rules/function_rules.py

Class-based adapters + function-based rules for SneakyScope.

Design:
- FactAdapter: converts text snippets into structured 'facts' dicts by category.
- FunctionRuleAdapter: wraps a rule function (expects dict facts) so it can be
  used directly by the RuleEngine even when the engine is given strings.

Each rule returns (matched: bool, reason: Optional[str]).
If matched is True, 'reason' should explain why.

Note:
- Form rules work today with text snippets, thanks to FunctionRuleAdapter+FactAdapter.
- Script rules expect per-script dict facts (src/base_hostname/etc.). They are
  registered now and will fully activate when you evaluate per-script contexts.
"""

from __future__ import annotations

from typing import Any, Dict, Optional
from urllib.parse import urlparse

_NOOP_ACTIONS = {"", "#", "javascript:void(0)", "javascript:void(0);"}

# ---------------------------------------------------------------------------
# Adapters
# ---------------------------------------------------------------------------

class FactAdapter:
    """
    Converts raw text/html snippets into structured 'facts' suitable for
    function-based rules. If input is already a dict, returns it unchanged.

    You can expand the per-category parsers over time as needed.
    """

    def __init__(self, logger: Optional[Any] = None) -> None:
        self.logger = logger

    def adapt(self, text_or_facts: Any, category: str = "") -> Dict[str, Any]:
        """
        Adapt text_or_facts (str or dict) into a facts dict.

        Args:
            text_or_facts: Either raw string snippet or an already-structured dict.
            category: 'form' | 'script' | 'text' | ... (used to choose parser)
        """
        # Already structured — pass through
        if isinstance(text_or_facts, dict):
            # Ensure a category key for consistency (optional)
            text_or_facts.setdefault("category", category or text_or_facts.get("category") or "")
            return text_or_facts

        # String snippets are parsed by category
        if isinstance(text_or_facts, str):
            if category == "form":
                return self._adapt_form_snippet(text_or_facts)
            elif category == "script":
                # For now, we don't parse script snippets into facts. Script rules expect
                # per-script dicts (src/base_hostname/etc.), which you'll provide when you
                # add per-script evaluation. Return minimal facts for safety.
                return {"category": "script", "raw": text_or_facts}
            elif category == "text":
                return {"category": "text", "raw": text_or_facts}
            else:
                if self.logger:
                    self.logger.warning(f"[FactAdapter] Unknown category '{category}', returning raw snippet.")
                return {"category": category, "raw": text_or_facts}

        # Fallback for unrecognized input types
        if self.logger:
            self.logger.warning(f"[FactAdapter] Unsupported input type: {type(text_or_facts)!r}")
        return {"category": category, "raw": text_or_facts}

    # ---- Per-category parsers ----

    def _adapt_form_snippet(self, snippet: str) -> Dict[str, Any]:
        """
        Parse the simple form snippet format used by browser.py today, e.g.:

            action=https://example.com/post
            method=post
            inputs=
              - name=email type=text
              - name=password type=password

        Only extracts fields needed by current function rules.
        """
        facts: Dict[str, Any] = {"category": "form", "raw": snippet}

        lines = snippet.splitlines()
        i = 0
        n = len(lines)
        while i < n:
            line = (lines[i] or "").strip()
            if line.startswith("action="):
                facts["action"] = line.split("=", 1)[1].strip()
            elif line.startswith("method="):
                facts["method"] = line.split("=", 1)[1].strip()
            i = i + 1

        # Normalize context keys expected by form rules
        facts.setdefault("base_url", "")       # filled by caller later if desired
        facts.setdefault("base_hostname", "")  # filled by caller later if desired
        return facts


class FunctionRuleAdapter:
    """
    Callable wrapper that adapts engine input (str or dict) into 'facts' and then
    invokes the underlying function rule that expects a facts dict.

    Usage:
        wrapped = FunctionRuleAdapter(fn=form_action_missing, category="form", adapter=FactAdapter(app.logger))
        matched, reason = wrapped("action=https://...")  # engine-friendly
    """

    def __init__(self, fn, category: str = "", adapter: Optional[FactAdapter] = None) -> None:
        self.fn = fn
        self.category = category
        self.adapter = adapter or FactAdapter()

    def __call__(self, text_or_facts: Any):
        facts = self.adapter.adapt(text_or_facts, category=self.category)
        return self.fn(facts)


# ---------------------------------------------------------------------------
# Function-based rules (dict 'facts' expected)
# ---------------------------------------------------------------------------

# ---------------- Script rules ----------------

def script_src_uses_data_or_blob(facts: Dict[str, Any]):
    """Flags <script> tags with src='data:' or 'blob:'."""
    src = facts.get("src") or ""
    if isinstance(src, str) and src.startswith(("data:", "blob:")):
        scheme = src.split(":", 1)[0]
        return True, f"Script src uses {scheme}: URL"
    return False, None


def script_src_has_dangerous_extension(facts: Dict[str, Any]):
    """Flags <script> tags with dangerous file extensions (e.g., .vbs, .hta)."""
    src = facts.get("src") or ""
    if not isinstance(src, str):
        return False, None
    low = src.lower()
    dangerous = (".vbs", ".hta")
    i = 0
    m = len(dangerous)
    while i < m:
        ext = dangerous[i]
        if low.endswith(ext):
            return True, f"External script has dangerous extension ({ext})"
        i = i + 1
    return False, None


def script_third_party_host(facts: Dict[str, Any]):
    """Flags scripts loaded from a different hostname than the page."""
    base_host = facts.get("base_hostname") or ""
    src_host = facts.get("src_hostname") or ""
    if base_host and src_host and base_host != src_host:
        return True, f"Third-party script host: {src_host}"
    return False, None


# ---------------- Form rules ----------------

def form_action_missing(facts: Dict[str, Any]):
    """Flags <form> elements with no meaningful action attribute."""
    action = (facts.get("action") or "").strip()
    if action in _NOOP_ACTIONS:
        return True, "Form has no action attribute (or uses a no-op action)"
    return False, None


def form_http_on_https_page(facts: Dict[str, Any]):
    """Flags forms submitting over HTTP while the page was loaded over HTTPS."""
    base_url = (facts.get("base_url") or "").strip()
    action   = (facts.get("action") or "").strip()

    try:
        base_scheme = (urlparse(base_url).scheme or "").lower()
        parsed_act  = urlparse(action)
        act_scheme  = (parsed_act.scheme or "").lower()
    except Exception:
        return False, None  # parsing trouble → don’t flag

    # Only flag absolute http:// actions on https pages.
    # Relative or schemeless ('//host/...') isn’t flagged here (it won’t be HTTP on an HTTPS page).
    if base_scheme == "https" and act_scheme == "http":
        return True, f"Submits over insecure HTTP (action={parsed_act.geturl()})"
    return False, None


def form_submits_to_different_host(facts: Dict[str, Any]):
    """Flags <form> actions that submit to a different hostname than the page."""
    base_host = (facts.get("base_hostname") or "").strip().lower()
    action    = (facts.get("action") or "").strip()

    if not action or action in _NOOP_ACTIONS:
        return False, None

    try:
        parsed = urlparse(action)
        act_host = (parsed.hostname or "").lower()
    except Exception:
        return False, None

    # Only compare when the action specifies a host (absolute URL or schemeless //host/path).
    if act_host and base_host and act_host != base_host:
        return True, f"Submits to a different host ({act_host} vs {base_host})"
    return False, None