SneakyScope/app/utils/browser.py

"""
app/browser.py

Singleton, lazily-loaded page fetcher + analysis orchestrator for SneakyScope.

Responsibilities:
- Fetch a URL (HTML, redirects, etc.)
- Run the Suspicious Rules Engine (PASS/FAIL for all rules)
- Write artifacts (screenshot.png, source.txt, results.json) into /data/<run_uuid>/
- Return a single 'result' dict suitable for UI and future API

Design notes:
- Detection logic (regex/heuristics) lives in the rules engine (YAML/function rules).
- This module keeps "plumbing" only (fetch, extract, persist).
- Minimal non-detection heuristics remain here (e.g., skip benign script MIME types).

Assumptions:
- Flask app context is active (uses current_app for logger and RULE_ENGINE).
- SANDBOX_STORAGE is configured (default: /data).
- enrich_url(url) returns enrichment dict.
"""

from __future__ import annotations

import json
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

from bs4 import BeautifulSoup
import re
from flask import current_app
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError

from app.utils.io_helpers import safe_write
from app.utils.enrichment import enrich_url
from app.utils.settings import get_settings
from app.logging_setup import get_app_logger

# Load settings once for constants / defaults
settings = get_settings()

logger = get_app_logger()

class Browser:
    """
    Orchestrates page fetching and analysis. Meant to be accessed via the
    lazily-loaded singleton factory `get_browser()`.
    """

    def __init__(self) -> None:
        storage_dir = Path("/data")
        self.storage_dir: Path = storage_dir

    # -----------------------------------------------------------------------
    # Engine access helpers
    # -----------------------------------------------------------------------
    @staticmethod
    def _get_rule_engine():
        """
        Retrieve the rules engine instance from the application state.

        Returns:
            RuleEngine or None: The engine if available, or None if not configured.
        """
        from app.state import get_rules_engine
        return get_rules_engine()

    @staticmethod
    def _summarize_results(results: List[Dict[str, Any]]) -> Dict[str, int]:
        """
        Summarize a list of engine rule result dicts (result = "PASS"|"FAIL").

        Returns:
            {'fail_count': int, 'total_rules': int}
        """
        summary = {"fail_count": 0, "total_rules": 0}
        index = 0
        total = len(results)
        while index < total:
            item = results[index]
            summary["total_rules"] = summary["total_rules"] + 1
            if str(item.get("result", "")).upper() == "FAIL":
                summary["fail_count"] = summary["fail_count"] + 1
            index = index + 1
        return summary

    def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]:
        """
        Build a top-level overview for the results page: runs each category across
        the entire HTML and groups results by category.

        Returns:
            [
              {"category": "script", "results": [ ...engine dicts... ], "summary": {...}},
              {"category": "form",   "results": [ ... ], "summary": {...}},
              {"category": "text",   "results": [ ... ], "summary": {...}},
            ]
        """
        overview: List[Dict[str, Any]] = []
        engine = self._get_rule_engine()

        categories = ["script", "form", "text"]
        index = 0
        total = len(categories)

        while index < total:
            cat = categories[index]
            block = {"category": cat, "results": [], "summary": {"fail_count": 0, "total_rules": 0}}

            if engine is not None:
                try:
                    results = engine.run_all(full_html_text, category=cat)
                    block["results"] = results
                    block["summary"] = self._summarize_results(results)
                except Exception as exc:
                    block["results"] = [{
                        "name": "engine_error",
                        "description": "Rule engine failed during overview evaluation",
                        "category": cat,
                        "result": "PASS",
                        "reason": f"{exc}",
                        "severity": None,
                        "tags": None
                    }]
                    block["summary"] = {"fail_count": 0, "total_rules": 1}

            overview.append(block)
            index = index + 1

        return overview

    # -----------------------------------------------------------------------
    # Form & Script analysis (plumbing only; detection is in the rules engine)
    # -----------------------------------------------------------------------
    def analyze_forms(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
        """
        Collect form artifacts and evaluate per-form matches via the rules engine.
        Only include rows that matched at least one rule.

        Returns list of dicts with keys (per matched form):
        - type: "form"
        - action, method, inputs
        - content_snippet: str
        - rules: List[{name, description, severity?, tags?}]
        """
        soup = BeautifulSoup(html, "lxml")
        results: List[Dict[str, Any]] = []

        engine = self._get_rule_engine()
        base_hostname = urlparse(base_url).hostname or ""
        # Match how scripts picks preview len
        try:
            preview_len = getattr(settings.ui, "snippet_preview_len", 200)  # keep parity with scripts
        except Exception:
            preview_len = 200

        for form in soup.find_all("form"):
            try:
                action = (form.get("action") or "").strip()
                method = (form.get("method") or "get").strip().lower()

                inputs: List[Dict[str, Any]] = []
                for inp in form.find_all("input"):
                    inputs.append({
                        "name": inp.get("name"),
                        "type": (inp.get("type") or "text").strip().lower(),
                    })

                # Use the actual form markup for regex rules
                form_markup = str(form)
                # UI-friendly snippet
                content_snippet = form_markup[:preview_len]

                matches: List[Dict[str, Any]] = []
                if engine is not None:
                    for r in getattr(engine, "rules", []):
                        if getattr(r, "category", None) != "form":
                            continue
                        rtype = getattr(r, "rule_type", None)

                        try:
                            ok = False
                            reason = ""
                            if rtype == "regex":
                                # Run against the raw form HTML
                                ok, reason = r.run(form_markup)
                            elif rtype == "function":
                                # Structured facts for function-style rules
                                facts = {
                                    "category": "form",
                                    "base_url": base_url,
                                    "base_hostname": base_hostname,
                                    "action": action,
                                    "action_hostname": urlparse(action).hostname or "",
                                    "method": method,
                                    "inputs": inputs,
                                    "markup": form_markup,
                                }
                                ok, reason = r.run(facts)
                            else:
                                continue

                            if ok:
                                matches.append({
                                    "name": getattr(r, "name", "unknown_rule"),
                                    "description": (reason or "") or getattr(r, "description", ""),
                                    "severity": getattr(r, "severity", None),
                                    "tags": getattr(r, "tags", None),
                                })
                        except Exception as rule_exc:
                            # Be defensive—bad rule shouldn't break the form pass
                            try:
                                logger.debug("Form rule error", extra={"rule": getattr(r, "name", "?"), "error": str(rule_exc)})
                            except Exception:
                                pass
                            continue

                if matches:
                    results.append({
                        "type": "form",
                        "action": action,
                        "method": method,
                        "inputs": inputs,
                        "content_snippet": content_snippet,
                        "rules": matches,
                    })

            except Exception as exc:
                # Keep analysis resilient
                try:
                    logger.error("Form analysis error", extra={"error": str(exc)})
                except Exception:
                    pass
                results.append({
                    "type": "form",
                    "heuristics": [f"Form analysis error: {exc}"],
                })

        return results

    def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
        """
        Collect script artifacts and evaluate per-script matches via the rules engine.
        Only include rows that matched at least one rule.
        """
        soup = BeautifulSoup(html, "lxml")
        results: List[Dict[str, Any]] = []

        benign_types = {"application/ld+json", "application/json"}

        engine = self._get_rule_engine()
        base_hostname = urlparse(base_url).hostname or ""

        for script in soup.find_all("script"):
            try:
                src = (script.get("src") or "").strip()
                s_type_attr = (script.get("type") or "").strip().lower()
                inline_text = script.get_text(strip=True) or ""

                if s_type_attr in benign_types:
                    continue

                record: Dict[str, Any] = {}
                if src:
                    record["type"] = "external"
                    record["src"] = src
                elif inline_text:
                    # respect your UI snippet config
                    preview_len = getattr(settings.ui, "snippet_preview_len", 200)
                    record["type"] = "inline"
                    record["content_snippet"] = (inline_text[:preview_len]).replace("\n", " ")
                else:
                    record["type"] = "unknown"

                matches: List[Dict[str, Any]] = []
                if engine is not None:
                    if inline_text:
                        for r in engine.rules:
                            if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "regex":
                                ok, reason = r.run(inline_text)
                                if ok:
                                    matches.append({
                                        "name": getattr(r, "name", "unknown_rule"),
                                        "description": getattr(r, "description", "") or (reason or ""),
                                        "severity": getattr(r, "severity", None),
                                        "tags": getattr(r, "tags", None),
                                    })

                    if src:
                        facts = {
                            "src": src,
                            "base_url": base_url,
                            "base_hostname": base_hostname,
                            "src_hostname": urlparse(src).hostname or "",
                            "category": "script",
                        }
                        for r in engine.rules:
                            if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "function":
                                ok, reason = r.run(facts)
                                if ok:
                                    matches.append({
                                        "name": getattr(r, "name", "unknown_rule"),
                                        "description": (reason or "") or getattr(r, "description", ""),
                                        "severity": getattr(r, "severity", None),
                                        "tags": getattr(r, "tags", None),
                                    })

                if matches:
                    record["rules"] = matches
                    results.append(record)

            except Exception as exc:
                results.append({
                    "type": "unknown",
                    "heuristics": [f"Script analysis error: {exc}"]
                })

        return results

    def analyze_text(self, html: str) -> List[Dict[str, Any]]:
        """
        Extract visible page text and evaluate text rules.
        Only include rows that matched at least one rule.

        Returns a list with 0..1 records shaped like:
        {
            "type": "page",
            "content_snippet": "<matched words/phrases joined>",
            "rules": [
                {"name": "...", "description": "...", "severity": "...", "tags": [...]},
                ...
            ],
        }
        """
        results: List[Dict[str, Any]] = []

        # Short-circuit on missing html
        if not html:
            return results

        # Extract visible text (strip scripts/styles)
        try:
            soup = BeautifulSoup(html, "lxml")
            for tag in soup(["script", "style", "noscript", "template"]):
                tag.decompose()
            # Basic hidden cleanup (best-effort)
            for el in soup.select('[hidden], [aria-hidden="true"]'):
                el.decompose()

            text = soup.get_text(separator=" ", strip=True)
            if not text:
                return results

            # Normalize whitespace so regexes behave consistently
            text = re.sub(r"\s+", " ", text).strip()

        except Exception as exc:
            # Keep consistency with your other analyzers
            results.append({
                "type": "page",
                "heuristics": [f"Text extraction error: {exc}"]
            })
            return results

        engine = self._get_rule_engine()
        if engine is None:
            return results

        matches_for_record: List[Dict[str, Any]] = []
        matched_phrases: List[str] = []   # order-preserving
        seen_phrases = set()

        # How many characters to show for the preview snippet
        preview_len = getattr(settings.ui, "snippet_preview_len", 200)

        try:
            # 1) Regex rules over full page text
            for r in engine.rules:
                if getattr(r, "category", None) != "text":
                    continue

                rtype = getattr(r, "rule_type", None)
                if rtype == "regex":
                    ok, _reason = r.run(text)
                    if not ok:
                        continue

                    # Try to pull matched words/phrases
                    compiled = getattr(r, "_compiled_regex", None)
                    if compiled is None and getattr(r, "pattern", None):
                        try:
                            compiled = re.compile(r.pattern, re.IGNORECASE)
                        except re.error:
                            compiled = None

                    # Collect a few (deduped) matched phrases
                    if compiled is not None:
                        # limit per rule to avoid flooding
                        per_rule_count = 0
                        for m in compiled.finditer(text):
                            phrase = m.group(0).strip()
                            if phrase and phrase not in seen_phrases:
                                matched_phrases.append(phrase)
                                seen_phrases.add(phrase)
                                per_rule_count += 1
                                if per_rule_count >= 5:  # cap per rule
                                    break

                    matches_for_record.append({
                        "name": getattr(r, "name", "unknown_rule"),
                        "description": getattr(r, "description", "") or "",
                        "severity": getattr(r, "severity", None),
                        "tags": getattr(r, "tags", None),
                    })

                elif rtype == "function":
                    # Optional: function-style rules can inspect the full text
                    facts = {"text": text, "category": "text"}
                    ok, reason = r.run(facts)
                    if ok:
                        matches_for_record.append({
                            "name": getattr(r, "name", "unknown_rule"),
                            "description": (reason or "") or getattr(r, "description", ""),
                            "severity": getattr(r, "severity", None),
                            "tags": getattr(r, "tags", None),
                        })

            if matches_for_record:
                # Build the snippet from matched words/phrases
                joined = " … ".join(matched_phrases) if matched_phrases else ""
                if len(joined) > preview_len:
                    joined = joined[:preview_len] + "…"

                record: Dict[str, Any] = {
                    "type": "page",
                    "content_snippet": joined or None,
                    "rules": matches_for_record,
                }
                results.append(record)

        except Exception as exc:
            results.append({
                "type": "page",
                "heuristics": [f"Text analysis error: {exc}"]
            })

        return results

    # -----------------------------------------------------------------------
    # Fetcher / Orchestrator
    # -----------------------------------------------------------------------
    async def fetch_page_artifacts(self, url: str, fetch_ssl_enabled:bool=False) -> Dict[str, Any]:
        """
        Fetch page artifacts and save them in a UUID-based directory for this Browser's storage_dir.

        Writes:
          - /data/<uuid>/screenshot.png
          - /data/<uuid>/source.html
          - /data/<uuid>/results.json  (single source of truth for routes)

        Returns:
            result dict with keys used by templates (and future API).
        """
        run_uuid = str(uuid.uuid4())
        run_dir = self.storage_dir / run_uuid
        run_dir.mkdir(parents=True, exist_ok=True)

        screenshot_path = run_dir / "screenshot.png"
        source_path = run_dir / "source.html"
        results_path = run_dir / "results.json"

        redirects: List[Dict[str, Any]] = []
        downloads: List[Dict[str, Any]] = []
        scripts_seen: List[str] = []

        async with async_playwright() as pw:
            browser = await pw.chromium.launch(
                headless=True,
                args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled"]
            )
            context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
                java_script_enabled=True,
                locale="en-US"
            )
            page = await context.new_page()

            # Event handlers (plumbing)
            def _on_response(resp):
                try:
                    if 300 <= resp.status <= 399:
                        redirects.append({"status": resp.status, "url": resp.url})
                except Exception:
                    pass

            def _on_download(d):
                try:
                    downloads.append({"url": d.url, "suggested_filename": d.suggested_filename})
                except Exception:
                    pass

            def _on_request(r):
                try:
                    if r.url.endswith((".js", ".vbs", ".hta")):
                        scripts_seen.append(r.url)
                except Exception:
                    pass

            page.on("response", _on_response)
            page.on("download", _on_download)
            page.on("request", _on_request)

            try:
                await page.goto(url, wait_until="networkidle", timeout=60000)
                final_url = page.url
                await page.screenshot(path=str(screenshot_path), full_page=True)
                html = await page.content()
                safe_write(source_path, html)
            except PWTimeoutError:
                final_url = page.url
                safe_write(source_path, "Page did not fully load (timeout)")
                await page.screenshot(path=str(screenshot_path), full_page=True)

            await context.close()
            await browser.close()

        # Read back saved source
        html_content = source_path.read_text(encoding="utf-8")

        # Forms analysis
        forms_info = self.analyze_forms(html_content, final_url)

        # Scripts artifacts
        suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url)

        # suspicious text
        flagged_text = self.analyze_text(html_content)

        # Enrichment
        enrichment = enrich_url(url, fetch_ssl_enabled)

        # Global PASS/FAIL table per category (entire document)
        rule_checks_overview = self.build_rule_checks_overview(html_content)

        try:
            for blk in rule_checks_overview:
                current_app.logger.debug(f"[rules] {blk['category']}: {blk['summary']}")
        except Exception:
            pass

        # Assemble single result dict
        result: Dict[str, Any] = {
            "uuid": run_uuid,
            "submitted_url": url,
            "final_url": final_url,
            "redirects": redirects,
            "downloads": downloads,
            "scripts": scripts_seen,
            "forms": forms_info,
            "suspicious_scripts": suspicious_scripts,
            "suspicious_text":flagged_text,
            "rule_checks": rule_checks_overview,
            "enrichment": enrichment
        }

        # Persist as the single source of truth for routes
        safe_write(results_path, json.dumps(result, indent=2, ensure_ascii=False))

        try:
            logger.info(f"Saved results.json for run {run_uuid}")
        except Exception:
            pass

        return result

# ---------------------------------------------------------------------------
# Lazy-loaded singleton factory
# ---------------------------------------------------------------------------

# Prefer importing your project-wide singleton decorator.
try:
    from app.utils.settings import singleton_loader  # if we already export it
except Exception:
    # Local fallback if import is not available.
    from typing import Callable, TypeVar
    import functools
    T = TypeVar("T")
    def singleton_loader(func: Callable[..., T]) -> Callable[..., T]:
        """Ensure the function only runs once, returning the cached value."""
        cache: dict[str, T] = {}
        @functools.wraps(func)
        def wrapper(*args, **kwargs) -> T:
            if func.__name__ not in cache:
                cache[func.__name__] = func(*args, **kwargs)
            return cache[func.__name__]
        return wrapper


@singleton_loader
def get_browser() -> Browser:
    """
    Lazily construct and cache a singleton Browser instance.

    Returns:
        Browser: The singleton instance.
    """
    return Browser()