feat: on-demand external script analysis + code viewer; refactor form analysis to rule engine

- API: add `POST /api/analyze_script` (app/blueprints/api.py) - Fetch one external script to artifacts, run rules, return findings + snippet - Uses new ExternalScriptFetcher (results_path aware) and job UUID - Returns: { ok, final_url, status_code, bytes, truncated, sha256, artifact_path, findings[], snippet, snippet_len } - TODO: document in openapi/openapi.yaml - Fetcher: update `app/utils/external_fetch.py` - Constructed with `results_path` (UUID dir); writes to `<results_path>/scripts/fetched/<index>.js` - Loads settings via `get_settings()`, logs via std logging - UI (results.html): - Move “Analyze external script” action into **Content Snippet** column for external rows - Clicking replaces button with `<details>` snippet, shows rule matches, and adds “open in viewer” link - Robust fetch handler (checks JSON, shows errors); builds viewer URL from absolute artifact path - Viewer: - New route: `GET /view/artifact/<run_uuid>/<path:filename>` (app/blueprints/ui.py) - New template: Monaco-based read-only code viewer (viewer.html) - Removes SRI on loader to avoid integrity block; loads file via `raw_url` and detects language by extension - Forms: - Refactor `analyze_forms` to mirror scripts analysis: - Uses rule engine (`category == "form"`) across regex/function rules - Emits rows only when matches exist - Includes `content_snippet`, `action`, `method`, `inputs`, `rules` - Replace legacy plumbing (`flagged`, `flag_reasons`, `status`) in output - Normalize form function rules to canonical returns `(bool, Optional[str])`: - `form_action_missing` - `form_http_on_https_page` - `form_submits_to_different_host` - Add minor hardening (lowercasing hosts, no-op actions, clearer reasons) - CSS: add `.forms-table` to mirror `.scripts-table` (5 columns) - Fixed table layout, widths per column, chip/snippet styling, responsive tweaks - Misc: - Fix “working outside app context” issue by avoiding `current_app` at import time (left storage logic inside routes) - Add “View Source” link to open page source in viewer Refs: - Roadmap: mark “Source code viewer” done; keep TODO to add `/api/analyze_script` to OpenAPI
2025-08-21 15:32:24 -05:00
parent 05cf23ad67
commit 3a24b392f2
15 changed files with 1192 additions and 218 deletions
--- a/app/utils/browser.py
+++ b/app/utils/browser.py
@@ -33,7 +33,7 @@ from flask import current_app
 from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError

 from app.utils.io_helpers import safe_write
-from app.enrichment import enrich_url
+from app.utils.enrichment import enrich_url
 from app.utils.settings import get_settings

 # Load settings once for constants / defaults
@@ -202,85 +202,111 @@ class Browser:
    # -----------------------------------------------------------------------
    # Form & Script analysis (plumbing only; detection is in the rules engine)
    # -----------------------------------------------------------------------
-    def analyze_forms(self, html: str, base_url: str) -> List[Dict[str, Any]]:
+    def analyze_forms(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
        """
-        Parse forms from the page HTML and apply rule-based checks (engine), keeping
-        only simple plumbing heuristics here (no security logic).
+        Collect form artifacts and evaluate per-form matches via the rules engine.
+        Only include rows that matched at least one rule.

-        Returns list of dicts with keys:
-          - action, method, inputs
-          - flagged (bool), flag_reasons (list[str]), status (str)
-          - rule_checks: {'checks': [...], 'summary': {...}} (per-form snippet evaluation)
+        Returns list of dicts with keys (per matched form):
+        - type: "form"
+        - action, method, inputs
+        - content_snippet: str
+        - rules: List[{name, description, severity?, tags?}]
        """
        soup = BeautifulSoup(html, "lxml")
-        forms_info: List[Dict[str, Any]] = []
-        page_hostname = urlparse(base_url).hostname
+        results: List[Dict[str, Any]] = []
+
+        engine = self._get_rule_engine()
+        base_hostname = urlparse(base_url).hostname or ""
+        # Match how scripts picks preview len
+        try:
+            preview_len = getattr(settings.ui, "snippet_preview_len", 200)  # keep parity with scripts
+        except Exception:
+            preview_len = 200

        for form in soup.find_all("form"):
-            action = form.get("action")
-            method = form.get("method", "get").lower()
+            try:
+                action = (form.get("action") or "").strip()
+                method = (form.get("method") or "get").strip().lower()

-            inputs: List[Dict[str, Any]] = []
-            for inp in form.find_all("input"):
-                input_name = inp.get("name")
-                input_type = inp.get("type", "text")
-                inputs.append({"name": input_name, "type": input_type})
+                inputs: List[Dict[str, Any]] = []
+                for inp in form.find_all("input"):
+                    inputs.append({
+                        "name": inp.get("name"),
+                        "type": (inp.get("type") or "text").strip().lower(),
+                    })

-            flagged_reasons: List[str] = []
+                # Use the actual form markup for regex rules
+                form_markup = str(form)
+                # UI-friendly snippet
+                content_snippet = form_markup[:preview_len]

-            if not action or str(action).strip() == "":
-                flagged_reasons.append("No action specified")
-            else:
+                matches: List[Dict[str, Any]] = []
+                if engine is not None:
+                    for r in getattr(engine, "rules", []):
+                        if getattr(r, "category", None) != "form":
+                            continue
+                        rtype = getattr(r, "rule_type", None)
+
+                        try:
+                            ok = False
+                            reason = ""
+                            if rtype == "regex":
+                                # Run against the raw form HTML
+                                ok, reason = r.run(form_markup)
+                            elif rtype == "function":
+                                # Structured facts for function-style rules
+                                facts = {
+                                    "category": "form",
+                                    "base_url": base_url,
+                                    "base_hostname": base_hostname,
+                                    "action": action,
+                                    "action_hostname": urlparse(action).hostname or "",
+                                    "method": method,
+                                    "inputs": inputs,
+                                    "markup": form_markup,
+                                }
+                                ok, reason = r.run(facts)
+                            else:
+                                continue
+
+                            if ok:
+                                matches.append({
+                                    "name": getattr(r, "name", "unknown_rule"),
+                                    "description": (reason or "") or getattr(r, "description", ""),
+                                    "severity": getattr(r, "severity", None),
+                                    "tags": getattr(r, "tags", None),
+                                })
+                        except Exception as rule_exc:
+                            # Be defensive—bad rule shouldn't break the form pass
+                            try:
+                                self.logger.debug("Form rule error", extra={"rule": getattr(r, "name", "?"), "error": str(rule_exc)})
+                            except Exception:
+                                pass
+                            continue
+
+                if matches:
+                    results.append({
+                        "type": "form",
+                        "action": action,
+                        "method": method,
+                        "inputs": inputs,
+                        "content_snippet": content_snippet,
+                        "rules": matches,
+                    })
+
+            except Exception as exc:
+                # Keep analysis resilient
                try:
-                    action_host = urlparse(action).hostname
-                    if not str(action).startswith("/") and action_host != page_hostname:
-                        flagged_reasons.append("Submits to a different host")
+                    self.logger.error("Form analysis error", extra={"error": str(exc)})
                except Exception:
                    pass
+                results.append({
+                    "type": "form",
+                    "heuristics": [f"Form analysis error: {exc}"],
+                })

-            try:
-                if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https":
-                    flagged_reasons.append("Submits over insecure HTTP")
-            except Exception:
-                pass
-
-            for hidden in form.find_all("input", type="hidden"):
-                name_value = hidden.get("name") or ""
-                if "password" in name_value.lower():
-                    flagged_reasons.append("Hidden password field")
-
-            flagged = bool(flagged_reasons)
-
-            # Serialize a simple form snippet for rule category='form'
-            snippet_lines = []
-            snippet_lines.append(f"base_url={base_url}")
-            snippet_lines.append(f"base_hostname={page_hostname}")
-            snippet_lines.append(f"action={action}")
-            snippet_lines.append(f"method={method}")
-            snippet_lines.append("inputs=")
-
-            i = 0
-            n = len(inputs)
-            while i < n:
-                item = inputs[i]
-                snippet_lines.append(f"  - name={item.get('name')} type={item.get('type')}")
-                i = i + 1
-            form_snippet = "\n".join(snippet_lines)
-
-            # Per-form rule checks (PASS/FAIL list via engine)
-            rule_checks = self.run_rule_checks(form_snippet, category="form")
-
-            forms_info.append({
-                "action": action,
-                "method": method,
-                "inputs": inputs,
-                "flagged": flagged,
-                "flag_reasons": flagged_reasons,
-                "status": "flagged" if flagged else "possibly safe",
-                "rule_checks": rule_checks
-            })
-
-        return forms_info
+        return results

    def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
        """
@@ -370,7 +396,7 @@ class Browser:

        Writes:
          - /data/<uuid>/screenshot.png
-          - /data/<uuid>/source.txt
+          - /data/<uuid>/source.html
          - /data/<uuid>/results.json  (single source of truth for routes)

        Returns:
@@ -381,7 +407,7 @@ class Browser:
        run_dir.mkdir(parents=True, exist_ok=True)

        screenshot_path = run_dir / "screenshot.png"
-        source_path = run_dir / "source.txt"
+        source_path = run_dir / "source.html"
        results_path = run_dir / "results.json"

        redirects: List[Dict[str, Any]] = []
--- a/app/utils/enrichment.py
+++ b/app/utils/enrichment.py
@@ -9,8 +9,8 @@ from ipaddress import ip_address
 import socket

 # Local imports
-from .utils.cache_db import get_cache
-from .utils.settings import get_settings
+from app.utils.cache_db import get_cache
+from app.utils.settings import get_settings

 # Configure logging
 logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
@@ -39,9 +39,6 @@ def enrich_url(url: str) -> dict:
    # --- GeoIP ---
    result["geoip"] = enrich_geoip(hostname)

-    # --- BEC Words ---
-    result["bec_words"] = [w for w in BEC_WORDS if w.lower() in url.lower()]
-
    return result


--- a/app/utils/external_fetcher.py
+++ b/app/utils/external_fetcher.py
@@ -0,0 +1,338 @@
+# sneakyscope/app/utils/external_fetch.py
+import hashlib
+import os
+import logging
+from dataclasses import dataclass
+from typing import Optional, Tuple, List
+from urllib.parse import urljoin, urlparse
+
+import requests
+
+from app.utils.settings import get_settings
+
+settings = get_settings()
+
+_ALLOWED_SCHEMES = {"http", "https"}
+
+
+@dataclass
+class FetchResult:
+    """
+    Outcome for a single external script fetch.
+    """
+    ok: bool
+    reason: str
+    source_url: str
+    final_url: str
+    status_code: Optional[int]
+    content_type: Optional[str]
+    bytes_fetched: int
+    truncated: bool
+    sha256_hex: Optional[str]
+    saved_path: Optional[str]
+
+
+class ExternalScriptFetcher:
+    """
+    Minimal, safe-by-default fetcher for external JS files.
+
+    Notes / assumptions:
+      - All artifacts for this run live under the UUID-backed `results_path` you pass in.
+      - Saves bytes to: <results_path>/<index>.js
+      - Manual redirects up to `max_redirects`.
+      - Streaming with a hard byte cap derived from `max_total_mb`.
+      - Never raises network exceptions to callers; failures are encoded in FetchResult.
+      - Settings are read from get_settings()['external_script_fetch'] with sane defaults.
+    """
+
+    def __init__(self, results_path: str, session: Optional[requests.Session] = None):
+        """
+        Args:
+            results_path: Absolute path to the run's UUID directory (e.g., /data/<run_uuid>).
+            session: Optional requests.Session to reuse connections; a new one is created if not provided.
+        """
+        # Derived value: MiB -> bytes
+        self.max_total_bytes: int = settings.external_fetch.max_total_mb * 1024 * 1024
+
+        # Logger
+        self.logger = logging.getLogger(__file__)
+
+        # Where to write artifacts for this job/run (UUID directory)
+        self.results_path = results_path
+
+        # HTTP session with a predictable UA
+        self.session = session or requests.Session()
+        self.session.headers.update({"User-Agent": "SneakyScope/1.0"})
+
+    # -------------------------
+    # Internal helper methods
+    # -------------------------
+
+    def _timeout(self) -> Tuple[float, float]:
+        """
+        Compute (connect_timeout, read_timeout) in seconds from max_time_ms.
+        Keeps a conservative split so either phase gets a fair chance.
+        """
+        total = max(0.1, settings.external_fetch.max_time_ms / 1000.0)
+        connect = min(1.5, total * 0.5)  # cap connect timeout
+        read = max(0.5, total * 0.5)     # floor read timeout
+        return (connect, read)
+
+    def _scheme_allowed(self, url: str) -> bool:
+        """
+        Return True if URL uses an allowed scheme (http/https).
+        """
+        scheme = (urlparse(url).scheme or "").lower()
+        return scheme in _ALLOWED_SCHEMES
+
+    def _artifact_path(self, index: int) -> str:
+        """
+        Build an output path like:
+            <results_path>/<index>.js
+
+        Ensures the directory exists.
+        """
+        base_dir = os.path.join(self.results_path)
+        # Make sure parent directories exist (idempotent)
+        os.makedirs(base_dir, exist_ok=True)
+        filename = f"{index}.js"
+        return os.path.join(base_dir, filename)
+
+    # -------------------------
+    # Public API
+    # -------------------------
+
+    def fetch_one(self, script_url: str, index: int) -> FetchResult:
+        """
+        Fetch exactly one external script with manual redirect handling and a hard per-file byte cap.
+
+        Args:
+            script_url: The script URL to retrieve.
+            index:     Numeric index used solely for naming the artifact file (<index>.js).
+
+        Returns:
+            FetchResult with status, metadata, and saved path (if successful).
+        """
+        # Feature gate: allow callers to rely on a consistent failure when globally disabled.
+        if not settings.external_fetch.enabled:
+            return FetchResult(
+                ok=False,
+                reason="Feature disabled",
+                source_url=script_url,
+                final_url=script_url,
+                status_code=None,
+                content_type=None,
+                bytes_fetched=0,
+                truncated=False,
+                sha256_hex=None,
+                saved_path=None,
+            )
+
+        # Scheme guard: refuse anything not http/https in this v1.
+        if not self._scheme_allowed(script_url):
+            return FetchResult(
+                ok=False,
+                reason="Scheme not allowed",
+                source_url=script_url,
+                final_url=script_url,
+                status_code=None,
+                content_type=None,
+                bytes_fetched=0,
+                truncated=False,
+                sha256_hex=None,
+                saved_path=None,
+            )
+
+        current_url = script_url
+        status_code: Optional[int] = None
+        content_type: Optional[str] = None
+        redirects_followed = 0
+
+        # Manual redirect loop so we can enforce max_redirects precisely.
+        while True:
+            try:
+                resp = self.session.get(
+                    current_url,
+                    stream=True,
+                    allow_redirects=False,
+                    timeout=self._timeout(),
+                )
+            except requests.exceptions.Timeout:
+                return FetchResult(
+                    ok=False,
+                    reason="Timeout",
+                    source_url=script_url,
+                    final_url=current_url,
+                    status_code=status_code,
+                    content_type=content_type,
+                    bytes_fetched=0,
+                    truncated=False,
+                    sha256_hex=None,
+                    saved_path=None,
+                )
+            except requests.exceptions.RequestException as e:
+                return FetchResult(
+                    ok=False,
+                    reason=f"Network error: {e.__class__.__name__}",
+                    source_url=script_url,
+                    final_url=current_url,
+                    status_code=status_code,
+                    content_type=content_type,
+                    bytes_fetched=0,
+                    truncated=False,
+                    sha256_hex=None,
+                    saved_path=None,
+                )
+
+            status_code = resp.status_code
+            content_type = resp.headers.get("Content-Type")
+
+            # Handle redirects explicitly (3xx with Location)
+            if status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers:
+                if redirects_followed >= settings.external_fetch.max_redirects:
+                    return FetchResult(
+                        ok=False,
+                        reason="Max redirects exceeded",
+                        source_url=script_url,
+                        final_url=current_url,
+                        status_code=status_code,
+                        content_type=content_type,
+                        bytes_fetched=0,
+                        truncated=False,
+                        sha256_hex=None,
+                        saved_path=None,
+                    )
+                next_url = urljoin(current_url, resp.headers["Location"])
+                if not self._scheme_allowed(next_url):
+                    return FetchResult(
+                        ok=False,
+                        reason="Redirect to disallowed scheme",
+                        source_url=script_url,
+                        final_url=next_url,
+                        status_code=status_code,
+                        content_type=content_type,
+                        bytes_fetched=0,
+                        truncated=False,
+                        sha256_hex=None,
+                        saved_path=None,
+                    )
+                current_url = next_url
+                redirects_followed += 1
+                # Loop to follow next hop
+                continue
+
+            # Not a redirect: stream response body with a hard byte cap.
+            cap = self.max_total_bytes
+            total = 0
+            truncated = False
+            chunks: List[bytes] = []
+
+            try:
+                for chunk in resp.iter_content(chunk_size=8192):
+                    if not chunk:
+                        # Skip keep-alive chunks
+                        continue
+                    new_total = total + len(chunk)
+                    if new_total > cap:
+                        # Only keep what fits and stop
+                        remaining = cap - total
+                        if remaining > 0:
+                            chunks.append(chunk[:remaining])
+                            total += remaining
+                        truncated = True
+                        break
+                    chunks.append(chunk)
+                    total = new_total
+            except requests.exceptions.Timeout:
+                return FetchResult(
+                    ok=False,
+                    reason="Timeout while reading",
+                    source_url=script_url,
+                    final_url=current_url,
+                    status_code=status_code,
+                    content_type=content_type,
+                    bytes_fetched=total,
+                    truncated=truncated,
+                    sha256_hex=None,
+                    saved_path=None,
+                )
+            except requests.exceptions.RequestException as e:
+                return FetchResult(
+                    ok=False,
+                    reason=f"Network error while reading: {e.__class__.__name__}",
+                    source_url=script_url,
+                    final_url=current_url,
+                    status_code=status_code,
+                    content_type=content_type,
+                    bytes_fetched=total,
+                    truncated=truncated,
+                    sha256_hex=None,
+                    saved_path=None,
+                )
+
+            data = b"".join(chunks)
+            if not data:
+                return FetchResult(
+                    ok=False,
+                    reason="Empty response",
+                    source_url=script_url,
+                    final_url=current_url,
+                    status_code=status_code,
+                    content_type=content_type,
+                    bytes_fetched=0,
+                    truncated=False,
+                    sha256_hex=None,
+                    saved_path=None,
+                )
+
+            # Persist to <results_path>/<index>.js
+            out_path = self._artifact_path(index)
+            try:
+                with open(out_path, "wb") as f:
+                    f.write(data)
+            except OSError as e:
+                return FetchResult(
+                    ok=False,
+                    reason=f"Write error: {e.__class__.__name__}",
+                    source_url=script_url,
+                    final_url=current_url,
+                    status_code=status_code,
+                    content_type=content_type,
+                    bytes_fetched=total,
+                    truncated=truncated,
+                    sha256_hex=None,
+                    saved_path=None,
+                )
+
+            sha256_hex = hashlib.sha256(data).hexdigest()
+
+            # Structured log line for visibility/metrics
+            try:
+                self.logger.info(
+                    "External script fetched",
+                    extra={
+                        "source_url": script_url,
+                        "final_url": current_url,
+                        "status": status_code,
+                        "bytes": total,
+                        "truncated": truncated,
+                        "sha256": sha256_hex,
+                        "saved_path": out_path,
+                    },
+                )
+            except Exception:
+                # Logging should never break the pipeline
+                pass
+
+            return FetchResult(
+                ok=True,
+                reason="OK",
+                source_url=script_url,
+                final_url=current_url,
+                status_code=status_code,
+                content_type=content_type,
+                bytes_fetched=total,
+                truncated=truncated,
+                sha256_hex=sha256_hex,
+                saved_path=out_path,
+            )
--- a/app/utils/settings.py
+++ b/app/utils/settings.py
@@ -39,6 +39,14 @@ BASE_DIR = Path(__file__).resolve().parent.parent
 DEFAULT_SETTINGS_FILE = BASE_DIR / "config" / "settings.yaml"

 # ---------- CONFIG DATA CLASSES ----------
+@dataclass
+class External_FetchConfig:
+    enabled: bool = True
+    max_total_mb: int = 5 
+    max_time_ms: int = 3000 
+    max_redirects: int = 3 
+    concurrency: int = 3 
+
@dataclass
 class UIConfig:
    snippet_preview_len: int = 160
@@ -61,6 +69,7 @@ class AppConfig:
 class Settings:
    cache: Cache_Config = field(default_factory=Cache_Config)
    ui: UIConfig = field(default_factory=UIConfig)
+    external_fetch: External_FetchConfig = field(default_factory=External_FetchConfig)
    app: AppConfig = field(default_factory=AppConfig)

    @classmethod