feat: on-demand external script analysis + code viewer; refactor form analysis to rule engine

- API: add `POST /api/analyze_script` (app/blueprints/api.py) - Fetch one external script to artifacts, run rules, return findings + snippet - Uses new ExternalScriptFetcher (results_path aware) and job UUID - Returns: { ok, final_url, status_code, bytes, truncated, sha256, artifact_path, findings[], snippet, snippet_len } - TODO: document in openapi/openapi.yaml - Fetcher: update `app/utils/external_fetch.py` - Constructed with `results_path` (UUID dir); writes to `<results_path>/scripts/fetched/<index>.js` - Loads settings via `get_settings()`, logs via std logging - UI (results.html): - Move “Analyze external script” action into **Content Snippet** column for external rows - Clicking replaces button with `<details>` snippet, shows rule matches, and adds “open in viewer” link - Robust fetch handler (checks JSON, shows errors); builds viewer URL from absolute artifact path - Viewer: - New route: `GET /view/artifact/<run_uuid>/<path:filename>` (app/blueprints/ui.py) - New template: Monaco-based read-only code viewer (viewer.html) - Removes SRI on loader to avoid integrity block; loads file via `raw_url` and detects language by extension - Forms: - Refactor `analyze_forms` to mirror scripts analysis: - Uses rule engine (`category == "form"`) across regex/function rules - Emits rows only when matches exist - Includes `content_snippet`, `action`, `method`, `inputs`, `rules` - Replace legacy plumbing (`flagged`, `flag_reasons`, `status`) in output - Normalize form function rules to canonical returns `(bool, Optional[str])`: - `form_action_missing` - `form_http_on_https_page` - `form_submits_to_different_host` - Add minor hardening (lowercasing hosts, no-op actions, clearer reasons) - CSS: add `.forms-table` to mirror `.scripts-table` (5 columns) - Fixed table layout, widths per column, chip/snippet styling, responsive tweaks - Misc: - Fix “working outside app context” issue by avoiding `current_app` at import time (left storage logic inside routes) - Add “View Source” link to open page source in viewer Refs: - Roadmap: mark “Source code viewer” done; keep TODO to add `/api/analyze_script` to OpenAPI
2025-08-21 15:32:24 -05:00
parent 05cf23ad67
commit 3a24b392f2
15 changed files with 1192 additions and 218 deletions
--- a/app/utils/browser.py
+++ b/app/utils/browser.py
@@ -33,7 +33,7 @@ from flask import current_app
 from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError

 from app.utils.io_helpers import safe_write
-from app.enrichment import enrich_url
+from app.utils.enrichment import enrich_url
 from app.utils.settings import get_settings

 # Load settings once for constants / defaults
@@ -202,85 +202,111 @@ class Browser:
    # -----------------------------------------------------------------------
    # Form & Script analysis (plumbing only; detection is in the rules engine)
    # -----------------------------------------------------------------------
-    def analyze_forms(self, html: str, base_url: str) -> List[Dict[str, Any]]:
+    def analyze_forms(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
        """
-        Parse forms from the page HTML and apply rule-based checks (engine), keeping
-        only simple plumbing heuristics here (no security logic).
+        Collect form artifacts and evaluate per-form matches via the rules engine.
+        Only include rows that matched at least one rule.

-        Returns list of dicts with keys:
-          - action, method, inputs
-          - flagged (bool), flag_reasons (list[str]), status (str)
-          - rule_checks: {'checks': [...], 'summary': {...}} (per-form snippet evaluation)
+        Returns list of dicts with keys (per matched form):
+        - type: "form"
+        - action, method, inputs
+        - content_snippet: str
+        - rules: List[{name, description, severity?, tags?}]
        """
        soup = BeautifulSoup(html, "lxml")
-        forms_info: List[Dict[str, Any]] = []
-        page_hostname = urlparse(base_url).hostname
+        results: List[Dict[str, Any]] = []
+
+        engine = self._get_rule_engine()
+        base_hostname = urlparse(base_url).hostname or ""
+        # Match how scripts picks preview len
+        try:
+            preview_len = getattr(settings.ui, "snippet_preview_len", 200)  # keep parity with scripts
+        except Exception:
+            preview_len = 200

        for form in soup.find_all("form"):
-            action = form.get("action")
-            method = form.get("method", "get").lower()
+            try:
+                action = (form.get("action") or "").strip()
+                method = (form.get("method") or "get").strip().lower()

-            inputs: List[Dict[str, Any]] = []
-            for inp in form.find_all("input"):
-                input_name = inp.get("name")
-                input_type = inp.get("type", "text")
-                inputs.append({"name": input_name, "type": input_type})
+                inputs: List[Dict[str, Any]] = []
+                for inp in form.find_all("input"):
+                    inputs.append({
+                        "name": inp.get("name"),
+                        "type": (inp.get("type") or "text").strip().lower(),
+                    })

-            flagged_reasons: List[str] = []
+                # Use the actual form markup for regex rules
+                form_markup = str(form)
+                # UI-friendly snippet
+                content_snippet = form_markup[:preview_len]

-            if not action or str(action).strip() == "":
-                flagged_reasons.append("No action specified")
-            else:
+                matches: List[Dict[str, Any]] = []
+                if engine is not None:
+                    for r in getattr(engine, "rules", []):
+                        if getattr(r, "category", None) != "form":
+                            continue
+                        rtype = getattr(r, "rule_type", None)
+
+                        try:
+                            ok = False
+                            reason = ""
+                            if rtype == "regex":
+                                # Run against the raw form HTML
+                                ok, reason = r.run(form_markup)
+                            elif rtype == "function":
+                                # Structured facts for function-style rules
+                                facts = {
+                                    "category": "form",
+                                    "base_url": base_url,
+                                    "base_hostname": base_hostname,
+                                    "action": action,
+                                    "action_hostname": urlparse(action).hostname or "",
+                                    "method": method,
+                                    "inputs": inputs,
+                                    "markup": form_markup,
+                                }
+                                ok, reason = r.run(facts)
+                            else:
+                                continue
+
+                            if ok:
+                                matches.append({
+                                    "name": getattr(r, "name", "unknown_rule"),
+                                    "description": (reason or "") or getattr(r, "description", ""),
+                                    "severity": getattr(r, "severity", None),
+                                    "tags": getattr(r, "tags", None),
+                                })
+                        except Exception as rule_exc:
+                            # Be defensive—bad rule shouldn't break the form pass
+                            try:
+                                self.logger.debug("Form rule error", extra={"rule": getattr(r, "name", "?"), "error": str(rule_exc)})
+                            except Exception:
+                                pass
+                            continue
+
+                if matches:
+                    results.append({
+                        "type": "form",
+                        "action": action,
+                        "method": method,
+                        "inputs": inputs,
+                        "content_snippet": content_snippet,
+                        "rules": matches,
+                    })
+
+            except Exception as exc:
+                # Keep analysis resilient
                try:
-                    action_host = urlparse(action).hostname
-                    if not str(action).startswith("/") and action_host != page_hostname:
-                        flagged_reasons.append("Submits to a different host")
+                    self.logger.error("Form analysis error", extra={"error": str(exc)})
                except Exception:
                    pass
+                results.append({
+                    "type": "form",
+                    "heuristics": [f"Form analysis error: {exc}"],
+                })

-            try:
-                if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https":
-                    flagged_reasons.append("Submits over insecure HTTP")
-            except Exception:
-                pass
-
-            for hidden in form.find_all("input", type="hidden"):
-                name_value = hidden.get("name") or ""
-                if "password" in name_value.lower():
-                    flagged_reasons.append("Hidden password field")
-
-            flagged = bool(flagged_reasons)
-
-            # Serialize a simple form snippet for rule category='form'
-            snippet_lines = []
-            snippet_lines.append(f"base_url={base_url}")
-            snippet_lines.append(f"base_hostname={page_hostname}")
-            snippet_lines.append(f"action={action}")
-            snippet_lines.append(f"method={method}")
-            snippet_lines.append("inputs=")
-
-            i = 0
-            n = len(inputs)
-            while i < n:
-                item = inputs[i]
-                snippet_lines.append(f"  - name={item.get('name')} type={item.get('type')}")
-                i = i + 1
-            form_snippet = "\n".join(snippet_lines)
-
-            # Per-form rule checks (PASS/FAIL list via engine)
-            rule_checks = self.run_rule_checks(form_snippet, category="form")
-
-            forms_info.append({
-                "action": action,
-                "method": method,
-                "inputs": inputs,
-                "flagged": flagged,
-                "flag_reasons": flagged_reasons,
-                "status": "flagged" if flagged else "possibly safe",
-                "rule_checks": rule_checks
-            })
-
-        return forms_info
+        return results

    def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
        """
@@ -370,7 +396,7 @@ class Browser:

        Writes:
          - /data/<uuid>/screenshot.png
-          - /data/<uuid>/source.txt
+          - /data/<uuid>/source.html
          - /data/<uuid>/results.json  (single source of truth for routes)

        Returns:
@@ -381,7 +407,7 @@ class Browser:
        run_dir.mkdir(parents=True, exist_ok=True)

        screenshot_path = run_dir / "screenshot.png"
-        source_path = run_dir / "source.txt"
+        source_path = run_dir / "source.html"
        results_path = run_dir / "results.json"

        redirects: List[Dict[str, Any]] = []