feat: on-demand external script analysis + code viewer; refactor form analysis to rule engine
- API: add `POST /api/analyze_script` (app/blueprints/api.py)
- Fetch one external script to artifacts, run rules, return findings + snippet
- Uses new ExternalScriptFetcher (results_path aware) and job UUID
- Returns: { ok, final_url, status_code, bytes, truncated, sha256, artifact_path, findings[], snippet, snippet_len }
- TODO: document in openapi/openapi.yaml
- Fetcher: update `app/utils/external_fetch.py`
- Constructed with `results_path` (UUID dir); writes to `<results_path>/scripts/fetched/<index>.js`
- Loads settings via `get_settings()`, logs via std logging
- UI (results.html):
- Move “Analyze external script” action into **Content Snippet** column for external rows
- Clicking replaces button with `<details>` snippet, shows rule matches, and adds “open in viewer” link
- Robust fetch handler (checks JSON, shows errors); builds viewer URL from absolute artifact path
- Viewer:
- New route: `GET /view/artifact/<run_uuid>/<path:filename>` (app/blueprints/ui.py)
- New template: Monaco-based read-only code viewer (viewer.html)
- Removes SRI on loader to avoid integrity block; loads file via `raw_url` and detects language by extension
- Forms:
- Refactor `analyze_forms` to mirror scripts analysis:
- Uses rule engine (`category == "form"`) across regex/function rules
- Emits rows only when matches exist
- Includes `content_snippet`, `action`, `method`, `inputs`, `rules`
- Replace legacy plumbing (`flagged`, `flag_reasons`, `status`) in output
- Normalize form function rules to canonical returns `(bool, Optional[str])`:
- `form_action_missing`
- `form_http_on_https_page`
- `form_submits_to_different_host`
- Add minor hardening (lowercasing hosts, no-op actions, clearer reasons)
- CSS: add `.forms-table` to mirror `.scripts-table` (5 columns)
- Fixed table layout, widths per column, chip/snippet styling, responsive tweaks
- Misc:
- Fix “working outside app context” issue by avoiding `current_app` at import time (left storage logic inside routes)
- Add “View Source” link to open page source in viewer
Refs:
- Roadmap: mark “Source code viewer” done; keep TODO to add `/api/analyze_script` to OpenAPI
This commit is contained in:
@@ -33,7 +33,7 @@ from flask import current_app
|
||||
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
|
||||
|
||||
from app.utils.io_helpers import safe_write
|
||||
from app.enrichment import enrich_url
|
||||
from app.utils.enrichment import enrich_url
|
||||
from app.utils.settings import get_settings
|
||||
|
||||
# Load settings once for constants / defaults
|
||||
@@ -202,85 +202,111 @@ class Browser:
|
||||
# -----------------------------------------------------------------------
|
||||
# Form & Script analysis (plumbing only; detection is in the rules engine)
|
||||
# -----------------------------------------------------------------------
|
||||
def analyze_forms(self, html: str, base_url: str) -> List[Dict[str, Any]]:
|
||||
def analyze_forms(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Parse forms from the page HTML and apply rule-based checks (engine), keeping
|
||||
only simple plumbing heuristics here (no security logic).
|
||||
Collect form artifacts and evaluate per-form matches via the rules engine.
|
||||
Only include rows that matched at least one rule.
|
||||
|
||||
Returns list of dicts with keys:
|
||||
- action, method, inputs
|
||||
- flagged (bool), flag_reasons (list[str]), status (str)
|
||||
- rule_checks: {'checks': [...], 'summary': {...}} (per-form snippet evaluation)
|
||||
Returns list of dicts with keys (per matched form):
|
||||
- type: "form"
|
||||
- action, method, inputs
|
||||
- content_snippet: str
|
||||
- rules: List[{name, description, severity?, tags?}]
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
forms_info: List[Dict[str, Any]] = []
|
||||
page_hostname = urlparse(base_url).hostname
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
engine = self._get_rule_engine()
|
||||
base_hostname = urlparse(base_url).hostname or ""
|
||||
# Match how scripts picks preview len
|
||||
try:
|
||||
preview_len = getattr(settings.ui, "snippet_preview_len", 200) # keep parity with scripts
|
||||
except Exception:
|
||||
preview_len = 200
|
||||
|
||||
for form in soup.find_all("form"):
|
||||
action = form.get("action")
|
||||
method = form.get("method", "get").lower()
|
||||
try:
|
||||
action = (form.get("action") or "").strip()
|
||||
method = (form.get("method") or "get").strip().lower()
|
||||
|
||||
inputs: List[Dict[str, Any]] = []
|
||||
for inp in form.find_all("input"):
|
||||
input_name = inp.get("name")
|
||||
input_type = inp.get("type", "text")
|
||||
inputs.append({"name": input_name, "type": input_type})
|
||||
inputs: List[Dict[str, Any]] = []
|
||||
for inp in form.find_all("input"):
|
||||
inputs.append({
|
||||
"name": inp.get("name"),
|
||||
"type": (inp.get("type") or "text").strip().lower(),
|
||||
})
|
||||
|
||||
flagged_reasons: List[str] = []
|
||||
# Use the actual form markup for regex rules
|
||||
form_markup = str(form)
|
||||
# UI-friendly snippet
|
||||
content_snippet = form_markup[:preview_len]
|
||||
|
||||
if not action or str(action).strip() == "":
|
||||
flagged_reasons.append("No action specified")
|
||||
else:
|
||||
matches: List[Dict[str, Any]] = []
|
||||
if engine is not None:
|
||||
for r in getattr(engine, "rules", []):
|
||||
if getattr(r, "category", None) != "form":
|
||||
continue
|
||||
rtype = getattr(r, "rule_type", None)
|
||||
|
||||
try:
|
||||
ok = False
|
||||
reason = ""
|
||||
if rtype == "regex":
|
||||
# Run against the raw form HTML
|
||||
ok, reason = r.run(form_markup)
|
||||
elif rtype == "function":
|
||||
# Structured facts for function-style rules
|
||||
facts = {
|
||||
"category": "form",
|
||||
"base_url": base_url,
|
||||
"base_hostname": base_hostname,
|
||||
"action": action,
|
||||
"action_hostname": urlparse(action).hostname or "",
|
||||
"method": method,
|
||||
"inputs": inputs,
|
||||
"markup": form_markup,
|
||||
}
|
||||
ok, reason = r.run(facts)
|
||||
else:
|
||||
continue
|
||||
|
||||
if ok:
|
||||
matches.append({
|
||||
"name": getattr(r, "name", "unknown_rule"),
|
||||
"description": (reason or "") or getattr(r, "description", ""),
|
||||
"severity": getattr(r, "severity", None),
|
||||
"tags": getattr(r, "tags", None),
|
||||
})
|
||||
except Exception as rule_exc:
|
||||
# Be defensive—bad rule shouldn't break the form pass
|
||||
try:
|
||||
self.logger.debug("Form rule error", extra={"rule": getattr(r, "name", "?"), "error": str(rule_exc)})
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
if matches:
|
||||
results.append({
|
||||
"type": "form",
|
||||
"action": action,
|
||||
"method": method,
|
||||
"inputs": inputs,
|
||||
"content_snippet": content_snippet,
|
||||
"rules": matches,
|
||||
})
|
||||
|
||||
except Exception as exc:
|
||||
# Keep analysis resilient
|
||||
try:
|
||||
action_host = urlparse(action).hostname
|
||||
if not str(action).startswith("/") and action_host != page_hostname:
|
||||
flagged_reasons.append("Submits to a different host")
|
||||
self.logger.error("Form analysis error", extra={"error": str(exc)})
|
||||
except Exception:
|
||||
pass
|
||||
results.append({
|
||||
"type": "form",
|
||||
"heuristics": [f"Form analysis error: {exc}"],
|
||||
})
|
||||
|
||||
try:
|
||||
if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https":
|
||||
flagged_reasons.append("Submits over insecure HTTP")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for hidden in form.find_all("input", type="hidden"):
|
||||
name_value = hidden.get("name") or ""
|
||||
if "password" in name_value.lower():
|
||||
flagged_reasons.append("Hidden password field")
|
||||
|
||||
flagged = bool(flagged_reasons)
|
||||
|
||||
# Serialize a simple form snippet for rule category='form'
|
||||
snippet_lines = []
|
||||
snippet_lines.append(f"base_url={base_url}")
|
||||
snippet_lines.append(f"base_hostname={page_hostname}")
|
||||
snippet_lines.append(f"action={action}")
|
||||
snippet_lines.append(f"method={method}")
|
||||
snippet_lines.append("inputs=")
|
||||
|
||||
i = 0
|
||||
n = len(inputs)
|
||||
while i < n:
|
||||
item = inputs[i]
|
||||
snippet_lines.append(f" - name={item.get('name')} type={item.get('type')}")
|
||||
i = i + 1
|
||||
form_snippet = "\n".join(snippet_lines)
|
||||
|
||||
# Per-form rule checks (PASS/FAIL list via engine)
|
||||
rule_checks = self.run_rule_checks(form_snippet, category="form")
|
||||
|
||||
forms_info.append({
|
||||
"action": action,
|
||||
"method": method,
|
||||
"inputs": inputs,
|
||||
"flagged": flagged,
|
||||
"flag_reasons": flagged_reasons,
|
||||
"status": "flagged" if flagged else "possibly safe",
|
||||
"rule_checks": rule_checks
|
||||
})
|
||||
|
||||
return forms_info
|
||||
return results
|
||||
|
||||
def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
|
||||
"""
|
||||
@@ -370,7 +396,7 @@ class Browser:
|
||||
|
||||
Writes:
|
||||
- /data/<uuid>/screenshot.png
|
||||
- /data/<uuid>/source.txt
|
||||
- /data/<uuid>/source.html
|
||||
- /data/<uuid>/results.json (single source of truth for routes)
|
||||
|
||||
Returns:
|
||||
@@ -381,7 +407,7 @@ class Browser:
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
screenshot_path = run_dir / "screenshot.png"
|
||||
source_path = run_dir / "source.txt"
|
||||
source_path = run_dir / "source.html"
|
||||
results_path = run_dir / "results.json"
|
||||
|
||||
redirects: List[Dict[str, Any]] = []
|
||||
|
||||
Reference in New Issue
Block a user