feat: on-demand external script analysis + code viewer; refactor form analysis to rule engine

- API: add `POST /api/analyze_script` (app/blueprints/api.py)
  - Fetch one external script to artifacts, run rules, return findings + snippet
  - Uses new ExternalScriptFetcher (results_path aware) and job UUID
  - Returns: { ok, final_url, status_code, bytes, truncated, sha256, artifact_path, findings[], snippet, snippet_len }
  - TODO: document in openapi/openapi.yaml

- Fetcher: update `app/utils/external_fetch.py`
  - Constructed with `results_path` (UUID dir); writes to `<results_path>/scripts/fetched/<index>.js`
  - Loads settings via `get_settings()`, logs via std logging

- UI (results.html):
  - Move “Analyze external script” action into **Content Snippet** column for external rows
  - Clicking replaces button with `<details>` snippet, shows rule matches, and adds “open in viewer” link
  - Robust fetch handler (checks JSON, shows errors); builds viewer URL from absolute artifact path

- Viewer:
  - New route: `GET /view/artifact/<run_uuid>/<path:filename>` (app/blueprints/ui.py)
  - New template: Monaco-based read-only code viewer (viewer.html)
  - Removes SRI on loader to avoid integrity block; loads file via `raw_url` and detects language by extension

- Forms:
  - Refactor `analyze_forms` to mirror scripts analysis:
    - Uses rule engine (`category == "form"`) across regex/function rules
    - Emits rows only when matches exist
    - Includes `content_snippet`, `action`, `method`, `inputs`, `rules`
  - Replace legacy plumbing (`flagged`, `flag_reasons`, `status`) in output
  - Normalize form function rules to canonical returns `(bool, Optional[str])`:
    - `form_action_missing`
    - `form_http_on_https_page`
    - `form_submits_to_different_host`
    - Add minor hardening (lowercasing hosts, no-op actions, clearer reasons)

- CSS: add `.forms-table` to mirror `.scripts-table` (5 columns)
  - Fixed table layout, widths per column, chip/snippet styling, responsive tweaks

- Misc:
  - Fix “working outside app context” issue by avoiding `current_app` at import time (left storage logic inside routes)
  - Add “View Source” link to open page source in viewer

Refs:
- Roadmap: mark “Source code viewer” done; keep TODO to add `/api/analyze_script` to OpenAPI
This commit is contained in:
2025-08-21 15:32:24 -05:00
parent 05cf23ad67
commit 3a24b392f2
15 changed files with 1192 additions and 218 deletions

View File

@@ -33,7 +33,7 @@ from flask import current_app
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
from app.utils.io_helpers import safe_write
from app.enrichment import enrich_url
from app.utils.enrichment import enrich_url
from app.utils.settings import get_settings
# Load settings once for constants / defaults
@@ -202,85 +202,111 @@ class Browser:
# -----------------------------------------------------------------------
# Form & Script analysis (plumbing only; detection is in the rules engine)
# -----------------------------------------------------------------------
def analyze_forms(self, html: str, base_url: str) -> List[Dict[str, Any]]:
def analyze_forms(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
"""
Parse forms from the page HTML and apply rule-based checks (engine), keeping
only simple plumbing heuristics here (no security logic).
Collect form artifacts and evaluate per-form matches via the rules engine.
Only include rows that matched at least one rule.
Returns list of dicts with keys:
- action, method, inputs
- flagged (bool), flag_reasons (list[str]), status (str)
- rule_checks: {'checks': [...], 'summary': {...}} (per-form snippet evaluation)
Returns list of dicts with keys (per matched form):
- type: "form"
- action, method, inputs
- content_snippet: str
- rules: List[{name, description, severity?, tags?}]
"""
soup = BeautifulSoup(html, "lxml")
forms_info: List[Dict[str, Any]] = []
page_hostname = urlparse(base_url).hostname
results: List[Dict[str, Any]] = []
engine = self._get_rule_engine()
base_hostname = urlparse(base_url).hostname or ""
# Match how scripts picks preview len
try:
preview_len = getattr(settings.ui, "snippet_preview_len", 200) # keep parity with scripts
except Exception:
preview_len = 200
for form in soup.find_all("form"):
action = form.get("action")
method = form.get("method", "get").lower()
try:
action = (form.get("action") or "").strip()
method = (form.get("method") or "get").strip().lower()
inputs: List[Dict[str, Any]] = []
for inp in form.find_all("input"):
input_name = inp.get("name")
input_type = inp.get("type", "text")
inputs.append({"name": input_name, "type": input_type})
inputs: List[Dict[str, Any]] = []
for inp in form.find_all("input"):
inputs.append({
"name": inp.get("name"),
"type": (inp.get("type") or "text").strip().lower(),
})
flagged_reasons: List[str] = []
# Use the actual form markup for regex rules
form_markup = str(form)
# UI-friendly snippet
content_snippet = form_markup[:preview_len]
if not action or str(action).strip() == "":
flagged_reasons.append("No action specified")
else:
matches: List[Dict[str, Any]] = []
if engine is not None:
for r in getattr(engine, "rules", []):
if getattr(r, "category", None) != "form":
continue
rtype = getattr(r, "rule_type", None)
try:
ok = False
reason = ""
if rtype == "regex":
# Run against the raw form HTML
ok, reason = r.run(form_markup)
elif rtype == "function":
# Structured facts for function-style rules
facts = {
"category": "form",
"base_url": base_url,
"base_hostname": base_hostname,
"action": action,
"action_hostname": urlparse(action).hostname or "",
"method": method,
"inputs": inputs,
"markup": form_markup,
}
ok, reason = r.run(facts)
else:
continue
if ok:
matches.append({
"name": getattr(r, "name", "unknown_rule"),
"description": (reason or "") or getattr(r, "description", ""),
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
except Exception as rule_exc:
# Be defensive—bad rule shouldn't break the form pass
try:
self.logger.debug("Form rule error", extra={"rule": getattr(r, "name", "?"), "error": str(rule_exc)})
except Exception:
pass
continue
if matches:
results.append({
"type": "form",
"action": action,
"method": method,
"inputs": inputs,
"content_snippet": content_snippet,
"rules": matches,
})
except Exception as exc:
# Keep analysis resilient
try:
action_host = urlparse(action).hostname
if not str(action).startswith("/") and action_host != page_hostname:
flagged_reasons.append("Submits to a different host")
self.logger.error("Form analysis error", extra={"error": str(exc)})
except Exception:
pass
results.append({
"type": "form",
"heuristics": [f"Form analysis error: {exc}"],
})
try:
if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https":
flagged_reasons.append("Submits over insecure HTTP")
except Exception:
pass
for hidden in form.find_all("input", type="hidden"):
name_value = hidden.get("name") or ""
if "password" in name_value.lower():
flagged_reasons.append("Hidden password field")
flagged = bool(flagged_reasons)
# Serialize a simple form snippet for rule category='form'
snippet_lines = []
snippet_lines.append(f"base_url={base_url}")
snippet_lines.append(f"base_hostname={page_hostname}")
snippet_lines.append(f"action={action}")
snippet_lines.append(f"method={method}")
snippet_lines.append("inputs=")
i = 0
n = len(inputs)
while i < n:
item = inputs[i]
snippet_lines.append(f" - name={item.get('name')} type={item.get('type')}")
i = i + 1
form_snippet = "\n".join(snippet_lines)
# Per-form rule checks (PASS/FAIL list via engine)
rule_checks = self.run_rule_checks(form_snippet, category="form")
forms_info.append({
"action": action,
"method": method,
"inputs": inputs,
"flagged": flagged,
"flag_reasons": flagged_reasons,
"status": "flagged" if flagged else "possibly safe",
"rule_checks": rule_checks
})
return forms_info
return results
def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
"""
@@ -370,7 +396,7 @@ class Browser:
Writes:
- /data/<uuid>/screenshot.png
- /data/<uuid>/source.txt
- /data/<uuid>/source.html
- /data/<uuid>/results.json (single source of truth for routes)
Returns:
@@ -381,7 +407,7 @@ class Browser:
run_dir.mkdir(parents=True, exist_ok=True)
screenshot_path = run_dir / "screenshot.png"
source_path = run_dir / "source.txt"
source_path = run_dir / "source.html"
results_path = run_dir / "results.json"
redirects: List[Dict[str, Any]] = []

View File

@@ -9,8 +9,8 @@ from ipaddress import ip_address
import socket
# Local imports
from .utils.cache_db import get_cache
from .utils.settings import get_settings
from app.utils.cache_db import get_cache
from app.utils.settings import get_settings
# Configure logging
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
@@ -39,9 +39,6 @@ def enrich_url(url: str) -> dict:
# --- GeoIP ---
result["geoip"] = enrich_geoip(hostname)
# --- BEC Words ---
result["bec_words"] = [w for w in BEC_WORDS if w.lower() in url.lower()]
return result

View File

@@ -0,0 +1,338 @@
# sneakyscope/app/utils/external_fetch.py
import hashlib
import os
import logging
from dataclasses import dataclass
from typing import Optional, Tuple, List
from urllib.parse import urljoin, urlparse
import requests
from app.utils.settings import get_settings
settings = get_settings()
_ALLOWED_SCHEMES = {"http", "https"}
@dataclass
class FetchResult:
"""
Outcome for a single external script fetch.
"""
ok: bool
reason: str
source_url: str
final_url: str
status_code: Optional[int]
content_type: Optional[str]
bytes_fetched: int
truncated: bool
sha256_hex: Optional[str]
saved_path: Optional[str]
class ExternalScriptFetcher:
"""
Minimal, safe-by-default fetcher for external JS files.
Notes / assumptions:
- All artifacts for this run live under the UUID-backed `results_path` you pass in.
- Saves bytes to: <results_path>/<index>.js
- Manual redirects up to `max_redirects`.
- Streaming with a hard byte cap derived from `max_total_mb`.
- Never raises network exceptions to callers; failures are encoded in FetchResult.
- Settings are read from get_settings()['external_script_fetch'] with sane defaults.
"""
def __init__(self, results_path: str, session: Optional[requests.Session] = None):
"""
Args:
results_path: Absolute path to the run's UUID directory (e.g., /data/<run_uuid>).
session: Optional requests.Session to reuse connections; a new one is created if not provided.
"""
# Derived value: MiB -> bytes
self.max_total_bytes: int = settings.external_fetch.max_total_mb * 1024 * 1024
# Logger
self.logger = logging.getLogger(__file__)
# Where to write artifacts for this job/run (UUID directory)
self.results_path = results_path
# HTTP session with a predictable UA
self.session = session or requests.Session()
self.session.headers.update({"User-Agent": "SneakyScope/1.0"})
# -------------------------
# Internal helper methods
# -------------------------
def _timeout(self) -> Tuple[float, float]:
"""
Compute (connect_timeout, read_timeout) in seconds from max_time_ms.
Keeps a conservative split so either phase gets a fair chance.
"""
total = max(0.1, settings.external_fetch.max_time_ms / 1000.0)
connect = min(1.5, total * 0.5) # cap connect timeout
read = max(0.5, total * 0.5) # floor read timeout
return (connect, read)
def _scheme_allowed(self, url: str) -> bool:
"""
Return True if URL uses an allowed scheme (http/https).
"""
scheme = (urlparse(url).scheme or "").lower()
return scheme in _ALLOWED_SCHEMES
def _artifact_path(self, index: int) -> str:
"""
Build an output path like:
<results_path>/<index>.js
Ensures the directory exists.
"""
base_dir = os.path.join(self.results_path)
# Make sure parent directories exist (idempotent)
os.makedirs(base_dir, exist_ok=True)
filename = f"{index}.js"
return os.path.join(base_dir, filename)
# -------------------------
# Public API
# -------------------------
def fetch_one(self, script_url: str, index: int) -> FetchResult:
"""
Fetch exactly one external script with manual redirect handling and a hard per-file byte cap.
Args:
script_url: The script URL to retrieve.
index: Numeric index used solely for naming the artifact file (<index>.js).
Returns:
FetchResult with status, metadata, and saved path (if successful).
"""
# Feature gate: allow callers to rely on a consistent failure when globally disabled.
if not settings.external_fetch.enabled:
return FetchResult(
ok=False,
reason="Feature disabled",
source_url=script_url,
final_url=script_url,
status_code=None,
content_type=None,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
# Scheme guard: refuse anything not http/https in this v1.
if not self._scheme_allowed(script_url):
return FetchResult(
ok=False,
reason="Scheme not allowed",
source_url=script_url,
final_url=script_url,
status_code=None,
content_type=None,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
current_url = script_url
status_code: Optional[int] = None
content_type: Optional[str] = None
redirects_followed = 0
# Manual redirect loop so we can enforce max_redirects precisely.
while True:
try:
resp = self.session.get(
current_url,
stream=True,
allow_redirects=False,
timeout=self._timeout(),
)
except requests.exceptions.Timeout:
return FetchResult(
ok=False,
reason="Timeout",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
except requests.exceptions.RequestException as e:
return FetchResult(
ok=False,
reason=f"Network error: {e.__class__.__name__}",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
status_code = resp.status_code
content_type = resp.headers.get("Content-Type")
# Handle redirects explicitly (3xx with Location)
if status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers:
if redirects_followed >= settings.external_fetch.max_redirects:
return FetchResult(
ok=False,
reason="Max redirects exceeded",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
next_url = urljoin(current_url, resp.headers["Location"])
if not self._scheme_allowed(next_url):
return FetchResult(
ok=False,
reason="Redirect to disallowed scheme",
source_url=script_url,
final_url=next_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
current_url = next_url
redirects_followed += 1
# Loop to follow next hop
continue
# Not a redirect: stream response body with a hard byte cap.
cap = self.max_total_bytes
total = 0
truncated = False
chunks: List[bytes] = []
try:
for chunk in resp.iter_content(chunk_size=8192):
if not chunk:
# Skip keep-alive chunks
continue
new_total = total + len(chunk)
if new_total > cap:
# Only keep what fits and stop
remaining = cap - total
if remaining > 0:
chunks.append(chunk[:remaining])
total += remaining
truncated = True
break
chunks.append(chunk)
total = new_total
except requests.exceptions.Timeout:
return FetchResult(
ok=False,
reason="Timeout while reading",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=total,
truncated=truncated,
sha256_hex=None,
saved_path=None,
)
except requests.exceptions.RequestException as e:
return FetchResult(
ok=False,
reason=f"Network error while reading: {e.__class__.__name__}",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=total,
truncated=truncated,
sha256_hex=None,
saved_path=None,
)
data = b"".join(chunks)
if not data:
return FetchResult(
ok=False,
reason="Empty response",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=0,
truncated=False,
sha256_hex=None,
saved_path=None,
)
# Persist to <results_path>/<index>.js
out_path = self._artifact_path(index)
try:
with open(out_path, "wb") as f:
f.write(data)
except OSError as e:
return FetchResult(
ok=False,
reason=f"Write error: {e.__class__.__name__}",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=total,
truncated=truncated,
sha256_hex=None,
saved_path=None,
)
sha256_hex = hashlib.sha256(data).hexdigest()
# Structured log line for visibility/metrics
try:
self.logger.info(
"External script fetched",
extra={
"source_url": script_url,
"final_url": current_url,
"status": status_code,
"bytes": total,
"truncated": truncated,
"sha256": sha256_hex,
"saved_path": out_path,
},
)
except Exception:
# Logging should never break the pipeline
pass
return FetchResult(
ok=True,
reason="OK",
source_url=script_url,
final_url=current_url,
status_code=status_code,
content_type=content_type,
bytes_fetched=total,
truncated=truncated,
sha256_hex=sha256_hex,
saved_path=out_path,
)

View File

@@ -39,6 +39,14 @@ BASE_DIR = Path(__file__).resolve().parent.parent
DEFAULT_SETTINGS_FILE = BASE_DIR / "config" / "settings.yaml"
# ---------- CONFIG DATA CLASSES ----------
@dataclass
class External_FetchConfig:
enabled: bool = True
max_total_mb: int = 5
max_time_ms: int = 3000
max_redirects: int = 3
concurrency: int = 3
@dataclass
class UIConfig:
snippet_preview_len: int = 160
@@ -61,6 +69,7 @@ class AppConfig:
class Settings:
cache: Cache_Config = field(default_factory=Cache_Config)
ui: UIConfig = field(default_factory=UIConfig)
external_fetch: External_FetchConfig = field(default_factory=External_FetchConfig)
app: AppConfig = field(default_factory=AppConfig)
@classmethod