feat: on-demand external script analysis + code viewer; refactor form analysis to rule engine
- API: add `POST /api/analyze_script` (app/blueprints/api.py)
- Fetch one external script to artifacts, run rules, return findings + snippet
- Uses new ExternalScriptFetcher (results_path aware) and job UUID
- Returns: { ok, final_url, status_code, bytes, truncated, sha256, artifact_path, findings[], snippet, snippet_len }
- TODO: document in openapi/openapi.yaml
- Fetcher: update `app/utils/external_fetch.py`
- Constructed with `results_path` (UUID dir); writes to `<results_path>/scripts/fetched/<index>.js`
- Loads settings via `get_settings()`, logs via std logging
- UI (results.html):
- Move “Analyze external script” action into **Content Snippet** column for external rows
- Clicking replaces button with `<details>` snippet, shows rule matches, and adds “open in viewer” link
- Robust fetch handler (checks JSON, shows errors); builds viewer URL from absolute artifact path
- Viewer:
- New route: `GET /view/artifact/<run_uuid>/<path:filename>` (app/blueprints/ui.py)
- New template: Monaco-based read-only code viewer (viewer.html)
- Removes SRI on loader to avoid integrity block; loads file via `raw_url` and detects language by extension
- Forms:
- Refactor `analyze_forms` to mirror scripts analysis:
- Uses rule engine (`category == "form"`) across regex/function rules
- Emits rows only when matches exist
- Includes `content_snippet`, `action`, `method`, `inputs`, `rules`
- Replace legacy plumbing (`flagged`, `flag_reasons`, `status`) in output
- Normalize form function rules to canonical returns `(bool, Optional[str])`:
- `form_action_missing`
- `form_http_on_https_page`
- `form_submits_to_different_host`
- Add minor hardening (lowercasing hosts, no-op actions, clearer reasons)
- CSS: add `.forms-table` to mirror `.scripts-table` (5 columns)
- Fixed table layout, widths per column, chip/snippet styling, responsive tweaks
- Misc:
- Fix “working outside app context” issue by avoiding `current_app` at import time (left storage logic inside routes)
- Add “View Source” link to open page source in viewer
Refs:
- Roadmap: mark “Source code viewer” done; keep TODO to add `/api/analyze_script` to OpenAPI
This commit is contained in:
338
app/utils/external_fetcher.py
Normal file
338
app/utils/external_fetcher.py
Normal file
@@ -0,0 +1,338 @@
|
||||
# sneakyscope/app/utils/external_fetch.py
|
||||
import hashlib
|
||||
import os
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, List
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from app.utils.settings import get_settings
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
_ALLOWED_SCHEMES = {"http", "https"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchResult:
|
||||
"""
|
||||
Outcome for a single external script fetch.
|
||||
"""
|
||||
ok: bool
|
||||
reason: str
|
||||
source_url: str
|
||||
final_url: str
|
||||
status_code: Optional[int]
|
||||
content_type: Optional[str]
|
||||
bytes_fetched: int
|
||||
truncated: bool
|
||||
sha256_hex: Optional[str]
|
||||
saved_path: Optional[str]
|
||||
|
||||
|
||||
class ExternalScriptFetcher:
|
||||
"""
|
||||
Minimal, safe-by-default fetcher for external JS files.
|
||||
|
||||
Notes / assumptions:
|
||||
- All artifacts for this run live under the UUID-backed `results_path` you pass in.
|
||||
- Saves bytes to: <results_path>/<index>.js
|
||||
- Manual redirects up to `max_redirects`.
|
||||
- Streaming with a hard byte cap derived from `max_total_mb`.
|
||||
- Never raises network exceptions to callers; failures are encoded in FetchResult.
|
||||
- Settings are read from get_settings()['external_script_fetch'] with sane defaults.
|
||||
"""
|
||||
|
||||
def __init__(self, results_path: str, session: Optional[requests.Session] = None):
|
||||
"""
|
||||
Args:
|
||||
results_path: Absolute path to the run's UUID directory (e.g., /data/<run_uuid>).
|
||||
session: Optional requests.Session to reuse connections; a new one is created if not provided.
|
||||
"""
|
||||
# Derived value: MiB -> bytes
|
||||
self.max_total_bytes: int = settings.external_fetch.max_total_mb * 1024 * 1024
|
||||
|
||||
# Logger
|
||||
self.logger = logging.getLogger(__file__)
|
||||
|
||||
# Where to write artifacts for this job/run (UUID directory)
|
||||
self.results_path = results_path
|
||||
|
||||
# HTTP session with a predictable UA
|
||||
self.session = session or requests.Session()
|
||||
self.session.headers.update({"User-Agent": "SneakyScope/1.0"})
|
||||
|
||||
# -------------------------
|
||||
# Internal helper methods
|
||||
# -------------------------
|
||||
|
||||
def _timeout(self) -> Tuple[float, float]:
|
||||
"""
|
||||
Compute (connect_timeout, read_timeout) in seconds from max_time_ms.
|
||||
Keeps a conservative split so either phase gets a fair chance.
|
||||
"""
|
||||
total = max(0.1, settings.external_fetch.max_time_ms / 1000.0)
|
||||
connect = min(1.5, total * 0.5) # cap connect timeout
|
||||
read = max(0.5, total * 0.5) # floor read timeout
|
||||
return (connect, read)
|
||||
|
||||
def _scheme_allowed(self, url: str) -> bool:
|
||||
"""
|
||||
Return True if URL uses an allowed scheme (http/https).
|
||||
"""
|
||||
scheme = (urlparse(url).scheme or "").lower()
|
||||
return scheme in _ALLOWED_SCHEMES
|
||||
|
||||
def _artifact_path(self, index: int) -> str:
|
||||
"""
|
||||
Build an output path like:
|
||||
<results_path>/<index>.js
|
||||
|
||||
Ensures the directory exists.
|
||||
"""
|
||||
base_dir = os.path.join(self.results_path)
|
||||
# Make sure parent directories exist (idempotent)
|
||||
os.makedirs(base_dir, exist_ok=True)
|
||||
filename = f"{index}.js"
|
||||
return os.path.join(base_dir, filename)
|
||||
|
||||
# -------------------------
|
||||
# Public API
|
||||
# -------------------------
|
||||
|
||||
def fetch_one(self, script_url: str, index: int) -> FetchResult:
|
||||
"""
|
||||
Fetch exactly one external script with manual redirect handling and a hard per-file byte cap.
|
||||
|
||||
Args:
|
||||
script_url: The script URL to retrieve.
|
||||
index: Numeric index used solely for naming the artifact file (<index>.js).
|
||||
|
||||
Returns:
|
||||
FetchResult with status, metadata, and saved path (if successful).
|
||||
"""
|
||||
# Feature gate: allow callers to rely on a consistent failure when globally disabled.
|
||||
if not settings.external_fetch.enabled:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Feature disabled",
|
||||
source_url=script_url,
|
||||
final_url=script_url,
|
||||
status_code=None,
|
||||
content_type=None,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
|
||||
# Scheme guard: refuse anything not http/https in this v1.
|
||||
if not self._scheme_allowed(script_url):
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Scheme not allowed",
|
||||
source_url=script_url,
|
||||
final_url=script_url,
|
||||
status_code=None,
|
||||
content_type=None,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
|
||||
current_url = script_url
|
||||
status_code: Optional[int] = None
|
||||
content_type: Optional[str] = None
|
||||
redirects_followed = 0
|
||||
|
||||
# Manual redirect loop so we can enforce max_redirects precisely.
|
||||
while True:
|
||||
try:
|
||||
resp = self.session.get(
|
||||
current_url,
|
||||
stream=True,
|
||||
allow_redirects=False,
|
||||
timeout=self._timeout(),
|
||||
)
|
||||
except requests.exceptions.Timeout:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Timeout",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
except requests.exceptions.RequestException as e:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason=f"Network error: {e.__class__.__name__}",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
|
||||
status_code = resp.status_code
|
||||
content_type = resp.headers.get("Content-Type")
|
||||
|
||||
# Handle redirects explicitly (3xx with Location)
|
||||
if status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers:
|
||||
if redirects_followed >= settings.external_fetch.max_redirects:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Max redirects exceeded",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
next_url = urljoin(current_url, resp.headers["Location"])
|
||||
if not self._scheme_allowed(next_url):
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Redirect to disallowed scheme",
|
||||
source_url=script_url,
|
||||
final_url=next_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
current_url = next_url
|
||||
redirects_followed += 1
|
||||
# Loop to follow next hop
|
||||
continue
|
||||
|
||||
# Not a redirect: stream response body with a hard byte cap.
|
||||
cap = self.max_total_bytes
|
||||
total = 0
|
||||
truncated = False
|
||||
chunks: List[bytes] = []
|
||||
|
||||
try:
|
||||
for chunk in resp.iter_content(chunk_size=8192):
|
||||
if not chunk:
|
||||
# Skip keep-alive chunks
|
||||
continue
|
||||
new_total = total + len(chunk)
|
||||
if new_total > cap:
|
||||
# Only keep what fits and stop
|
||||
remaining = cap - total
|
||||
if remaining > 0:
|
||||
chunks.append(chunk[:remaining])
|
||||
total += remaining
|
||||
truncated = True
|
||||
break
|
||||
chunks.append(chunk)
|
||||
total = new_total
|
||||
except requests.exceptions.Timeout:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Timeout while reading",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=total,
|
||||
truncated=truncated,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
except requests.exceptions.RequestException as e:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason=f"Network error while reading: {e.__class__.__name__}",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=total,
|
||||
truncated=truncated,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
|
||||
data = b"".join(chunks)
|
||||
if not data:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason="Empty response",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=0,
|
||||
truncated=False,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
|
||||
# Persist to <results_path>/<index>.js
|
||||
out_path = self._artifact_path(index)
|
||||
try:
|
||||
with open(out_path, "wb") as f:
|
||||
f.write(data)
|
||||
except OSError as e:
|
||||
return FetchResult(
|
||||
ok=False,
|
||||
reason=f"Write error: {e.__class__.__name__}",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=total,
|
||||
truncated=truncated,
|
||||
sha256_hex=None,
|
||||
saved_path=None,
|
||||
)
|
||||
|
||||
sha256_hex = hashlib.sha256(data).hexdigest()
|
||||
|
||||
# Structured log line for visibility/metrics
|
||||
try:
|
||||
self.logger.info(
|
||||
"External script fetched",
|
||||
extra={
|
||||
"source_url": script_url,
|
||||
"final_url": current_url,
|
||||
"status": status_code,
|
||||
"bytes": total,
|
||||
"truncated": truncated,
|
||||
"sha256": sha256_hex,
|
||||
"saved_path": out_path,
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
# Logging should never break the pipeline
|
||||
pass
|
||||
|
||||
return FetchResult(
|
||||
ok=True,
|
||||
reason="OK",
|
||||
source_url=script_url,
|
||||
final_url=current_url,
|
||||
status_code=status_code,
|
||||
content_type=content_type,
|
||||
bytes_fetched=total,
|
||||
truncated=truncated,
|
||||
sha256_hex=sha256_hex,
|
||||
saved_path=out_path,
|
||||
)
|
||||
Reference in New Issue
Block a user