Files
SneakyScope/app/utils/browser.py
Phillip Tarrant 3a24b392f2 feat: on-demand external script analysis + code viewer; refactor form analysis to rule engine
- API: add `POST /api/analyze_script` (app/blueprints/api.py)
  - Fetch one external script to artifacts, run rules, return findings + snippet
  - Uses new ExternalScriptFetcher (results_path aware) and job UUID
  - Returns: { ok, final_url, status_code, bytes, truncated, sha256, artifact_path, findings[], snippet, snippet_len }
  - TODO: document in openapi/openapi.yaml

- Fetcher: update `app/utils/external_fetch.py`
  - Constructed with `results_path` (UUID dir); writes to `<results_path>/scripts/fetched/<index>.js`
  - Loads settings via `get_settings()`, logs via std logging

- UI (results.html):
  - Move “Analyze external script” action into **Content Snippet** column for external rows
  - Clicking replaces button with `<details>` snippet, shows rule matches, and adds “open in viewer” link
  - Robust fetch handler (checks JSON, shows errors); builds viewer URL from absolute artifact path

- Viewer:
  - New route: `GET /view/artifact/<run_uuid>/<path:filename>` (app/blueprints/ui.py)
  - New template: Monaco-based read-only code viewer (viewer.html)
  - Removes SRI on loader to avoid integrity block; loads file via `raw_url` and detects language by extension

- Forms:
  - Refactor `analyze_forms` to mirror scripts analysis:
    - Uses rule engine (`category == "form"`) across regex/function rules
    - Emits rows only when matches exist
    - Includes `content_snippet`, `action`, `method`, `inputs`, `rules`
  - Replace legacy plumbing (`flagged`, `flag_reasons`, `status`) in output
  - Normalize form function rules to canonical returns `(bool, Optional[str])`:
    - `form_action_missing`
    - `form_http_on_https_page`
    - `form_submits_to_different_host`
    - Add minor hardening (lowercasing hosts, no-op actions, clearer reasons)

- CSS: add `.forms-table` to mirror `.scripts-table` (5 columns)
  - Fixed table layout, widths per column, chip/snippet styling, responsive tweaks

- Misc:
  - Fix “working outside app context” issue by avoiding `current_app` at import time (left storage logic inside routes)
  - Add “View Source” link to open page source in viewer

Refs:
- Roadmap: mark “Source code viewer” done; keep TODO to add `/api/analyze_script` to OpenAPI
2025-08-21 15:32:24 -05:00

549 lines
21 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
app/browser.py
Singleton, lazily-loaded page fetcher + analysis orchestrator for SneakyScope.
Responsibilities:
- Fetch a URL (HTML, redirects, etc.)
- Run the Suspicious Rules Engine (PASS/FAIL for all rules)
- Write artifacts (screenshot.png, source.txt, results.json) into /data/<run_uuid>/
- Return a single 'result' dict suitable for UI and future API
Design notes:
- Detection logic (regex/heuristics) lives in the rules engine (YAML/function rules).
- This module keeps "plumbing" only (fetch, extract, persist).
- Minimal non-detection heuristics remain here (e.g., skip benign script MIME types).
Assumptions:
- Flask app context is active (uses current_app for logger and RULE_ENGINE).
- SANDBOX_STORAGE is configured (default: /data).
- enrich_url(url) returns enrichment dict.
"""
from __future__ import annotations
import json
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from flask import current_app
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
from app.utils.io_helpers import safe_write
from app.utils.enrichment import enrich_url
from app.utils.settings import get_settings
# Load settings once for constants / defaults
settings = get_settings()
class Browser:
"""
Orchestrates page fetching and analysis. Meant to be accessed via the
lazily-loaded singleton factory `get_browser()`.
"""
def __init__(self, storage_dir: Optional[Path] = None) -> None:
"""
Args:
storage_dir: Base directory for run artifacts. Defaults to settings.sandbox.storage
(typically /data) if not provided.
"""
if storage_dir is None:
try:
# Prefer your settings models configured storage path
storage_dir = Path(settings.sandbox.storage)
except Exception:
storage_dir = Path("/data")
self.storage_dir: Path = storage_dir
# -----------------------------------------------------------------------
# Engine access helpers
# -----------------------------------------------------------------------
@staticmethod
def _get_rule_engine():
"""
Retrieve the rules engine instance from the Flask application config.
Returns:
RuleEngine or None: The engine if available, or None if not configured.
"""
try:
return current_app.config.get("RULE_ENGINE")
except Exception:
return None
@staticmethod
def _summarize_results(results: List[Dict[str, Any]]) -> Dict[str, int]:
"""
Summarize a list of engine rule result dicts (result = "PASS"|"FAIL").
Returns:
{'fail_count': int, 'total_rules': int}
"""
summary = {"fail_count": 0, "total_rules": 0}
index = 0
total = len(results)
while index < total:
item = results[index]
summary["total_rules"] = summary["total_rules"] + 1
if str(item.get("result", "")).upper() == "FAIL":
summary["fail_count"] = summary["fail_count"] + 1
index = index + 1
return summary
def run_rule_checks(self, text: str, category: str) -> Dict[str, Any]:
"""
Run all rules for a given category against provided text, returning a table-friendly model.
Args:
text: Text to analyze (HTML, snippet, etc.)
category: One of 'form', 'script', 'text' (or any category your rules use)
Returns:
{
"checks": [
{ "name": str, "description": str, "category": str,
"result": "PASS"|"FAIL", "reason": Optional[str],
"severity": Optional[str], "tags": Optional[List[str]] }, ...
],
"summary": { "fail_count": int, "total_rules": int }
}
"""
out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}}
engine = self._get_rule_engine()
if engine is None:
return out
try:
engine_results = engine.run_all(text, category=category) # list of dicts
index = 0
total = len(engine_results)
while index < total:
item = engine_results[index]
normalized = {
"name": item.get("name"),
"description": item.get("description"),
"category": item.get("category"),
"result": item.get("result"), # "PASS" | "FAIL"
"reason": item.get("reason"), # present on FAIL by engine design
"severity": item.get("severity"),
"tags": item.get("tags"),
}
out["checks"].append(normalized)
index = index + 1
out["summary"] = self._summarize_results(out["checks"])
except Exception as exc:
# Preserve shape; record the error as a synthetic PASS (so UI doesn't break)
out["checks"].append({
"name": "engine_error",
"description": "Rule engine failed during evaluation",
"category": category,
"result": "PASS",
"reason": f"{exc}",
"severity": None,
"tags": None
})
out["summary"] = {"fail_count": 0, "total_rules": 1}
return out
def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]:
"""
Build a top-level overview for the results page: runs each category across
the entire HTML and groups results by category.
Returns:
[
{"category": "script", "results": [ ...engine dicts... ], "summary": {...}},
{"category": "form", "results": [ ... ], "summary": {...}},
{"category": "text", "results": [ ... ], "summary": {...}},
]
"""
overview: List[Dict[str, Any]] = []
engine = self._get_rule_engine()
categories = ["script", "form", "text"]
index = 0
total = len(categories)
while index < total:
cat = categories[index]
block = {"category": cat, "results": [], "summary": {"fail_count": 0, "total_rules": 0}}
if engine is not None:
try:
results = engine.run_all(full_html_text, category=cat)
block["results"] = results
block["summary"] = self._summarize_results(results)
except Exception as exc:
block["results"] = [{
"name": "engine_error",
"description": "Rule engine failed during overview evaluation",
"category": cat,
"result": "PASS",
"reason": f"{exc}",
"severity": None,
"tags": None
}]
block["summary"] = {"fail_count": 0, "total_rules": 1}
overview.append(block)
index = index + 1
return overview
# -----------------------------------------------------------------------
# Form & Script analysis (plumbing only; detection is in the rules engine)
# -----------------------------------------------------------------------
def analyze_forms(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
"""
Collect form artifacts and evaluate per-form matches via the rules engine.
Only include rows that matched at least one rule.
Returns list of dicts with keys (per matched form):
- type: "form"
- action, method, inputs
- content_snippet: str
- rules: List[{name, description, severity?, tags?}]
"""
soup = BeautifulSoup(html, "lxml")
results: List[Dict[str, Any]] = []
engine = self._get_rule_engine()
base_hostname = urlparse(base_url).hostname or ""
# Match how scripts picks preview len
try:
preview_len = getattr(settings.ui, "snippet_preview_len", 200) # keep parity with scripts
except Exception:
preview_len = 200
for form in soup.find_all("form"):
try:
action = (form.get("action") or "").strip()
method = (form.get("method") or "get").strip().lower()
inputs: List[Dict[str, Any]] = []
for inp in form.find_all("input"):
inputs.append({
"name": inp.get("name"),
"type": (inp.get("type") or "text").strip().lower(),
})
# Use the actual form markup for regex rules
form_markup = str(form)
# UI-friendly snippet
content_snippet = form_markup[:preview_len]
matches: List[Dict[str, Any]] = []
if engine is not None:
for r in getattr(engine, "rules", []):
if getattr(r, "category", None) != "form":
continue
rtype = getattr(r, "rule_type", None)
try:
ok = False
reason = ""
if rtype == "regex":
# Run against the raw form HTML
ok, reason = r.run(form_markup)
elif rtype == "function":
# Structured facts for function-style rules
facts = {
"category": "form",
"base_url": base_url,
"base_hostname": base_hostname,
"action": action,
"action_hostname": urlparse(action).hostname or "",
"method": method,
"inputs": inputs,
"markup": form_markup,
}
ok, reason = r.run(facts)
else:
continue
if ok:
matches.append({
"name": getattr(r, "name", "unknown_rule"),
"description": (reason or "") or getattr(r, "description", ""),
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
except Exception as rule_exc:
# Be defensive—bad rule shouldn't break the form pass
try:
self.logger.debug("Form rule error", extra={"rule": getattr(r, "name", "?"), "error": str(rule_exc)})
except Exception:
pass
continue
if matches:
results.append({
"type": "form",
"action": action,
"method": method,
"inputs": inputs,
"content_snippet": content_snippet,
"rules": matches,
})
except Exception as exc:
# Keep analysis resilient
try:
self.logger.error("Form analysis error", extra={"error": str(exc)})
except Exception:
pass
results.append({
"type": "form",
"heuristics": [f"Form analysis error: {exc}"],
})
return results
def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]:
"""
Collect script artifacts and evaluate per-script matches via the rules engine.
Only include rows that matched at least one rule.
"""
soup = BeautifulSoup(html, "lxml")
results: List[Dict[str, Any]] = []
benign_types = {"application/ld+json", "application/json"}
engine = self._get_rule_engine()
base_hostname = urlparse(base_url).hostname or ""
for script in soup.find_all("script"):
try:
src = (script.get("src") or "").strip()
s_type_attr = (script.get("type") or "").strip().lower()
inline_text = script.get_text(strip=True) or ""
if s_type_attr in benign_types:
continue
record: Dict[str, Any] = {}
if src:
record["type"] = "external"
record["src"] = src
elif inline_text:
# respect your UI snippet config
preview_len = getattr(settings.ui, "snippet_preview_len", 200)
record["type"] = "inline"
record["content_snippet"] = (inline_text[:preview_len]).replace("\n", " ")
else:
record["type"] = "unknown"
matches: List[Dict[str, Any]] = []
if engine is not None:
if inline_text:
for r in engine.rules:
if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "regex":
ok, reason = r.run(inline_text)
if ok:
matches.append({
"name": getattr(r, "name", "unknown_rule"),
"description": getattr(r, "description", "") or (reason or ""),
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
if src:
facts = {
"src": src,
"base_url": base_url,
"base_hostname": base_hostname,
"src_hostname": urlparse(src).hostname or "",
"category": "script",
}
for r in engine.rules:
if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "function":
ok, reason = r.run(facts)
if ok:
matches.append({
"name": getattr(r, "name", "unknown_rule"),
"description": (reason or "") or getattr(r, "description", ""),
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
if matches:
record["rules"] = matches
results.append(record)
except Exception as exc:
results.append({
"type": "unknown",
"heuristics": [f"Script analysis error: {exc}"]
})
return results
# -----------------------------------------------------------------------
# Fetcher / Orchestrator
# -----------------------------------------------------------------------
async def fetch_page_artifacts(self, url: str) -> Dict[str, Any]:
"""
Fetch page artifacts and save them in a UUID-based directory for this Browser's storage_dir.
Writes:
- /data/<uuid>/screenshot.png
- /data/<uuid>/source.html
- /data/<uuid>/results.json (single source of truth for routes)
Returns:
result dict with keys used by templates (and future API).
"""
run_uuid = str(uuid.uuid4())
run_dir = self.storage_dir / run_uuid
run_dir.mkdir(parents=True, exist_ok=True)
screenshot_path = run_dir / "screenshot.png"
source_path = run_dir / "source.html"
results_path = run_dir / "results.json"
redirects: List[Dict[str, Any]] = []
downloads: List[Dict[str, Any]] = []
scripts_seen: List[str] = []
async with async_playwright() as pw:
browser = await pw.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled"]
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
java_script_enabled=True,
locale="en-US"
)
page = await context.new_page()
# Event handlers (plumbing)
def _on_response(resp):
try:
if 300 <= resp.status <= 399:
redirects.append({"status": resp.status, "url": resp.url})
except Exception:
pass
def _on_download(d):
try:
downloads.append({"url": d.url, "suggested_filename": d.suggested_filename})
except Exception:
pass
def _on_request(r):
try:
if r.url.endswith((".js", ".vbs", ".hta")):
scripts_seen.append(r.url)
except Exception:
pass
page.on("response", _on_response)
page.on("download", _on_download)
page.on("request", _on_request)
try:
await page.goto(url, wait_until="networkidle", timeout=60000)
final_url = page.url
await page.screenshot(path=str(screenshot_path), full_page=True)
html = await page.content()
safe_write(source_path, html)
except PWTimeoutError:
final_url = page.url
safe_write(source_path, "Page did not fully load (timeout)")
await page.screenshot(path=str(screenshot_path), full_page=True)
await context.close()
await browser.close()
# Read back saved source
html_content = source_path.read_text(encoding="utf-8")
# Forms analysis (per-form rule checks)
forms_info = self.analyze_forms(html_content, final_url)
# Scripts artifacts (no detection here)
suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url)
# Enrichment
enrichment = enrich_url(url)
# Global PASS/FAIL table per category (entire document)
rule_checks_overview = self.build_rule_checks_overview(html_content)
try:
for blk in rule_checks_overview:
current_app.logger.debug(f"[rules] {blk['category']}: {blk['summary']}")
except Exception:
pass
# Assemble single result dict
result: Dict[str, Any] = {
"uuid": run_uuid,
"submitted_url": url,
"final_url": final_url,
"redirects": redirects,
"downloads": downloads,
"scripts": scripts_seen,
"forms": forms_info,
"suspicious_scripts": suspicious_scripts,
"rule_checks": rule_checks_overview, # table-ready for UI
"enrichment": enrichment
}
# Persist as the single source of truth for routes
safe_write(results_path, json.dumps(result, indent=2, ensure_ascii=False))
try:
current_app.logger.info(f"[browser] Saved results.json for run {run_uuid}")
except Exception:
pass
return result
# ---------------------------------------------------------------------------
# Lazy-loaded singleton factory
# ---------------------------------------------------------------------------
# Prefer importing your project-wide singleton decorator.
try:
from app.utils.settings import singleton_loader # if we already export it
except Exception:
# Local fallback if import is not available.
from typing import Callable, TypeVar
import functools
T = TypeVar("T")
def singleton_loader(func: Callable[..., T]) -> Callable[..., T]:
"""Ensure the function only runs once, returning the cached value."""
cache: dict[str, T] = {}
@functools.wraps(func)
def wrapper(*args, **kwargs) -> T:
if func.__name__ not in cache:
cache[func.__name__] = func(*args, **kwargs)
return cache[func.__name__]
return wrapper
@singleton_loader
def get_browser(storage_dir: Optional[Path] = None) -> Browser:
"""
Lazily construct and cache a singleton Browser instance.
Args:
storage_dir: Optional override for artifact base directory.
Returns:
Browser: The singleton instance.
"""
return Browser(storage_dir=storage_dir)