""" app/browser.py Singleton, lazily-loaded page fetcher + analysis orchestrator for SneakyScope. Responsibilities: - Fetch a URL (HTML, redirects, etc.) - Run the Suspicious Rules Engine (PASS/FAIL for all rules) - Write artifacts (screenshot.png, source.txt, results.json) into /data// - Return a single 'result' dict suitable for UI and future API Design notes: - Detection logic (regex/heuristics) lives in the rules engine (YAML/function rules). - This module keeps "plumbing" only (fetch, extract, persist). - Minimal non-detection heuristics remain here (e.g., skip benign script MIME types). Assumptions: - Flask app context is active (uses current_app for logger and RULE_ENGINE). - SANDBOX_STORAGE is configured (default: /data). - enrich_url(url) returns enrichment dict. """ from __future__ import annotations import json import uuid from pathlib import Path from typing import Any, Dict, List, Optional from urllib.parse import urlparse from bs4 import BeautifulSoup from flask import current_app from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError from app.utils.io_helpers import safe_write from app.enrichment import enrich_url from app.utils.settings import get_settings # Load settings once for constants / defaults settings = get_settings() class Browser: """ Orchestrates page fetching and analysis. Meant to be accessed via the lazily-loaded singleton factory `get_browser()`. """ def __init__(self, storage_dir: Optional[Path] = None) -> None: """ Args: storage_dir: Base directory for run artifacts. Defaults to settings.sandbox.storage (typically /data) if not provided. """ if storage_dir is None: try: # Prefer your settings model’s configured storage path storage_dir = Path(settings.sandbox.storage) except Exception: storage_dir = Path("/data") self.storage_dir: Path = storage_dir # ----------------------------------------------------------------------- # Engine access helpers # ----------------------------------------------------------------------- @staticmethod def _get_rule_engine(): """ Retrieve the rules engine instance from the Flask application config. Returns: RuleEngine or None: The engine if available, or None if not configured. """ try: return current_app.config.get("RULE_ENGINE") except Exception: return None @staticmethod def _summarize_results(results: List[Dict[str, Any]]) -> Dict[str, int]: """ Summarize a list of engine rule result dicts (result = "PASS"|"FAIL"). Returns: {'fail_count': int, 'total_rules': int} """ summary = {"fail_count": 0, "total_rules": 0} index = 0 total = len(results) while index < total: item = results[index] summary["total_rules"] = summary["total_rules"] + 1 if str(item.get("result", "")).upper() == "FAIL": summary["fail_count"] = summary["fail_count"] + 1 index = index + 1 return summary def run_rule_checks(self, text: str, category: str) -> Dict[str, Any]: """ Run all rules for a given category against provided text, returning a table-friendly model. Args: text: Text to analyze (HTML, snippet, etc.) category: One of 'form', 'script', 'text' (or any category your rules use) Returns: { "checks": [ { "name": str, "description": str, "category": str, "result": "PASS"|"FAIL", "reason": Optional[str], "severity": Optional[str], "tags": Optional[List[str]] }, ... ], "summary": { "fail_count": int, "total_rules": int } } """ out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}} engine = self._get_rule_engine() if engine is None: return out try: engine_results = engine.run_all(text, category=category) # list of dicts index = 0 total = len(engine_results) while index < total: item = engine_results[index] normalized = { "name": item.get("name"), "description": item.get("description"), "category": item.get("category"), "result": item.get("result"), # "PASS" | "FAIL" "reason": item.get("reason"), # present on FAIL by engine design "severity": item.get("severity"), "tags": item.get("tags"), } out["checks"].append(normalized) index = index + 1 out["summary"] = self._summarize_results(out["checks"]) except Exception as exc: # Preserve shape; record the error as a synthetic PASS (so UI doesn't break) out["checks"].append({ "name": "engine_error", "description": "Rule engine failed during evaluation", "category": category, "result": "PASS", "reason": f"{exc}", "severity": None, "tags": None }) out["summary"] = {"fail_count": 0, "total_rules": 1} return out def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]: """ Build a top-level overview for the results page: runs each category across the entire HTML and groups results by category. Returns: [ {"category": "script", "results": [ ...engine dicts... ], "summary": {...}}, {"category": "form", "results": [ ... ], "summary": {...}}, {"category": "text", "results": [ ... ], "summary": {...}}, ] """ overview: List[Dict[str, Any]] = [] engine = self._get_rule_engine() categories = ["script", "form", "text"] index = 0 total = len(categories) while index < total: cat = categories[index] block = {"category": cat, "results": [], "summary": {"fail_count": 0, "total_rules": 0}} if engine is not None: try: results = engine.run_all(full_html_text, category=cat) block["results"] = results block["summary"] = self._summarize_results(results) except Exception as exc: block["results"] = [{ "name": "engine_error", "description": "Rule engine failed during overview evaluation", "category": cat, "result": "PASS", "reason": f"{exc}", "severity": None, "tags": None }] block["summary"] = {"fail_count": 0, "total_rules": 1} overview.append(block) index = index + 1 return overview # ----------------------------------------------------------------------- # Form & Script analysis (plumbing only; detection is in the rules engine) # ----------------------------------------------------------------------- def analyze_forms(self, html: str, base_url: str) -> List[Dict[str, Any]]: """ Parse forms from the page HTML and apply rule-based checks (engine), keeping only simple plumbing heuristics here (no security logic). Returns list of dicts with keys: - action, method, inputs - flagged (bool), flag_reasons (list[str]), status (str) - rule_checks: {'checks': [...], 'summary': {...}} (per-form snippet evaluation) """ soup = BeautifulSoup(html, "lxml") forms_info: List[Dict[str, Any]] = [] page_hostname = urlparse(base_url).hostname for form in soup.find_all("form"): action = form.get("action") method = form.get("method", "get").lower() inputs: List[Dict[str, Any]] = [] for inp in form.find_all("input"): input_name = inp.get("name") input_type = inp.get("type", "text") inputs.append({"name": input_name, "type": input_type}) flagged_reasons: List[str] = [] if not action or str(action).strip() == "": flagged_reasons.append("No action specified") else: try: action_host = urlparse(action).hostname if not str(action).startswith("/") and action_host != page_hostname: flagged_reasons.append("Submits to a different host") except Exception: pass try: if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https": flagged_reasons.append("Submits over insecure HTTP") except Exception: pass for hidden in form.find_all("input", type="hidden"): name_value = hidden.get("name") or "" if "password" in name_value.lower(): flagged_reasons.append("Hidden password field") flagged = bool(flagged_reasons) # Serialize a simple form snippet for rule category='form' snippet_lines = [] snippet_lines.append(f"base_url={base_url}") snippet_lines.append(f"base_hostname={page_hostname}") snippet_lines.append(f"action={action}") snippet_lines.append(f"method={method}") snippet_lines.append("inputs=") i = 0 n = len(inputs) while i < n: item = inputs[i] snippet_lines.append(f" - name={item.get('name')} type={item.get('type')}") i = i + 1 form_snippet = "\n".join(snippet_lines) # Per-form rule checks (PASS/FAIL list via engine) rule_checks = self.run_rule_checks(form_snippet, category="form") forms_info.append({ "action": action, "method": method, "inputs": inputs, "flagged": flagged, "flag_reasons": flagged_reasons, "status": "flagged" if flagged else "possibly safe", "rule_checks": rule_checks }) return forms_info def analyze_scripts(self, html: str, base_url: str = "") -> List[Dict[str, Any]]: """ Collect script artifacts and evaluate per-script matches via the rules engine. Only include rows that matched at least one rule. """ soup = BeautifulSoup(html, "lxml") results: List[Dict[str, Any]] = [] benign_types = {"application/ld+json", "application/json"} engine = self._get_rule_engine() base_hostname = urlparse(base_url).hostname or "" for script in soup.find_all("script"): try: src = (script.get("src") or "").strip() s_type_attr = (script.get("type") or "").strip().lower() inline_text = script.get_text(strip=True) or "" if s_type_attr in benign_types: continue record: Dict[str, Any] = {} if src: record["type"] = "external" record["src"] = src elif inline_text: # respect your UI snippet config preview_len = getattr(settings.ui, "snippet_preview_len", 200) record["type"] = "inline" record["content_snippet"] = (inline_text[:preview_len]).replace("\n", " ") else: record["type"] = "unknown" matches: List[Dict[str, Any]] = [] if engine is not None: if inline_text: for r in engine.rules: if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "regex": ok, reason = r.run(inline_text) if ok: matches.append({ "name": getattr(r, "name", "unknown_rule"), "description": getattr(r, "description", "") or (reason or ""), "severity": getattr(r, "severity", None), "tags": getattr(r, "tags", None), }) if src: facts = { "src": src, "base_url": base_url, "base_hostname": base_hostname, "src_hostname": urlparse(src).hostname or "", "category": "script", } for r in engine.rules: if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "function": ok, reason = r.run(facts) if ok: matches.append({ "name": getattr(r, "name", "unknown_rule"), "description": (reason or "") or getattr(r, "description", ""), "severity": getattr(r, "severity", None), "tags": getattr(r, "tags", None), }) if matches: record["rules"] = matches results.append(record) except Exception as exc: results.append({ "type": "unknown", "heuristics": [f"Script analysis error: {exc}"] }) return results # ----------------------------------------------------------------------- # Fetcher / Orchestrator # ----------------------------------------------------------------------- async def fetch_page_artifacts(self, url: str) -> Dict[str, Any]: """ Fetch page artifacts and save them in a UUID-based directory for this Browser's storage_dir. Writes: - /data//screenshot.png - /data//source.txt - /data//results.json (single source of truth for routes) Returns: result dict with keys used by templates (and future API). """ run_uuid = str(uuid.uuid4()) run_dir = self.storage_dir / run_uuid run_dir.mkdir(parents=True, exist_ok=True) screenshot_path = run_dir / "screenshot.png" source_path = run_dir / "source.txt" results_path = run_dir / "results.json" redirects: List[Dict[str, Any]] = [] downloads: List[Dict[str, Any]] = [] scripts_seen: List[str] = [] async with async_playwright() as pw: browser = await pw.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled"] ) context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", java_script_enabled=True, locale="en-US" ) page = await context.new_page() # Event handlers (plumbing) def _on_response(resp): try: if 300 <= resp.status <= 399: redirects.append({"status": resp.status, "url": resp.url}) except Exception: pass def _on_download(d): try: downloads.append({"url": d.url, "suggested_filename": d.suggested_filename}) except Exception: pass def _on_request(r): try: if r.url.endswith((".js", ".vbs", ".hta")): scripts_seen.append(r.url) except Exception: pass page.on("response", _on_response) page.on("download", _on_download) page.on("request", _on_request) try: await page.goto(url, wait_until="networkidle", timeout=60000) final_url = page.url await page.screenshot(path=str(screenshot_path), full_page=True) html = await page.content() safe_write(source_path, html) except PWTimeoutError: final_url = page.url safe_write(source_path, "Page did not fully load (timeout)") await page.screenshot(path=str(screenshot_path), full_page=True) await context.close() await browser.close() # Read back saved source html_content = source_path.read_text(encoding="utf-8") # Forms analysis (per-form rule checks) forms_info = self.analyze_forms(html_content, final_url) # Scripts artifacts (no detection here) suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url) # Enrichment enrichment = enrich_url(url) # Global PASS/FAIL table per category (entire document) rule_checks_overview = self.build_rule_checks_overview(html_content) try: for blk in rule_checks_overview: current_app.logger.debug(f"[rules] {blk['category']}: {blk['summary']}") except Exception: pass # Assemble single result dict result: Dict[str, Any] = { "uuid": run_uuid, "submitted_url": url, "final_url": final_url, "redirects": redirects, "downloads": downloads, "scripts": scripts_seen, "forms": forms_info, "suspicious_scripts": suspicious_scripts, "rule_checks": rule_checks_overview, # table-ready for UI "enrichment": enrichment } # Persist as the single source of truth for routes safe_write(results_path, json.dumps(result, indent=2, ensure_ascii=False)) try: current_app.logger.info(f"[browser] Saved results.json for run {run_uuid}") except Exception: pass return result # --------------------------------------------------------------------------- # Lazy-loaded singleton factory # --------------------------------------------------------------------------- # Prefer importing your project-wide singleton decorator. try: from app.utils.settings import singleton_loader # if we already export it except Exception: # Local fallback if import is not available. from typing import Callable, TypeVar import functools T = TypeVar("T") def singleton_loader(func: Callable[..., T]) -> Callable[..., T]: """Ensure the function only runs once, returning the cached value.""" cache: dict[str, T] = {} @functools.wraps(func) def wrapper(*args, **kwargs) -> T: if func.__name__ not in cache: cache[func.__name__] = func(*args, **kwargs) return cache[func.__name__] return wrapper @singleton_loader def get_browser(storage_dir: Optional[Path] = None) -> Browser: """ Lazily construct and cache a singleton Browser instance. Args: storage_dir: Optional override for artifact base directory. Returns: Browser: The singleton instance. """ return Browser(storage_dir=storage_dir)