""" app/browser.py Page fetcher + analysis orchestrator for SneakyScope. - Fetches a URL (HTML, redirects, etc.) - Runs the Suspicious Rules Engine (PASS/FAIL for all rules) - Writes artifacts (screenshot.png, source.txt, results.json) into /data// - Returns a single 'result' dict suitable for UI and future API Design notes: - Detection logic (regex/heuristics) lives in the rules engine (YAML/function rules). - This module keeps "plumbing" only (fetch, extract, persist). - Minimal non-detection heuristics remain here (e.g., skip benign script MIME types). Assumptions: - Flask app context is active (uses current_app for logger and RULE_ENGINE). - SANDBOX_STORAGE is configured (default: /data). - enrich_url(url) returns enrichment dict. """ import json import uuid import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlparse from bs4 import BeautifulSoup from flask import current_app from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError from app.utils.io_helpers import safe_write from .enrichment import enrich_url from .utils.settings import get_settings settings = get_settings() # --------------------------------------------------------------------------- # Engine access helpers # --------------------------------------------------------------------------- def get_rule_engine(): """ Retrieve the rules engine instance from the Flask application config. Returns: RuleEngine or None: The engine if available, or None if not configured. """ try: return current_app.config.get("RULE_ENGINE") except Exception: return None def _summarize_results(results: List[Dict[str, Any]]) -> Dict[str, int]: """ Summarize a list of engine rule result dicts (result = "PASS"|"FAIL"). Returns: {'fail_count': int, 'total_rules': int} """ summary = {"fail_count": 0, "total_rules": 0} index = 0 total = len(results) while index < total: item = results[index] summary["total_rules"] = summary["total_rules"] + 1 if str(item.get("result", "")).upper() == "FAIL": summary["fail_count"] = summary["fail_count"] + 1 index = index + 1 return summary def run_rule_checks(text: str, category: str) -> Dict[str, Any]: """ Run all rules for a given category against provided text, returning a table-friendly model. Args: text: Text to analyze (HTML, snippet, etc.) category: One of 'form', 'script', 'text' (or any category your rules use) Returns: { "checks": [ { "name": str, "description": str, "category": str, "result": "PASS"|"FAIL", "reason": Optional[str], "severity": Optional[str], "tags": Optional[List[str]] }, ... ], "summary": { "fail_count": int, "total_rules": int } } """ out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}} engine = get_rule_engine() if engine is None: return out try: engine_results = engine.run_all(text, category=category) # list of dicts # Normalize explicitly index = 0 total = len(engine_results) while index < total: item = engine_results[index] normalized = { "name": item.get("name"), "description": item.get("description"), "category": item.get("category"), "result": item.get("result"), # "PASS" | "FAIL" "reason": item.get("reason"), # present on FAIL by engine design "severity": item.get("severity"), "tags": item.get("tags"), } out["checks"].append(normalized) index = index + 1 out["summary"] = _summarize_results(out["checks"]) except Exception as exc: # Preserve shape; record the error as a synthetic PASS (so UI doesn't break) out["checks"].append({ "name": "engine_error", "description": "Rule engine failed during evaluation", "category": category, "result": "PASS", "reason": f"{exc}", "severity": None, "tags": None }) out["summary"] = {"fail_count": 0, "total_rules": 1} return out def build_rule_checks_overview(full_html_text: str) -> List[Dict[str, Any]]: """ Build a top-level overview for the results page: runs each category across the entire HTML and groups results by category. Returns: [ {"category": "script", "results": [ ...engine dicts... ], "summary": {...}}, {"category": "form", "results": [ ... ], "summary": {...}}, {"category": "text", "results": [ ... ], "summary": {...}}, ] """ overview: List[Dict[str, Any]] = [] engine = get_rule_engine() categories = ["script", "form", "text"] index = 0 total = len(categories) while index < total: cat = categories[index] block = {"category": cat, "results": [], "summary": {"fail_count": 0, "total_rules": 0}} if engine is not None: try: results = engine.run_all(full_html_text, category=cat) block["results"] = results block["summary"] = _summarize_results(results) except Exception as exc: block["results"] = [{ "name": "engine_error", "description": "Rule engine failed during overview evaluation", "category": cat, "result": "PASS", "reason": f"{exc}", "severity": None, "tags": None }] block["summary"] = {"fail_count": 0, "total_rules": 1} overview.append(block) index = index + 1 return overview # --------------------------------------------------------------------------- # Form & Script analysis (plumbing only; detection is in the rules engine) # --------------------------------------------------------------------------- def analyze_forms(html: str, base_url: str) -> List[Dict[str, Any]]: """ Parse forms from the page HTML and apply rule-based checks (engine), keeping only simple plumbing heuristics here (no security logic). Returns list of dicts with keys: - action, method, inputs - flagged (bool), flag_reasons (list[str]), status (str) - rule_checks: {'checks': [...], 'summary': {...}} (per-form snippet evaluation) Note: The 'flagged' value is now purely a legacy visual hint based on simple heuristics; the authoritative PASS/FAIL details are in rule_checks. As you migrate heuristics into function rules, this 'flagged' may be removed entirely. """ soup = BeautifulSoup(html, "lxml") forms_info: List[Dict[str, Any]] = [] page_hostname = urlparse(base_url).hostname for form in soup.find_all("form"): action = form.get("action") method = form.get("method", "get").lower() # Build explicit inputs list inputs: List[Dict[str, Any]] = [] for inp in form.find_all("input"): input_name = inp.get("name") input_type = inp.get("type", "text") inputs.append({"name": input_name, "type": input_type}) # Minimal legacy flags (kept for UI continuity; detection lives in engine) flagged_reasons: List[str] = [] if not action or str(action).strip() == "": flagged_reasons.append("No action specified") else: try: action_host = urlparse(action).hostname if not str(action).startswith("/") and action_host != page_hostname: flagged_reasons.append("Submits to a different host") except Exception: pass try: if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https": flagged_reasons.append("Submits over insecure HTTP") except Exception: pass for hidden in form.find_all("input", type="hidden"): name_value = hidden.get("name") or "" if "password" in name_value.lower(): flagged_reasons.append("Hidden password field") flagged = bool(flagged_reasons) # Serialize a simple form snippet for rule category='form' snippet_lines = [] snippet_lines.append(f"base_url={base_url}") snippet_lines.append(f"base_hostname={page_hostname}") snippet_lines.append(f"action={action}") snippet_lines.append(f"method={method}") snippet_lines.append("inputs=") i = 0 n = len(inputs) while i < n: item = inputs[i] snippet_lines.append(f" - name={item.get('name')} type={item.get('type')}") i = i + 1 form_snippet = "\n".join(snippet_lines) # Per-form rule checks (PASS/FAIL list via engine) rule_checks = run_rule_checks(form_snippet, category="form") forms_info.append({ "action": action, "method": method, "inputs": inputs, "flagged": flagged, "flag_reasons": flagged_reasons, "status": "flagged" if flagged else "possibly safe", "rule_checks": rule_checks }) return forms_info def analyze_scripts(html: str, base_url: str = "") -> List[Dict[str, Any]]: """ Collect script artifacts and evaluate per-script matches via the rules engine. Only include rows that matched at least one rule. Inline scripts are checked against regex rules using their text; external scripts are checked against function rules using a small 'facts' dict (src/hosts). Returns list of dicts like: { "type": "external" | "inline" | "unknown", "src": "...", # for external "content_snippet": "...", # for inline "rules": [ { "name": "...", "description": "..." }, ... ] } """ soup = BeautifulSoup(html, "lxml") results: List[Dict[str, Any]] = [] # Benign MIME types we ignore entirely (non-detection plumbing) benign_types = {"application/ld+json", "application/json"} engine = get_rule_engine() base_hostname = urlparse(base_url).hostname or "" for script in soup.find_all("script"): try: src = (script.get("src") or "").strip() s_type_attr = (script.get("type") or "").strip().lower() inline_text = script.get_text(strip=True) or "" # Skip benign structured data outright (noise control) if s_type_attr in benign_types: continue record: Dict[str, Any] = {} if src: record["type"] = "external" record["src"] = src elif inline_text: record["type"] = "inline" record["content_snippet"] = (inline_text[:settings.ui.snippet_preview_len]).replace("\n", " ") else: record["type"] = "unknown" # --- Per-script evaluation: gather matches from engine rules matches: List[Dict[str, str]] = [] if engine is not None: # Inline content → run regex script rules against the text if inline_text: for r in engine.rules: if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "regex": ok, reason = r.run(inline_text) if ok: matches.append({ "name": getattr(r, "name", "unknown_rule"), "description": getattr(r, "description", "") or (reason or ""), "severity": getattr(r, "severity", None), "tags": getattr(r, "tags", None), }) # External src → run function script rules with facts if src: facts = { "src": src, "base_url": base_url, "base_hostname": base_hostname, "src_hostname": urlparse(src).hostname or "", "category": "script", } for r in engine.rules: if getattr(r, "category", None) == "script" and getattr(r, "rule_type", None) == "function": ok, reason = r.run(facts) if ok: matches.append({ "name": getattr(r, "name", "unknown_rule"), "description": (reason or "") or getattr(r, "description", ""), "severity": getattr(r, "severity", None), "tags": getattr(r, "tags", None), }) # Only keep rows that matched at least one rule if matches: record["rules"] = matches results.append(record) except Exception as exc: results.append({ "type": "unknown", "heuristics": [f"Script analysis error: {exc}"] }) return results # --------------------------------------------------------------------------- # Fetcher / Orchestrator # --------------------------------------------------------------------------- async def fetch_page_artifacts(url: str, storage_dir: Path) -> Dict[str, Any]: """ Fetch page artifacts and save them in a UUID-based directory. Writes: - /data//screenshot.png - /data//source.txt - /data//results.json (single source of truth for routes) Returns: result dict with keys used by templates (and future API). """ run_uuid = str(uuid.uuid4()) run_dir = storage_dir / run_uuid run_dir.mkdir(parents=True, exist_ok=True) screenshot_path = run_dir / "screenshot.png" source_path = run_dir / "source.txt" results_path = run_dir / "results.json" redirects: List[Dict[str, Any]] = [] downloads: List[Dict[str, Any]] = [] scripts_seen: List[str] = [] async with async_playwright() as pw: browser = await pw.chromium.launch( headless=True, args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled"] ) context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36", java_script_enabled=True, locale="en-US" ) page = await context.new_page() # Event handlers (plumbing) def _on_response(resp): try: if 300 <= resp.status <= 399: redirects.append({"status": resp.status, "url": resp.url}) except Exception: pass def _on_download(d): try: downloads.append({"url": d.url, "suggested_filename": d.suggested_filename}) except Exception: pass def _on_request(r): try: if r.url.endswith((".js", ".vbs", ".hta")): scripts_seen.append(r.url) except Exception: pass page.on("response", _on_response) page.on("download", _on_download) page.on("request", _on_request) try: await page.goto(url, wait_until="networkidle", timeout=60000) final_url = page.url await page.screenshot(path=str(screenshot_path), full_page=True) html = await page.content() safe_write(source_path, html) except PWTimeoutError: final_url = page.url safe_write(source_path, "Page did not fully load (timeout)") await page.screenshot(path=str(screenshot_path), full_page=True) await context.close() await browser.close() # Read back saved source html_content = source_path.read_text(encoding="utf-8") # Forms analysis (per-form rule checks) forms_info = analyze_forms(html_content, final_url) # Scripts artifacts (no detection here) suspicious_scripts = analyze_scripts(html_content, base_url=final_url) # Enrichment enrichment = enrich_url(url) # Global PASS/FAIL table per category (entire document) rule_checks_overview = build_rule_checks_overview(html_content) for blk in rule_checks_overview: current_app.logger.debug(f"[rules] {blk['category']}: {blk['summary']}") # Assemble single result dict result: Dict[str, Any] = { "uuid": run_uuid, "submitted_url": url, "final_url": final_url, "redirects": redirects, "downloads": downloads, "scripts": scripts_seen, "forms": forms_info, "suspicious_scripts": suspicious_scripts, "rule_checks": rule_checks_overview, # table-ready for UI "enrichment": enrichment } # Persist as the single source of truth for routes safe_write(results_path, json.dumps(result, indent=2, ensure_ascii=False)) try: current_app.logger.info(f"[browser] Saved results.json for run {run_uuid}") except Exception: pass return result def load_results(storage_dir: Path, run_uuid: str) -> Optional[Dict[str, Any]]: """ Load a prior run's results.json from /data//. Returns: dict or None """ run_dir = storage_dir / run_uuid results_path = run_dir / "results.json" if not results_path.exists(): return None try: text = results_path.read_text(encoding="utf-8") data = json.loads(text) return data except Exception: return None