first commit

This commit is contained in:
2025-08-20 21:22:28 +00:00
commit 70d29f9f95
26 changed files with 2558 additions and 0 deletions

10
.env.example Normal file
View File

@@ -0,0 +1,10 @@
# Flask Configuration
FLASK_ENV=production
SECRET_KEY=changeme_super_long_random_secret
PYTHONUNBUFFERED=1
# Playwright (browser automation)
PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
# Sandbox Storage
SANDBOX_STORAGE=/data

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
.env
/data/

34
Dockerfile Normal file
View File

@@ -0,0 +1,34 @@
# Use the official Playwright image with browsers preinstalled
FROM mcr.microsoft.com/playwright/python:v1.45.0-jammy
# Create a non-root user (the base image already has pwuser, we'll keep it)
USER root
# System deps (whois, dig, etc. — handy for later stages)
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
whois dnsutils iputils-ping ca-certificates \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Copy requirements first to leverage Docker layer caching
COPY requirements.txt ./
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code (the double app is needed because the app folder needs to be inside the app folder)
COPY app/ /app/app/
COPY entrypoint.sh ./entrypoint.sh
RUN chmod +x /app/entrypoint.sh
# Create data dir for screenshots/artifacts
RUN mkdir -p /data && chown -R pwuser:pwuser /data /app
USER pwuser
# Expose port
EXPOSE 8000
# Start server
ENTRYPOINT ["/app/entrypoint.sh"]

92
Readme.md Normal file
View File

@@ -0,0 +1,92 @@
# URL Sandbox
A lightweight web-based sandbox for analyzing websites and domains.
It performs WHOIS lookups, GeoIP enrichment, script/form inspection, and provides analyst-friendly output.
---
## 🚀 Features
- **Domain & IP Enrichment**
- WHOIS lookups with fallback to raw text when fields are missing
- Explicit handling of privacy-protected WHOIS records (`N/A` or `Possible Privacy`)
- GeoIP (City, Region, Country, Latitude/Longitude)
- ASN, ISP, and network details
- **Flagged Content Analysis**
- Suspicious script detection
- Suspicious form detection
- Nested bullet-style reporting for clarity
- **Improved UX**
- Automatic addition of `http://`, `https://`, and `www.` if only a domain is provided
- Modal spinner to indicate background analysis (`Analyzing website…`)
- **Resilient GeoLite2 Database Management**
- Downloads the MaxMind GeoLite2-City database on first startup
- Checks file age and only re-downloads if older than **14 days** (configurable via environment variable)
---
## ⚙️ Setup Instructions
### 1. Clone the Repository
```bash
git clone https://github.com/yourusername/url-sandbox.git
cd url-sandbox
```
### 2. Create a MaxMind Account & License Key
1. Go to [MaxMind GeoLite2](https://dev.maxmind.com/geoip/geolite2-free-geolocation-data)
2. Sign up for a free account
3. Navigate to **Account > Manage License Keys**
4. Generate a new license key
### 3. Configure Environment Variables
All environment variables are loaded from a `.env` file.
1. Copy the sample file:
```bash
cp .env.example .env
````
2. Edit `.env` and set your values (see [`.env.example`](./.env.example) for available options).
Make sure to add your **MaxMind License Key** under `MAXMIND_LICENSE_KEY`.
### 4. Run with Docker Compose
```bash
docker-compose up --build
```
This will:
- Build the app
- Download the GeoLite2 database if not present or too old
- Start the web interface
---
## 📝 Example Output
**WHOIS Info**
- Registrar: MarkMonitor, Inc.
- Organization: Possible Privacy
- Creation: 1997-09-15
- Expiration: 2028-09-14
**GeoIP Info**
- IP: 172.66.159.20
- City: N/A
- Region: N/A
- Country: United States
- Coordinates: (37.751, -97.822)
- ASN: 13335
- ISP: Cloudflare, Inc.
---
## 📌 Roadmap
See [Next Steps Checklist](docs/roadmap.md) for planned features:
- Improved UI templates
- Artifact cleanup
- Proxy support (optional)
---

82
app/__init__.py Normal file
View File

@@ -0,0 +1,82 @@
"""
app/__init__.py
Application factory and startup hooks for SneakyScope.
Responsibilities:
- Create the Flask app.
- Load settings (YAML -> dataclasses) with safe defaults.
- Initialize and load the Suspicious Rules Engine from YAML.
- Register blueprints (routes).
- Configure core paths (e.g., SANDBOX_STORAGE).
"""
import os
import logging
from pathlib import Path
from flask import Flask
# Local imports
from .utils.settings import get_settings
from .utils import io_helpers # if you need logging/setup later
from .utils import cache_db # available for future injections
from .utils.rules_engine import RuleEngine, load_rules_from_yaml # rules engine
from . import routes # blueprint
def create_app() -> Flask:
"""
Create and configure the Flask application instance.
Returns:
Flask: The configured Flask app.
"""
# Basic app object
app = Flask(__name__, template_folder="templates", static_folder="static")
# Load settings (safe fallback to defaults if file missing)
settings = get_settings()
# Secret key loaded from env
app.secret_key = os.getenv("SECRET_KEY")
# Configure storage directory (bind-mount is still handled by sandbox.sh)
sandbox_storage_default = Path("/data")
app.config["SANDBOX_STORAGE"] = str(sandbox_storage_default)
# Initialize Suspicious Rules Engine at startup
# Determine rules file path relative to this package
base_dir = Path(__file__).resolve().parent
rules_path = base_dir / "config" / "suspicious_rules.yaml"
# Create an engine instance (even if file missing, we still want an engine)
engine = RuleEngine()
# Try to load from YAML if present; log clearly if not
if rules_path.exists():
try:
loaded_rules = load_rules_from_yaml(rules_path)
# Add rules one-by-one (explicit)
for rule in loaded_rules:
engine.add_rule(rule)
app.logger.info(f"[+] Loaded {len(loaded_rules)} suspicious rules from {rules_path}")
except Exception as e:
app.logger.warning(f"[!] Failed loading rules from {rules_path}: {e}")
else:
app.logger.warning(f"[!] Rules file not found at {rules_path}. Engine will start with zero rules.")
# Store engine on app config so it is accessible via current_app
app.config["RULE_ENGINE"] = engine
# Make app name/version available for templates here if you want it globally
app.config["APP_NAME"] = settings.app.name
app.config["APP_VERSION"] = f"v{settings.app.version_major}.{settings.app.version_minor}"
# Register blueprints
app.register_blueprint(routes.bp)
# Example log line so we know we booted cleanly
app.logger.info(f"SneakyScope started: {app.config['APP_NAME']} {app.config['APP_VERSION']}")
app.logger.info(f"SANDBOX_STORAGE: {app.config['SANDBOX_STORAGE']}")
return app

400
app/browser.py Normal file
View File

@@ -0,0 +1,400 @@
import re
import uuid
import json
from pathlib import Path
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse
from typing import Dict, Any, Optional
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
from flask import current_app # access the rule engine from app config
from app.utils.io_helpers import safe_write
from .enrichment import enrich_url
def get_rule_engine():
"""
Retrieve the rules engine instance from the Flask application config.
Returns:
RuleEngine or None: The engine if available, or None if not configured.
"""
try:
# current_app is only available during an active request context
engine = current_app.config.get("RULE_ENGINE")
return engine
except Exception:
# If called outside a Flask request context, fail gracefully
return None
def run_rule_checks(text, category):
"""
Run all rules for a given category against the provided text.
Args:
text (str): The content to test (e.g., form snippet, inline JS).
category (str): The rule category to run (e.g., 'form' or 'script').
Returns:
dict: {
"checks": [ { "rule": str, "category": str, "matched": bool, "reason": Optional[str] }, ... ],
"summary": { "matched_count": int, "total_rules": int }
}
"""
result = {
"checks": [],
"summary": {
"matched_count": 0,
"total_rules": 0
}
}
engine = get_rule_engine()
if engine is None:
# No engine configured; return empty but well-formed structure
return result
try:
# Run engine rules for the specified category
check_results = engine.run_all(text, category=category)
# Normalize results into the expected structure
total = 0
matched = 0
for item in check_results:
# item is expected to contain: rule, category, matched, reason (optional)
total = total + 1
if bool(item.get("matched")):
matched = matched + 1
normalized = {
"rule": item.get("rule"),
"category": item.get("category"),
"matched": bool(item.get("matched")),
"reason": item.get("reason")
}
result["checks"].append(normalized)
result["summary"]["matched_count"] = matched
result["summary"]["total_rules"] = total
except Exception as e:
# If anything goes wrong, keep structure and add a fake failure note
result["checks"].append({
"rule": "engine_error",
"category": category,
"matched": False,
"reason": f"Rule engine error: {e}"
})
result["summary"]["matched_count"] = 0
result["summary"]["total_rules"] = 0
return result
def analyze_forms(html: str, base_url: str):
"""
Parse forms from the page HTML and apply heuristic flags and rule-based checks.
Args:
html (str): The full page HTML.
base_url (str): The final URL of the page (used for hostname comparisons).
Returns:
list[dict]: A list of form analysis dictionaries, each including:
- action, method, inputs
- flagged (bool), flag_reasons (list[str]), status (str)
- rule_checks: dict with "checks" (list) and "summary" (dict)
"""
soup = BeautifulSoup(html, "lxml")
forms_info = []
page_hostname = urlparse(base_url).hostname
for form in soup.find_all("form"):
action = form.get("action")
method = form.get("method", "get").lower()
# Build explicit inputs list
inputs = []
for inp in form.find_all("input"):
input_name = inp.get("name")
input_type = inp.get("type", "text")
inputs.append({
"name": input_name,
"type": input_type
})
flagged_reasons = []
# No action specified
if not action or str(action).strip() == "":
flagged_reasons.append("No action specified")
# External host
else:
try:
action_host = urlparse(action).hostname
if not str(action).startswith("/") and action_host != page_hostname:
flagged_reasons.append("Submits to a different host")
except Exception:
# If hostname parsing fails, skip this condition quietly
pass
# HTTP form on HTTPS page
try:
if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https":
flagged_reasons.append("Submits over insecure HTTP")
except Exception:
# If scheme parsing fails, ignore
pass
# Hidden password / suspicious hidden inputs
for hidden in form.find_all("input", type="hidden"):
name_value = hidden.get("name") or ""
if "password" in name_value.lower():
flagged_reasons.append("Hidden password field")
flagged = bool(flagged_reasons)
# Serialize a simple form snippet for the rules engine to analyze (category='form')
snippet_lines = []
snippet_lines.append(f"action={action}")
snippet_lines.append(f"method={method}")
snippet_lines.append("inputs=")
for item in inputs:
snippet_lines.append(f" - name={item.get('name')} type={item.get('type')}")
form_snippet = "\n".join(snippet_lines)
rule_checks = run_rule_checks(form_snippet, category="form")
forms_info.append({
"action": action,
"method": method,
"inputs": inputs,
"flagged": flagged,
"flag_reasons": flagged_reasons,
"status": "flagged" if flagged else "possibly safe",
"rule_checks": rule_checks
})
return forms_info
def analyze_scripts(html: str, base_url: str = "", engine=None) -> list[dict]:
"""
Analyze <script> elements using the RuleEngine (if provided) and
lightweight built-in heuristics. Only append a record when at least
one rule or heuristic matches, and always set a sensible 'type'.
Returns list of dicts like:
{
"type": "external" | "inline" | "unknown",
"src": "...", # for external
"content_snippet": "...", # for inline
"rules": [ { "name": "...", "description": "..." }, ... ],
"heuristics": [ "reason1", "reason2", ... ]
}
"""
soup = BeautifulSoup(html, "lxml")
results: list[dict] = []
import re
from urllib.parse import urlparse
# Benign MIME types we ignore entirely
benign_types = {"application/ld+json", "application/json"}
# Suspicious file extensions for external scripts
dangerous_ext = (".vbs", ".hta")
# Inline red flags
risky_inline_patterns = [
(re.compile(r"\beval\s*\(", re.IGNORECASE), "Uses eval()"),
(re.compile(r"\bnew\s+Function\s*\(", re.IGNORECASE), "Uses Function constructor"),
(re.compile(r"\bdocument\.write\s*\(", re.IGNORECASE), "Uses document.write()"),
(re.compile(r"\bActiveXObject\s*\(", re.IGNORECASE), "Uses ActiveXObject (IE-only)"),
(re.compile(r"\batob\s*\(", re.IGNORECASE), "Uses atob() (possible obfuscation)"),
(re.compile(r"\bunescape\s*\(", re.IGNORECASE), "Uses unescape() (legacy/obfuscation)"),
(re.compile(r"\bset(?:Timeout|Interval)\s*\(\s*['\"`].+['\"`]\s*,", re.IGNORECASE),
"String passed to setTimeout/setInterval"),
(re.compile(r"[\"']?0x[0-9a-fA-F]{16,}[\"']?", re.IGNORECASE),
"Contains long hex-like constants (possible obfuscation)"),
]
base_host = urlparse(base_url).hostname or ""
for script in soup.find_all("script"):
try:
src = (script.get("src") or "").strip()
s_type_attr = (script.get("type") or "").strip().lower()
# IMPORTANT: .string is often None; get_text() is reliable
inline_text = script.get_text(strip=True) or ""
# Skip benign structured data outright
if s_type_attr in benign_types:
continue
# ---- Build facts for the rules engine
facts = {
"script_type_attr": s_type_attr or None,
"has_src": bool(src),
"src": src or None,
"attrs": dict(script.attrs),
"inline_len": len(inline_text),
"inline_preview": inline_text[:200].replace("\n", " ") if inline_text else None,
"base_url": base_url or None,
"base_hostname": base_host or None,
"src_hostname": urlparse(src).hostname if src else None,
}
# ---- Evaluate rules engine (using name/description)
engine_matches: list[dict] = []
if engine is not None:
try:
if hasattr(engine, "evaluate_script"):
matches = engine.evaluate_script(facts)
elif hasattr(engine, "evaluate"):
matches = engine.evaluate(facts)
else:
matches = []
if isinstance(matches, list):
for m in matches:
if isinstance(m, dict) and "name" in m:
engine_matches.append({
"name": m["name"],
"description": m.get("description", "")
})
elif isinstance(m, str):
engine_matches.append({"name": m, "description": ""})
except Exception as e:
engine_matches.append({"name": "Rules Engine Error", "description": str(e)})
# ---- Built-in heuristics
heuristics: list[str] = []
if src:
# Unusual URL schemes for script sources
if src.startswith(("data:", "blob:")):
heuristics.append("Script src uses data:/blob: URL")
# Dangerous extensions
for ext in dangerous_ext:
if src.lower().endswith(ext):
heuristics.append(f"External script with dangerous extension ({ext.lstrip('.')})")
break
# Third-party host hint
src_host = facts.get("src_hostname") or ""
if base_host and src_host and src_host != base_host:
heuristics.append(f"Third-party host: {src_host}")
else:
if inline_text:
for pat, why in risky_inline_patterns:
if pat.search(inline_text):
heuristics.append(why)
# ---- Only append when something matched; always set type
if engine_matches or heuristics:
record: dict = {}
if src:
record["type"] = "external"
record["src"] = src
elif inline_text:
record["type"] = "inline"
record["content_snippet"] = facts.get("inline_preview")
else:
record["type"] = "unknown"
if engine_matches:
record["rules"] = engine_matches
if heuristics:
record["heuristics"] = heuristics
results.append(record)
except Exception as e:
# Never let a single broken <script> kill the whole analysis
results.append({
"type": "unknown",
"heuristics": [f"Script analysis error: {e}"]
})
return results
async def fetch_page_artifacts(url: str, storage_dir: Path, engine=None) -> Dict[str, Any]:
"""
Fetch page artifacts and save them in a UUID-based directory.
Args:
url (str): URL to analyze.
storage_dir (Path): Base /data path.
engine: Optional rules engine instance (from app.config["RULE_ENGINE"]).
"""
run_uuid = str(uuid.uuid4())
run_dir = storage_dir / run_uuid
run_dir.mkdir(parents=True, exist_ok=True)
screenshot_path = run_dir / "screenshot.png"
source_path = run_dir / "source.txt"
results_path = run_dir / "results.json"
redirects = []
downloads = []
scripts = []
async with async_playwright() as pw:
browser = await pw.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-dev-shm-usage", "--disable-blink-features=AutomationControlled"]
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
java_script_enabled=True,
locale="en-US"
)
page = await context.new_page()
# Event handlers
page.on("response", lambda resp: redirects.append({"status": resp.status, "url": resp.url}) if 300 <= resp.status <= 399 else None)
page.on("download", lambda d: downloads.append({"url": d.url, "suggested_filename": d.suggested_filename}))
page.on("request", lambda r: scripts.append(r.url) if r.url.endswith((".js", ".vbs", ".hta")) else None)
try:
await page.goto(url, wait_until="networkidle", timeout=60000)
final_url = page.url
await page.screenshot(path=str(screenshot_path), full_page=True)
html = await page.content()
safe_write(source_path, html)
except PWTimeoutError:
final_url = page.url
safe_write(source_path, "Page did not fully load (timeout)")
await page.screenshot(path=str(screenshot_path), full_page=True)
await context.close()
await browser.close()
html_content = source_path.read_text(encoding="utf-8")
forms_info = analyze_forms(html_content, final_url)
suspicious_scripts = analyze_scripts(html_content, base_url=final_url, engine=engine)
enrichment = enrich_url(url)
result = {
"uuid": run_uuid,
"submitted_url": url,
"final_url": final_url,
"redirects": redirects,
"downloads": downloads,
"scripts": scripts,
"forms": forms_info,
"suspicious_scripts": suspicious_scripts,
"enrichment": enrichment
}
safe_write(results_path, json.dumps(result, indent=2))
return result

View File

@@ -0,0 +1,5 @@
words:
- "reset password"
- "open document"
- "view document"
- "verify account"

9
app/config/settings.yaml Normal file
View File

@@ -0,0 +1,9 @@
app:
name: SneakyScope
version_major: 0
version_minor: 1
cache:
recent_runs_count: 10
whois_cache_days: 7
geoip_cache_days: 7

View File

@@ -0,0 +1,80 @@
# config/suspicious_rules.yaml
# Baseline suspicious rules for SneakyScope
# Organized by category: script, form, text
# Extend these with more specific rules as needed
# --- Script Rules ---
- name: eval_usage
description: "Use of eval() in script"
category: script
type: regex
pattern: "\\beval\\("
- name: document_write
description: "Use of document.write (often abused in malicious injections)"
category: script
type: regex
pattern: "document\\.write\\("
- name: inline_event_handler
description: "Inline event handler detected (onload, onclick, etc.)"
category: script
type: regex
pattern: "on(load|click|error|mouseover|keydown)\\s*="
- name: obfuscated_encoding
description: "Suspicious use of atob() or btoa() (base64 encoding/decoding)"
category: script
type: regex
pattern: "\\b(atob|btoa)\\("
- name: suspicious_iframe
description: "Iframe usage in script (possible phishing/malvertising)"
category: script
type: regex
pattern: "<iframe[^>]*>"
# --- Form Rules ---
- name: suspicious_form_action
description: "Form action with external URL (potential credential exfiltration)"
category: form
type: regex
pattern: "<form[^>]*action=['\"]http"
- name: hidden_inputs
description: "Form with hidden inputs (possible credential harvesting)"
category: form
type: regex
pattern: "<input[^>]*type=['\"]hidden"
- name: password_field
description: "Form requesting password field"
category: form
type: regex
pattern: "<input[^>]*type=['\"]password"
# --- Text Rules (Social Engineering / BEC) ---
- name: urgent_request
description: "Language suggesting urgency (common in phishing/BEC)"
category: text
type: regex
pattern: "(urgent|immediately|asap|action required)"
- name: account_suspension
description: "Threat of account suspension/closure"
category: text
type: regex
pattern: "(account.*suspend|account.*close|verify.*account)"
- name: financial_request
description: "Request for gift cards, wire transfer, or money"
category: text
type: regex
pattern: "(gift card|wire transfer|bank account|bitcoin|payment required)"
- name: credential_reset
description: "Password reset or credential reset wording"
category: text
type: regex
pattern: "(reset password|update credentials|login to verify)"

137
app/enrichment.py Normal file
View File

@@ -0,0 +1,137 @@
import logging
from pathlib import Path
from urllib.parse import urlparse
import requests
import yaml
import whois
from datetime import datetime
from ipaddress import ip_address
import socket
# Local imports
from .utils.cache_db import get_cache
from .utils.settings import get_settings
# Configure logging
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
# Init cache
cache = get_cache("/data/cache.db")
settings = get_settings()
# Load BEC words
BEC_WORDS_FILE = Path(__file__).parent.parent / "config" / "bec_words.yaml"
if BEC_WORDS_FILE.exists():
with open(BEC_WORDS_FILE, "r", encoding="utf-8") as f:
BEC_WORDS = yaml.safe_load(f).get("words", [])
else:
BEC_WORDS = []
# 24 hours * 60 minutes
days = 24 * 60
GEOIP_DEFAULT_TTL = settings.cache.geoip_cache_days * days
WHOIS_DEFAULT_TTL = settings.cache.whois_cache_days * days
def enrich_url(url: str) -> dict:
"""Perform WHOIS, GeoIP, and BEC word enrichment."""
result = {}
# Extract hostname
parsed = urlparse(url)
hostname = parsed.hostname or url # fallback if parsing fails
# --- WHOIS ---
result.update(enrich_whois(hostname))
# --- GeoIP ---
result["geoip"] = enrich_geoip(hostname)
# --- BEC Words ---
result["bec_words"] = [w for w in BEC_WORDS if w.lower() in url.lower()]
return result
def enrich_whois(hostname: str) -> dict:
"""Fetch WHOIS info using python-whois with safe type handling."""
cache_key = f"whois:{hostname}"
cached = cache.read(cache_key)
if cached:
logging.info(f"[CACHE HIT] for WHOIS: {hostname}")
return cached
logging.info(f"[CACHE MISS] for WHOIS: {hostname}")
result = {}
try:
w = whois.whois(hostname)
def format_dt(val):
if isinstance(val, list):
return ", ".join([v.strftime("%Y-%m-%d %H:%M:%S") if isinstance(v, datetime) else str(v) for v in val])
elif isinstance(val, datetime):
return val.strftime("%Y-%m-%d %H:%M:%S")
elif val is None:
return "Possible Privacy"
else:
return str(val)
result["whois"] = {
"registrar": format_dt(getattr(w, "registrar", None)),
"creation_date": format_dt(getattr(w, "creation_date", None)),
"expiration_date": format_dt(getattr(w, "expiration_date", None)),
"owner": format_dt(getattr(w, "org", None))
}
except Exception as e:
logging.warning(f"WHOIS lookup failed for {hostname}: {e}")
try:
# fallback raw whois text
import subprocess
raw_output = subprocess.check_output(["whois", hostname], encoding="utf-8", errors="ignore")
result["whois"] = {}
result["raw_whois"] = raw_output
except Exception as raw_e:
logging.error(f"Raw WHOIS also failed: {raw_e}")
result["whois"] = {}
result["raw_whois"] = "N/A"
cache.create(cache_key, result, WHOIS_DEFAULT_TTL)
return result
def enrich_geoip(hostname: str) -> dict:
"""Resolve hostname to IPs and fetch info from ip-api.com."""
geo_info = {}
ips = extract_ips_from_url(hostname)
for ip in ips:
ip_str = str(ip)
cache_key = f"geoip:{ip_str}"
cached = cache.read(cache_key)
if cached:
logging.info(f"[CACHE HIT] for GEOIP: {ip}")
geo_info[ip_str] = cached
continue
logging.info(f"[CACHE MISS] for GEOIP: {ip}")
try:
resp = requests.get(f"http://ip-api.com/json/{ip_str}?fields=24313855", timeout=5)
if resp.status_code == 200:
geo_info[ip_str] = resp.json()
else:
geo_info[ip_str] = {"error": f"HTTP {resp.status_code}"}
except Exception as e:
geo_info[ip_str] = {"error": str(e)}
cache.create(cache_key, geo_info[ip_str],GEOIP_DEFAULT_TTL)
return geo_info
def extract_ips_from_url(hostname: str):
"""Resolve hostname to IPs."""
try:
info = socket.getaddrinfo(hostname, None)
return list({ip_address(x[4][0]) for x in info})
except Exception:
return []

125
app/routes.py Normal file
View File

@@ -0,0 +1,125 @@
import os
import json
import asyncio
from pathlib import Path
from datetime import datetime
from flask import Blueprint, render_template, request, redirect, url_for, flash, current_app, send_file, abort
from .browser import fetch_page_artifacts
from .enrichment import enrich_url
from .utils.settings import get_settings
from .utils.io_helpers import get_recent_results
bp = Blueprint("main", __name__)
settings = get_settings()
app_name = settings.app.name
app_version = f"v {settings.app.version_major}.{settings.app.version_minor}"
# --- context processor ---
@bp.context_processor
def inject_app_info():
"""Inject app name and version into all templates."""
return {
"app_name": app_name,
"app_version": app_version
}
@bp.route("/", methods=["GET"])
def index():
"""
Render the landing page with optional 'recent_results' list.
The number of recent runs is controlled via settings.cache.recent_runs_count (int).
Falls back to 10 if not present or invalid.
"""
# Resolve SANDBOX_STORAGE from app config
storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
# Pull recent count from settings with a safe fallback
try:
# settings is already initialized at module import in your file
recent_count = int(getattr(settings.cache, "recent_runs_count", 10))
if recent_count < 0:
recent_count = 0
except Exception:
recent_count = 10
# Build the recent list (non-fatal if storage is empty or unreadable)
recent_results = get_recent_results(storage, recent_count, current_app.logger)
# Pass to template; your index.html will hide the card if list is empty
return render_template("index.html", recent_results=recent_results)
@bp.route("/analyze", methods=["POST"])
def analyze():
url = request.form.get("url", "").strip()
current_app.logger.info(f"[*] Analyzing {url}")
if not url:
flash("Please enter a URL.", "error")
return redirect(url_for("main.index"))
storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
storage.mkdir(parents=True, exist_ok=True)
try:
engine = current_app.config.get("RULE_ENGINE")
result = asyncio.run(fetch_page_artifacts(url, storage, engine=engine))
# result = asyncio.run(fetch_page_artifacts(url, storage))
current_app.logger.info(f"[+] Analysis done for {url}")
except Exception as e:
flash(f"Analysis failed: {e}", "error")
current_app.logger.error(f"Analysis failed for {url}: {e}")
return redirect(url_for("main.index"))
# Add enrichment safely
try:
enrichment = enrich_url(url)
result["enrichment"] = enrichment
current_app.logger.info(f"[+] Enrichment added for {url}")
except Exception as e:
result["enrichment"] = {}
current_app.logger.warning(f"[!] Enrichment failed for {url}: {e}")
# Redirect to permalink page for this run
return redirect(url_for("main.view_result", run_uuid=result["uuid"]))
@bp.route("/results/<run_uuid>", methods=["GET"])
def view_result(run_uuid: str):
storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
run_dir = storage / run_uuid
results_path = run_dir / "results.json"
if not results_path.exists():
current_app.logger.error(f"Results not found for UUID: {run_uuid}")
abort(404)
with open(results_path, "r", encoding="utf-8") as f:
result = json.load(f)
# Pass the UUID to the template for artifact links
result["uuid"] = run_uuid
return render_template("result.html", **result)
@bp.route("/artifacts/<run_uuid>/<filename>", methods=["GET"])
def artifacts(run_uuid: str, filename: str):
storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
run_dir = storage / run_uuid
full_path = run_dir / filename
# Prevent directory traversal
try:
full_path.relative_to(run_dir.resolve())
except ValueError:
current_app.logger.warning(f"Directory traversal attempt: {filename}")
abort(404)
if not full_path.exists():
current_app.logger.error(f"Artifact not found: {filename} for UUID {run_uuid}")
abort(404)
return send_file(full_path)

288
app/static/style.css Normal file
View File

@@ -0,0 +1,288 @@
:root {
font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
}
body {
margin: 0;
background: #0b0f14;
color: #e6edf3;
}
header, footer {
padding: 1rem 1.25rem;
background: #0f1720;
border-bottom: 1px solid #1f2a36;
}
/* ===== main: now full-width (no 960px cap) ===== */
main {
padding: 1.5rem 2rem; /* a bit more horizontal breathing room */
max-width: 100%; /* remove fixed cap */
width: 100%;
margin: 0; /* no auto centering since were full-width */
box-sizing: border-box;
}
.card {
background: #111826;
padding: 1rem;
border: 1px solid #1f2a36;
border-radius: 12px;
margin-bottom: 1rem;
}
label {
display: block;
margin-bottom: 0.5rem;
}
input[type=url] {
width: 100%;
padding: 0.7rem;
border-radius: 8px;
border: 1px solid #243041;
background: #0b1220;
color: #e6edf3;
}
button, .button {
display: inline-block;
margin-top: 0.75rem;
padding: 0.6rem 1rem;
border-radius: 8px;
border: 1px solid #243041;
background: #1a2535;
color: #e6edf3;
text-decoration: none;
}
.flash {
list-style: none;
padding: 0.5rem 1rem;
}
.flash .error {
color: #ff6b6b;
}
.grid {
display: grid;
grid-template-columns: 150px 1fr;
gap: 0.5rem 1rem;
}
img {
max-width: 100%;
height: auto;
border-radius: 8px;
border: 1px solid #243041;
}
pre.code {
white-space: pre-wrap;
word-break: break-all;
background: #0b1220;
padding: 0.75rem;
border-radius: 8px;
border: 1px solid #243041;
}
/* Links */
a {
color: #7dd3fc; /* Soft cyan for dark background */
text-decoration: underline;
}
a:hover {
color: #38bdf8; /* Slightly brighter on hover */
}
/* Accordion / details summary */
details summary {
cursor: pointer;
padding: 0.5rem;
font-weight: bold;
border-radius: 8px;
background: #111826;
border: 1px solid #1f2a36;
margin-bottom: 0.5rem;
transition: background 0.3s ease;
}
details[open] summary {
background: #1a2535; /* Slightly lighter when expanded */
}
details > ul, details > table {
padding-left: 1rem;
margin: 0.5rem 0;
}
/* Highlight flagged forms */
details.flagged summary {
border-left: 4px solid #ff6b6b; /* Red accent for flagged forms */
}
/* Smooth collapse/expand */
details ul, details p {
transition: all 0.3s ease;
}
/* Enrichment / GeoIP / Forms / Redirects Tables */
.enrichment-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 1rem;
}
.enrichment-table th,
.enrichment-table td {
border: 1px solid #243041;
padding: 0.5rem;
vertical-align: top;
}
.enrichment-table th {
background: #111826;
text-align: left;
}
.enrichment-table td {
width: auto; /* browser resizes naturally */
word-break: break-word;
}
/* Scripts Table Special Handling */
.scripts-table pre.code {
margin: 0;
padding: 0.25rem;
font-size: 0.9rem;
}
/* Hover effects for table rows */
.enrichment-table tbody tr:hover {
background: #1f2a36;
}
/* Card table headings */
.enrichment-table thead th {
border-bottom: 2px solid #243041;
}
/* Ensure nested tables don't overflow */
.card table {
table-layout: auto;
word-break: break-word;
}
/* ============================
Results Table (3+ columns)
- Visual style matches .enrichment-table
- Adds better wrapping for long strings (URL/UUID)
- Right-aligns timestamps for scannability
============================ */
.results-table {
width: 100%;
border-collapse: collapse;
background: #111826; /* match card background */
border: 1px solid #1f2a36; /* subtle border like cards */
border-radius: 12px; /* rounded corners */
overflow: hidden; /* clip the rounded corners */
table-layout: auto; /* allow natural column sizing */
}
/* Header styling */
.results-table thead th {
padding: 0.6rem 0.75rem;
background: #0f1720; /* match header tone */
border-bottom: 1px solid #1f2a36;
text-align: left;
font-weight: 600;
white-space: nowrap; /* keep short headers on one line */
}
/* Body cells */
.results-table tbody td {
padding: 0.6rem 0.75rem;
border-top: 1px solid #1f2a36;
vertical-align: top;
text-align: left;
}
/* Zebra rows for readability (optional) */
.results-table tbody tr:nth-child(odd) {
background: #0d1522; /* slight contrast row */
}
/* Links inside table should inherit your global link colors */
.results-table a {
text-decoration: underline;
}
/* ---- Column-specific tweaks ---- */
/* URL column: allow wrapping of long URLs without blowing the layout */
.results-table td.url,
.results-table td.url a {
word-wrap: break-word; /* legacy support */
overflow-wrap: anywhere; /* modern wrapping for long URLs */
word-break: break-word;
}
/* UUID column: force wrap to avoid overflow */
.results-table td.uuid {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
word-break: break-all; /* split at any point to keep table narrow */
max-width: 28ch; /* reasonable width to avoid stretching */
}
/* Timestamp column: align right and keep on a single line */
.results-table td.timestamp {
text-align: right;
white-space: nowrap; /* keep ISO timestamps on one line */
}
/* Optional: make the newest (first) row stand out subtly */
.results-table tbody tr:first-child {
box-shadow: inset 0 0 0 1px #243041;
}
/* Optional: small, subtle buttons in table cells (e.g., copy UUID) */
.results-table .copy-btn {
margin-left: 0.4rem;
padding: 0.2rem 0.45rem;
border-radius: 6px;
border: 1px solid #243041;
background: #1a2535;
color: #e6edf3;
cursor: pointer;
line-height: 1;
font-size: 0.9rem;
}
.results-table .copy-btn:hover {
filter: brightness(1.1);
}
/* ===== Responsive niceties for very small screens ===== */
@media (max-width: 768px) {
main {
padding: 1rem; /* a tad tighter on mobile */
}
.enrichment-table,
.results-table {
display: block;
overflow-x: auto; /* allow horizontal scroll if needed */
white-space: nowrap;
}
}
.scripts-table td ul {
margin: 0.25rem 0 0.25rem 1rem;
padding-left: 1rem;
}
.scripts-table td small {
opacity: 0.85;
}

33
app/templates/base.html Normal file
View File

@@ -0,0 +1,33 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>{{ app_name }} {{ app_version }}</title>
<link rel="stylesheet" href="https://unpkg.com/sanitize.css" />
<link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}" />
</head>
<body>
<header>
<h1>{{ app_name }} {{ app_version }}</h1>
</header>
{% with messages = get_flashed_messages(with_categories=true) %}
{% if messages %}
<ul class="flash">
{% for category, message in messages %}
<li class="{{ category }}">{{ message }}</li>
{% endfor %}
</ul>
{% endif %}
{% endwith %}
<main>
{% block content %}{% endblock %}
</main>
<footer>
<small>{{ app_name }} - A self-hosted URL analysis sandbox - {{ app_version }}</small>
</footer>
</body>
</html>

149
app/templates/index.html Normal file
View File

@@ -0,0 +1,149 @@
{% extends 'base.html' %}
{% block content %}
<!-- Analysis Form -->
<form id="analyze-form" method="post" action="{{ url_for('main.analyze') }}" class="card">
<h2>Analyze a URL</h2>
<label for="url">Enter a URL to analyze</label>
<input id="url" name="url" type="url" placeholder="https://example.com" required />
<button type="submit">Analyze</button>
</form>
<!-- Recent Results (optional; shown only if recent_results provided) -->
{% if recent_results %}
<div class="card" id="recent-results">
<h2>Recent Results</h2>
<table class="results-table">
<thead>
<tr>
<th>Timestamp</th>
<th>URL</th>
<th>UUID</th>
</tr>
</thead>
<tbody>
{% for r in recent_results %}
<tr>
<td class="timestamp">
{% if r.timestamp %}
{{ r.timestamp }}
{% else %}
N/A
{% endif %}
</td>
<td class="url">
<a href="{{ url_for('main.view_result', run_uuid=r.uuid) }}">
{{ r.final_url or r.submitted_url }}
</a>
</td>
<td class="uuid">
<code id="uuid-{{ loop.index }}">{{ r.uuid }}</code>
<button
type="button"
class="copy-btn"
data-target="uuid-{{ loop.index }}">
📋
</button>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% endif %}
<!-- Spinner Modal -->
<div id="spinner-modal" style="
display:none;
opacity:0;
position:fixed;
top:0;
left:0;
width:100%;
height:100%;
background:rgba(0,0,0,0.7);
color:#fff;
font-size:1.5rem;
text-align:center;
padding-top:20%;
z-index:9999;
transition: opacity 0.3s ease;
">
<div>
<div class="loader" style="
border: 8px solid #f3f3f3;
border-top: 8px solid #1a2535;
border-radius: 50%;
width: 60px;
height: 60px;
animation: spin 1s linear infinite;
margin: 0 auto 1rem auto;
"></div>
Analyzing website…
</div>
</div>
<style>
@keyframes spin {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
</style>
<script>
const form = document.getElementById('analyze-form');
const modal = document.getElementById('spinner-modal');
function showModal() {
modal.style.display = 'block';
requestAnimationFrame(() => {
modal.style.opacity = '1';
});
}
function hideModal() {
modal.style.opacity = '0';
modal.addEventListener('transitionend', () => {
modal.style.display = 'none';
}, { once: true });
}
// Hide spinner on initial load / back navigation
window.addEventListener('pageshow', () => {
modal.style.opacity = '0';
modal.style.display = 'none';
});
form.addEventListener('submit', (e) => {
showModal();
// Prevent double submission
form.querySelector('button').disabled = true;
// Allow browser to render the modal before submitting
requestAnimationFrame(() => form.submit());
e.preventDefault();
});
</script>
<script>
document.addEventListener('DOMContentLoaded', () => {
const buttons = document.querySelectorAll('.copy-btn');
buttons.forEach(btn => {
btn.addEventListener('click', () => {
const targetId = btn.getAttribute('data-target');
const uuidText = document.getElementById(targetId).innerText;
navigator.clipboard.writeText(uuidText).then(() => {
// Give quick feedback
btn.textContent = '✅';
setTimeout(() => { btn.textContent = '📋'; }, 1500);
}).catch(err => {
console.error('Failed to copy UUID:', err);
});
});
});
});
</script>
{% endblock %}

268
app/templates/result.html Normal file
View File

@@ -0,0 +1,268 @@
{% extends "base.html" %}
{% block content %}
<!-- Top Jump List -->
<div class="card" id="top-jump-list">
<h2>Jump to Section</h2>
<ul>
<li><a href="/">Analyse Another Page</a></li>
<li><a href="#url-overview">URL Overview</a></li>
<li><a href="#enrichment">Enrichment</a></li>
<li><a href="#redirects">Redirects</a></li>
<li><a href="#forms">Forms</a></li>
<li><a href="#scripts">Suspicious Scripts</a></li>
<li><a href="#screenshot">Screenshot</a></li>
<li><a href="#source">Source</a></li>
</ul>
</div>
<!-- URL Overview -->
<div class="card" id="url-overview">
<h2>URL Overview</h2>
<p><strong>Submitted URL:</strong> {{ submitted_url }}</p>
<p><strong>Final URL:</strong> <a href="{{ final_url }}" target="_blank">{{ final_url }}</a></p>
<p><strong>Permalink:</strong>
<a href="{{ url_for('main.view_result', run_uuid=uuid, _external=True) }}">
{{ request.host_url }}results/{{ uuid }}
</a>
</p>
<p><a href="#top-jump-list">Back to top</a></p>
</div>
<!-- Enrichment -->
<div class="card" id="enrichment">
<h2>Enrichment</h2>
<!-- WHOIS -->
{% if enrichment.whois %}
<h3>WHOIS</h3>
<table class="enrichment-table">
<thead>
<tr>
<th>Field</th>
<th>Value</th>
</tr>
</thead>
<tbody>
{% for k, v in enrichment.whois.items() %}
<tr>
<td>{{ k.replace('_', ' ').title() }}</td>
<td>{{ v }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% endif %}
{% if enrichment.raw_whois %}
<h3>Raw WHOIS</h3>
<pre class="code">{{ enrichment.raw_whois }}</pre>
{% endif %}
<!-- GeoIP / IP-API -->
{% if enrichment.geoip %}
<h3>GeoIP</h3>
{% for ip, info in enrichment.geoip.items() %}
<details class="card" style="padding:0.5rem; margin-bottom:0.5rem;">
<summary>{{ ip }}</summary>
<table class="enrichment-table">
<tbody>
{% for key, val in info.items() %}
<tr>
<td>{{ key.replace('_', ' ').title() }}</td>
<td>{{ val }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</details>
{% endfor %}
{% endif %}
<!-- BEC Words -->
{% if enrichment.bec_words %}
<h3>BEC Words Detected</h3>
<table class="enrichment-table">
<thead>
<tr><th>Word</th></tr>
</thead>
<tbody>
{% for word in enrichment.bec_words %}
<tr><td>{{ word }}</td></tr>
{% endfor %}
</tbody>
</table>
{% endif %}
{% if not enrichment.whois and not enrichment.raw_whois and not enrichment.geoip and not enrichment.bec_words %}
<p>No enrichment data available.</p>
{% endif %}
<p><a href="#top-jump-list">Back to top</a></p>
</div>
<!-- Redirects -->
<div class="card" id="redirects">
<h2>Redirects</h2>
{% if redirects %}
<table class="enrichment-table">
<thead>
<tr>
<th>Status</th>
<th>URL</th>
</tr>
</thead>
<tbody>
{% for r in redirects %}
<tr>
<td>{{ r.status }}</td>
<td><a href="{{ r.url }}" target="_blank">{{ r.url }}</a></td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p>No redirects detected.</p>
{% endif %}
<p><a href="#top-jump-list">Back to top</a></p>
</div>
<!-- Forms -->
<div class="card" id="forms">
<h2>Forms</h2>
{% if forms %}
{% for form in forms %}
<details class="card {% if form.flagged %}flagged{% endif %}" style="padding:0.5rem; margin-bottom:0.5rem;">
<summary>{{ form.status }} — Action: {{ form.action }} ({{ form.method | upper }})</summary>
<table class="enrichment-table">
<thead>
<tr>
<th>Input Name</th>
<th>Type</th>
</tr>
</thead>
<tbody>
{% for inp in form.inputs %}
<tr>
<td>{{ inp.name }}</td>
<td>{{ inp.type }}</td>
</tr>
{% endfor %}
</tbody>
</table>
{% if form.flagged %}
<p><strong>Flag Reasons:</strong></p>
<ul>
{% for reason in form.flag_reasons %}
<li>{{ reason }}</li>
{% endfor %}
</ul>
{% endif %}
</details>
{% endfor %}
{% else %}
<p>No forms detected.</p>
{% endif %}
<p><a href="#top-jump-list">Back to top</a></p>
</div>
<!-- Suspicious Scripts -->
<div class="card" id="scripts">
<h2>Suspicious Scripts</h2>
{% if suspicious_scripts %}
<table class="enrichment-table scripts-table">
<thead>
<tr>
<th>Type</th>
<th>Source URL</th>
<th>Content Snippet</th>
<th>Matches (Rules & Heuristics)</th>
</tr>
</thead>
<tbody>
{% for s in suspicious_scripts %}
<tr>
<!-- Type -->
<td>{{ s.type or 'unknown' }}</td>
<!-- Source URL -->
<td>
{% if s.src %}
<a href="{{ s.src }}" target="_blank">{{ s.src }}</a>
{% else %}
N/A
{% endif %}
</td>
<!-- Inline content snippet (collapsible) -->
<td>
{% if s.content_snippet %}
<details>
<summary>View snippet</summary>
<pre class="code">{{ s.content_snippet }}</pre>
</details>
{% else %}
N/A
{% endif %}
</td>
<!-- Rules & Heuristics -->
<td>
{% set has_rules = s.rules and s.rules|length > 0 %}
{% set has_heur = s.heuristics and s.heuristics|length > 0 %}
{% if has_rules %}
<strong>Rules</strong>
<ul>
{% for r in s.rules %}
<li title="{{ r.description or '' }}">
{{ r.name }}
{% if r.description %}
<small>— {{ r.description }}</small>
{% endif %}
</li>
{% endfor %}
</ul>
{% endif %}
{% if has_heur %}
<strong>Heuristics</strong>
<ul>
{% for h in s.heuristics %}
<li>{{ h }}</li>
{% endfor %}
</ul>
{% endif %}
{% if not has_rules and not has_heur %}
N/A
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p>No suspicious scripts detected.</p>
{% endif %}
<p><a href="#top-jump-list">Back to top</a></p>
</div>
<!-- Screenshot -->
<div class="card" id="screenshot">
<h2>Screenshot</h2>
<img src="{{ url_for('main.artifacts', run_uuid=uuid, filename='screenshot.png') }}" alt="Screenshot">
<p><a href="#top-jump-list">Back to top</a></p>
</div>
<!-- Source -->
<div class="card" id="source">
<h2>Source</h2>
<p><a href="{{ url_for('main.artifacts', run_uuid=uuid, filename='source.txt') }}" target="_blank">View Source</a></p>
<p><a href="#top-jump-list">Back to top</a></p>
</div>
{% endblock %}

128
app/utils/cache_db.py Normal file
View File

@@ -0,0 +1,128 @@
import json
import time
import sqlite3
import threading
import functools
from pathlib import Path
from typing import Any, Optional
# ---------- SINGLETON DECORATOR ----------
T = Any
def singleton_loader(func):
"""Ensure only one cache instance exists."""
cache: dict[str, T] = {}
lock = threading.Lock()
@functools.wraps(func)
def wrapper(*args, **kwargs) -> T:
with lock:
if func.__name__ not in cache:
cache[func.__name__] = func(*args, **kwargs)
return cache[func.__name__]
return wrapper
# ---------- CACHE CLASS ----------
class CacheDB:
"""SQLite-backed cache with expiration in minutes, CRUD, auto-cleanup, singleton support."""
TABLE_NAME = "cache"
def __init__(self, db_path: str | Path = "cache.db", default_expiration_minutes: int = 1440):
"""
:param default_expiration_minutes: default expiration in minutes (default 24 hours)
"""
self.db_path = Path(db_path)
self.default_expiration = default_expiration_minutes * 60 # convert minutes -> seconds
self.conn = sqlite3.connect(self.db_path, check_same_thread=False)
self.conn.row_factory = sqlite3.Row
self._lock = threading.Lock()
self._create_table()
def _create_table(self):
"""Create the cache table if it doesn't exist."""
with self._lock:
self.conn.execute(f"""
CREATE TABLE IF NOT EXISTS {self.TABLE_NAME} (
key TEXT PRIMARY KEY,
value TEXT,
expires_at INTEGER
)
""")
self.conn.commit()
def _cleanup_expired(self):
"""Delete expired rows."""
now = int(time.time())
with self._lock:
self.conn.execute(
f"DELETE FROM {self.TABLE_NAME} WHERE expires_at IS NOT NULL AND expires_at < ?", (now,)
)
self.conn.commit()
# ---------- CRUD ----------
def create(self, key: str, value: Any, expires_in_minutes: Optional[int] = None):
"""Insert or update a cache entry. expires_in_minutes overrides default expiration."""
self._cleanup_expired()
if expires_in_minutes is None:
expires_in_seconds = self.default_expiration
else:
expires_in_seconds = expires_in_minutes * 60
expires_at = int(time.time()) + expires_in_seconds
value_json = json.dumps(value)
with self._lock:
self.conn.execute(
f"INSERT OR REPLACE INTO {self.TABLE_NAME} (key, value, expires_at) VALUES (?, ?, ?)",
(key, value_json, expires_at)
)
self.conn.commit()
def read(self, key: str) -> Optional[Any]:
"""Read a cache entry. Auto-cleans expired items."""
self._cleanup_expired()
with self._lock:
row = self.conn.execute(
f"SELECT * FROM {self.TABLE_NAME} WHERE key = ?", (key,)
).fetchone()
if not row:
return None
return json.loads(row["value"])
def update(self, key: str, value: Any, expires_in_minutes: Optional[int] = None):
"""Update a cache entry. Optional expiration in minutes."""
if expires_in_minutes is None:
expires_in_seconds = self.default_expiration
else:
expires_in_seconds = expires_in_minutes * 60
expires_at = int(time.time()) + expires_in_seconds
value_json = json.dumps(value)
with self._lock:
self.conn.execute(
f"UPDATE {self.TABLE_NAME} SET value = ?, expires_at = ? WHERE key = ?",
(value_json, expires_at, key)
)
self.conn.commit()
def delete(self, key: str):
with self._lock:
self.conn.execute(f"DELETE FROM {self.TABLE_NAME} WHERE key = ?", (key,))
self.conn.commit()
def clear(self):
"""Delete all rows from the cache table."""
with self._lock:
self.conn.execute(f"DELETE FROM {self.TABLE_NAME}")
self.conn.commit()
def close(self):
self.conn.close()
# ---------- SINGLETON INSTANCE ----------
@singleton_loader
def get_cache(db_path: str = "cache.db", default_expiration_minutes: int = 1440) -> CacheDB:
return CacheDB(db_path=db_path, default_expiration_minutes=default_expiration_minutes)

115
app/utils/io_helpers.py Normal file
View File

@@ -0,0 +1,115 @@
import json
import logging
from pathlib import Path
from datetime import datetime
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
def safe_write(path: Path | str, content: str, mode="w", encoding="utf-8"):
"""Write content to a file safely with logging."""
path = Path(path)
try:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, mode, encoding=encoding) as f:
f.write(content)
logging.info(f"[+] Wrote file: {path}")
except Exception as e:
logging.error(f"[!] Failed writing {path}: {e}")
raise
def get_recent_results(storage_dir: Path, limit: int, logger) -> list[dict]:
"""
Scan the SANDBOX_STORAGE directory for run folders (UUIDs), read each
run's results.json, and return the most recent N entries by file mtime.
Args:
storage_dir (Path): Base path where UUID run directories live.
limit (int): Maximum number of recent items to return.
logger: Flask or stdlib logger to record non-fatal issues.
Returns:
list[dict]: Each item includes:
{
"uuid": str,
"submitted_url": str | None,
"final_url": str | None,
"timestamp": str (ISO 8601),
}
Returns an empty list if no runs are found or on error.
"""
items = []
try:
# Ensure the storage dir exists
storage_dir.mkdir(parents=True, exist_ok=True)
# Iterate directories directly under storage_dir
for entry in storage_dir.iterdir():
try:
if not entry.is_dir():
# Skip non-directories
continue
# Expect results.json inside each UUID directory
results_path = entry / "results.json"
if not results_path.exists():
# Skip folders without results.json
continue
# Read file metadata (mtime) for sorting and display
stat_info = results_path.stat()
mtime_epoch = stat_info.st_mtime
mtime_iso = datetime.fromtimestamp(mtime_epoch).isoformat(timespec="seconds")
# Parse a small subset of the JSON for display
submitted_url = None
final_url = None
run_uuid = entry.name
try:
with open(results_path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
submitted_url = data.get("submitted_url")
final_url = data.get("final_url")
except Exception as read_err:
# If JSON is malformed or unreadable, log and continue
if logger:
logger.warning(f"[recent] Failed reading {results_path}: {read_err}")
item = {
"uuid": run_uuid,
"submitted_url": submitted_url,
"final_url": final_url,
"timestamp": mtime_iso
}
items.append((mtime_epoch, item))
except Exception as inner_err:
# Keep going; a single bad folder should not break the list
if logger:
logger.warning(f"[recent] Skipping {entry}: {inner_err}")
# Sort by mtime desc
try:
items.sort(key=lambda t: t[0], reverse=True)
except Exception as sort_err:
if logger:
logger.warning(f"[recent] Sort failed: {sort_err}")
# Trim to limit without list comprehensions
trimmed = []
count = 0
for tup in items:
if count >= limit:
break
trimmed.append(tup[1])
count = count + 1
return trimmed
except Exception as outer_err:
if logger:
logger.error(f"[recent] Unexpected error while scanning {storage_dir}: {outer_err}")
return []

132
app/utils/rules_engine.py Normal file
View File

@@ -0,0 +1,132 @@
"""
rules_engine.py
A flexible rule-based engine for detecting suspicious patterns in scripts, forms,
or other web artifacts inside SneakyScope.
Each rule is defined as:
- name: str # Rule identifier
- description: str # Human-readable reason for analysts
- category: str # e.g., 'script', 'form', 'text', 'generic'
- type: str # 'regex' or 'function'
- pattern: str # Regex pattern (if type=regex)
- function: callable # Python function returning (bool, str) (if type=function)
The framework returns a list of results, with pass/fail and reasoning.
"""
import re
from pathlib import Path
from typing import Callable, Dict, List, Tuple, Union
import yaml
class Rule:
"""Represents a single detection rule."""
def __init__(
self,
name: str,
description: str,
category: str,
rule_type: str = "regex",
pattern: str = None,
function: Callable = None,
):
self.name = name
self.description = description
self.category = category
self.rule_type = rule_type
self.pattern = pattern
self.function = function
def run(self, text: str) -> Tuple[bool, str]:
"""
Run the rule on given text.
Returns:
(matched: bool, reason: str)
"""
if self.rule_type == "regex" and self.pattern:
if re.search(self.pattern, text, re.IGNORECASE):
return True, f"Matched regex '{self.pattern}'{self.description}"
else:
return False, "No match"
elif self.rule_type == "function" and callable(self.function):
return self.function(text)
else:
return False, "Invalid rule configuration"
class RuleEngine:
"""Loads and executes rules against provided text."""
def __init__(self, rules: List[Rule] = None):
self.rules = rules or []
def add_rule(self, rule: Rule):
"""Add a new rule at runtime."""
self.rules.append(rule)
def run_all(self, text: str, category: str = None) -> List[Dict]:
"""
Run all rules against text.
Args:
text: str → the content to test
category: str → optional, only run rules in this category
Returns:
List of dicts with rule results.
"""
results = []
for rule in self.rules:
if category and rule.category != category:
continue
matched, reason = rule.run(text)
results.append(
{
"rule": rule.name,
"category": rule.category,
"matched": matched,
"reason": reason if matched else None,
}
)
return results
def load_rules_from_yaml(yaml_file: Union[str, Path]) -> List[Rule]:
"""
Load rules from a YAML file.
Example YAML format:
- name: suspicious_eval
description: "Use of eval() in script"
category: script
type: regex
pattern: "\\beval\\("
- name: password_reset
description: "Password reset wording"
category: text
type: regex
pattern: "reset password"
"""
rules = []
with open(yaml_file, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
for item in data:
rule = Rule(
name=item["name"],
description=item["description"],
category=item["category"],
rule_type=item.get("type", "regex"),
pattern=item.get("pattern"),
)
rules.append(rule)
return rules

144
app/utils/settings.py Normal file
View File

@@ -0,0 +1,144 @@
#
# Note the settings file is hardcoded in this class at the top after imports.
#
# To make a new settings section, just add the setting dict to your yaml
# and then define the data class below in the config data classes area.
#
# Example use from anywhere - this will always return the same singleton
# from settings import get_settings
# def main():
# settings = get_settings()
# print(settings.database.host) # Autocomplete works
# print(settings.logging.level)
# if __name__ == "__main__":
# main()
import functools
from pathlib import Path
from typing import Any, Callable, TypeVar
from dataclasses import dataclass, fields, is_dataclass, field, MISSING
import logging
import sys
logger = logging.getLogger(__file__)
try:
import yaml
except ModuleNotFoundError:
msg = (
"Required modules are not installed. "
"Can not continue with module / application loading.\n"
"Install it with: pip install -r requirements"
)
print(msg, file=sys.stderr)
logger.error(msg)
exit()
BASE_DIR = Path(__file__).resolve().parent.parent
DEFAULT_SETTINGS_FILE = BASE_DIR / "config" / "settings.yaml"
# ---------- CONFIG DATA CLASSES ----------
@dataclass
class Cache_Config:
whois_cache_days: int = 7
geoip_cache_days: int = 7
recent_runs_count: int = 10
@dataclass
class AppConfig:
name: str = "MyApp"
version_major: int = 1
version_minor: int = 0
@dataclass
class Settings:
cache: Cache_Config = field(default_factory=Cache_Config)
app: AppConfig = field(default_factory=AppConfig)
@classmethod
def from_yaml(cls, path: str | Path) -> "Settings":
try:
"""Load settings from YAML file into a Settings object."""
with open(path, "r", encoding="utf-8") as f:
raw: dict[str, Any] = yaml.safe_load(f) or {}
except FileNotFoundError:
logger.warning(f"Settings file {path} not found! Using default settings.")
raw = {}
init_kwargs = {}
for f_def in fields(cls):
yaml_value = raw.get(f_def.name, None)
# Determine default value from default_factory or default
if f_def.default_factory is not MISSING:
default_value = f_def.default_factory()
elif f_def.default is not MISSING:
default_value = f_def.default
else:
default_value = None
# Handle nested dataclasses
if is_dataclass(f_def.type):
if isinstance(yaml_value, dict):
# Merge YAML values with defaults
merged_data = {fld.name: getattr(default_value, fld.name) for fld in fields(f_def.type)}
merged_data.update(yaml_value)
init_kwargs[f_def.name] = f_def.type(**merged_data)
else:
init_kwargs[f_def.name] = default_value
else:
init_kwargs[f_def.name] = yaml_value if yaml_value is not None else default_value
return cls(**init_kwargs)
# ---------- SINGLETON DECORATOR ----------
T = TypeVar("T")
def singleton_loader(func: Callable[..., T]) -> Callable[..., T]:
"""Ensure the function only runs once, returning the cached value."""
cache: dict[str, T] = {}
@functools.wraps(func)
def wrapper(*args, **kwargs) -> T:
if func.__name__ not in cache:
cache[func.__name__] = func(*args, **kwargs)
return cache[func.__name__]
return wrapper
# ---------- SINGLETON DECORATOR ----------
T = TypeVar("T")
def singleton_loader(func: Callable[..., T]) -> Callable[..., T]:
"""Decorator to ensure the settings are loaded only once."""
cache: dict[str, T] = {}
@functools.wraps(func)
def wrapper(*args, **kwargs) -> T:
if func.__name__ not in cache:
cache[func.__name__] = func(*args, **kwargs)
return cache[func.__name__]
return wrapper
@singleton_loader
def get_settings(config_path: str | Path | None = None) -> Settings:
"""
Returns the singleton Settings instance.
Args:
config_path: Optional path to the YAML config file. If not provided,
defaults to 'config/settings.yaml' in the current working directory.
"""
if config_path is None:
config_path = DEFAULT_SETTINGS_FILE
else:
config_path = Path(config_path)
return Settings.from_yaml(config_path)

10
app/wsgi.py Normal file
View File

@@ -0,0 +1,10 @@
"""
app/wsgi.py
Gunicorn entrypoint for SneakyScope.
"""
from . import create_app
# Gunicorn will look for "app"
app = create_app()

13
docker-compose.yaml Normal file
View File

@@ -0,0 +1,13 @@
services:
web:
build: .
container_name: url-sandbox-web
ports:
- "8000:8000"
env_file:
- .env
volumes:
- ./data:/data
security_opt:
- no-new-privileges:true
restart: unless-stopped

71
docs/roadmap.md Normal file
View File

@@ -0,0 +1,71 @@
## Priority 1 Core Functionality / Stability
**Permissions / Storage Paths**
*`/data` and other mounted volumes setup handled by `sandbox.sh`
* ✅ Downloads, screenshots, and HTML artifacts are written correctly (`safe_write` in `io_helpers.py`)
---
## Priority 2 Data Accuracy / Enrichment
**WHOIS & GeoIP Enhancements**
* ✅ Implemented Python-based WHOIS parsing with fallback to raw WHOIS text
* ✅ Default `"Possible Privacy"` or `"N/A"` for missing WHOIS fields
* ✅ GeoIP + ASN + ISP info displayed per IP in **accordion tables**
* ✅ Cache WHOIS and GeoIP results to reduce repeated queries
**Suspicious Scripts & Forms**
* [ ] Expand flagged script and form output with reasons for analysts
* [ ] Show each check and if it triggered flags (pass/fail for each check)
**Add Suspicious BEC words**
* ✅ Look for things like `"reset password"`
* ✅ Make configurable via a config file (yaml doc with rules)
---
## Priority 3 User Interface / UX
**Front Page / Input Handling**
* [ ] Automatically prepend `http://`, `https://`, and/or `www.` if a user only enters a domain
**Result Templates / Cards**
* [ ] load sourcecode for webpage in a code editor view or code block on page so that it's easier to read
* [ ] Update result cards with clear, analyst-friendly explanations
* [ ] Include flagged logic and reason lists for scripts and forms
* ✅ Display GeoIP results in accordion tables (✅ done)
---
## Priority 4 API Layer
**API Endpoints**
* [ ] Add `/screenshot` endpoint
* [ ] Add `/source` endpoint
* [ ] Add `/analyse` endpoint
**OpenAPI + Docs**
* [ ] Create initial `openapi/openapi.yaml` spec file
* [ ] Serve spec at `/api/openapi.yaml`
* [ ] Wire up Swagger UI or Redoc at `/docs` for interactive API exploration
---
## Priority 5 Optional / Cleanup
**Artifact Management**
* [ ] Implement saving of results from a UUID as "results.json" so we don't rerun all the rules and just load from cache.
* [ ] Implement cleanup or retention policy for old artifacts
* [ ] Optional: Add periodic maintenance scripts for storage
**Extra Features**
* [ ] Placeholder for additional features (e.g., bulk URL analysis, alerting, integrations)

22
entrypoint.sh Normal file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
# Ensure browsers are installed (the base image already has them, but this is safe)
python - <<'PY'
from pathlib import Path
from playwright.__main__ import main as pw
# no-op import ensures playwright is present; install step below is quick if cached
PY
# Run the app via gunicorn
# graceful-timeout - 300 ensures long page loads arent killed prematurely
# threads - 8 gives us more threads to work with
# gthread allows each worker to handle multiple threads, so async/blocking tasks like Playwright wont block the whole worker
exec gunicorn \
--bind 0.0.0.0:8000 \
--workers 2 \
--threads 8 \
--worker-class gthread \
--timeout 300 \
--graceful-timeout 300 \
"app.wsgi:app"

94
openapi/openapi.yaml Normal file
View File

@@ -0,0 +1,94 @@
openapi: 3.0.3
info:
title: URL Sandbox API
version: 0.1.0
description: API for analyzing and extracting website artifacts.
servers:
- url: http://localhost:5000/api
description: Local development
paths:
/screenshot:
post:
summary: Capture a screenshot of a website
requestBody:
required: true
content:
application/json:
schema:
type: object
required:
- url
properties:
url:
type: string
example: "http://example.com"
responses:
'200':
description: Screenshot image returned
content:
image/png: {}
'400':
description: Invalid request
/source:
post:
summary: Retrieve HTML source of a website
requestBody:
required: true
content:
application/json:
schema:
type: object
required:
- url
properties:
url:
type: string
example: "http://example.com"
responses:
'200':
description: Raw HTML source
content:
text/html:
schema:
type: string
'400':
description: Invalid request
/analyse:
post:
summary: Run full analysis on a website
requestBody:
required: true
content:
application/json:
schema:
type: object
required:
- url
properties:
url:
type: string
example: "http://example.com"
responses:
'200':
description: JSON with enrichment and analysis results
content:
application/json:
schema:
type: object
properties:
url:
type: string
whois:
type: object
geoip:
type: object
flags:
type: array
items:
type: string
'400':
description: Invalid request

14
requirements.txt Normal file
View File

@@ -0,0 +1,14 @@
Flask>=3.0.3
Jinja2>=3.1.4
Werkzeug>=3.0.3
itsdangerous>=2.2.0
click>=8.1.7
lxml>=5.3.0
playwright==1.45.0 # Playwright stack
beautifulsoup4>=4.12.3 # HTML parsing, etc.
gunicorn>=22.0.0 # Production server
python-whois # For WHOIS lookups
geoip2 # MaxMind GeoLite2 database for IP geolocation
dnspython # For DNS lookups, including A/AAAA records
ipwhois
PyYAML

101
sandbox.sh Executable file
View File

@@ -0,0 +1,101 @@
#!/usr/bin/env bash
set -euo pipefail
# --- CONFIG ---
SANDBOX_STORAGE="${SANDBOX_STORAGE:-./data}"
APP_URL="${APP_URL:-http://localhost:8000}"
# --- FUNCTIONS ---
prepare_storage() {
echo "[*] Checking storage path: $SANDBOX_STORAGE"
if [ ! -d "$SANDBOX_STORAGE" ]; then
echo " -> Creating $SANDBOX_STORAGE on host"
sudo mkdir -p "$SANDBOX_STORAGE"
fi
echo " -> Setting ownership to Playwright user (pwuser / UID 1000)"
sudo chown -R 1000:1000 "$SANDBOX_STORAGE"
sudo chmod -R 755 "$SANDBOX_STORAGE"
echo "[+] Storage ready."
}
start_stack() {
prepare_storage
echo "[*] Building Docker image..."
docker compose build
if [[ "${1:-}" == "-d" ]]; then
echo "[*] Starting services in detached mode..."
docker compose up -d
else
echo "[*] Starting services (attached)..."
docker compose up
fi
}
stop_stack() {
echo "[*] Stopping services..."
docker compose down
}
clean_stack() {
echo "[*] Removing containers, networks, and volumes..."
docker compose down -v --remove-orphans
}
restart_stack() {
stop_stack
echo "[*] Restarting services..."
start_stack -d
}
logs_stack() {
echo "[*] Showing logs (Ctrl+C to exit)..."
docker compose logs -f
}
status_stack() {
echo "[*] Current service status:"
docker compose ps
}
healthcheck_stack() {
echo "[*] Running health check on $APP_URL ..."
if curl -fsS "$APP_URL" > /dev/null; then
echo "[+] Service is healthy and reachable."
else
echo "[!] Service is NOT reachable at $APP_URL"
exit 1
fi
}
# --- MAIN ---
case "${1:-}" in
start)
shift
start_stack "$@"
;;
stop)
stop_stack
;;
restart)
restart_stack
;;
clean)
clean_stack
;;
logs)
logs_stack
;;
status)
status_stack
;;
healthcheck)
healthcheck_stack
;;
*)
echo "Usage: $0 {start [-d for detached mode] | stop | restart | clean | logs | status | healthcheck}"
exit 1
;;
esac