diff --git a/app/__init__.py b/app/__init__.py index 87b30cb..eb45184 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -2,15 +2,19 @@ import os import logging from pathlib import Path from flask import Flask +from datetime import datetime # Local imports from app.utils.settings import get_settings from app.logging_setup import wire_logging_once, get_app_logger +from app.app_settings import AppSettings from app.blueprints.main import bp as main_bp # ui blueprint from app.blueprints.api import api_bp as api_bp # api blueprint from app.blueprints.roadmap import bp as roadmap_bp # roadmap + + def create_app() -> Flask: """ Create and configure the Flask application instance. @@ -34,17 +38,36 @@ def create_app() -> Flask: if not app.secret_key: app_logger.warning("[init] SECRET_KEY is not set; sessions may be insecure in production.") - # Configure storage directory (bind-mount is still handled by sandbox.sh) - sandbox_storage_default = Path("/data") - app.config["SANDBOX_STORAGE"] = str(sandbox_storage_default) + + # version + version = f"v{AppSettings.version_major}.{AppSettings.version_minor}" + + # allow branding for name if they don't match our name + branded_name = settings.branding.name + if branded_name == AppSettings.name: + public_name = AppSettings.name + footer = f"{AppSettings.copyright} {public_name} {version} - {AppSettings.tagline}" + else: + public_name = f"{branded_name}" + link = f'{AppSettings.name}' + footer = f"{AppSettings.copyright} {public_name} powered by {link} {version} - {AppSettings.tagline}" + + # web header / footer + header = f"{public_name}" # App metadata available to templates - app.config["APP_NAME"] = settings.app.name - app.config["APP_VERSION"] = f"v{settings.app.version_major}.{settings.app.version_minor}" + app.config["APP_NAME"] = public_name + app.config["APP_VERSION"] = version + app.config["WEB_HEADER"] = header + app.config["WEB_FOOTER"] = footer # roadmap file app.config["ROADMAP_FILE"] = str(Path(app.root_path) / "docs" / "roadmap.yaml") + # Configure storage directory (bind-mount is still handled by sandbox.sh) + sandbox_storage_default = Path("/data") + app.config["SANDBOX_STORAGE"] = str(sandbox_storage_default) + # Register blueprints app.register_blueprint(main_bp) diff --git a/app/app_settings.py b/app/app_settings.py new file mode 100644 index 0000000..bbc6fc6 --- /dev/null +++ b/app/app_settings.py @@ -0,0 +1,13 @@ +from dataclasses import dataclass +from datetime import datetime + +this_year = datetime.strftime(datetime.now(),"%Y") + +@dataclass +class AppSettings: + name: str = "SneakyScope" + tagline: str = "A selfhosted URL Sandbox" + url: str = "https://git.sneakygeek.net/ptarrant/SneakyScope" + copyright: str = f"© 2025 - {this_year}" + version_major: int = 1 + version_minor: int = 0 \ No newline at end of file diff --git a/app/blueprints/main.py b/app/blueprints/main.py index 383e41d..8c93255 100644 --- a/app/blueprints/main.py +++ b/app/blueprints/main.py @@ -1,6 +1,5 @@ # app/blueprints/ui.py -import os import json import asyncio from pathlib import Path @@ -9,18 +8,16 @@ from flask import Blueprint, render_template, request, redirect, url_for, flash, from app.utils.url_tools import get_url_normalizer from app.utils.browser import get_browser -from app.utils.enrichment import enrich_url from app.utils.settings import get_settings from app.utils.io_helpers import get_recent_results from app.logging_setup import get_app_logger + app_logger = get_app_logger() bp = Blueprint("main", __name__) settings = get_settings() -app_name = settings.app.name -app_version = f"v {settings.app.version_major}.{settings.app.version_minor}" # --- data cleaner for tls to ensure data is standardized @@ -68,9 +65,9 @@ def normalize_ssl_tls_for_view(ssl_tls): def inject_app_info(): """Inject app name and version into all templates.""" return { - "app_name": app_name, - "app_version": app_version, - "current_year": datetime.strftime(datetime.now(),"%Y") + "app_name": current_app.config.get("APP_NAME", "SneakyScope"), + "header": current_app.config.get("WEB_HEADER", "SneakyScope"), + "footer": current_app.config.get("WEB_FOOTER", "SneakyScope"), } @bp.route("/", methods=["GET"]) @@ -115,7 +112,7 @@ def analyze(): app_logger.warning("Empty or invalid URL input") return redirect(url_for("index")) - app_logger.info(f"[*] Analyzing URL{target}") + app_logger.info(f"[*] Analyzing URL {target}") app_logger.info(f"[*] SSL Checks set to {fetch_ssl_enabled}") if not target: diff --git a/app/config/settings.yaml b/app/config/settings.yaml index 5bccd29..9b0c79d 100644 --- a/app/config/settings.yaml +++ b/app/config/settings.yaml @@ -1,8 +1,8 @@ -app: - name: SneakyScope - version_major: 0 - version_minor: 1 +branding: + # you can brand your sandbox to anything you want + name: Redwire Sandbox +logging: # logs when rules are loaded log_rule_loads: False @@ -13,15 +13,34 @@ app: log_rule_debug: False cache: + # number of recent runs shown on front page recent_runs_count: 10 + + # how long to cache whois information whois_cache_days: 7 + + # how long to cache geoip information geoip_cache_days: 7 + # should we cache crt certificate pulls? + crt_cache_enabled: True + + # how long to cache certificate information (if above is true) + crt_cache_days: 7 + external_script_fetch: + # enable ability to pull external scripts enabled: True + + # max mb of script to pull if pulling max_total_mb: 5 + + # max time to wait for script to pull if pulling max_time_ms: 3000 + + # max redirects for external scripts pull if pulling max_redirects: 3 ui: + # how many char to show in a snippet preview in the gui snippet_preview_len: 300 diff --git a/app/rules/factory.py b/app/rules/factory.py index 688b2c0..d77ccf1 100644 --- a/app/rules/factory.py +++ b/app/rules/factory.py @@ -34,18 +34,33 @@ def build_rules_engine() -> RuleEngine: def add(rule: Rule): eng.add_rule(rule) - add(Rule("form_action_missing", "Form has no action attribute", "form", "function", - FunctionRuleAdapter(form_action_missing, category="form", adapter=adapter, rule_name="form_action_missing"))) - add(Rule("form_http_on_https_page", "Form submits via HTTP from HTTPS page", "form", "function", - FunctionRuleAdapter(form_http_on_https_page, category="form", adapter=adapter, rule_name="form_http_on_https_page"))) - add(Rule("form_submits_to_different_host", "Form submits to a different host", "form", "function", - FunctionRuleAdapter(form_submits_to_different_host, category="form", adapter=adapter, rule_name="form_submits_to_different_host"))) - add(Rule("script_src_uses_data_or_blob", "Script src uses data:/blob: URL", "script", "function", - FunctionRuleAdapter(script_src_uses_data_or_blob, category="script", adapter=adapter, rule_name="script_src_uses_data_or_blob"))) - add(Rule("script_src_has_dangerous_extension", "External script with dangerous extension", "script", "function", - FunctionRuleAdapter(script_src_has_dangerous_extension, category="script", adapter=adapter, rule_name="script_src_has_dangerous_extension"))) - add(Rule("script_third_party_host", "Script is from a third-party host", "script", "function", - FunctionRuleAdapter(script_third_party_host, category="script", adapter=adapter, rule_name="script_third_party_host"))) + # Form no action + add(Rule( + name="form_action_missing", + description="Form has no action attribute", + category="form", + rule_type="function", + function=FunctionRuleAdapter(form_action_missing, category="form", adapter=adapter, rule_name="form_action_missing"), + )) + + # add(Rule( + # name="form_http_on_https_page", + # description="Form submits via HTTP from HTTPS page", + # category="form", + # rule_type="function", + # function=FunctionRuleAdapter(form_http_on_https_page, category="form", adapter=adapter, rule_name="form_http_on_https_page"), + # )) + + # add(Rule("form_http_on_https_page", "Form submits via HTTP from HTTPS page", "form", "function", + # FunctionRuleAdapter(form_http_on_https_page, category="form", adapter=adapter, rule_name="form_http_on_https_page"))) + # add(Rule("form_submits_to_different_host", "Form submits to a different host", "form", "function", + # FunctionRuleAdapter(form_submits_to_different_host, category="form", adapter=adapter, rule_name="form_submits_to_different_host"))) + # add(Rule("script_src_uses_data_or_blob", "Script src uses data:/blob: URL", "script", "function", + # FunctionRuleAdapter(script_src_uses_data_or_blob, category="script", adapter=adapter, rule_name="script_src_uses_data_or_blob"))) + # add(Rule("script_src_has_dangerous_extension", "External script with dangerous extension", "script", "function", + # FunctionRuleAdapter(script_src_has_dangerous_extension, category="script", adapter=adapter, rule_name="script_src_has_dangerous_extension"))) + # add(Rule("script_third_party_host", "Script is from a third-party host", "script", "function", + # FunctionRuleAdapter(script_third_party_host, category="script", adapter=adapter, rule_name="script_third_party_host"))) log.info("Registered %d total rules (YAML + function)", len(eng.rules)) return eng diff --git a/app/rules/rules_engine.py b/app/rules/rules_engine.py index 12b2d2d..90af932 100644 --- a/app/rules/rules_engine.py +++ b/app/rules/rules_engine.py @@ -143,6 +143,13 @@ class Rule: return False, "No match" if self.rule_type == "function": + if not callable(self.function): + logger.warning( + "[Rule] '%s' function is not callable (type=%s, value=%r)", + self.name, type(self.function).__name__, self.function + ) + return False, "Invalid rule configuration: function not callable" + if callable(self.function): try: matched, reason = self.function(text) @@ -255,7 +262,7 @@ class RuleEngine: ) return - if settings.app.log_rule_loads: + if settings.logconfig.log_rule_loads: logger.info( "[engine] add_rule: %s/%s replace=%s -> count=%d", rule.category, rule.name, bool(replace), len(self._rules) @@ -308,7 +315,7 @@ class RuleEngine: """ # --- dispatch visibility --- if set to true, we log applied categories - if getattr(settings.app, "log_rule_dispatch", False): + if getattr(settings.logconfig, "log_rule_dispatch", False): all_cats = [r.category for r in self._rules] cat_counts = Counter(all_cats) # Which categories are being applied this run? diff --git a/app/templates/base.html b/app/templates/base.html index eefba8f..b66e0c9 100644 --- a/app/templates/base.html +++ b/app/templates/base.html @@ -20,7 +20,7 @@
- SneakyScope + {{ header }} {# Desktop nav #} @@ -76,7 +76,7 @@ {# Footer #}
-

© {{ current_year }} SneakyScope {{ app_name }} {{ app_version }} - A selfhosted URL sandbox

+

{{ footer | safe }}

{# Flowbite JS (enables collapse) #} diff --git a/app/templates/index.html b/app/templates/index.html index ab6c9f7..a815218 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -110,7 +110,7 @@
-
Analyzing website…
+
Analyzing website…
If you are pulling certificates, this may take a long time
@@ -142,11 +142,7 @@ function hideSpinner() { } /** - * Initialize form submit handling: - * - shows overlay spinner - * - disables submit button - * - shows small spinner inside button - * - lets the browser continue with POST + * Initialize form submit handling. */ (function initAnalyzeForm() { const form = document.getElementById('analyze-form'); @@ -155,11 +151,16 @@ function hideSpinner() { const submitBtn = form.querySelector('button[type="submit"]'); const btnSpinner = document.getElementById('btn-spinner'); - // Hide spinner overlay if arriving from bfcache/back - window.addEventListener('pageshow', () => { - hideSpinner(); - if (submitBtn) submitBtn.disabled = false; - if (btnSpinner) btnSpinner.classList.add('hidden'); + // Only hide the overlay when returning via BFCache (back/forward) + window.addEventListener('pageshow', (e) => { + const nav = performance.getEntriesByType('navigation')[0]; + const isBFCache = e.persisted || nav?.type === 'back_forward'; + + if (isBFCache) { + hideSpinner(); + if (submitBtn) submitBtn.disabled = false; + if (btnSpinner) btnSpinner.classList.add('hidden'); + } }); form.addEventListener('submit', (e) => { diff --git a/app/templates/partials/result_enrichment.html b/app/templates/partials/result_enrichment.html index f399c23..3f6fc64 100644 --- a/app/templates/partials/result_enrichment.html +++ b/app/templates/partials/result_enrichment.html @@ -33,7 +33,12 @@

GeoIP

{% for ip, info in enrichment.geoip.items() %}
- {{ ip }} + + {{ ip }} - + {% if info.country %} {{ info.country }} {% endif %} - + {% if info.isp %} {{ info.isp }} {% endif %} + +
diff --git a/app/templates/partials/result_forms.html b/app/templates/partials/result_forms.html index d34f81f..f0e9543 100644 --- a/app/templates/partials/result_forms.html +++ b/app/templates/partials/result_forms.html @@ -1,6 +1,6 @@
-

Forms

+

Suspicious Form Hits

{% if forms and forms|length > 0 %}
diff --git a/app/templates/partials/result_text.html b/app/templates/partials/result_text.html index a1e54f4..f5fae21 100644 --- a/app/templates/partials/result_text.html +++ b/app/templates/partials/result_text.html @@ -1,6 +1,6 @@
-

Text

+

Suspicious Text

{% if suspicious_text and suspicious_text|length > 0 %}
diff --git a/app/templates/result.html b/app/templates/result.html index 7c22b18..fcc47f5 100644 --- a/app/templates/result.html +++ b/app/templates/result.html @@ -28,14 +28,23 @@

Submitted URL: {{ submitted_url }}

Final URL: - {{ final_url }} + {{ final_url }}

Permalink: - {{ request.host_url }}results/{{ uuid }} + Permalink for {{ uuid }}

+

+ Full Results File: + + Results File + +

+

Back to top

diff --git a/app/utils/enrichment.py b/app/utils/enrichment.py index 70b0f4d..ad57c67 100644 --- a/app/utils/enrichment.py +++ b/app/utils/enrichment.py @@ -30,6 +30,8 @@ days = 24 * 60 GEOIP_DEFAULT_TTL = settings.cache.geoip_cache_days * days WHOIS_DEFAULT_TTL = settings.cache.whois_cache_days * days +CRT_DEFAULT_TTL = settings.cache.crt_cache_days * days + logger = get_app_logger() @@ -137,6 +139,20 @@ def search_certs(domain, wildcard=True, expired=True, deduplicate=True): "not_before": "2018-02-08T15:47:39" } """ + + cache_key = f"crt_cert:{domain}" + + # log if caching is turned on or not + logger.info(f"CRT Cache is set to: {settings.cache.crt_cache_enabled}") + + if settings.cache.crt_cache_enabled: + cached = cache.read(cache_key) + if cached: + logger.info(f"[CACHE HIT] for CRT Cert: {domain}") + return cached + else: + logger.info(f"[CACHE MISS] for CRT Cert: {domain} - {cache_key}") + base_url = "https://crt.sh/?q={}&output=json" if not expired: base_url = base_url + "&exclude=expired" @@ -153,11 +169,21 @@ def search_certs(domain, wildcard=True, expired=True, deduplicate=True): try: content = req.content.decode('utf-8') data = json.loads(content) + # if caching + if settings.cache.crt_cache_enabled: + logger.info(f"Setting Cache for {cache_key}") + cache.create(cache_key, data, CRT_DEFAULT_TTL) return data except ValueError: # crt.sh fixed their JSON response. This shouldn't be necessary anymore # https://github.com/crtsh/certwatch_db/commit/f4f46ea37c23543c4cdf1a3c8867d68967641807 data = json.loads("[{}]".format(content.replace('}{', '},{'))) + + # if caching + if settings.cache.crt_cache_enabled: + logger.info(f"Setting Cache for {cache_key}") + cache.create(cache_key, data, CRT_DEFAULT_TTL) + return data except Exception as err: logger.error("Error retrieving cert information from CRT.sh.") @@ -200,6 +226,7 @@ def gather_crtsh_certs_for_target(target): hostname = parse_target_to_host(target) result["hostname"] = hostname + # return fake return if no hostname was able to be parsed if hostname is None: return result @@ -209,6 +236,7 @@ def gather_crtsh_certs_for_target(target): # Always query crt.sh for the specific hostname # (expired=False means we filter expired) + host_certs = search_certs(hostname, wildcard=False, expired=False) result["crtsh"]["host_certs"] = host_certs diff --git a/app/utils/settings.py b/app/utils/settings.py index 1f5acd7..7c05476 100644 --- a/app/utils/settings.py +++ b/app/utils/settings.py @@ -53,27 +53,34 @@ class UIConfig: @dataclass class Cache_Config: + recent_runs_count: int = 10 + whois_cache_days: int = 7 geoip_cache_days: int = 7 - recent_runs_count: int = 10 + crt_cache_enabled: bool = True + crt_cache_days: int = 7 + + @dataclass -class AppConfig: - name: str = "MyApp" - version_major: int = 1 - version_minor: int = 0 +class Logging_Config: log_rule_loads: bool = False log_rule_dispatch: bool = False log_rule_debug: bool = False +@dataclass +class BrandingConfig: + name: str = "MyApp" + @dataclass class Settings: cache: Cache_Config = field(default_factory=Cache_Config) ui: UIConfig = field(default_factory=UIConfig) external_fetch: External_FetchConfig = field(default_factory=External_FetchConfig) - app: AppConfig = field(default_factory=AppConfig) + branding: BrandingConfig = field(default_factory=BrandingConfig) + logconfig: Logging_Config = field(default_factory=Logging_Config) @classmethod def from_yaml(cls, path: str | Path) -> "Settings": diff --git a/app/wsgi.py b/app/wsgi.py index 35f38ba..f0a0733 100644 --- a/app/wsgi.py +++ b/app/wsgi.py @@ -9,7 +9,7 @@ from . import create_app # Gunicorn will look for "app" app = create_app() -from app.state import set_rules_engine, get_rules_engine +from app.state import set_rules_engine from app.logging_setup import get_app_logger from app.rules.factory import build_rules_engine diff --git a/docs/rule_processing_notes.md b/docs/rule_processing_notes.md new file mode 100644 index 0000000..d0081f0 --- /dev/null +++ b/docs/rule_processing_notes.md @@ -0,0 +1,24 @@ +Blueprint -> Analyse calls browser.fetch_page_artifacts + +Browser.py -> fetch_page_artifacts +grabs html_content + run analyse_forms on html_content + run analyse_scripts on html_content + enrich_url + build_rule_checks_overview + +builds the structured dict for return +dumps data to disk + + +text flag mapping for rules +# --- Helper: map human-friendly flags to re.* constants --- + FLAG_MAP = { + "i": _re.IGNORECASE, "ignorecase": _re.IGNORECASE, + "m": _re.MULTILINE, "multiline": _re.MULTILINE, + "s": _re.DOTALL, "dotall": _re.DOTALL, "singleline": _re.DOTALL, + "x": _re.VERBOSE, "verbose": _re.VERBOSE, + "a": _re.ASCII, "ascii": _re.ASCII, + "u": _re.UNICODE, "unicode": _re.UNICODE, + "l": _re.LOCALE, "locale": _re.LOCALE, + } \ No newline at end of file diff --git a/docs/rules-rebuild-remember.md b/docs/rules-rebuild-remember.md new file mode 100644 index 0000000..5349412 --- /dev/null +++ b/docs/rules-rebuild-remember.md @@ -0,0 +1,113 @@ +Got it — here’s a structured outline you can keep as a “memory primer” for when you want to resume this work (whether continuing the current engine or pivoting to a SaaS build). Think of it as a **checkpoint map** of what we’ve covered and what directions you flagged. + +--- + +# SneakyScope – Rules Engine Refactor & SaaS Considerations (Outline) + +## 1. Current State + +* Engine: `app/rules/rules_engine.py` — mixed YAML + code rules, `(bool, str)` return shape expected. +* Issue: function-based rules returning `(bool, dict)` → caused invalid type warnings. +* Stopgap: `FunctionRuleAdapter` converted returns. +* Desire: Remove YAML entirely; rules defined in code only. + +--- + +## 2. Agreed Direction + +* **Rule definition approach**: + + * Option B chosen → decorator-based registration. + * Every rule defined in `app/rules/` as Python functions. + * Rules register with metadata (`name`, `category`, `severity`, etc.). + +* **Return shape**: + + * Always return a **Result dict** (no adapter needed). + * Engine enforces schema and fills in defaults. + +* **Engine relocation**: + + * Move to `app/utils/rules_engine/`. + * Responsibilities: load, validate, freeze registry, run rules, aggregate results, log/report. + +--- + +## 3. Result Schema (concept) + +* **Per RuleResult** + + * Required: `ok: bool`, `message: str`. + * Identity: `name`, `category`, `severity`, `tags`, `rule_version`. + * Detail: `data: object|null`. + * Timing: `duration_ms`. + * Errors: structured `error` object if exceptions occur. + * Provenance: `source_module`, optional `policy` snapshot. + +* **Per AnalysisResult (run-level envelope)** + + * Input scope: target URL, category, content hash, facts profile. + * Provenance: run\_id, engine\_version, ruleset\_checksum, timestamp, duration. + * Results: array of RuleResults. + * Summary: counts by severity, match count, errors, first match, top severity. + * Artifacts: references (screenshot, DOM snapshot, etc.). + * Policy snapshot: optional central policy/overrides. + +--- + +## 4. Operational Standards + +* **Determinism**: identical inputs + ruleset\_checksum → identical results. +* **Message stability**: avoid wording churn; expand via `data`. +* **Size limits**: `message ≤ 256 chars`; `data ≤ 8–16 KB`. +* **Errors**: `ok=false` if error present; always emit `message`. +* **Severity**: rule sets default; policy may override. +* **Tags**: controlled vocabulary; additive. + +--- + +## 5. Migration Plan + +1. Create new `rules_engine` package in `app/utils/`. +2. Add decorator/registry for rules. +3. Port all rules from YAML → Python modules grouped by category. +4. Delete YAML loader + adapters. +5. Update call sites to build `facts` and call `engine.run(...)`. +6. Add CI tests: + + * Schema compliance. + * No duplicates. + * Ruleset checksum snapshot. +7. Integration tests with real fixtures. +8. Benchmark & harden (caps on input size, rule runtime). + +--- + +## 6. SaaS Expansion (future) + +* **Multi-tenancy**: separate org/user scopes for data and rule runs. +* **RBAC**: roles (admin, analyst, viewer). +* **Compliance**: logging, retention, export, audit trails. +* **Rules**: centrally maintained, not user-editable. +* **APIs**: authenticated endpoints, per-user quotas. +* **Observability**: per-tenant metrics, alerts. +* **Security**: sandboxing, strict module allowlists, compliance with SOC2/ISO. +* **Data controls**: PII redaction, encryption, retention policies. + +--- + +## 7. Future-Proofing Hooks + +* Versioning: ruleset checksum + per-rule versions. +* Extensibility: support `actions`, `links`, `evidence` in Result. +* Policy: central config for thresholds/overrides. +* Hot reload (optional, dev-only). +* Rule provenance tracking (source\_module, commit SHA). + +--- + +✅ This outline is enough to “re-hydrate” the context later — you won’t need to dig back into old logs to remember why `(bool, str)` didn’t fit, why YAML was removed, or what schema we were converging on. + +--- + +Do you want me to also save this in a **short “README-spec” style** (like `RESULTS.md`) so it can live in your repo as the contract doc for rules, or should I keep this as just your personal checkpoint outline?