feat(text): add text analysis pipeline & surface results in UI

- engine: add analyse_text() to extract visible page text and evaluate category="text" rules; collect matched phrases and expose as `content_snippet` (deduped, length-capped via settings.ui.snippet_preview_len). - engine: removed unused code - browser: removed double call for enrichment - engine: improve regex compilation — honor per-rule flags (string or list) and default IGNORECASE when category=="text". - engine: add dispatch logging "[engine] applying categories: …" gated by settings.app.print_rule_dispatch. - ui(templates): add `templates/partials/result_text.html` mirroring the forms table; renders page-level records and their matched rules. - ui(controller): wire `analyse_text()` into scan path and expose `payload["suspicious_text"]`. - rules(text): add `identity_verification_prompt`, `gated_document_access`, `email_collection_prompt`; broaden `credential_reset`. fix: text indicators were not displayed due to missing analyzer and mismatched result shape. Result shape: suspicious_text: [ { "type": "page", "content_snippet": "...matched phrases…", "rules": [ {"name": "...", "description": "...", "severity": "medium", "tags": ["..."]} ] } ]
2025-08-22 17:18:50 -05:00
parent af253c858c
commit 55cd81aec0
13 changed files with 422 additions and 115 deletions
--- a/app/blueprints/main.py
+++ b/app/blueprints/main.py
@@ -0,0 +1,200 @@
+# app/blueprints/ui.py
+
+import os
+import json
+import asyncio
+from pathlib import Path
+from datetime import datetime
+from flask import Blueprint, render_template, request, redirect, url_for, flash, current_app, send_file, abort
+
+from app.utils.url_tools import get_url_normalizer
+from app.utils.browser import get_browser
+from app.utils.enrichment import enrich_url
+from app.utils.settings import get_settings
+from app.utils.io_helpers import get_recent_results
+from app.logging_setup import get_app_logger
+
+app_logger = get_app_logger()
+
+bp = Blueprint("main", __name__)
+
+settings = get_settings()
+app_name = settings.app.name
+app_version = f"v {settings.app.version_major}.{settings.app.version_minor}"
+
+
+# --- data cleaner for tls to ensure data is standardized
+def normalize_ssl_tls_for_view(ssl_tls):
+    """
+    Normalize/guard the ssl_tls structure for template rendering.
+    Adds missing keys so Jinja doesn't need defensive checks everywhere.
+    """
+    safe = {"crtsh": None, "probe": None, "error": None, "skipped": False, "reason": None}
+
+    if not isinstance(ssl_tls, dict):
+        safe["error"] = "ssl_tls is not a dict"
+        return safe
+
+    safe.update(ssl_tls)
+
+    if safe.get("skipped") is True:
+        return safe  # don’t force probe/crtsh keys when skipped
+
+    # Probe guards
+    probe = safe.get("probe") or {}
+    if "results_by_version" not in probe or not isinstance(probe["results_by_version"], dict):
+        probe["results_by_version"] = {}
+    if "weak_protocols" not in probe or not isinstance(probe["weak_protocols"], list):
+        probe["weak_protocols"] = []
+    if "weak_ciphers" not in probe or not isinstance(probe["weak_ciphers"], list):
+        probe["weak_ciphers"] = []
+    if "errors" not in probe or not isinstance(probe["errors"], list):
+        probe["errors"] = []
+    if "hostname" not in probe:
+        probe["hostname"] = None
+    if "port" not in probe:
+        probe["port"] = 443
+    safe["probe"] = probe
+
+    # crt.sh guards (we keep it mostly raw; macro only reads a few fields)
+    if "crtsh" not in safe:
+        safe["crtsh"] = None
+
+    return safe
+
+
+# --- context processor ---
+@bp.context_processor
+def inject_app_info():
+    """Inject app name and version into all templates."""
+    return {
+        "app_name": app_name,
+        "app_version": app_version,
+        "current_year": datetime.strftime(datetime.now(),"%Y")
+    }
+
+@bp.route("/", methods=["GET"])
+def index():
+    """
+    Render the landing page with optional 'recent_results' list.
+
+    The number of recent runs is controlled via settings.cache.recent_runs_count (int).
+    Falls back to 10 if not present or invalid.
+    """
+    # Pull recent count from settings with a safe fallback
+    try:
+        # settings is already initialized at module import in your file
+        recent_count = int(getattr(settings.cache, "recent_runs_count", 10))
+        if recent_count < 0:
+            recent_count = 0
+    except Exception:
+        recent_count = 10
+
+    # Resolve SANDBOX_STORAGE from app config
+    storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
+
+    # Build the recent list (non-fatal if storage is empty or unreadable)
+    recent_results = get_recent_results(storage, recent_count, app_logger)
+
+    # Pass to template; your index.html will hide the card if list is empty
+    return render_template("index.html", recent_results=recent_results)
+
+@bp.route("/analyze", methods=["POST"])
+def analyze():
+    url = request.form.get("url", "").strip()
+    
+    # Checkbox comes as '1' when checked, or None when not present
+    fetch_ssl = request.form.get("fetch_ssl")
+    fetch_ssl_enabled = bool(fetch_ssl == "1")
+
+    normalizer = get_url_normalizer()
+
+    try:
+        target = normalizer.normalize_for_analysis(url)
+    except ValueError:
+        app_logger.warning("Empty or invalid URL input")
+        return redirect(url_for("index"))
+
+    app_logger.info(f"[*] Analyzing URL{target}")
+    app_logger.info(f"[*] SSL Checks set to {fetch_ssl_enabled}")
+
+    if not target:
+        flash("Please enter a URL.", "error")
+        return redirect(url_for("main.index"))
+    
+    storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
+    storage.mkdir(parents=True, exist_ok=True)
+
+    try:
+        browser = get_browser()
+        result = asyncio.run(browser.fetch_page_artifacts(url,fetch_ssl_enabled=fetch_ssl_enabled))
+        app_logger.info(f"[+] Analysis done for {url}")
+    except Exception as e:
+        flash(f"Analysis failed: {e}", "error")
+        app_logger.error(f"Analysis failed for {url}: {e}")
+        return redirect(url_for("main.index"))
+
+    # Redirect to permalink page for this run
+    return redirect(url_for("main.view_result", run_uuid=result["uuid"]))
+
+@bp.route("/results/<run_uuid>", methods=["GET"])
+def view_result(run_uuid: str):
+    """
+    View the analysis results for a given run UUID.
+    Loads results.json from SANDBOX_STORAGE/<uuid>,
+    normalizes structures for template safety, and renders the result page.
+    """
+    # Resolve SANDBOX_STORAGE from app config
+    storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
+    run_dir = storage / run_uuid
+    results_path = run_dir / "results.json"
+
+    # Ensure results exist
+    if not results_path.exists():
+        app_logger.error(f"Results not found for UUID: {run_uuid}")
+        abort(404)
+
+    # Load the results JSON
+    with open(results_path, "r", encoding="utf-8") as f:
+        result = json.load(f)
+
+    # Add UUID so template can build artifact links
+    result["uuid"] = run_uuid
+
+    # === Normalize SSL/TLS structure for safe rendering ===
+    if "ssl_tls" in result:
+        result["ssl_tls"] = normalize_ssl_tls_for_view(result["ssl_tls"])
+
+    # Pass the enriched result dict to the template
+    return render_template("result.html", **result)
+
+
+@bp.route("/artifacts/<run_uuid>/<filename>", methods=["GET"])
+def artifacts(run_uuid: str, filename: str):
+    # Resolve SANDBOX_STORAGE from app config
+    storage = Path(current_app.config["SANDBOX_STORAGE"]).resolve()
+    run_dir = storage / run_uuid
+    full_path = run_dir / filename
+
+    # Prevent directory traversal
+    try:
+        full_path.relative_to(run_dir.resolve())
+    except ValueError:
+        app_logger.warning(f"Directory traversal attempt: {filename}")
+        abort(404)
+
+    if not full_path.exists():
+        app_logger.error(f"Artifact not found: {filename} for UUID {run_uuid}")
+        abort(404)
+
+    return send_file(full_path)
+
+
+@bp.get("/view/artifact/<run_uuid>/<filename>")
+def view_artifact(run_uuid, filename):
+    # Build a safe raw URL that streams the file (you said you already have this route)
+    raw_url = url_for('api.get_artifact_raw', run_uuid=run_uuid, filename=filename)
+    # Optional: derive language server-side if you prefer
+    language = None  # e.g., 'javascript'
+    return render_template('viewer.html', filename=filename, raw_url=raw_url, language=language)
+