feat(text): add text analysis pipeline & surface results in UI

- engine: add analyse_text() to extract visible page text and evaluate category="text" rules; collect matched phrases and expose as `content_snippet` (deduped, length-capped via settings.ui.snippet_preview_len). - engine: removed unused code - browser: removed double call for enrichment - engine: improve regex compilation — honor per-rule flags (string or list) and default IGNORECASE when category=="text". - engine: add dispatch logging "[engine] applying categories: …" gated by settings.app.print_rule_dispatch. - ui(templates): add `templates/partials/result_text.html` mirroring the forms table; renders page-level records and their matched rules. - ui(controller): wire `analyse_text()` into scan path and expose `payload["suspicious_text"]`. - rules(text): add `identity_verification_prompt`, `gated_document_access`, `email_collection_prompt`; broaden `credential_reset`. fix: text indicators were not displayed due to missing analyzer and mismatched result shape. Result shape: suspicious_text: [ { "type": "page", "content_snippet": "...matched phrases…", "rules": [ {"name": "...", "description": "...", "severity": "medium", "tags": ["..."]} ] } ]
2025-08-22 17:18:50 -05:00
parent af253c858c
commit 55cd81aec0
13 changed files with 422 additions and 115 deletions
--- a/app/utils/browser.py
+++ b/app/utils/browser.py
@@ -29,6 +29,7 @@ from typing import Any, Dict, List, Optional
 from urllib.parse import urlparse

 from bs4 import BeautifulSoup
+import re
 from flask import current_app
 from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError

@@ -85,64 +86,6 @@ class Browser:
            index = index + 1
        return summary

-    def run_rule_checks(self, text: str, category: str) -> Dict[str, Any]:
-        """
-        Run all rules for a given category against provided text, returning a table-friendly model.
-
-        Args:
-            text: Text to analyze (HTML, snippet, etc.)
-            category: One of 'form', 'script', 'text' (or any category your rules use)
-
-        Returns:
-            {
-              "checks": [
-                 { "name": str, "description": str, "category": str,
-                   "result": "PASS"|"FAIL", "reason": Optional[str],
-                   "severity": Optional[str], "tags": Optional[List[str]] }, ...
-              ],
-              "summary": { "fail_count": int, "total_rules": int }
-            }
-        """
-        out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}}
-        engine = self._get_rule_engine()
-
-        if engine is None:
-            return out
-
-        try:
-            engine_results = engine.run_all(text, category=category)  # list of dicts
-            index = 0
-            total = len(engine_results)
-            while index < total:
-                item = engine_results[index]
-                normalized = {
-                    "name": item.get("name"),
-                    "description": item.get("description"),
-                    "category": item.get("category"),
-                    "result": item.get("result"),        # "PASS" | "FAIL"
-                    "reason": item.get("reason"),        # present on FAIL by engine design
-                    "severity": item.get("severity"),
-                    "tags": item.get("tags"),
-                }
-                out["checks"].append(normalized)
-                index = index + 1
-
-            out["summary"] = self._summarize_results(out["checks"])
-        except Exception as exc:
-            # Preserve shape; record the error as a synthetic PASS (so UI doesn't break)
-            out["checks"].append({
-                "name": "engine_error",
-                "description": "Rule engine failed during evaluation",
-                "category": category,
-                "result": "PASS",
-                "reason": f"{exc}",
-                "severity": None,
-                "tags": None
-            })
-            out["summary"] = {"fail_count": 0, "total_rules": 1}
-
-        return out
-
    def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]:
        """
        Build a top-level overview for the results page: runs each category across
@@ -376,6 +319,135 @@ class Browser:

        return results

+    def analyze_text(self, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract visible page text and evaluate text rules.
+        Only include rows that matched at least one rule.
+
+        Returns a list with 0..1 records shaped like:
+        {
+            "type": "page",
+            "content_snippet": "<matched words/phrases joined>",
+            "rules": [
+                {"name": "...", "description": "...", "severity": "...", "tags": [...]},
+                ...
+            ],
+        }
+        """
+        results: List[Dict[str, Any]] = []
+
+        # Short-circuit on missing html
+        if not html:
+            return results
+
+        # Extract visible text (strip scripts/styles)
+        try:
+            soup = BeautifulSoup(html, "lxml")
+            for tag in soup(["script", "style", "noscript", "template"]):
+                tag.decompose()
+            # Basic hidden cleanup (best-effort)
+            for el in soup.select('[hidden], [aria-hidden="true"]'):
+                el.decompose()
+
+            text = soup.get_text(separator=" ", strip=True)
+            if not text:
+                return results
+
+            # Normalize whitespace so regexes behave consistently
+            text = re.sub(r"\s+", " ", text).strip()
+
+        except Exception as exc:
+            # Keep consistency with your other analyzers
+            results.append({
+                "type": "page",
+                "heuristics": [f"Text extraction error: {exc}"]
+            })
+            return results
+
+        engine = self._get_rule_engine()
+        if engine is None:
+            return results
+
+        matches_for_record: List[Dict[str, Any]] = []
+        matched_phrases: List[str] = []   # order-preserving
+        seen_phrases = set()
+
+        # How many characters to show for the preview snippet
+        preview_len = getattr(settings.ui, "snippet_preview_len", 200)
+
+        try:
+            # 1) Regex rules over full page text
+            for r in engine.rules:
+                if getattr(r, "category", None) != "text":
+                    continue
+
+                rtype = getattr(r, "rule_type", None)
+                if rtype == "regex":
+                    ok, _reason = r.run(text)
+                    if not ok:
+                        continue
+
+                    # Try to pull matched words/phrases
+                    compiled = getattr(r, "_compiled_regex", None)
+                    if compiled is None and getattr(r, "pattern", None):
+                        try:
+                            compiled = re.compile(r.pattern, re.IGNORECASE)
+                        except re.error:
+                            compiled = None
+
+                    # Collect a few (deduped) matched phrases
+                    if compiled is not None:
+                        # limit per rule to avoid flooding
+                        per_rule_count = 0
+                        for m in compiled.finditer(text):
+                            phrase = m.group(0).strip()
+                            if phrase and phrase not in seen_phrases:
+                                matched_phrases.append(phrase)
+                                seen_phrases.add(phrase)
+                                per_rule_count += 1
+                                if per_rule_count >= 5:  # cap per rule
+                                    break
+
+                    matches_for_record.append({
+                        "name": getattr(r, "name", "unknown_rule"),
+                        "description": getattr(r, "description", "") or "",
+                        "severity": getattr(r, "severity", None),
+                        "tags": getattr(r, "tags", None),
+                    })
+
+                elif rtype == "function":
+                    # Optional: function-style rules can inspect the full text
+                    facts = {"text": text, "category": "text"}
+                    ok, reason = r.run(facts)
+                    if ok:
+                        matches_for_record.append({
+                            "name": getattr(r, "name", "unknown_rule"),
+                            "description": (reason or "") or getattr(r, "description", ""),
+                            "severity": getattr(r, "severity", None),
+                            "tags": getattr(r, "tags", None),
+                        })
+
+            if matches_for_record:
+                # Build the snippet from matched words/phrases
+                joined = " … ".join(matched_phrases) if matched_phrases else ""
+                if len(joined) > preview_len:
+                    joined = joined[:preview_len] + "…"
+
+                record: Dict[str, Any] = {
+                    "type": "page",
+                    "content_snippet": joined or None,
+                    "rules": matches_for_record,
+                }
+                results.append(record)
+
+        except Exception as exc:
+            results.append({
+                "type": "page",
+                "heuristics": [f"Text analysis error: {exc}"]
+            })
+
+        return results
+
    # -----------------------------------------------------------------------
    # Fetcher / Orchestrator
    # -----------------------------------------------------------------------
@@ -458,12 +530,15 @@ class Browser:
        # Read back saved source
        html_content = source_path.read_text(encoding="utf-8")

-        # Forms analysis (per-form rule checks)
+        # Forms analysis 
        forms_info = self.analyze_forms(html_content, final_url)

-        # Scripts artifacts (no detection here)
+        # Scripts artifacts 
        suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url)

+        # suspicious text
+        flagged_text = self.analyze_text(html_content)
+
        # Enrichment
        enrichment = enrich_url(url, fetch_ssl_enabled)

@@ -486,7 +561,8 @@ class Browser:
            "scripts": scripts_seen,
            "forms": forms_info,
            "suspicious_scripts": suspicious_scripts,
-            "rule_checks": rule_checks_overview,  # table-ready for UI
+            "suspicious_text":flagged_text,
+            "rule_checks": rule_checks_overview,
            "enrichment": enrichment
        }