feat(text): add text analysis pipeline & surface results in UI

- engine: add analyse_text() to extract visible page text and evaluate
  category="text" rules; collect matched phrases and expose as
  `content_snippet` (deduped, length-capped via settings.ui.snippet_preview_len).
- engine: removed unused code
- browser: removed double call for enrichment
- engine: improve regex compilation — honor per-rule flags (string or list)
  and default IGNORECASE when category=="text".
- engine: add dispatch logging "[engine] applying categories: …" gated by
  settings.app.print_rule_dispatch.
- ui(templates): add `templates/partials/result_text.html` mirroring the forms
  table; renders page-level records and their matched rules.
- ui(controller): wire `analyse_text()` into scan path and expose
  `payload["suspicious_text"]`.
- rules(text): add `identity_verification_prompt`, `gated_document_access`,
  `email_collection_prompt`; broaden `credential_reset`.

fix: text indicators were not displayed due to missing analyzer and mismatched result shape.

Result shape:
  suspicious_text: [
    {
      "type": "page",
      "content_snippet": "...matched phrases…",
      "rules": [
        {"name": "...", "description": "...", "severity": "medium", "tags": ["..."]}
      ]
    }
  ]
This commit is contained in:
2025-08-22 17:18:50 -05:00
parent af253c858c
commit 55cd81aec0
13 changed files with 422 additions and 115 deletions

View File

@@ -29,6 +29,7 @@ from typing import Any, Dict, List, Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
from flask import current_app
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
@@ -85,64 +86,6 @@ class Browser:
index = index + 1
return summary
def run_rule_checks(self, text: str, category: str) -> Dict[str, Any]:
"""
Run all rules for a given category against provided text, returning a table-friendly model.
Args:
text: Text to analyze (HTML, snippet, etc.)
category: One of 'form', 'script', 'text' (or any category your rules use)
Returns:
{
"checks": [
{ "name": str, "description": str, "category": str,
"result": "PASS"|"FAIL", "reason": Optional[str],
"severity": Optional[str], "tags": Optional[List[str]] }, ...
],
"summary": { "fail_count": int, "total_rules": int }
}
"""
out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}}
engine = self._get_rule_engine()
if engine is None:
return out
try:
engine_results = engine.run_all(text, category=category) # list of dicts
index = 0
total = len(engine_results)
while index < total:
item = engine_results[index]
normalized = {
"name": item.get("name"),
"description": item.get("description"),
"category": item.get("category"),
"result": item.get("result"), # "PASS" | "FAIL"
"reason": item.get("reason"), # present on FAIL by engine design
"severity": item.get("severity"),
"tags": item.get("tags"),
}
out["checks"].append(normalized)
index = index + 1
out["summary"] = self._summarize_results(out["checks"])
except Exception as exc:
# Preserve shape; record the error as a synthetic PASS (so UI doesn't break)
out["checks"].append({
"name": "engine_error",
"description": "Rule engine failed during evaluation",
"category": category,
"result": "PASS",
"reason": f"{exc}",
"severity": None,
"tags": None
})
out["summary"] = {"fail_count": 0, "total_rules": 1}
return out
def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]:
"""
Build a top-level overview for the results page: runs each category across
@@ -376,6 +319,135 @@ class Browser:
return results
def analyze_text(self, html: str) -> List[Dict[str, Any]]:
"""
Extract visible page text and evaluate text rules.
Only include rows that matched at least one rule.
Returns a list with 0..1 records shaped like:
{
"type": "page",
"content_snippet": "<matched words/phrases joined>",
"rules": [
{"name": "...", "description": "...", "severity": "...", "tags": [...]},
...
],
}
"""
results: List[Dict[str, Any]] = []
# Short-circuit on missing html
if not html:
return results
# Extract visible text (strip scripts/styles)
try:
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript", "template"]):
tag.decompose()
# Basic hidden cleanup (best-effort)
for el in soup.select('[hidden], [aria-hidden="true"]'):
el.decompose()
text = soup.get_text(separator=" ", strip=True)
if not text:
return results
# Normalize whitespace so regexes behave consistently
text = re.sub(r"\s+", " ", text).strip()
except Exception as exc:
# Keep consistency with your other analyzers
results.append({
"type": "page",
"heuristics": [f"Text extraction error: {exc}"]
})
return results
engine = self._get_rule_engine()
if engine is None:
return results
matches_for_record: List[Dict[str, Any]] = []
matched_phrases: List[str] = [] # order-preserving
seen_phrases = set()
# How many characters to show for the preview snippet
preview_len = getattr(settings.ui, "snippet_preview_len", 200)
try:
# 1) Regex rules over full page text
for r in engine.rules:
if getattr(r, "category", None) != "text":
continue
rtype = getattr(r, "rule_type", None)
if rtype == "regex":
ok, _reason = r.run(text)
if not ok:
continue
# Try to pull matched words/phrases
compiled = getattr(r, "_compiled_regex", None)
if compiled is None and getattr(r, "pattern", None):
try:
compiled = re.compile(r.pattern, re.IGNORECASE)
except re.error:
compiled = None
# Collect a few (deduped) matched phrases
if compiled is not None:
# limit per rule to avoid flooding
per_rule_count = 0
for m in compiled.finditer(text):
phrase = m.group(0).strip()
if phrase and phrase not in seen_phrases:
matched_phrases.append(phrase)
seen_phrases.add(phrase)
per_rule_count += 1
if per_rule_count >= 5: # cap per rule
break
matches_for_record.append({
"name": getattr(r, "name", "unknown_rule"),
"description": getattr(r, "description", "") or "",
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
elif rtype == "function":
# Optional: function-style rules can inspect the full text
facts = {"text": text, "category": "text"}
ok, reason = r.run(facts)
if ok:
matches_for_record.append({
"name": getattr(r, "name", "unknown_rule"),
"description": (reason or "") or getattr(r, "description", ""),
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
if matches_for_record:
# Build the snippet from matched words/phrases
joined = "".join(matched_phrases) if matched_phrases else ""
if len(joined) > preview_len:
joined = joined[:preview_len] + ""
record: Dict[str, Any] = {
"type": "page",
"content_snippet": joined or None,
"rules": matches_for_record,
}
results.append(record)
except Exception as exc:
results.append({
"type": "page",
"heuristics": [f"Text analysis error: {exc}"]
})
return results
# -----------------------------------------------------------------------
# Fetcher / Orchestrator
# -----------------------------------------------------------------------
@@ -458,12 +530,15 @@ class Browser:
# Read back saved source
html_content = source_path.read_text(encoding="utf-8")
# Forms analysis (per-form rule checks)
# Forms analysis
forms_info = self.analyze_forms(html_content, final_url)
# Scripts artifacts (no detection here)
# Scripts artifacts
suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url)
# suspicious text
flagged_text = self.analyze_text(html_content)
# Enrichment
enrichment = enrich_url(url, fetch_ssl_enabled)
@@ -486,7 +561,8 @@ class Browser:
"scripts": scripts_seen,
"forms": forms_info,
"suspicious_scripts": suspicious_scripts,
"rule_checks": rule_checks_overview, # table-ready for UI
"suspicious_text":flagged_text,
"rule_checks": rule_checks_overview,
"enrichment": enrichment
}