feat(text): add text analysis pipeline & surface results in UI
- engine: add analyse_text() to extract visible page text and evaluate
category="text" rules; collect matched phrases and expose as
`content_snippet` (deduped, length-capped via settings.ui.snippet_preview_len).
- engine: removed unused code
- browser: removed double call for enrichment
- engine: improve regex compilation — honor per-rule flags (string or list)
and default IGNORECASE when category=="text".
- engine: add dispatch logging "[engine] applying categories: …" gated by
settings.app.print_rule_dispatch.
- ui(templates): add `templates/partials/result_text.html` mirroring the forms
table; renders page-level records and their matched rules.
- ui(controller): wire `analyse_text()` into scan path and expose
`payload["suspicious_text"]`.
- rules(text): add `identity_verification_prompt`, `gated_document_access`,
`email_collection_prompt`; broaden `credential_reset`.
fix: text indicators were not displayed due to missing analyzer and mismatched result shape.
Result shape:
suspicious_text: [
{
"type": "page",
"content_snippet": "...matched phrases…",
"rules": [
{"name": "...", "description": "...", "severity": "medium", "tags": ["..."]}
]
}
]
This commit is contained in:
@@ -29,6 +29,7 @@ from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
from flask import current_app
|
||||
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
|
||||
|
||||
@@ -85,64 +86,6 @@ class Browser:
|
||||
index = index + 1
|
||||
return summary
|
||||
|
||||
def run_rule_checks(self, text: str, category: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Run all rules for a given category against provided text, returning a table-friendly model.
|
||||
|
||||
Args:
|
||||
text: Text to analyze (HTML, snippet, etc.)
|
||||
category: One of 'form', 'script', 'text' (or any category your rules use)
|
||||
|
||||
Returns:
|
||||
{
|
||||
"checks": [
|
||||
{ "name": str, "description": str, "category": str,
|
||||
"result": "PASS"|"FAIL", "reason": Optional[str],
|
||||
"severity": Optional[str], "tags": Optional[List[str]] }, ...
|
||||
],
|
||||
"summary": { "fail_count": int, "total_rules": int }
|
||||
}
|
||||
"""
|
||||
out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}}
|
||||
engine = self._get_rule_engine()
|
||||
|
||||
if engine is None:
|
||||
return out
|
||||
|
||||
try:
|
||||
engine_results = engine.run_all(text, category=category) # list of dicts
|
||||
index = 0
|
||||
total = len(engine_results)
|
||||
while index < total:
|
||||
item = engine_results[index]
|
||||
normalized = {
|
||||
"name": item.get("name"),
|
||||
"description": item.get("description"),
|
||||
"category": item.get("category"),
|
||||
"result": item.get("result"), # "PASS" | "FAIL"
|
||||
"reason": item.get("reason"), # present on FAIL by engine design
|
||||
"severity": item.get("severity"),
|
||||
"tags": item.get("tags"),
|
||||
}
|
||||
out["checks"].append(normalized)
|
||||
index = index + 1
|
||||
|
||||
out["summary"] = self._summarize_results(out["checks"])
|
||||
except Exception as exc:
|
||||
# Preserve shape; record the error as a synthetic PASS (so UI doesn't break)
|
||||
out["checks"].append({
|
||||
"name": "engine_error",
|
||||
"description": "Rule engine failed during evaluation",
|
||||
"category": category,
|
||||
"result": "PASS",
|
||||
"reason": f"{exc}",
|
||||
"severity": None,
|
||||
"tags": None
|
||||
})
|
||||
out["summary"] = {"fail_count": 0, "total_rules": 1}
|
||||
|
||||
return out
|
||||
|
||||
def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Build a top-level overview for the results page: runs each category across
|
||||
@@ -376,6 +319,135 @@ class Browser:
|
||||
|
||||
return results
|
||||
|
||||
def analyze_text(self, html: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract visible page text and evaluate text rules.
|
||||
Only include rows that matched at least one rule.
|
||||
|
||||
Returns a list with 0..1 records shaped like:
|
||||
{
|
||||
"type": "page",
|
||||
"content_snippet": "<matched words/phrases joined>",
|
||||
"rules": [
|
||||
{"name": "...", "description": "...", "severity": "...", "tags": [...]},
|
||||
...
|
||||
],
|
||||
}
|
||||
"""
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
# Short-circuit on missing html
|
||||
if not html:
|
||||
return results
|
||||
|
||||
# Extract visible text (strip scripts/styles)
|
||||
try:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for tag in soup(["script", "style", "noscript", "template"]):
|
||||
tag.decompose()
|
||||
# Basic hidden cleanup (best-effort)
|
||||
for el in soup.select('[hidden], [aria-hidden="true"]'):
|
||||
el.decompose()
|
||||
|
||||
text = soup.get_text(separator=" ", strip=True)
|
||||
if not text:
|
||||
return results
|
||||
|
||||
# Normalize whitespace so regexes behave consistently
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
except Exception as exc:
|
||||
# Keep consistency with your other analyzers
|
||||
results.append({
|
||||
"type": "page",
|
||||
"heuristics": [f"Text extraction error: {exc}"]
|
||||
})
|
||||
return results
|
||||
|
||||
engine = self._get_rule_engine()
|
||||
if engine is None:
|
||||
return results
|
||||
|
||||
matches_for_record: List[Dict[str, Any]] = []
|
||||
matched_phrases: List[str] = [] # order-preserving
|
||||
seen_phrases = set()
|
||||
|
||||
# How many characters to show for the preview snippet
|
||||
preview_len = getattr(settings.ui, "snippet_preview_len", 200)
|
||||
|
||||
try:
|
||||
# 1) Regex rules over full page text
|
||||
for r in engine.rules:
|
||||
if getattr(r, "category", None) != "text":
|
||||
continue
|
||||
|
||||
rtype = getattr(r, "rule_type", None)
|
||||
if rtype == "regex":
|
||||
ok, _reason = r.run(text)
|
||||
if not ok:
|
||||
continue
|
||||
|
||||
# Try to pull matched words/phrases
|
||||
compiled = getattr(r, "_compiled_regex", None)
|
||||
if compiled is None and getattr(r, "pattern", None):
|
||||
try:
|
||||
compiled = re.compile(r.pattern, re.IGNORECASE)
|
||||
except re.error:
|
||||
compiled = None
|
||||
|
||||
# Collect a few (deduped) matched phrases
|
||||
if compiled is not None:
|
||||
# limit per rule to avoid flooding
|
||||
per_rule_count = 0
|
||||
for m in compiled.finditer(text):
|
||||
phrase = m.group(0).strip()
|
||||
if phrase and phrase not in seen_phrases:
|
||||
matched_phrases.append(phrase)
|
||||
seen_phrases.add(phrase)
|
||||
per_rule_count += 1
|
||||
if per_rule_count >= 5: # cap per rule
|
||||
break
|
||||
|
||||
matches_for_record.append({
|
||||
"name": getattr(r, "name", "unknown_rule"),
|
||||
"description": getattr(r, "description", "") or "",
|
||||
"severity": getattr(r, "severity", None),
|
||||
"tags": getattr(r, "tags", None),
|
||||
})
|
||||
|
||||
elif rtype == "function":
|
||||
# Optional: function-style rules can inspect the full text
|
||||
facts = {"text": text, "category": "text"}
|
||||
ok, reason = r.run(facts)
|
||||
if ok:
|
||||
matches_for_record.append({
|
||||
"name": getattr(r, "name", "unknown_rule"),
|
||||
"description": (reason or "") or getattr(r, "description", ""),
|
||||
"severity": getattr(r, "severity", None),
|
||||
"tags": getattr(r, "tags", None),
|
||||
})
|
||||
|
||||
if matches_for_record:
|
||||
# Build the snippet from matched words/phrases
|
||||
joined = " … ".join(matched_phrases) if matched_phrases else ""
|
||||
if len(joined) > preview_len:
|
||||
joined = joined[:preview_len] + "…"
|
||||
|
||||
record: Dict[str, Any] = {
|
||||
"type": "page",
|
||||
"content_snippet": joined or None,
|
||||
"rules": matches_for_record,
|
||||
}
|
||||
results.append(record)
|
||||
|
||||
except Exception as exc:
|
||||
results.append({
|
||||
"type": "page",
|
||||
"heuristics": [f"Text analysis error: {exc}"]
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Fetcher / Orchestrator
|
||||
# -----------------------------------------------------------------------
|
||||
@@ -458,12 +530,15 @@ class Browser:
|
||||
# Read back saved source
|
||||
html_content = source_path.read_text(encoding="utf-8")
|
||||
|
||||
# Forms analysis (per-form rule checks)
|
||||
# Forms analysis
|
||||
forms_info = self.analyze_forms(html_content, final_url)
|
||||
|
||||
# Scripts artifacts (no detection here)
|
||||
# Scripts artifacts
|
||||
suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url)
|
||||
|
||||
# suspicious text
|
||||
flagged_text = self.analyze_text(html_content)
|
||||
|
||||
# Enrichment
|
||||
enrichment = enrich_url(url, fetch_ssl_enabled)
|
||||
|
||||
@@ -486,7 +561,8 @@ class Browser:
|
||||
"scripts": scripts_seen,
|
||||
"forms": forms_info,
|
||||
"suspicious_scripts": suspicious_scripts,
|
||||
"rule_checks": rule_checks_overview, # table-ready for UI
|
||||
"suspicious_text":flagged_text,
|
||||
"rule_checks": rule_checks_overview,
|
||||
"enrichment": enrichment
|
||||
}
|
||||
|
||||
|
||||
@@ -63,7 +63,9 @@ class AppConfig:
|
||||
name: str = "MyApp"
|
||||
version_major: int = 1
|
||||
version_minor: int = 0
|
||||
print_rule_loads: bool = False
|
||||
log_rule_loads: bool = False
|
||||
log_rule_dispatch: bool = False
|
||||
log_rule_debug: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
Reference in New Issue
Block a user