feat(text): add text analysis pipeline & surface results in UI

- engine: add analyse_text() to extract visible page text and evaluate
  category="text" rules; collect matched phrases and expose as
  `content_snippet` (deduped, length-capped via settings.ui.snippet_preview_len).
- engine: removed unused code
- browser: removed double call for enrichment
- engine: improve regex compilation — honor per-rule flags (string or list)
  and default IGNORECASE when category=="text".
- engine: add dispatch logging "[engine] applying categories: …" gated by
  settings.app.print_rule_dispatch.
- ui(templates): add `templates/partials/result_text.html` mirroring the forms
  table; renders page-level records and their matched rules.
- ui(controller): wire `analyse_text()` into scan path and expose
  `payload["suspicious_text"]`.
- rules(text): add `identity_verification_prompt`, `gated_document_access`,
  `email_collection_prompt`; broaden `credential_reset`.

fix: text indicators were not displayed due to missing analyzer and mismatched result shape.

Result shape:
  suspicious_text: [
    {
      "type": "page",
      "content_snippet": "...matched phrases…",
      "rules": [
        {"name": "...", "description": "...", "severity": "medium", "tags": ["..."]}
      ]
    }
  ]
This commit is contained in:
2025-08-22 17:18:50 -05:00
parent af253c858c
commit 55cd81aec0
13 changed files with 422 additions and 115 deletions

View File

@@ -4,10 +4,10 @@ from pathlib import Path
from flask import Flask from flask import Flask
# Local imports # Local imports
from .utils.settings import get_settings from app.utils.settings import get_settings
from .logging_setup import wire_logging_once, get_app_logger, get_engine_logger from app.logging_setup import wire_logging_once, get_app_logger
from app.blueprints.ui import bp as main_bp # ui blueprint from app.blueprints.main import bp as main_bp # ui blueprint
from app.blueprints.api import api_bp as api_bp # api blueprint from app.blueprints.api import api_bp as api_bp # api blueprint
from app.blueprints.roadmap import bp as roadmap_bp # roadmap from app.blueprints.roadmap import bp as roadmap_bp # roadmap

View File

@@ -134,15 +134,6 @@ def analyze():
app_logger.error(f"Analysis failed for {url}: {e}") app_logger.error(f"Analysis failed for {url}: {e}")
return redirect(url_for("main.index")) return redirect(url_for("main.index"))
# Add enrichment safely
try:
enrichment = enrich_url(url)
result["enrichment"] = enrichment
app_logger.info(f"[+] Enrichment added for {url}")
except Exception as e:
result["enrichment"] = {}
app_logger.warning(f"[!] Enrichment failed for {url}: {e}")
# Redirect to permalink page for this run # Redirect to permalink page for this run
return redirect(url_for("main.view_result", run_uuid=result["uuid"])) return redirect(url_for("main.view_result", run_uuid=result["uuid"]))

View File

@@ -2,7 +2,15 @@ app:
name: SneakyScope name: SneakyScope
version_major: 0 version_major: 0
version_minor: 1 version_minor: 1
print_rule_loads: True
# logs when rules are loaded
log_rule_loads: False
# logs each category of rule ran
log_rule_dispatch: False
# logs rule pass/fail per rule
log_rule_debug: False
cache: cache:
recent_runs_count: 10 recent_runs_count: 10

View File

@@ -96,39 +96,49 @@
severity: high severity: high
tags: [credentials, form] tags: [credentials, form]
# --- Text Rules (Social Engineering / BEC) --- # --- Text Rules (Social Engineering / BEC / Lures) ---
- name: urgent_request
description: "Language suggesting urgency (common in phishing/BEC)"
category: text
type: regex
pattern: '\b(urgent|immediately|asap|action\s*required|verify\s*now)\b'
severity: medium
tags: [bec, urgency]
- name: account_suspension - name: identity_verification_prompt
description: "Threat of account suspension/closure" description: "Prompts to verify identity/account/email, often gating access"
category: text category: text
type: regex type: regex
pattern: '\b(account\s*(suspend|closure|close)|verify\s*account)\b' # e.g., "verify your identity", "confirm your email", "validate account"
pattern: '\b(verify|confirm|validate)\s+(?:your\s+)?(identity|account|email)\b'
flags: [i]
severity: medium severity: medium
tags: [bec, scare-tactics] tags: [bec, verification, gating]
- name: financial_request - name: gated_document_access
description: "Request for gift cards, wire transfer, or money" description: "Language gating document access behind an action"
category: text category: text
type: regex type: regex
pattern: '\b(gift\s*card|wire\s*transfer|bank\s*account|bitcoin|crypto|payment\s*required)\b' # e.g., "access your secure document", "unlock document", "view document" + action verbs nearby
severity: high pattern: '(secure|confidential)\s+document|access\s+(?:the|your)?\s*document|unlock\s+document'
tags: [bec, financial] flags: [i]
severity: medium
tags: [lure, document]
- name: email_collection_prompt
description: "Explicit prompt to enter/provide an email address to proceed"
category: text
type: regex
# e.g., "enter your email address", "provide email", "use your email to continue"
pattern: '\b(enter|provide|use)\s+(?:your\s+)?email\s+(?:address)?\b'
flags: [i]
severity: low
tags: [data-collection, email]
- name: credential_reset - name: credential_reset
description: "Password reset or credential reset wording" description: "Password/credential reset or login-to-continue wording"
category: text category: text
type: regex type: regex
pattern: '\b(reset\s*password|update\s*credentials|log\s*in\s*to\s*verify|password\s*expiry)\b' # includes: reset password, update credentials, log in to (verify|view|access), password expiry/expiration
pattern: '\b(reset\s*password|update\s*credentials|log\s*in\s*to\s*(?:verify|view|access)|password\s*(?:expiry|expiration|expires))\b'
flags: [i]
severity: medium severity: medium
tags: [bec, credentials] tags: [bec, credentials]
- name: suspicious_iframe - name: suspicious_iframe
description: "Iframe tag present (possible phishing/malvertising/drive-by)" description: "Iframe tag present (possible phishing/malvertising/drive-by)"
category: text category: text

View File

@@ -3,7 +3,8 @@ rules_engine.py
""" """
import re import re
import logging import unicodedata
from collections import Counter
from dataclasses import dataclass, asdict, field from dataclasses import dataclass, asdict, field
from pathlib import Path from pathlib import Path
from typing import Callable, Dict, List, Optional, Tuple, Union from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -11,6 +12,18 @@ from typing import Callable, Dict, List, Optional, Tuple, Union
from app.logging_setup import get_engine_logger from app.logging_setup import get_engine_logger
from app.utils.settings import get_settings from app.utils.settings import get_settings
import re as _re
FLAG_MAP = {
"i": _re.IGNORECASE, "ignorecase": _re.IGNORECASE,
"m": _re.MULTILINE, "multiline": _re.MULTILINE,
"s": _re.DOTALL, "dotall": _re.DOTALL, "singleline": _re.DOTALL,
"x": _re.VERBOSE, "verbose": _re.VERBOSE,
"a": _re.ASCII, "ascii": _re.ASCII,
"u": _re.UNICODE, "unicode": _re.UNICODE,
"l": _re.LOCALE, "locale": _re.LOCALE,
}
settings = get_settings() settings = get_settings()
import yaml import yaml
@@ -49,21 +62,65 @@ class Rule:
""" """
Compile the regex pattern once for performance, if applicable. Compile the regex pattern once for performance, if applicable.
Behavior:
- Uses flags specified on the rule (list like ['i','m'] or a string like 'im').
- If the rule category is 'text' and no 'i' flag is set, defaults to IGNORECASE.
- Stores the compiled object on self._compiled_regex.
Returns: Returns:
bool: True if the regex is compiled and ready, False otherwise. bool: True if the regex is compiled and ready, False otherwise.
""" """
if getattr(self, "rule_type", None) != "regex" or not getattr(self, "pattern", None):
return False
re_flags = 0
# Collect flags from the rule, if any (supports "ims" or ["i","m","s"])
raw_flags = getattr(self, "flags", None)
if isinstance(raw_flags, str):
for ch in raw_flags:
mapped = FLAG_MAP.get(ch.lower())
if mapped is not None:
re_flags |= mapped
else:
logger.warning("[Rule] Unknown regex flag %r on rule '%s'", ch, getattr(self, "name", "?"))
elif isinstance(raw_flags, (list, tuple, set)):
for fl in raw_flags:
key = str(fl).lower()
mapped = FLAG_MAP.get(key)
if mapped is not None:
re_flags |= mapped
else:
logger.warning("[Rule] Unknown regex flag %r on rule '%s'", fl, getattr(self, "name", "?"))
# Default IGNORECASE for text rules if not explicitly provided
cat = (getattr(self, "category", "") or "").lower().strip()
if cat == "text" and not (re_flags & _re.IGNORECASE):
re_flags |= _re.IGNORECASE
if self.rule_type == "regex" and self.pattern:
try: try:
self._compiled_regex = re.compile(self.pattern, re.IGNORECASE) self._compiled_regex = _re.compile(self.pattern, re_flags)
logger.debug(f"[Rule] Compiled regex for '{self.name}'")
# Build a compact flag summary inline (e.g., 'ims' or '-' if none)
flag_parts = []
if re_flags & _re.IGNORECASE: flag_parts.append("i")
if re_flags & _re.MULTILINE: flag_parts.append("m")
if re_flags & _re.DOTALL: flag_parts.append("s")
if re_flags & _re.VERBOSE: flag_parts.append("x")
if re_flags & _re.ASCII: flag_parts.append("a")
if re_flags & _re.UNICODE: flag_parts.append("u")
if re_flags & _re.LOCALE: flag_parts.append("l")
flag_summary = "".join(flag_parts) if flag_parts else "-"
logger.info("[Rule] Compiled regex for '%s' (flags=%s)", getattr(self, "name", "?"), flag_summary)
return True return True
except re.error as rex:
except _re.error as rex:
self._compiled_regex = None self._compiled_regex = None
logger.warning(f"[Rule] Failed to compile regex for '{self.name}': {rex}") logger.warning("[Rule] Failed to compile regex for '%s': %s", getattr(self, "name", "?"), rex)
return False
return False return False
def run(self, text: str) -> Tuple[bool, str]: def run(self, text: str) -> Tuple[bool, str]:
""" """
Run the rule on the given text. Run the rule on the given text.
@@ -198,7 +255,7 @@ class RuleEngine:
) )
return return
if settings.app.print_rule_loads: if settings.app.log_rule_loads:
logger.info( logger.info(
"[engine] add_rule: %s/%s replace=%s -> count=%d", "[engine] add_rule: %s/%s replace=%s -> count=%d",
rule.category, rule.name, bool(replace), len(self._rules) rule.category, rule.name, bool(replace), len(self._rules)
@@ -230,6 +287,14 @@ class RuleEngine:
self.add_rule(rules[i], replace=replace) self.add_rule(rules[i], replace=replace)
i = i + 1 i = i + 1
def _normalize_for_text_rules(self, s: str) -> str:
if not s:
return ""
s = unicodedata.normalize("NFKC", s)
# collapse whitespace; keeps word boundaries sensible
s = _re.sub(r"\s+", " ", s).strip()
return s
def run_all(self, text: str, category: Optional[str] = None) -> List[Dict]: def run_all(self, text: str, category: Optional[str] = None) -> List[Dict]:
""" """
Run all rules against text. Run all rules against text.
@@ -241,6 +306,30 @@ class RuleEngine:
Returns: Returns:
List of dicts with PASS/FAIL per rule (JSON-serializable). List of dicts with PASS/FAIL per rule (JSON-serializable).
""" """
# --- dispatch visibility --- if set to true, we log applied categories
if getattr(settings.app, "log_rule_dispatch", False):
all_cats = [r.category for r in self._rules]
cat_counts = Counter(all_cats)
# Which categories are being applied this run?
if category is None:
selected_categories = sorted(cat_counts.keys())
else:
selected_categories = [category]
# How many rules match the selection?
selected_rule_count = sum(1 for r in self._rules if r.category in selected_categories)
try:
logger.info(
"[engine] applying categories: %s | selected_rules=%d | totals=%s",
",".join(selected_categories),
selected_rule_count,
dict(cat_counts),
)
except Exception:
pass
# --- end dispatch visibility ---
results: List[Dict] = [] results: List[Dict] = []
index = 0 index = 0
@@ -248,12 +337,20 @@ class RuleEngine:
while index < total: while index < total:
rule = self.rules[index] rule = self.rules[index]
# if we are running a text rule, let's normalize the text.
if category == "text":
text = self._normalize_for_text_rules(text)
if category is not None and rule.category != category: if category is not None and rule.category != category:
index = index + 1 index = index + 1
continue continue
matched, reason = rule.run(text) matched, reason = rule.run(text)
# very fine-grained trace per rule:
if getattr(settings.app, "log_rule_debug", False):
logger.info(f"[engine] eval: cat:{rule.category} - rule:{rule.name} - result: {matched} - reason:{reason}" )
result_str = "FAIL" if matched else "PASS" result_str = "FAIL" if matched else "PASS"
reason_to_include: Optional[str] reason_to_include: Optional[str]
if matched: if matched:

View File

@@ -54,5 +54,5 @@
<p class="text-sm text-gray-500">No enrichment data available.</p> <p class="text-sm text-gray-500">No enrichment data available.</p>
{% endif %} {% endif %}
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section> </section>

View File

@@ -109,5 +109,5 @@
<p class="text-sm text-gray-500">No form issues detected.</p> <p class="text-sm text-gray-500">No form issues detected.</p>
{% endif %} {% endif %}
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section> </section>

View File

@@ -116,5 +116,5 @@
<p class="text-sm text-gray-500">No suspicious scripts detected.</p> <p class="text-sm text-gray-500">No suspicious scripts detected.</p>
{% endif %} {% endif %}
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section> </section>

View File

@@ -193,7 +193,7 @@
</details> </details>
{% endif %} {% endif %}
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</div> </div>
</section> </section>
{% endmacro %} {% endmacro %}

View File

@@ -0,0 +1,120 @@
<!-- /templates/partials/result_text.html -->
<section id="sus_text" class="card">
<h2 class="text-lg font-semibold mb-3">Text</h2>
{% if suspicious_text and suspicious_text|length > 0 %}
<div class="overflow-x-auto">
<table class="w-full table-fixed text-sm"> <!-- matches forms table style -->
<colgroup>
<col class="w-[10%]"> <!-- Source -->
<col class="w-[10%]"> <!-- Indicators -->
<col class="w-[15%]"> <!-- Tags -->
<col class="w-[45%]"> <!-- Matches (Rules) -->
<col class="w-[25%]"> <!-- Text Snippet -->
</colgroup>
<thead class="text-gray-400 border-b border-gray-800">
<tr>
<th class="text-left py-2 pr-4 whitespace-normal break-words">Source</th>
<th class="text-left py-2 pr-4 whitespace-normal break-words">Indicators</th>
<th class="text-left py-2 pr-4 whitespace-normal break-words">Tags</th>
<th class="text-left py-2 pr-4 whitespace-normal break-words">Matches (Rules)</th>
<th class="text-left py-2 pr-4 whitespace-normal break-words">Text Snippet</th>
</tr>
</thead>
<tbody>
{% for rec in suspicious_text %}
<tr class="border-b border-gray-900 align-top">
<!-- Source -->
<td class="py-2 pr-4 break-words">
{{ (rec.type or 'page')|title }}
</td>
<!-- Indicators (count of rules matched) -->
<td class="py-2 pr-4 whitespace-nowrap">
{{ rec.rules|length if rec.rules else 0 }}
</td>
<!-- Tags (unique across rules) -->
<td class="py-2 pr-4 break-words">
{% set ns = namespace(tags=[]) %}
{% if rec.rules %}
{% for r in rec.rules %}
{% if r.tags %}
{% for t in r.tags %}
{% if t not in ns.tags %}
{% set ns.tags = ns.tags + [t] %}
{% endif %}
{% endfor %}
{% endif %}
{% endfor %}
{% endif %}
{% if ns.tags and ns.tags|length > 0 %}
<div class="flex flex-wrap gap-1">
{% for t in ns.tags %}
<span class="chip" title="Tag: {{ t }}">{{ t }}</span>
{% endfor %}
</div>
{% else %}
<span class="chip">None</span>
{% endif %}
</td>
<!-- Matches (Rules) -->
<td class="py-2 pr-4 break-words">
{% if rec.rules and rec.rules|length > 0 %}
<ul class="space-y-1">
{% for r in rec.rules %}
<li title="{{ r.description or '' }}">
{{ r.name }}
{% if r.severity %}
{% set sev = r.severity|lower %}
<span class="ml-2 rounded-full px-2 py-0.5 text-xs border
{% if sev == 'high' %} badge badge-danger
{% elif sev == 'medium' %} badge badge-warn
{% else %} badge badge-info {% endif %}">
{{ r.severity|title }}
</span>
{% endif %}
{% if r.tags %}
{% for t in r.tags %}
<span class="chip" title="Tag: {{ t }}">{{ t }}</span>
{% endfor %}
{% endif %}
{% if r.description %}
<small class="text-gray-400"> — {{ r.description }}</small>
{% endif %}
</li>
{% endfor %}
</ul>
{% else %}
<span class="text-gray-500">N/A</span>
{% endif %}
</td>
<!-- Text Snippet (matched phrases; let column width control it) -->
<td class="py-2 pr-4 align-top">
{% if rec.content_snippet %}
<details>
<summary class="cursor-pointer text-blue-300 hover:underline">
View snippet ({{ rec.content_snippet|length }} chars)
</summary>
<pre class="mt-1 bg-[#0b0f14] border border-gray-800 rounded-lg p-3
w-full max-w-full overflow-auto max-h-64
whitespace-pre-wrap break-words font-mono text-xs">{{ rec.content_snippet }}</pre>
</details>
{% else %}
<span class="text-gray-500">N/A</span>
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
{% else %}
<p class="text-sm text-gray-500">No text issues detected.</p>
{% endif %}
<p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section>

View File

@@ -15,6 +15,7 @@
<a href="#redirects" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Redirects</a> <a href="#redirects" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Redirects</a>
<a href="#forms" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Forms</a> <a href="#forms" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Forms</a>
<a href="#scripts" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Suspicious Scripts</a> <a href="#scripts" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Suspicious Scripts</a>
<a href="#sus_text" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Suspicious Text</a>
<a href="#screenshot" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Screenshot</a> <a href="#screenshot" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Screenshot</a>
<a href="#source" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Source</a> <a href="#source" class="px-2 py-1 rounded border border-gray-700 hover:bg-gray-800">Source</a>
</div> </div>
@@ -35,7 +36,7 @@
{{ request.host_url }}results/{{ uuid }} {{ request.host_url }}results/{{ uuid }}
</a> </a>
</p> </p>
<p><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</div> </div>
</section> </section>
@@ -73,23 +74,25 @@
{% else %} {% else %}
<p class="text-sm text-gray-500">No redirects detected.</p> <p class="text-sm text-gray-500">No redirects detected.</p>
{% endif %} {% endif %}
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section> </section>
<!-- Forms --> <!-- Forms -->
{% include "partials/result_forms.html" %} {% include "partials/result_forms.html" %}
<!-- Suspicious Scripts --> <!-- Suspicious Scripts -->
{% include "partials/result_scripts.html" %} {% include "partials/result_scripts.html" %}
<!-- Suspicious Text -->
{% include "partials/result_text.html" with context %}
<!-- Screenshot --> <!-- Screenshot -->
<section id="screenshot" class="bg-card border border-gray-800 rounded-xl p-4"> <section id="screenshot" class="bg-card border border-gray-800 rounded-xl p-4">
<h2 class="text-lg font-semibold mb-3">Screenshot</h2> <h2 class="text-lg font-semibold mb-3">Screenshot</h2>
<img src="{{ url_for('main.artifacts', run_uuid=uuid, filename='screenshot.png') }}" <img src="{{ url_for('main.artifacts', run_uuid=uuid, filename='screenshot.png') }}"
alt="Screenshot" alt="Screenshot"
class="w-full rounded-lg border border-gray-800"> class="w-full rounded-lg border border-gray-800">
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section> </section>
<!-- Source --> <!-- Source -->
@@ -102,7 +105,7 @@
View Source View Source
</a> </a>
</p> </p>
<p class="mt-2"><a href="#top-jump-list" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p> <p class="mt-2"><a href="#url-overview" class="text-sm text-gray-400 hover:text-blue-400">Back to top</a></p>
</section> </section>
</div> </div>

View File

@@ -29,6 +29,7 @@ from typing import Any, Dict, List, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import re
from flask import current_app from flask import current_app
from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError
@@ -85,64 +86,6 @@ class Browser:
index = index + 1 index = index + 1
return summary return summary
def run_rule_checks(self, text: str, category: str) -> Dict[str, Any]:
"""
Run all rules for a given category against provided text, returning a table-friendly model.
Args:
text: Text to analyze (HTML, snippet, etc.)
category: One of 'form', 'script', 'text' (or any category your rules use)
Returns:
{
"checks": [
{ "name": str, "description": str, "category": str,
"result": "PASS"|"FAIL", "reason": Optional[str],
"severity": Optional[str], "tags": Optional[List[str]] }, ...
],
"summary": { "fail_count": int, "total_rules": int }
}
"""
out: Dict[str, Any] = {"checks": [], "summary": {"fail_count": 0, "total_rules": 0}}
engine = self._get_rule_engine()
if engine is None:
return out
try:
engine_results = engine.run_all(text, category=category) # list of dicts
index = 0
total = len(engine_results)
while index < total:
item = engine_results[index]
normalized = {
"name": item.get("name"),
"description": item.get("description"),
"category": item.get("category"),
"result": item.get("result"), # "PASS" | "FAIL"
"reason": item.get("reason"), # present on FAIL by engine design
"severity": item.get("severity"),
"tags": item.get("tags"),
}
out["checks"].append(normalized)
index = index + 1
out["summary"] = self._summarize_results(out["checks"])
except Exception as exc:
# Preserve shape; record the error as a synthetic PASS (so UI doesn't break)
out["checks"].append({
"name": "engine_error",
"description": "Rule engine failed during evaluation",
"category": category,
"result": "PASS",
"reason": f"{exc}",
"severity": None,
"tags": None
})
out["summary"] = {"fail_count": 0, "total_rules": 1}
return out
def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]: def build_rule_checks_overview(self, full_html_text: str) -> List[Dict[str, Any]]:
""" """
Build a top-level overview for the results page: runs each category across Build a top-level overview for the results page: runs each category across
@@ -376,6 +319,135 @@ class Browser:
return results return results
def analyze_text(self, html: str) -> List[Dict[str, Any]]:
"""
Extract visible page text and evaluate text rules.
Only include rows that matched at least one rule.
Returns a list with 0..1 records shaped like:
{
"type": "page",
"content_snippet": "<matched words/phrases joined>",
"rules": [
{"name": "...", "description": "...", "severity": "...", "tags": [...]},
...
],
}
"""
results: List[Dict[str, Any]] = []
# Short-circuit on missing html
if not html:
return results
# Extract visible text (strip scripts/styles)
try:
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript", "template"]):
tag.decompose()
# Basic hidden cleanup (best-effort)
for el in soup.select('[hidden], [aria-hidden="true"]'):
el.decompose()
text = soup.get_text(separator=" ", strip=True)
if not text:
return results
# Normalize whitespace so regexes behave consistently
text = re.sub(r"\s+", " ", text).strip()
except Exception as exc:
# Keep consistency with your other analyzers
results.append({
"type": "page",
"heuristics": [f"Text extraction error: {exc}"]
})
return results
engine = self._get_rule_engine()
if engine is None:
return results
matches_for_record: List[Dict[str, Any]] = []
matched_phrases: List[str] = [] # order-preserving
seen_phrases = set()
# How many characters to show for the preview snippet
preview_len = getattr(settings.ui, "snippet_preview_len", 200)
try:
# 1) Regex rules over full page text
for r in engine.rules:
if getattr(r, "category", None) != "text":
continue
rtype = getattr(r, "rule_type", None)
if rtype == "regex":
ok, _reason = r.run(text)
if not ok:
continue
# Try to pull matched words/phrases
compiled = getattr(r, "_compiled_regex", None)
if compiled is None and getattr(r, "pattern", None):
try:
compiled = re.compile(r.pattern, re.IGNORECASE)
except re.error:
compiled = None
# Collect a few (deduped) matched phrases
if compiled is not None:
# limit per rule to avoid flooding
per_rule_count = 0
for m in compiled.finditer(text):
phrase = m.group(0).strip()
if phrase and phrase not in seen_phrases:
matched_phrases.append(phrase)
seen_phrases.add(phrase)
per_rule_count += 1
if per_rule_count >= 5: # cap per rule
break
matches_for_record.append({
"name": getattr(r, "name", "unknown_rule"),
"description": getattr(r, "description", "") or "",
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
elif rtype == "function":
# Optional: function-style rules can inspect the full text
facts = {"text": text, "category": "text"}
ok, reason = r.run(facts)
if ok:
matches_for_record.append({
"name": getattr(r, "name", "unknown_rule"),
"description": (reason or "") or getattr(r, "description", ""),
"severity": getattr(r, "severity", None),
"tags": getattr(r, "tags", None),
})
if matches_for_record:
# Build the snippet from matched words/phrases
joined = "".join(matched_phrases) if matched_phrases else ""
if len(joined) > preview_len:
joined = joined[:preview_len] + ""
record: Dict[str, Any] = {
"type": "page",
"content_snippet": joined or None,
"rules": matches_for_record,
}
results.append(record)
except Exception as exc:
results.append({
"type": "page",
"heuristics": [f"Text analysis error: {exc}"]
})
return results
# ----------------------------------------------------------------------- # -----------------------------------------------------------------------
# Fetcher / Orchestrator # Fetcher / Orchestrator
# ----------------------------------------------------------------------- # -----------------------------------------------------------------------
@@ -458,12 +530,15 @@ class Browser:
# Read back saved source # Read back saved source
html_content = source_path.read_text(encoding="utf-8") html_content = source_path.read_text(encoding="utf-8")
# Forms analysis (per-form rule checks) # Forms analysis
forms_info = self.analyze_forms(html_content, final_url) forms_info = self.analyze_forms(html_content, final_url)
# Scripts artifacts (no detection here) # Scripts artifacts
suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url) suspicious_scripts = self.analyze_scripts(html_content, base_url=final_url)
# suspicious text
flagged_text = self.analyze_text(html_content)
# Enrichment # Enrichment
enrichment = enrich_url(url, fetch_ssl_enabled) enrichment = enrich_url(url, fetch_ssl_enabled)
@@ -486,7 +561,8 @@ class Browser:
"scripts": scripts_seen, "scripts": scripts_seen,
"forms": forms_info, "forms": forms_info,
"suspicious_scripts": suspicious_scripts, "suspicious_scripts": suspicious_scripts,
"rule_checks": rule_checks_overview, # table-ready for UI "suspicious_text":flagged_text,
"rule_checks": rule_checks_overview,
"enrichment": enrichment "enrichment": enrichment
} }

View File

@@ -63,7 +63,9 @@ class AppConfig:
name: str = "MyApp" name: str = "MyApp"
version_major: int = 1 version_major: int = 1
version_minor: int = 0 version_minor: int = 0
print_rule_loads: bool = False log_rule_loads: bool = False
log_rule_dispatch: bool = False
log_rule_debug: bool = False
@dataclass @dataclass