import re import uuid import json from pathlib import Path from bs4 import BeautifulSoup from datetime import datetime from urllib.parse import urlparse from typing import Dict, Any, Optional from playwright.async_api import async_playwright, TimeoutError as PWTimeoutError from flask import current_app # access the rule engine from app config from app.utils.io_helpers import safe_write from .enrichment import enrich_url def get_rule_engine(): """ Retrieve the rules engine instance from the Flask application config. Returns: RuleEngine or None: The engine if available, or None if not configured. """ try: # current_app is only available during an active request context engine = current_app.config.get("RULE_ENGINE") return engine except Exception: # If called outside a Flask request context, fail gracefully return None def run_rule_checks(text, category): """ Run all rules for a given category against the provided text. Args: text (str): The content to test (e.g., form snippet, inline JS). category (str): The rule category to run (e.g., 'form' or 'script'). Returns: dict: { "checks": [ { "rule": str, "category": str, "matched": bool, "reason": Optional[str] }, ... ], "summary": { "matched_count": int, "total_rules": int } } """ result = { "checks": [], "summary": { "matched_count": 0, "total_rules": 0 } } engine = get_rule_engine() if engine is None: # No engine configured; return empty but well-formed structure return result try: # Run engine rules for the specified category check_results = engine.run_all(text, category=category) # Normalize results into the expected structure total = 0 matched = 0 for item in check_results: # item is expected to contain: rule, category, matched, reason (optional) total = total + 1 if bool(item.get("matched")): matched = matched + 1 normalized = { "rule": item.get("rule"), "category": item.get("category"), "matched": bool(item.get("matched")), "reason": item.get("reason") } result["checks"].append(normalized) result["summary"]["matched_count"] = matched result["summary"]["total_rules"] = total except Exception as e: # If anything goes wrong, keep structure and add a fake failure note result["checks"].append({ "rule": "engine_error", "category": category, "matched": False, "reason": f"Rule engine error: {e}" }) result["summary"]["matched_count"] = 0 result["summary"]["total_rules"] = 0 return result def analyze_forms(html: str, base_url: str): """ Parse forms from the page HTML and apply heuristic flags and rule-based checks. Args: html (str): The full page HTML. base_url (str): The final URL of the page (used for hostname comparisons). Returns: list[dict]: A list of form analysis dictionaries, each including: - action, method, inputs - flagged (bool), flag_reasons (list[str]), status (str) - rule_checks: dict with "checks" (list) and "summary" (dict) """ soup = BeautifulSoup(html, "lxml") forms_info = [] page_hostname = urlparse(base_url).hostname for form in soup.find_all("form"): action = form.get("action") method = form.get("method", "get").lower() # Build explicit inputs list inputs = [] for inp in form.find_all("input"): input_name = inp.get("name") input_type = inp.get("type", "text") inputs.append({ "name": input_name, "type": input_type }) flagged_reasons = [] # No action specified if not action or str(action).strip() == "": flagged_reasons.append("No action specified") # External host else: try: action_host = urlparse(action).hostname if not str(action).startswith("/") and action_host != page_hostname: flagged_reasons.append("Submits to a different host") except Exception: # If hostname parsing fails, skip this condition quietly pass # HTTP form on HTTPS page try: if urlparse(action).scheme == "http" and urlparse(base_url).scheme == "https": flagged_reasons.append("Submits over insecure HTTP") except Exception: # If scheme parsing fails, ignore pass # Hidden password / suspicious hidden inputs for hidden in form.find_all("input", type="hidden"): name_value = hidden.get("name") or "" if "password" in name_value.lower(): flagged_reasons.append("Hidden password field") flagged = bool(flagged_reasons) # Serialize a simple form snippet for the rules engine to analyze (category='form') snippet_lines = [] snippet_lines.append(f"action={action}") snippet_lines.append(f"method={method}") snippet_lines.append("inputs=") for item in inputs: snippet_lines.append(f" - name={item.get('name')} type={item.get('type')}") form_snippet = "\n".join(snippet_lines) rule_checks = run_rule_checks(form_snippet, category="form") forms_info.append({ "action": action, "method": method, "inputs": inputs, "flagged": flagged, "flag_reasons": flagged_reasons, "status": "flagged" if flagged else "possibly safe", "rule_checks": rule_checks }) return forms_info def analyze_scripts(html: str, base_url: str = "", engine=None) -> list[dict]: """ Analyze