feat: HTTPS auto-normalization; robust TLS intel UI; global rules state; clean logging; preload

- Add SSL/TLS intelligence pipeline: - crt.sh lookup with expired-filtering and root-domain wildcard resolution - live TLS version/cipher probe with weak/legacy flags and probe notes - UI: card + matrix rendering, raw JSON toggle, and host/wildcard cert lists - Front page: checkbox to optionally fetch certificate/CT data - Introduce `URLNormalizer` with punycode support and typo repair - Auto-prepend `https://` for bare domains (e.g., `google.com`) - Optional quick HTTPS reachability + `http://` fallback - Provide singleton via function-cached `@singleton_loader`: - `get_url_normalizer()` reads defaults from Settings (if present) - Standardize function-rule return shape to `(bool, dict|None)` across `form_*` and `script_*` rules; include structured payloads (`note`, hosts, ext, etc.) - Harden `FunctionRuleAdapter`: - Coerce legacy returns `(bool)`, `(bool, str)` → normalized outputs - Adapt non-dict inputs to facts (category-aware and via provided adapter) - Return `(True, dict)` on match, `(False, None)` on miss - Bind-time logging with file:line + function id for diagnostics - `RuleEngine`: - Back rules by private `self._rules`; `rules` property returns copy - Idempotent `add_rule(replace=False)` with in-place replace and regex (re)compile - Fix AttributeError from property assignment during `__init__` - Replace hidden singleton factory with explicit builder + global state: - `app/rules/factory.py::build_rules_engine()` builds and logs totals - `app/state.py` exposes `set_rules_engine()` / `get_rules_engine()` as the SOF - `app/wsgi.py` builds once at preload and publishes via `set_rules_engine()` - Add lightweight debug hooks (`SS_DEBUG_RULES=1`) to trace engine id and rule counts - Unify logging wiring: - `wire_logging_once(app)` clears and attaches a single handler chain - Create two named loggers: `sneakyscope.app` and `sneakyscope.engine` - Disable propagation to prevent dupes; include pid/logger name in format - Remove stray/duplicate handlers and import-time logging - Optional dedup filter for bursty repeats (kept off by default) - Gunicorn: enable `--preload` in entrypoint to avoid thread races and double registration - Documented foreground vs background log “double consumer” caveat (attach vs `compose logs`) - Jinja: replace `{% return %}` with structured `if/elif/else` branches - Add toggle button to show raw JSON for TLS/CT section - Consumers should import the rules engine via: - `from app.state import get_rules_engine` - Use `build_rules_engine()` **only** during preload/init to construct the instance, then publish with `set_rules_engine()`. Do not call old singleton factories. - New/changed modules (high level): - `app/utils/urltools.py` (+) — URLNormalizer + `get_url_normalizer()` - `app/rules/function_rules.py` (±) — normalized payload returns - `engine/function_rule_adapter.py` (±) — coercion, fact adaptation, bind logs - `app/utils/rules_engine.py` (±) — `_rules`, idempotent `add_rule`, fixes - `app/rules/factory.py` (±) — pure builder; totals logged post-registration - `app/state.py` (+) — process-global rules engine - `app/logging_setup.py` (±) — single chain, two named loggers - `app/wsgi.py` (±) — preload build + `set_rules_engine()` - `entrypoint.sh` (±) — add `--preload` - templates (±) — TLS card, raw toggle; front-page checkbox Closes: flaky rule-type warnings, duplicate logs, and multi-worker race on rules init.
2025-08-21 22:05:16 -05:00
parent f639ad0934
commit 693f7d67b9
22 changed files with 1476 additions and 256 deletions
--- a/app/utils/enrichment.py
+++ b/app/utils/enrichment.py
@@ -1,19 +1,25 @@
-import logging
-from pathlib import Path
 from urllib.parse import urlparse
 import requests
-import yaml
+import json
 import whois
 from datetime import datetime
 from ipaddress import ip_address
 import socket

+# Optional: high-accuracy root-domain detection if available (tldextract is in the requirements, but this is still useful)
+try:
+    import tldextract
+    _HAS_TLDEXTRACT = True
+except Exception:
+    _HAS_TLDEXTRACT = False
+
 # Local imports
 from app.utils.cache_db import get_cache
 from app.utils.settings import get_settings
+from app.utils.tls_probe import TLSEnumerator

-# Configure logging
-logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+# Configure logger
+from app.logging_setup import get_app_logger

 # Init cache
 cache = get_cache("/data/cache.db")
@@ -25,32 +31,244 @@ days = 24 * 60
 GEOIP_DEFAULT_TTL = settings.cache.geoip_cache_days * days
 WHOIS_DEFAULT_TTL = settings.cache.whois_cache_days * days

-def enrich_url(url: str) -> dict:
-    """Perform WHOIS, GeoIP, and BEC word enrichment."""
-    result = {}
+logger = get_app_logger()
+
+
+
+def parse_target_to_host(target):
+    """
+    Convert a user-supplied string (URL or domain) into a hostname.
+
+    Returns:
+        str or None
+    """
+    if target is None:
+        return None
+
+    value = str(target).strip()
+    if value == "":
+        return None
+
+    # urlparse needs a scheme to treat the first token as netloc
+    parsed = urlparse(value if "://" in value else f"http://{value}")
+
+    # If the input was something like "localhost:8080/path", netloc includes the port
+    host = parsed.hostname
+    if host is None:
+        return None
+
+    # Lowercase for consistency
+    host = host.strip().lower()
+    if host == "":
+        return None
+
+    return host
+
+def get_root_domain(hostname):
+    """
+    Determine the registrable/root domain from a hostname.
+    Prefers tldextract if available; otherwise falls back to a heuristic.
+
+    Examples:
+        sub.a.example.com -> example.com
+        portal.gov.uk -> gov.uk (but with PSL, you’d get portal.gov.uk’s registrable, which is gov.uk)
+        api.example.co.uk -> example.co.uk  (PSL needed for correctness)
+
+    Returns:
+        str  (best-effort registrable domain)
+    """
+    if hostname is None:
+        return None
+
+    if _HAS_TLDEXTRACT:
+        # tldextract returns subdomain, domain, suffix separately using PSL rules
+        # e.g., sub= "api", domain="example", suffix="co.uk"
+        parts = tldextract.extract(hostname)
+        # If suffix is empty (e.g., localhost), fall back
+        if parts.suffix:
+            return f"{parts.domain}.{parts.suffix}".lower()
+        else:
+            return hostname.lower()
+
+    # Fallback heuristic: last two labels (not perfect for multi-part TLDs, but safe)
+    # We avoid list comprehensions per your preference for explicit code
+    labels = hostname.split(".")
+    labels = [lbl for lbl in labels if lbl]  # allow simple cleanup without logic change
+
+    if len(labels) >= 2:
+        last = labels[-1]
+        second_last = labels[-2]
+        candidate = f"{second_last}.{last}".lower()
+        return candidate
+
+    return hostname.lower()
+
+def is_root_domain(hostname):
+    """
+    Is the provided hostname the same as its registrable/root domain?
+    """
+    if hostname is None:
+        return False
+
+    root = get_root_domain(hostname)
+    if root is None:
+        return False
+
+    return hostname.lower() == root.lower()
+
+def search_certs(domain, wildcard=True, expired=True, deduplicate=True):
+    """
+    Search crt.sh for the given domain.
+
+    domain -- Domain to search for
+    wildcard -- Whether or not to prepend a wildcard to the domain
+                (default: True)
+    expired -- Whether or not to include expired certificates
+                (default: True)
+
+    Return a list of objects, like so:
+
+    {
+        "issuer_ca_id": 16418,
+        "issuer_name": "C=US, O=Let's Encrypt, CN=Let's Encrypt Authority X3",
+        "name_value": "hatch.uber.com",
+        "min_cert_id": 325717795,
+        "min_entry_timestamp": "2018-02-08T16:47:39.089",
+        "not_before": "2018-02-08T15:47:39"
+    }
+    """
+    base_url = "https://crt.sh/?q={}&output=json"
+    if not expired:
+        base_url = base_url + "&exclude=expired"
+    if deduplicate:
+        base_url = base_url + "&deduplicate=Y"
+    if wildcard and "%" not in domain:
+        domain = "%.{}".format(domain)
+    url = base_url.format(domain)
+
+    ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'
+    req = requests.get(url, headers={'User-Agent': ua})
+
+    if req.ok:
+        try:
+            content = req.content.decode('utf-8')
+            data = json.loads(content)
+            return data
+        except ValueError:
+            # crt.sh fixed their JSON response. This shouldn't be necessary anymore
+            # https://github.com/crtsh/certwatch_db/commit/f4f46ea37c23543c4cdf1a3c8867d68967641807
+            data = json.loads("[{}]".format(content.replace('}{', '},{')))
+            return data
+        except Exception as err:
+            logger.error("Error retrieving cert information from CRT.sh.")
+    return None
+
+def gather_crtsh_certs_for_target(target):
+    """
+    Given a URL or domain-like input, return crt.sh results for:
+      - The exact hostname
+      - If hostname is a subdomain, also the wildcard for the root domain (e.g., *.example.com)
+
+    We intentionally run this even if the scheme is HTTP (per your design).
+    Expired certs are excluded by default.
+
+    Returns:
+        dict:
+        {
+            "input": <original target>,
+            "hostname": <parsed hostname>,
+            "root_domain": <registrable>,
+            "is_root_domain": <bool>,
+            "crtsh": {
+                "host_certs": [... or None],
+                "wildcard_root_certs": [... or None]
+            }
+        }
+    """
+    result = {
+        "input": target,
+        "hostname": None,
+        "root_domain": None,
+        "is_root_domain": False,
+        "crtsh": {
+            "host_certs": None,
+            "wildcard_root_certs": None
+        }
+    }
+
+    try:
+        hostname = parse_target_to_host(target)
+        result["hostname"] = hostname
+
+        if hostname is None:
+            return result
+
+        root = get_root_domain(hostname)
+        result["root_domain"] = root
+        result["is_root_domain"] = is_root_domain(hostname)
+
+        # Always query crt.sh for the specific hostname
+        # (expired=False means we filter expired)
+        host_certs = search_certs(hostname, wildcard=False, expired=False)
+        result["crtsh"]["host_certs"] = host_certs
+
+        # If subdomain, also look up wildcard for the root domain: *.root
+        if not result["is_root_domain"] and root:
+            wildcard_certs = search_certs(root, wildcard=True, expired=False)
+            result["crtsh"]["wildcard_root_certs"] = wildcard_certs
+
+    except Exception as exc:
+        logger.exception("crt.sh enrichment failed: %s", exc)
+
+    return result
+
+def enrich_url(url: str, fetch_ssl_enabled:bool=False) -> dict:
+    """Perform WHOIS, GeoIP"""
+    enrichment = {}

    # Extract hostname
    parsed = urlparse(url)
    hostname = parsed.hostname or url  # fallback if parsing fails

    # --- WHOIS ---
-    result.update(enrich_whois(hostname))
+    enrichment.update(enrich_whois(hostname))

    # --- GeoIP ---
-    result["geoip"] = enrich_geoip(hostname)
+    enrichment["geoip"] = enrich_geoip(hostname)

-    return result
+    # === SSL/TLS: crt.sh + live probe ===
+    # if fetching ssl...
+    if fetch_ssl_enabled:
+        try:
+            # 1) Certificate Transparency (already implemented previously)
+            crtsh_info = gather_crtsh_certs_for_target(url)

+            # 2) Live TLS probe (versions + negotiated cipher per version)
+            tls_enum = TLSEnumerator(timeout_seconds=5.0)
+            probe_result = tls_enum.probe(url)
+
+            enrichment["ssl_tls"] = {}
+            enrichment["ssl_tls"]["crtsh"] = crtsh_info
+            enrichment["ssl_tls"]["probe"] = probe_result.to_dict()
+
+        except Exception as exc:
+            logger.exception("SSL/TLS enrichment failed: %s", exc)
+            enrichment["ssl_tls"] = {"error": "SSL/TLS enrichment failed"}
+    else:
+        # Include a small marker so the UI can show “skipped”
+        enrichment["ssl_tls"] = {"skipped": True, "reason": "Disabled on submission"}
+
+    return enrichment

 def enrich_whois(hostname: str) -> dict:
    """Fetch WHOIS info using python-whois with safe type handling."""
    cache_key = f"whois:{hostname}"
    cached = cache.read(cache_key)
    if cached:
-        logging.info(f"[CACHE HIT] for WHOIS: {hostname}")
+        logger.info(f"[CACHE HIT] for WHOIS: {hostname}")
        return cached

-    logging.info(f"[CACHE MISS] for WHOIS: {hostname}")
+    logger.info(f"[CACHE MISS] for WHOIS: {hostname}")
    result = {}
    try:
        w = whois.whois(hostname)
@@ -73,7 +291,7 @@ def enrich_whois(hostname: str) -> dict:
        }

    except Exception as e:
-        logging.warning(f"WHOIS lookup failed for {hostname}: {e}")
+        logger.warning(f"WHOIS lookup failed for {hostname}: {e}")
        try:
            # fallback raw whois text
            import subprocess
@@ -81,14 +299,13 @@ def enrich_whois(hostname: str) -> dict:
            result["whois"] = {}
            result["raw_whois"] = raw_output
        except Exception as raw_e:
-            logging.error(f"Raw WHOIS also failed: {raw_e}")
+            logger.error(f"Raw WHOIS also failed: {raw_e}")
            result["whois"] = {}
            result["raw_whois"] = "N/A"

    cache.create(cache_key, result, WHOIS_DEFAULT_TTL)
    return result

-
 def enrich_geoip(hostname: str) -> dict:
    """Resolve hostname to IPs and fetch info from ip-api.com."""
    geo_info = {}
@@ -98,11 +315,11 @@ def enrich_geoip(hostname: str) -> dict:
        cache_key = f"geoip:{ip_str}"
        cached = cache.read(cache_key)
        if cached:
-            logging.info(f"[CACHE HIT] for GEOIP: {ip}")
+            logger.info(f"[CACHE HIT] for GEOIP: {ip}")
            geo_info[ip_str] = cached
            continue

-        logging.info(f"[CACHE MISS] for GEOIP: {ip}")
+        logger.info(f"[CACHE MISS] for GEOIP: {ip}")
        try:
            resp = requests.get(f"http://ip-api.com/json/{ip_str}?fields=24313855", timeout=5)
            if resp.status_code == 200:
@@ -116,7 +333,6 @@ def enrich_geoip(hostname: str) -> dict:

    return geo_info

-
 def extract_ips_from_url(hostname: str):
    """Resolve hostname to IPs."""
    try: