- Add SSL/TLS intelligence pipeline:
- crt.sh lookup with expired-filtering and root-domain wildcard resolution
- live TLS version/cipher probe with weak/legacy flags and probe notes
- UI: card + matrix rendering, raw JSON toggle, and host/wildcard cert lists
- Front page: checkbox to optionally fetch certificate/CT data
- Introduce `URLNormalizer` with punycode support and typo repair
- Auto-prepend `https://` for bare domains (e.g., `google.com`)
- Optional quick HTTPS reachability + `http://` fallback
- Provide singleton via function-cached `@singleton_loader`:
- `get_url_normalizer()` reads defaults from Settings (if present)
- Standardize function-rule return shape to `(bool, dict|None)` across
`form_*` and `script_*` rules; include structured payloads (`note`, hosts, ext, etc.)
- Harden `FunctionRuleAdapter`:
- Coerce legacy returns `(bool)`, `(bool, str)` → normalized outputs
- Adapt non-dict inputs to facts (category-aware and via provided adapter)
- Return `(True, dict)` on match, `(False, None)` on miss
- Bind-time logging with file:line + function id for diagnostics
- `RuleEngine`:
- Back rules by private `self._rules`; `rules` property returns copy
- Idempotent `add_rule(replace=False)` with in-place replace and regex (re)compile
- Fix AttributeError from property assignment during `__init__`
- Replace hidden singleton factory with explicit builder + global state:
- `app/rules/factory.py::build_rules_engine()` builds and logs totals
- `app/state.py` exposes `set_rules_engine()` / `get_rules_engine()` as the SOF
- `app/wsgi.py` builds once at preload and publishes via `set_rules_engine()`
- Add lightweight debug hooks (`SS_DEBUG_RULES=1`) to trace engine id and rule counts
- Unify logging wiring:
- `wire_logging_once(app)` clears and attaches a single handler chain
- Create two named loggers: `sneakyscope.app` and `sneakyscope.engine`
- Disable propagation to prevent dupes; include pid/logger name in format
- Remove stray/duplicate handlers and import-time logging
- Optional dedup filter for bursty repeats (kept off by default)
- Gunicorn: enable `--preload` in entrypoint to avoid thread races and double registration
- Documented foreground vs background log “double consumer” caveat (attach vs `compose logs`)
- Jinja: replace `{% return %}` with structured `if/elif/else` branches
- Add toggle button to show raw JSON for TLS/CT section
- Consumers should import the rules engine via:
- `from app.state import get_rules_engine`
- Use `build_rules_engine()` **only** during preload/init to construct the instance,
then publish with `set_rules_engine()`. Do not call old singleton factories.
- New/changed modules (high level):
- `app/utils/urltools.py` (+) — URLNormalizer + `get_url_normalizer()`
- `app/rules/function_rules.py` (±) — normalized payload returns
- `engine/function_rule_adapter.py` (±) — coercion, fact adaptation, bind logs
- `app/utils/rules_engine.py` (±) — `_rules`, idempotent `add_rule`, fixes
- `app/rules/factory.py` (±) — pure builder; totals logged post-registration
- `app/state.py` (+) — process-global rules engine
- `app/logging_setup.py` (±) — single chain, two named loggers
- `app/wsgi.py` (±) — preload build + `set_rules_engine()`
- `entrypoint.sh` (±) — add `--preload`
- templates (±) — TLS card, raw toggle; front-page checkbox
Closes: flaky rule-type warnings, duplicate logs, and multi-worker race on rules init.
343 lines
11 KiB
Python
343 lines
11 KiB
Python
from urllib.parse import urlparse
|
||
import requests
|
||
import json
|
||
import whois
|
||
from datetime import datetime
|
||
from ipaddress import ip_address
|
||
import socket
|
||
|
||
# Optional: high-accuracy root-domain detection if available (tldextract is in the requirements, but this is still useful)
|
||
try:
|
||
import tldextract
|
||
_HAS_TLDEXTRACT = True
|
||
except Exception:
|
||
_HAS_TLDEXTRACT = False
|
||
|
||
# Local imports
|
||
from app.utils.cache_db import get_cache
|
||
from app.utils.settings import get_settings
|
||
from app.utils.tls_probe import TLSEnumerator
|
||
|
||
# Configure logger
|
||
from app.logging_setup import get_app_logger
|
||
|
||
# Init cache
|
||
cache = get_cache("/data/cache.db")
|
||
settings = get_settings()
|
||
|
||
# 24 hours * 60 minutes
|
||
days = 24 * 60
|
||
|
||
GEOIP_DEFAULT_TTL = settings.cache.geoip_cache_days * days
|
||
WHOIS_DEFAULT_TTL = settings.cache.whois_cache_days * days
|
||
|
||
logger = get_app_logger()
|
||
|
||
|
||
|
||
def parse_target_to_host(target):
|
||
"""
|
||
Convert a user-supplied string (URL or domain) into a hostname.
|
||
|
||
Returns:
|
||
str or None
|
||
"""
|
||
if target is None:
|
||
return None
|
||
|
||
value = str(target).strip()
|
||
if value == "":
|
||
return None
|
||
|
||
# urlparse needs a scheme to treat the first token as netloc
|
||
parsed = urlparse(value if "://" in value else f"http://{value}")
|
||
|
||
# If the input was something like "localhost:8080/path", netloc includes the port
|
||
host = parsed.hostname
|
||
if host is None:
|
||
return None
|
||
|
||
# Lowercase for consistency
|
||
host = host.strip().lower()
|
||
if host == "":
|
||
return None
|
||
|
||
return host
|
||
|
||
def get_root_domain(hostname):
|
||
"""
|
||
Determine the registrable/root domain from a hostname.
|
||
Prefers tldextract if available; otherwise falls back to a heuristic.
|
||
|
||
Examples:
|
||
sub.a.example.com -> example.com
|
||
portal.gov.uk -> gov.uk (but with PSL, you’d get portal.gov.uk’s registrable, which is gov.uk)
|
||
api.example.co.uk -> example.co.uk (PSL needed for correctness)
|
||
|
||
Returns:
|
||
str (best-effort registrable domain)
|
||
"""
|
||
if hostname is None:
|
||
return None
|
||
|
||
if _HAS_TLDEXTRACT:
|
||
# tldextract returns subdomain, domain, suffix separately using PSL rules
|
||
# e.g., sub= "api", domain="example", suffix="co.uk"
|
||
parts = tldextract.extract(hostname)
|
||
# If suffix is empty (e.g., localhost), fall back
|
||
if parts.suffix:
|
||
return f"{parts.domain}.{parts.suffix}".lower()
|
||
else:
|
||
return hostname.lower()
|
||
|
||
# Fallback heuristic: last two labels (not perfect for multi-part TLDs, but safe)
|
||
# We avoid list comprehensions per your preference for explicit code
|
||
labels = hostname.split(".")
|
||
labels = [lbl for lbl in labels if lbl] # allow simple cleanup without logic change
|
||
|
||
if len(labels) >= 2:
|
||
last = labels[-1]
|
||
second_last = labels[-2]
|
||
candidate = f"{second_last}.{last}".lower()
|
||
return candidate
|
||
|
||
return hostname.lower()
|
||
|
||
def is_root_domain(hostname):
|
||
"""
|
||
Is the provided hostname the same as its registrable/root domain?
|
||
"""
|
||
if hostname is None:
|
||
return False
|
||
|
||
root = get_root_domain(hostname)
|
||
if root is None:
|
||
return False
|
||
|
||
return hostname.lower() == root.lower()
|
||
|
||
def search_certs(domain, wildcard=True, expired=True, deduplicate=True):
|
||
"""
|
||
Search crt.sh for the given domain.
|
||
|
||
domain -- Domain to search for
|
||
wildcard -- Whether or not to prepend a wildcard to the domain
|
||
(default: True)
|
||
expired -- Whether or not to include expired certificates
|
||
(default: True)
|
||
|
||
Return a list of objects, like so:
|
||
|
||
{
|
||
"issuer_ca_id": 16418,
|
||
"issuer_name": "C=US, O=Let's Encrypt, CN=Let's Encrypt Authority X3",
|
||
"name_value": "hatch.uber.com",
|
||
"min_cert_id": 325717795,
|
||
"min_entry_timestamp": "2018-02-08T16:47:39.089",
|
||
"not_before": "2018-02-08T15:47:39"
|
||
}
|
||
"""
|
||
base_url = "https://crt.sh/?q={}&output=json"
|
||
if not expired:
|
||
base_url = base_url + "&exclude=expired"
|
||
if deduplicate:
|
||
base_url = base_url + "&deduplicate=Y"
|
||
if wildcard and "%" not in domain:
|
||
domain = "%.{}".format(domain)
|
||
url = base_url.format(domain)
|
||
|
||
ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'
|
||
req = requests.get(url, headers={'User-Agent': ua})
|
||
|
||
if req.ok:
|
||
try:
|
||
content = req.content.decode('utf-8')
|
||
data = json.loads(content)
|
||
return data
|
||
except ValueError:
|
||
# crt.sh fixed their JSON response. This shouldn't be necessary anymore
|
||
# https://github.com/crtsh/certwatch_db/commit/f4f46ea37c23543c4cdf1a3c8867d68967641807
|
||
data = json.loads("[{}]".format(content.replace('}{', '},{')))
|
||
return data
|
||
except Exception as err:
|
||
logger.error("Error retrieving cert information from CRT.sh.")
|
||
return None
|
||
|
||
def gather_crtsh_certs_for_target(target):
|
||
"""
|
||
Given a URL or domain-like input, return crt.sh results for:
|
||
- The exact hostname
|
||
- If hostname is a subdomain, also the wildcard for the root domain (e.g., *.example.com)
|
||
|
||
We intentionally run this even if the scheme is HTTP (per your design).
|
||
Expired certs are excluded by default.
|
||
|
||
Returns:
|
||
dict:
|
||
{
|
||
"input": <original target>,
|
||
"hostname": <parsed hostname>,
|
||
"root_domain": <registrable>,
|
||
"is_root_domain": <bool>,
|
||
"crtsh": {
|
||
"host_certs": [... or None],
|
||
"wildcard_root_certs": [... or None]
|
||
}
|
||
}
|
||
"""
|
||
result = {
|
||
"input": target,
|
||
"hostname": None,
|
||
"root_domain": None,
|
||
"is_root_domain": False,
|
||
"crtsh": {
|
||
"host_certs": None,
|
||
"wildcard_root_certs": None
|
||
}
|
||
}
|
||
|
||
try:
|
||
hostname = parse_target_to_host(target)
|
||
result["hostname"] = hostname
|
||
|
||
if hostname is None:
|
||
return result
|
||
|
||
root = get_root_domain(hostname)
|
||
result["root_domain"] = root
|
||
result["is_root_domain"] = is_root_domain(hostname)
|
||
|
||
# Always query crt.sh for the specific hostname
|
||
# (expired=False means we filter expired)
|
||
host_certs = search_certs(hostname, wildcard=False, expired=False)
|
||
result["crtsh"]["host_certs"] = host_certs
|
||
|
||
# If subdomain, also look up wildcard for the root domain: *.root
|
||
if not result["is_root_domain"] and root:
|
||
wildcard_certs = search_certs(root, wildcard=True, expired=False)
|
||
result["crtsh"]["wildcard_root_certs"] = wildcard_certs
|
||
|
||
except Exception as exc:
|
||
logger.exception("crt.sh enrichment failed: %s", exc)
|
||
|
||
return result
|
||
|
||
def enrich_url(url: str, fetch_ssl_enabled:bool=False) -> dict:
|
||
"""Perform WHOIS, GeoIP"""
|
||
enrichment = {}
|
||
|
||
# Extract hostname
|
||
parsed = urlparse(url)
|
||
hostname = parsed.hostname or url # fallback if parsing fails
|
||
|
||
# --- WHOIS ---
|
||
enrichment.update(enrich_whois(hostname))
|
||
|
||
# --- GeoIP ---
|
||
enrichment["geoip"] = enrich_geoip(hostname)
|
||
|
||
# === SSL/TLS: crt.sh + live probe ===
|
||
# if fetching ssl...
|
||
if fetch_ssl_enabled:
|
||
try:
|
||
# 1) Certificate Transparency (already implemented previously)
|
||
crtsh_info = gather_crtsh_certs_for_target(url)
|
||
|
||
# 2) Live TLS probe (versions + negotiated cipher per version)
|
||
tls_enum = TLSEnumerator(timeout_seconds=5.0)
|
||
probe_result = tls_enum.probe(url)
|
||
|
||
enrichment["ssl_tls"] = {}
|
||
enrichment["ssl_tls"]["crtsh"] = crtsh_info
|
||
enrichment["ssl_tls"]["probe"] = probe_result.to_dict()
|
||
|
||
except Exception as exc:
|
||
logger.exception("SSL/TLS enrichment failed: %s", exc)
|
||
enrichment["ssl_tls"] = {"error": "SSL/TLS enrichment failed"}
|
||
else:
|
||
# Include a small marker so the UI can show “skipped”
|
||
enrichment["ssl_tls"] = {"skipped": True, "reason": "Disabled on submission"}
|
||
|
||
return enrichment
|
||
|
||
def enrich_whois(hostname: str) -> dict:
|
||
"""Fetch WHOIS info using python-whois with safe type handling."""
|
||
cache_key = f"whois:{hostname}"
|
||
cached = cache.read(cache_key)
|
||
if cached:
|
||
logger.info(f"[CACHE HIT] for WHOIS: {hostname}")
|
||
return cached
|
||
|
||
logger.info(f"[CACHE MISS] for WHOIS: {hostname}")
|
||
result = {}
|
||
try:
|
||
w = whois.whois(hostname)
|
||
|
||
def format_dt(val):
|
||
if isinstance(val, list):
|
||
return ", ".join([v.strftime("%Y-%m-%d %H:%M:%S") if isinstance(v, datetime) else str(v) for v in val])
|
||
elif isinstance(val, datetime):
|
||
return val.strftime("%Y-%m-%d %H:%M:%S")
|
||
elif val is None:
|
||
return "Possible Privacy"
|
||
else:
|
||
return str(val)
|
||
|
||
result["whois"] = {
|
||
"registrar": format_dt(getattr(w, "registrar", None)),
|
||
"creation_date": format_dt(getattr(w, "creation_date", None)),
|
||
"expiration_date": format_dt(getattr(w, "expiration_date", None)),
|
||
"owner": format_dt(getattr(w, "org", None))
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.warning(f"WHOIS lookup failed for {hostname}: {e}")
|
||
try:
|
||
# fallback raw whois text
|
||
import subprocess
|
||
raw_output = subprocess.check_output(["whois", hostname], encoding="utf-8", errors="ignore")
|
||
result["whois"] = {}
|
||
result["raw_whois"] = raw_output
|
||
except Exception as raw_e:
|
||
logger.error(f"Raw WHOIS also failed: {raw_e}")
|
||
result["whois"] = {}
|
||
result["raw_whois"] = "N/A"
|
||
|
||
cache.create(cache_key, result, WHOIS_DEFAULT_TTL)
|
||
return result
|
||
|
||
def enrich_geoip(hostname: str) -> dict:
|
||
"""Resolve hostname to IPs and fetch info from ip-api.com."""
|
||
geo_info = {}
|
||
ips = extract_ips_from_url(hostname)
|
||
for ip in ips:
|
||
ip_str = str(ip)
|
||
cache_key = f"geoip:{ip_str}"
|
||
cached = cache.read(cache_key)
|
||
if cached:
|
||
logger.info(f"[CACHE HIT] for GEOIP: {ip}")
|
||
geo_info[ip_str] = cached
|
||
continue
|
||
|
||
logger.info(f"[CACHE MISS] for GEOIP: {ip}")
|
||
try:
|
||
resp = requests.get(f"http://ip-api.com/json/{ip_str}?fields=24313855", timeout=5)
|
||
if resp.status_code == 200:
|
||
geo_info[ip_str] = resp.json()
|
||
else:
|
||
geo_info[ip_str] = {"error": f"HTTP {resp.status_code}"}
|
||
except Exception as e:
|
||
geo_info[ip_str] = {"error": str(e)}
|
||
|
||
cache.create(cache_key, geo_info[ip_str],GEOIP_DEFAULT_TTL)
|
||
|
||
return geo_info
|
||
|
||
def extract_ips_from_url(hostname: str):
|
||
"""Resolve hostname to IPs."""
|
||
try:
|
||
info = socket.getaddrinfo(hostname, None)
|
||
return list({ip_address(x[4][0]) for x in info})
|
||
except Exception:
|
||
return []
|