import logging from pathlib import Path from urllib.parse import urlparse import requests import yaml import whois from datetime import datetime from ipaddress import ip_address import socket # Local imports from .utils.cache_db import get_cache from .utils.settings import get_settings # Configure logging logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") # Init cache cache = get_cache("/data/cache.db") settings = get_settings() # Load BEC words BEC_WORDS_FILE = Path(__file__).parent.parent / "config" / "bec_words.yaml" if BEC_WORDS_FILE.exists(): with open(BEC_WORDS_FILE, "r", encoding="utf-8") as f: BEC_WORDS = yaml.safe_load(f).get("words", []) else: BEC_WORDS = [] # 24 hours * 60 minutes days = 24 * 60 GEOIP_DEFAULT_TTL = settings.cache.geoip_cache_days * days WHOIS_DEFAULT_TTL = settings.cache.whois_cache_days * days def enrich_url(url: str) -> dict: """Perform WHOIS, GeoIP, and BEC word enrichment.""" result = {} # Extract hostname parsed = urlparse(url) hostname = parsed.hostname or url # fallback if parsing fails # --- WHOIS --- result.update(enrich_whois(hostname)) # --- GeoIP --- result["geoip"] = enrich_geoip(hostname) # --- BEC Words --- result["bec_words"] = [w for w in BEC_WORDS if w.lower() in url.lower()] return result def enrich_whois(hostname: str) -> dict: """Fetch WHOIS info using python-whois with safe type handling.""" cache_key = f"whois:{hostname}" cached = cache.read(cache_key) if cached: logging.info(f"[CACHE HIT] for WHOIS: {hostname}") return cached logging.info(f"[CACHE MISS] for WHOIS: {hostname}") result = {} try: w = whois.whois(hostname) def format_dt(val): if isinstance(val, list): return ", ".join([v.strftime("%Y-%m-%d %H:%M:%S") if isinstance(v, datetime) else str(v) for v in val]) elif isinstance(val, datetime): return val.strftime("%Y-%m-%d %H:%M:%S") elif val is None: return "Possible Privacy" else: return str(val) result["whois"] = { "registrar": format_dt(getattr(w, "registrar", None)), "creation_date": format_dt(getattr(w, "creation_date", None)), "expiration_date": format_dt(getattr(w, "expiration_date", None)), "owner": format_dt(getattr(w, "org", None)) } except Exception as e: logging.warning(f"WHOIS lookup failed for {hostname}: {e}") try: # fallback raw whois text import subprocess raw_output = subprocess.check_output(["whois", hostname], encoding="utf-8", errors="ignore") result["whois"] = {} result["raw_whois"] = raw_output except Exception as raw_e: logging.error(f"Raw WHOIS also failed: {raw_e}") result["whois"] = {} result["raw_whois"] = "N/A" cache.create(cache_key, result, WHOIS_DEFAULT_TTL) return result def enrich_geoip(hostname: str) -> dict: """Resolve hostname to IPs and fetch info from ip-api.com.""" geo_info = {} ips = extract_ips_from_url(hostname) for ip in ips: ip_str = str(ip) cache_key = f"geoip:{ip_str}" cached = cache.read(cache_key) if cached: logging.info(f"[CACHE HIT] for GEOIP: {ip}") geo_info[ip_str] = cached continue logging.info(f"[CACHE MISS] for GEOIP: {ip}") try: resp = requests.get(f"http://ip-api.com/json/{ip_str}?fields=24313855", timeout=5) if resp.status_code == 200: geo_info[ip_str] = resp.json() else: geo_info[ip_str] = {"error": f"HTTP {resp.status_code}"} except Exception as e: geo_info[ip_str] = {"error": str(e)} cache.create(cache_key, geo_info[ip_str],GEOIP_DEFAULT_TTL) return geo_info def extract_ips_from_url(hostname: str): """Resolve hostname to IPs.""" try: info = socket.getaddrinfo(hostname, None) return list({ip_address(x[4][0]) for x in info}) except Exception: return []