refactor of browser.py into object model
This commit is contained in:
129
app/utils/enrichment.py
Normal file
129
app/utils/enrichment.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import requests
|
||||
import yaml
|
||||
import whois
|
||||
from datetime import datetime
|
||||
from ipaddress import ip_address
|
||||
import socket
|
||||
|
||||
# Local imports
|
||||
from .utils.cache_db import get_cache
|
||||
from .utils.settings import get_settings
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
||||
|
||||
# Init cache
|
||||
cache = get_cache("/data/cache.db")
|
||||
settings = get_settings()
|
||||
|
||||
# 24 hours * 60 minutes
|
||||
days = 24 * 60
|
||||
|
||||
GEOIP_DEFAULT_TTL = settings.cache.geoip_cache_days * days
|
||||
WHOIS_DEFAULT_TTL = settings.cache.whois_cache_days * days
|
||||
|
||||
def enrich_url(url: str) -> dict:
|
||||
"""Perform WHOIS, GeoIP, and BEC word enrichment."""
|
||||
result = {}
|
||||
|
||||
# Extract hostname
|
||||
parsed = urlparse(url)
|
||||
hostname = parsed.hostname or url # fallback if parsing fails
|
||||
|
||||
# --- WHOIS ---
|
||||
result.update(enrich_whois(hostname))
|
||||
|
||||
# --- GeoIP ---
|
||||
result["geoip"] = enrich_geoip(hostname)
|
||||
|
||||
# --- BEC Words ---
|
||||
result["bec_words"] = [w for w in BEC_WORDS if w.lower() in url.lower()]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def enrich_whois(hostname: str) -> dict:
|
||||
"""Fetch WHOIS info using python-whois with safe type handling."""
|
||||
cache_key = f"whois:{hostname}"
|
||||
cached = cache.read(cache_key)
|
||||
if cached:
|
||||
logging.info(f"[CACHE HIT] for WHOIS: {hostname}")
|
||||
return cached
|
||||
|
||||
logging.info(f"[CACHE MISS] for WHOIS: {hostname}")
|
||||
result = {}
|
||||
try:
|
||||
w = whois.whois(hostname)
|
||||
|
||||
def format_dt(val):
|
||||
if isinstance(val, list):
|
||||
return ", ".join([v.strftime("%Y-%m-%d %H:%M:%S") if isinstance(v, datetime) else str(v) for v in val])
|
||||
elif isinstance(val, datetime):
|
||||
return val.strftime("%Y-%m-%d %H:%M:%S")
|
||||
elif val is None:
|
||||
return "Possible Privacy"
|
||||
else:
|
||||
return str(val)
|
||||
|
||||
result["whois"] = {
|
||||
"registrar": format_dt(getattr(w, "registrar", None)),
|
||||
"creation_date": format_dt(getattr(w, "creation_date", None)),
|
||||
"expiration_date": format_dt(getattr(w, "expiration_date", None)),
|
||||
"owner": format_dt(getattr(w, "org", None))
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"WHOIS lookup failed for {hostname}: {e}")
|
||||
try:
|
||||
# fallback raw whois text
|
||||
import subprocess
|
||||
raw_output = subprocess.check_output(["whois", hostname], encoding="utf-8", errors="ignore")
|
||||
result["whois"] = {}
|
||||
result["raw_whois"] = raw_output
|
||||
except Exception as raw_e:
|
||||
logging.error(f"Raw WHOIS also failed: {raw_e}")
|
||||
result["whois"] = {}
|
||||
result["raw_whois"] = "N/A"
|
||||
|
||||
cache.create(cache_key, result, WHOIS_DEFAULT_TTL)
|
||||
return result
|
||||
|
||||
|
||||
def enrich_geoip(hostname: str) -> dict:
|
||||
"""Resolve hostname to IPs and fetch info from ip-api.com."""
|
||||
geo_info = {}
|
||||
ips = extract_ips_from_url(hostname)
|
||||
for ip in ips:
|
||||
ip_str = str(ip)
|
||||
cache_key = f"geoip:{ip_str}"
|
||||
cached = cache.read(cache_key)
|
||||
if cached:
|
||||
logging.info(f"[CACHE HIT] for GEOIP: {ip}")
|
||||
geo_info[ip_str] = cached
|
||||
continue
|
||||
|
||||
logging.info(f"[CACHE MISS] for GEOIP: {ip}")
|
||||
try:
|
||||
resp = requests.get(f"http://ip-api.com/json/{ip_str}?fields=24313855", timeout=5)
|
||||
if resp.status_code == 200:
|
||||
geo_info[ip_str] = resp.json()
|
||||
else:
|
||||
geo_info[ip_str] = {"error": f"HTTP {resp.status_code}"}
|
||||
except Exception as e:
|
||||
geo_info[ip_str] = {"error": str(e)}
|
||||
|
||||
cache.create(cache_key, geo_info[ip_str],GEOIP_DEFAULT_TTL)
|
||||
|
||||
return geo_info
|
||||
|
||||
|
||||
def extract_ips_from_url(hostname: str):
|
||||
"""Resolve hostname to IPs."""
|
||||
try:
|
||||
info = socket.getaddrinfo(hostname, None)
|
||||
return list({ip_address(x[4][0]) for x in info})
|
||||
except Exception:
|
||||
return []
|
||||
Reference in New Issue
Block a user