# app/utils/urltools.py from urllib.parse import urlparse, urlunparse import requests import idna # Reuse existing decorator (import from wherever you defined it) from app.utils.settings import singleton_loader class URLNormalizer: """ Normalize user input into a fully-qualified URL for analysis. Behavior: - If no scheme is present, prepend https:// by default. - Optional quick HTTPS reachability check with fallback to http://. - Converts Unicode hostnames to punycode via IDNA. Notes: - Keep the first-constructed configuration stable via the singleton factory. - Avoids Flask/current_app/threading per your project style. """ def __init__(self, prefer_https: bool = True, fallback_http: bool = False, connect_timeout: float = 2.0): self.prefer_https = bool(prefer_https) self.fallback_http = bool(fallback_http) self.connect_timeout = float(connect_timeout) def normalize_for_analysis(self, raw_input: str) -> str: """ Convert raw input (URL or domain) into a normalized URL string. Raises: ValueError: if input is empty/invalid. """ if raw_input is None: raise ValueError("Empty input") text = str(raw_input).strip() if text == "": raise ValueError("Empty input") # Repair common typos (missing colon) lower = text.lower() if lower.startswith("http//"): text = "http://" + text[6:] elif lower.startswith("https//"): text = "https://" + text[7:] # Respect an existing scheme if "://" in text: parsed = urlparse(text) return self._recompose_with_punycode_host(parsed) # No scheme -> build one if self.prefer_https: https_url = "https://" + text if self.fallback_http: if self._quick_https_ok(https_url): return self._recompose_with_punycode_host(urlparse(https_url)) http_url = "http://" + text return self._recompose_with_punycode_host(urlparse(http_url)) return self._recompose_with_punycode_host(urlparse(https_url)) http_url = "http://" + text return self._recompose_with_punycode_host(urlparse(http_url)) def _recompose_with_punycode_host(self, parsed): """ Recompose a parsed URL with hostname encoded to ASCII (punycode). Preserves userinfo, port, path, params, query, fragment. """ host = parsed.hostname if host is None: return urlunparse(parsed) try: ascii_host = idna.encode(host).decode("ascii") except Exception: ascii_host = host # rebuild netloc (auth + port) netloc = ascii_host if parsed.port: netloc = f"{netloc}:{parsed.port}" if parsed.username: if parsed.password: netloc = f"{parsed.username}:{parsed.password}@{netloc}" else: netloc = f"{parsed.username}@{netloc}" return urlunparse(( parsed.scheme, netloc, parsed.path or "", parsed.params or "", parsed.query or "", parsed.fragment or "", )) def _quick_https_ok(self, https_url: str) -> bool: """ Quick reachability check for https:// using a HEAD request. Redirects allowed; TLS verify disabled — posture-only. """ try: resp = requests.head(https_url, allow_redirects=True, timeout=self.connect_timeout, verify=False) _ = resp.status_code return True except Exception: return False # ---- Singleton factory using our decorator ---- @singleton_loader def get_url_normalizer( prefer_https: bool = True, fallback_http: bool = False, connect_timeout: float = 2.0, ) -> URLNormalizer: """ Return the singleton URLNormalizer instance. IMPORTANT: With this decorator, the FIRST call's arguments "win". Later calls return the cached instance and ignore new arguments. """ return URLNormalizer( prefer_https=prefer_https, fallback_http=fallback_http, connect_timeout=connect_timeout, )