restructure of dirs, huge docs update

This commit is contained in:
2025-11-17 16:29:14 -06:00
parent 456e052389
commit cd840cb8ca
87 changed files with 2827 additions and 1094 deletions

View File

@@ -0,0 +1,10 @@
"""
Services package for SneakyScanner web application.
This package contains business logic layer services that orchestrate
operations between API endpoints and database models.
"""
from web.services.scan_service import ScanService
__all__ = ['ScanService']

View File

@@ -0,0 +1,552 @@
"""
Config Service - Business logic for config file management
This service handles all operations related to scan configuration files,
including creation, validation, listing, and deletion.
"""
import os
import re
import yaml
import ipaddress
from typing import Dict, List, Tuple, Any, Optional
from datetime import datetime
from pathlib import Path
from werkzeug.utils import secure_filename
class ConfigService:
"""Business logic for config management"""
def __init__(self, configs_dir: str = '/app/configs'):
"""
Initialize the config service.
Args:
configs_dir: Directory where config files are stored
"""
self.configs_dir = configs_dir
# Ensure configs directory exists
os.makedirs(self.configs_dir, exist_ok=True)
def list_configs(self) -> List[Dict[str, Any]]:
"""
List all config files with metadata.
Returns:
List of config metadata dictionaries:
[
{
"filename": "prod-scan.yaml",
"title": "Prod Scan",
"path": "/app/configs/prod-scan.yaml",
"created_at": "2025-11-15T10:30:00Z",
"size_bytes": 1234,
"used_by_schedules": ["Daily Scan", "Weekly Audit"]
}
]
"""
configs = []
# Get all YAML files in configs directory
if not os.path.exists(self.configs_dir):
return configs
for filename in os.listdir(self.configs_dir):
if not filename.endswith(('.yaml', '.yml')):
continue
filepath = os.path.join(self.configs_dir, filename)
if not os.path.isfile(filepath):
continue
try:
# Get file metadata
stat_info = os.stat(filepath)
created_at = datetime.fromtimestamp(stat_info.st_mtime).isoformat() + 'Z'
size_bytes = stat_info.st_size
# Parse YAML to get title
title = None
try:
with open(filepath, 'r') as f:
data = yaml.safe_load(f)
if isinstance(data, dict):
title = data.get('title', filename)
except Exception:
title = filename # Fallback to filename if parsing fails
# Get schedules using this config
used_by_schedules = self.get_schedules_using_config(filename)
configs.append({
'filename': filename,
'title': title,
'path': filepath,
'created_at': created_at,
'size_bytes': size_bytes,
'used_by_schedules': used_by_schedules
})
except Exception as e:
# Skip files that can't be read
continue
# Sort by created_at (most recent first)
configs.sort(key=lambda x: x['created_at'], reverse=True)
return configs
def get_config(self, filename: str) -> Dict[str, Any]:
"""
Get config file content and parsed data.
Args:
filename: Config filename
Returns:
{
"filename": "prod-scan.yaml",
"content": "title: Prod Scan\n...",
"parsed": {"title": "Prod Scan", "sites": [...]}
}
Raises:
FileNotFoundError: If config doesn't exist
ValueError: If config content is invalid
"""
filepath = os.path.join(self.configs_dir, filename)
if not os.path.exists(filepath):
raise FileNotFoundError(f"Config file '{filename}' not found")
# Read file content
with open(filepath, 'r') as f:
content = f.read()
# Parse YAML
try:
parsed = yaml.safe_load(content)
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML syntax: {str(e)}")
return {
'filename': filename,
'content': content,
'parsed': parsed
}
def create_from_yaml(self, filename: str, content: str) -> str:
"""
Create config from YAML content.
Args:
filename: Desired filename (will be sanitized)
content: YAML content string
Returns:
Final filename (sanitized)
Raises:
ValueError: If content invalid or filename conflict
"""
# Sanitize filename
filename = secure_filename(filename)
# Ensure .yaml extension
if not filename.endswith(('.yaml', '.yml')):
filename += '.yaml'
filepath = os.path.join(self.configs_dir, filename)
# Check for conflicts
if os.path.exists(filepath):
raise ValueError(f"Config file '{filename}' already exists")
# Parse and validate YAML
try:
parsed = yaml.safe_load(content)
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML syntax: {str(e)}")
# Validate config structure
is_valid, error_msg = self.validate_config_content(parsed)
if not is_valid:
raise ValueError(f"Invalid config structure: {error_msg}")
# Write file
with open(filepath, 'w') as f:
f.write(content)
return filename
def create_from_cidr(
self,
title: str,
cidr: str,
site_name: Optional[str] = None,
ping_default: bool = False
) -> Tuple[str, str]:
"""
Create config from CIDR range.
Args:
title: Scan configuration title
cidr: CIDR range (e.g., "10.0.0.0/24")
site_name: Optional site name (defaults to "Site 1")
ping_default: Default ping expectation for all IPs
Returns:
Tuple of (final_filename, yaml_content)
Raises:
ValueError: If CIDR invalid or other validation errors
"""
# Validate and parse CIDR
try:
network = ipaddress.ip_network(cidr, strict=False)
except ValueError as e:
raise ValueError(f"Invalid CIDR range: {str(e)}")
# Check if network is too large (prevent expansion of huge ranges)
if network.num_addresses > 10000:
raise ValueError(f"CIDR range too large: {network.num_addresses} addresses. Maximum is 10,000.")
# Expand CIDR to list of IP addresses
ip_list = [str(ip) for ip in network.hosts()]
# If network has only 1 address (like /32 or /128), hosts() returns empty
# In that case, use the network address itself
if not ip_list:
ip_list = [str(network.network_address)]
# Build site name
if not site_name or not site_name.strip():
site_name = "Site 1"
# Build IP configurations
ips = []
for ip_address in ip_list:
ips.append({
'address': ip_address,
'expected': {
'ping': ping_default,
'tcp_ports': [],
'udp_ports': []
}
})
# Build YAML structure
config_data = {
'title': title.strip(),
'sites': [
{
'name': site_name.strip(),
'ips': ips
}
]
}
# Convert to YAML string
yaml_content = yaml.dump(config_data, sort_keys=False, default_flow_style=False)
# Generate filename from title
filename = self.generate_filename_from_title(title)
filepath = os.path.join(self.configs_dir, filename)
# Check for conflicts
if os.path.exists(filepath):
raise ValueError(f"Config file '{filename}' already exists")
# Write file
with open(filepath, 'w') as f:
f.write(yaml_content)
return filename, yaml_content
def update_config(self, filename: str, yaml_content: str) -> None:
"""
Update existing config file with new YAML content.
Args:
filename: Config filename to update
yaml_content: New YAML content string
Raises:
FileNotFoundError: If config doesn't exist
ValueError: If YAML content is invalid
"""
filepath = os.path.join(self.configs_dir, filename)
# Check if file exists
if not os.path.exists(filepath):
raise FileNotFoundError(f"Config file '{filename}' not found")
# Parse and validate YAML
try:
parsed = yaml.safe_load(yaml_content)
except yaml.YAMLError as e:
raise ValueError(f"Invalid YAML syntax: {str(e)}")
# Validate config structure
is_valid, error_msg = self.validate_config_content(parsed)
if not is_valid:
raise ValueError(f"Invalid config structure: {error_msg}")
# Write updated content
with open(filepath, 'w') as f:
f.write(yaml_content)
def delete_config(self, filename: str) -> None:
"""
Delete config file and cascade delete any associated schedules.
When a config is deleted, all schedules using that config (both enabled
and disabled) are automatically deleted as well, since they would be
invalid without the config file.
Args:
filename: Config filename to delete
Raises:
FileNotFoundError: If config doesn't exist
"""
filepath = os.path.join(self.configs_dir, filename)
if not os.path.exists(filepath):
raise FileNotFoundError(f"Config file '{filename}' not found")
# Delete any schedules using this config (both enabled and disabled)
try:
from web.services.schedule_service import ScheduleService
from flask import current_app
# Get database session from Flask app
db = current_app.db_session
# Get all schedules
schedule_service = ScheduleService(db)
result = schedule_service.list_schedules(page=1, per_page=10000)
schedules = result.get('schedules', [])
# Build full path for comparison
config_path = os.path.join(self.configs_dir, filename)
# Find and delete all schedules using this config (enabled or disabled)
deleted_schedules = []
for schedule in schedules:
schedule_config = schedule.get('config_file', '')
# Handle both absolute paths and just filenames
if schedule_config == filename or schedule_config == config_path:
schedule_id = schedule.get('id')
schedule_name = schedule.get('name', 'Unknown')
try:
schedule_service.delete_schedule(schedule_id)
deleted_schedules.append(schedule_name)
except Exception as e:
import logging
logging.getLogger(__name__).warning(
f"Failed to delete schedule {schedule_id} ('{schedule_name}'): {e}"
)
if deleted_schedules:
import logging
logging.getLogger(__name__).info(
f"Cascade deleted {len(deleted_schedules)} schedule(s) associated with config '{filename}': {', '.join(deleted_schedules)}"
)
except ImportError:
# If ScheduleService doesn't exist yet, skip schedule deletion
pass
except Exception as e:
# Log error but continue with config deletion
import logging
logging.getLogger(__name__).error(
f"Error deleting schedules for config {filename}: {e}", exc_info=True
)
# Delete file
os.remove(filepath)
def validate_config_content(self, content: Dict) -> Tuple[bool, str]:
"""
Validate parsed YAML config structure.
Args:
content: Parsed YAML config as dict
Returns:
Tuple of (is_valid, error_message)
"""
if not isinstance(content, dict):
return False, "Config must be a dictionary/object"
# Check required fields
if 'title' not in content:
return False, "Missing required field: 'title'"
if 'sites' not in content:
return False, "Missing required field: 'sites'"
# Validate title
if not isinstance(content['title'], str) or not content['title'].strip():
return False, "Field 'title' must be a non-empty string"
# Validate sites
sites = content['sites']
if not isinstance(sites, list):
return False, "Field 'sites' must be a list"
if len(sites) == 0:
return False, "Must have at least one site defined"
# Validate each site
for i, site in enumerate(sites):
if not isinstance(site, dict):
return False, f"Site {i+1} must be a dictionary/object"
if 'name' not in site:
return False, f"Site {i+1} missing required field: 'name'"
if 'ips' not in site:
return False, f"Site {i+1} missing required field: 'ips'"
if not isinstance(site['ips'], list):
return False, f"Site {i+1} field 'ips' must be a list"
if len(site['ips']) == 0:
return False, f"Site {i+1} must have at least one IP"
# Validate each IP
for j, ip_config in enumerate(site['ips']):
if not isinstance(ip_config, dict):
return False, f"Site {i+1} IP {j+1} must be a dictionary/object"
if 'address' not in ip_config:
return False, f"Site {i+1} IP {j+1} missing required field: 'address'"
if 'expected' not in ip_config:
return False, f"Site {i+1} IP {j+1} missing required field: 'expected'"
if not isinstance(ip_config['expected'], dict):
return False, f"Site {i+1} IP {j+1} field 'expected' must be a dictionary/object"
return True, ""
def get_schedules_using_config(self, filename: str) -> List[str]:
"""
Get list of schedule names using this config.
Args:
filename: Config filename
Returns:
List of schedule names (e.g., ["Daily Scan", "Weekly Audit"])
"""
# Import here to avoid circular dependency
try:
from web.services.schedule_service import ScheduleService
from flask import current_app
# Get database session from Flask app
db = current_app.db_session
# Get all schedules (use large per_page to get all)
schedule_service = ScheduleService(db)
result = schedule_service.list_schedules(page=1, per_page=10000)
# Extract schedules list from paginated result
schedules = result.get('schedules', [])
# Build full path for comparison
config_path = os.path.join(self.configs_dir, filename)
# Find schedules using this config (only enabled schedules)
using_schedules = []
for schedule in schedules:
schedule_config = schedule.get('config_file', '')
# Handle both absolute paths and just filenames
if schedule_config == filename or schedule_config == config_path:
# Only count enabled schedules
if schedule.get('enabled', False):
using_schedules.append(schedule.get('name', 'Unknown'))
return using_schedules
except ImportError:
# If ScheduleService doesn't exist yet, return empty list
return []
except Exception as e:
# If any error occurs, return empty list (safer than failing)
# Log the error for debugging
import logging
logging.getLogger(__name__).error(f"Error getting schedules using config {filename}: {e}", exc_info=True)
return []
def generate_filename_from_title(self, title: str) -> str:
"""
Generate safe filename from scan title.
Args:
title: Scan title string
Returns:
Safe filename (e.g., "Prod Scan 2025" -> "prod-scan-2025.yaml")
"""
# Convert to lowercase
filename = title.lower()
# Replace spaces with hyphens
filename = filename.replace(' ', '-')
# Remove special characters (keep only alphanumeric, hyphens, underscores)
filename = re.sub(r'[^a-z0-9\-_]', '', filename)
# Remove consecutive hyphens
filename = re.sub(r'-+', '-', filename)
# Remove leading/trailing hyphens
filename = filename.strip('-')
# Limit length (max 200 chars, reserve 5 for .yaml)
max_length = 195
if len(filename) > max_length:
filename = filename[:max_length]
# Ensure not empty
if not filename:
filename = 'config'
# Add .yaml extension
filename += '.yaml'
return filename
def get_config_path(self, filename: str) -> str:
"""
Get absolute path for a config file.
Args:
filename: Config filename
Returns:
Absolute path to config file
"""
return os.path.join(self.configs_dir, filename)
def config_exists(self, filename: str) -> bool:
"""
Check if a config file exists.
Args:
filename: Config filename
Returns:
True if file exists, False otherwise
"""
filepath = os.path.join(self.configs_dir, filename)
return os.path.exists(filepath) and os.path.isfile(filepath)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,483 @@
"""
Schedule service for managing scheduled scan operations.
This service handles the business logic for creating, updating, and managing
scheduled scans with cron expressions.
"""
import logging
import os
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple
from croniter import croniter
from sqlalchemy.orm import Session
from web.models import Schedule, Scan
from web.utils.pagination import paginate, PaginatedResult
logger = logging.getLogger(__name__)
class ScheduleService:
"""
Service for managing scheduled scans.
Handles schedule lifecycle: creation, validation, updating,
and cron expression processing.
"""
def __init__(self, db_session: Session):
"""
Initialize schedule service.
Args:
db_session: SQLAlchemy database session
"""
self.db = db_session
def create_schedule(
self,
name: str,
config_file: str,
cron_expression: str,
enabled: bool = True
) -> int:
"""
Create a new schedule.
Args:
name: Human-readable schedule name
config_file: Path to YAML configuration file
cron_expression: Cron expression (e.g., '0 2 * * *')
enabled: Whether schedule is active
Returns:
Schedule ID of the created schedule
Raises:
ValueError: If cron expression is invalid or config file doesn't exist
"""
# Validate cron expression
is_valid, error_msg = self.validate_cron_expression(cron_expression)
if not is_valid:
raise ValueError(f"Invalid cron expression: {error_msg}")
# Validate config file exists
# If config_file is just a filename, prepend the configs directory
if not config_file.startswith('/'):
config_file_path = os.path.join('/app/configs', config_file)
else:
config_file_path = config_file
if not os.path.isfile(config_file_path):
raise ValueError(f"Config file not found: {config_file}")
# Calculate next run time
next_run = self.calculate_next_run(cron_expression) if enabled else None
# Create schedule record
schedule = Schedule(
name=name,
config_file=config_file,
cron_expression=cron_expression,
enabled=enabled,
last_run=None,
next_run=next_run,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow()
)
self.db.add(schedule)
self.db.commit()
self.db.refresh(schedule)
logger.info(f"Schedule {schedule.id} created: '{name}' with cron '{cron_expression}'")
return schedule.id
def get_schedule(self, schedule_id: int) -> Dict[str, Any]:
"""
Get schedule details by ID.
Args:
schedule_id: Schedule ID
Returns:
Schedule dictionary with details and execution history
Raises:
ValueError: If schedule not found
"""
schedule = self.db.query(Schedule).filter(Schedule.id == schedule_id).first()
if not schedule:
raise ValueError(f"Schedule {schedule_id} not found")
# Convert to dict and include history
schedule_dict = self._schedule_to_dict(schedule)
schedule_dict['history'] = self.get_schedule_history(schedule_id, limit=10)
return schedule_dict
def list_schedules(
self,
page: int = 1,
per_page: int = 20,
enabled_filter: Optional[bool] = None
) -> Dict[str, Any]:
"""
List all schedules with pagination and filtering.
Args:
page: Page number (1-indexed)
per_page: Items per page
enabled_filter: Filter by enabled status (None = all)
Returns:
Dictionary with paginated schedules:
{
'schedules': [...],
'total': int,
'page': int,
'per_page': int,
'pages': int
}
"""
# Build query
query = self.db.query(Schedule)
# Apply filter
if enabled_filter is not None:
query = query.filter(Schedule.enabled == enabled_filter)
# Order by next_run (nulls last), then by name
query = query.order_by(Schedule.next_run.is_(None), Schedule.next_run, Schedule.name)
# Paginate
result = paginate(query, page=page, per_page=per_page)
# Convert schedules to dicts
schedules = [self._schedule_to_dict(s) for s in result.items]
return {
'schedules': schedules,
'total': result.total,
'page': result.page,
'per_page': result.per_page,
'pages': result.pages
}
def update_schedule(
self,
schedule_id: int,
**updates: Any
) -> Dict[str, Any]:
"""
Update schedule fields.
Args:
schedule_id: Schedule ID
**updates: Fields to update (name, config_file, cron_expression, enabled)
Returns:
Updated schedule dictionary
Raises:
ValueError: If schedule not found or invalid updates
"""
schedule = self.db.query(Schedule).filter(Schedule.id == schedule_id).first()
if not schedule:
raise ValueError(f"Schedule {schedule_id} not found")
# Validate cron expression if being updated
if 'cron_expression' in updates:
is_valid, error_msg = self.validate_cron_expression(updates['cron_expression'])
if not is_valid:
raise ValueError(f"Invalid cron expression: {error_msg}")
# Recalculate next_run
if schedule.enabled or updates.get('enabled', False):
updates['next_run'] = self.calculate_next_run(updates['cron_expression'])
# Validate config file if being updated
if 'config_file' in updates:
config_file = updates['config_file']
# If config_file is just a filename, prepend the configs directory
if not config_file.startswith('/'):
config_file_path = os.path.join('/app/configs', config_file)
else:
config_file_path = config_file
if not os.path.isfile(config_file_path):
raise ValueError(f"Config file not found: {updates['config_file']}")
# Handle enabled toggle
if 'enabled' in updates:
if updates['enabled'] and not schedule.enabled:
# Being enabled - calculate next_run
cron_expr = updates.get('cron_expression', schedule.cron_expression)
updates['next_run'] = self.calculate_next_run(cron_expr)
elif not updates['enabled'] and schedule.enabled:
# Being disabled - clear next_run
updates['next_run'] = None
# Update fields
for key, value in updates.items():
if hasattr(schedule, key):
setattr(schedule, key, value)
schedule.updated_at = datetime.utcnow()
self.db.commit()
self.db.refresh(schedule)
logger.info(f"Schedule {schedule_id} updated: {list(updates.keys())}")
return self._schedule_to_dict(schedule)
def delete_schedule(self, schedule_id: int) -> bool:
"""
Delete a schedule.
Note: Associated scans are NOT deleted (schedule_id becomes null).
Args:
schedule_id: Schedule ID
Returns:
True if deleted successfully
Raises:
ValueError: If schedule not found
"""
schedule = self.db.query(Schedule).filter(Schedule.id == schedule_id).first()
if not schedule:
raise ValueError(f"Schedule {schedule_id} not found")
schedule_name = schedule.name
self.db.delete(schedule)
self.db.commit()
logger.info(f"Schedule {schedule_id} ('{schedule_name}') deleted")
return True
def toggle_enabled(self, schedule_id: int, enabled: bool) -> Dict[str, Any]:
"""
Enable or disable a schedule.
Args:
schedule_id: Schedule ID
enabled: New enabled status
Returns:
Updated schedule dictionary
Raises:
ValueError: If schedule not found
"""
return self.update_schedule(schedule_id, enabled=enabled)
def update_run_times(
self,
schedule_id: int,
last_run: datetime,
next_run: datetime
) -> bool:
"""
Update last_run and next_run timestamps.
Called after each execution.
Args:
schedule_id: Schedule ID
last_run: Last execution time
next_run: Next scheduled execution time
Returns:
True if updated successfully
Raises:
ValueError: If schedule not found
"""
schedule = self.db.query(Schedule).filter(Schedule.id == schedule_id).first()
if not schedule:
raise ValueError(f"Schedule {schedule_id} not found")
schedule.last_run = last_run
schedule.next_run = next_run
schedule.updated_at = datetime.utcnow()
self.db.commit()
logger.debug(f"Schedule {schedule_id} run times updated: last={last_run}, next={next_run}")
return True
def validate_cron_expression(self, cron_expr: str) -> Tuple[bool, Optional[str]]:
"""
Validate a cron expression.
Args:
cron_expr: Cron expression to validate
Returns:
Tuple of (is_valid, error_message)
- (True, None) if valid
- (False, error_message) if invalid
"""
try:
# Try to create a croniter instance
base_time = datetime.utcnow()
cron = croniter(cron_expr, base_time)
# Try to get the next run time (validates the expression)
cron.get_next(datetime)
return (True, None)
except (ValueError, KeyError) as e:
return (False, str(e))
except Exception as e:
return (False, f"Unexpected error: {str(e)}")
def calculate_next_run(
self,
cron_expr: str,
from_time: Optional[datetime] = None
) -> datetime:
"""
Calculate next run time from cron expression.
Args:
cron_expr: Cron expression
from_time: Base time (defaults to now UTC)
Returns:
Next run datetime (UTC)
Raises:
ValueError: If cron expression is invalid
"""
if from_time is None:
from_time = datetime.utcnow()
try:
cron = croniter(cron_expr, from_time)
return cron.get_next(datetime)
except Exception as e:
raise ValueError(f"Invalid cron expression '{cron_expr}': {str(e)}")
def get_schedule_history(
self,
schedule_id: int,
limit: int = 10
) -> List[Dict[str, Any]]:
"""
Get recent scans triggered by this schedule.
Args:
schedule_id: Schedule ID
limit: Maximum number of scans to return
Returns:
List of scan dictionaries (recent first)
"""
scans = (
self.db.query(Scan)
.filter(Scan.schedule_id == schedule_id)
.order_by(Scan.timestamp.desc())
.limit(limit)
.all()
)
return [
{
'id': scan.id,
'timestamp': scan.timestamp.isoformat() if scan.timestamp else None,
'status': scan.status,
'title': scan.title,
'config_file': scan.config_file
}
for scan in scans
]
def _schedule_to_dict(self, schedule: Schedule) -> Dict[str, Any]:
"""
Convert Schedule model to dictionary.
Args:
schedule: Schedule model instance
Returns:
Dictionary representation
"""
return {
'id': schedule.id,
'name': schedule.name,
'config_file': schedule.config_file,
'cron_expression': schedule.cron_expression,
'enabled': schedule.enabled,
'last_run': schedule.last_run.isoformat() if schedule.last_run else None,
'next_run': schedule.next_run.isoformat() if schedule.next_run else None,
'next_run_relative': self._get_relative_time(schedule.next_run) if schedule.next_run else None,
'created_at': schedule.created_at.isoformat() if schedule.created_at else None,
'updated_at': schedule.updated_at.isoformat() if schedule.updated_at else None
}
def _get_relative_time(self, dt: Optional[datetime]) -> Optional[str]:
"""
Format datetime as relative time.
Args:
dt: Datetime to format (UTC)
Returns:
Human-readable relative time (e.g., "in 2 hours", "yesterday")
"""
if dt is None:
return None
now = datetime.utcnow()
diff = dt - now
# Future times
if diff.total_seconds() > 0:
seconds = int(diff.total_seconds())
if seconds < 60:
return "in less than a minute"
elif seconds < 3600:
minutes = seconds // 60
return f"in {minutes} minute{'s' if minutes != 1 else ''}"
elif seconds < 86400:
hours = seconds // 3600
return f"in {hours} hour{'s' if hours != 1 else ''}"
elif seconds < 604800:
days = seconds // 86400
return f"in {days} day{'s' if days != 1 else ''}"
else:
weeks = seconds // 604800
return f"in {weeks} week{'s' if weeks != 1 else ''}"
# Past times
else:
seconds = int(-diff.total_seconds())
if seconds < 60:
return "less than a minute ago"
elif seconds < 3600:
minutes = seconds // 60
return f"{minutes} minute{'s' if minutes != 1 else ''} ago"
elif seconds < 86400:
hours = seconds // 3600
return f"{hours} hour{'s' if hours != 1 else ''} ago"
elif seconds < 604800:
days = seconds // 86400
return f"{days} day{'s' if days != 1 else ''} ago"
else:
weeks = seconds // 604800
return f"{weeks} week{'s' if weeks != 1 else ''} ago"

View File

@@ -0,0 +1,356 @@
"""
Scheduler service for managing background jobs and scheduled scans.
This service integrates APScheduler with Flask to enable background
scan execution and future scheduled scanning capabilities.
"""
import logging
from datetime import datetime, timezone
from typing import Optional
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.executors.pool import ThreadPoolExecutor
from flask import Flask
from web.jobs.scan_job import execute_scan
logger = logging.getLogger(__name__)
class SchedulerService:
"""
Service for managing background job scheduling.
Uses APScheduler's BackgroundScheduler to run scans asynchronously
without blocking HTTP requests.
"""
def __init__(self):
"""Initialize scheduler service (scheduler not started yet)."""
self.scheduler: Optional[BackgroundScheduler] = None
self.db_url: Optional[str] = None
def init_scheduler(self, app: Flask):
"""
Initialize and start APScheduler with Flask app.
Args:
app: Flask application instance
Configuration:
- BackgroundScheduler: Runs in separate thread
- ThreadPoolExecutor: Allows concurrent scan execution
- Max workers: 3 (configurable via SCHEDULER_MAX_WORKERS)
"""
if self.scheduler:
logger.warning("Scheduler already initialized")
return
# Store database URL for passing to background jobs
self.db_url = app.config['SQLALCHEMY_DATABASE_URI']
# Configure executor for concurrent jobs
max_workers = app.config.get('SCHEDULER_MAX_WORKERS', 3)
executors = {
'default': ThreadPoolExecutor(max_workers=max_workers)
}
# Configure job defaults
job_defaults = {
'coalesce': True, # Combine multiple pending instances into one
'max_instances': app.config.get('SCHEDULER_MAX_INSTANCES', 3),
'misfire_grace_time': 60 # Allow 60 seconds for delayed starts
}
# Create scheduler with local system timezone
# This allows users to schedule jobs using their local time
# APScheduler will automatically use the system's local timezone
self.scheduler = BackgroundScheduler(
executors=executors,
job_defaults=job_defaults
# timezone defaults to local system timezone
)
# Start scheduler
self.scheduler.start()
logger.info(f"APScheduler started with {max_workers} max workers")
# Register shutdown handler
import atexit
atexit.register(lambda: self.shutdown())
def shutdown(self):
"""
Shutdown scheduler gracefully.
Waits for running jobs to complete before shutting down.
"""
if self.scheduler:
logger.info("Shutting down APScheduler...")
self.scheduler.shutdown(wait=True)
logger.info("APScheduler shutdown complete")
self.scheduler = None
def load_schedules_on_startup(self):
"""
Load all enabled schedules from database and register with APScheduler.
Should be called after init_scheduler() to restore scheduled jobs
that were active when the application last shutdown.
Raises:
RuntimeError: If scheduler not initialized
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
# Import here to avoid circular imports
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from web.models import Schedule
try:
# Create database session
engine = create_engine(self.db_url)
Session = sessionmaker(bind=engine)
session = Session()
try:
# Query all enabled schedules
enabled_schedules = (
session.query(Schedule)
.filter(Schedule.enabled == True)
.all()
)
logger.info(f"Loading {len(enabled_schedules)} enabled schedules on startup")
# Register each schedule with APScheduler
for schedule in enabled_schedules:
try:
self.add_scheduled_scan(
schedule_id=schedule.id,
config_file=schedule.config_file,
cron_expression=schedule.cron_expression
)
logger.info(f"Loaded schedule {schedule.id}: '{schedule.name}'")
except Exception as e:
logger.error(
f"Failed to load schedule {schedule.id} ('{schedule.name}'): {str(e)}",
exc_info=True
)
logger.info("Schedule loading complete")
finally:
session.close()
except Exception as e:
logger.error(f"Error loading schedules on startup: {str(e)}", exc_info=True)
def queue_scan(self, scan_id: int, config_file: str) -> str:
"""
Queue a scan for immediate background execution.
Args:
scan_id: Database ID of the scan
config_file: Path to YAML configuration file
Returns:
Job ID from APScheduler
Raises:
RuntimeError: If scheduler not initialized
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
# Add job to run immediately
job = self.scheduler.add_job(
func=execute_scan,
args=[scan_id, config_file, self.db_url],
id=f'scan_{scan_id}',
name=f'Scan {scan_id}',
replace_existing=True,
misfire_grace_time=300 # 5 minutes
)
logger.info(f"Queued scan {scan_id} for background execution (job_id={job.id})")
return job.id
def add_scheduled_scan(self, schedule_id: int, config_file: str,
cron_expression: str) -> str:
"""
Add a recurring scheduled scan.
Args:
schedule_id: Database ID of the schedule
config_file: Path to YAML configuration file
cron_expression: Cron expression (e.g., "0 2 * * *" for 2am daily)
Returns:
Job ID from APScheduler
Raises:
RuntimeError: If scheduler not initialized
ValueError: If cron expression is invalid
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
from apscheduler.triggers.cron import CronTrigger
# Create cron trigger from expression using local timezone
# This allows users to specify times in their local timezone
try:
trigger = CronTrigger.from_crontab(cron_expression)
# timezone defaults to local system timezone
except (ValueError, KeyError) as e:
raise ValueError(f"Invalid cron expression '{cron_expression}': {str(e)}")
# Add cron job
job = self.scheduler.add_job(
func=self._trigger_scheduled_scan,
args=[schedule_id],
trigger=trigger,
id=f'schedule_{schedule_id}',
name=f'Schedule {schedule_id}',
replace_existing=True,
max_instances=1 # Only one instance per schedule
)
logger.info(f"Added scheduled scan {schedule_id} with cron '{cron_expression}' (job_id={job.id})")
return job.id
def remove_scheduled_scan(self, schedule_id: int):
"""
Remove a scheduled scan job.
Args:
schedule_id: Database ID of the schedule
Raises:
RuntimeError: If scheduler not initialized
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
job_id = f'schedule_{schedule_id}'
try:
self.scheduler.remove_job(job_id)
logger.info(f"Removed scheduled scan job: {job_id}")
except Exception as e:
logger.warning(f"Failed to remove scheduled scan job {job_id}: {str(e)}")
def _trigger_scheduled_scan(self, schedule_id: int):
"""
Internal method to trigger a scan from a schedule.
Creates a new scan record and queues it for execution.
Args:
schedule_id: Database ID of the schedule
"""
logger.info(f"Scheduled scan triggered: schedule_id={schedule_id}")
# Import here to avoid circular imports
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from web.services.schedule_service import ScheduleService
from web.services.scan_service import ScanService
try:
# Create database session
engine = create_engine(self.db_url)
Session = sessionmaker(bind=engine)
session = Session()
try:
# Get schedule details
schedule_service = ScheduleService(session)
schedule = schedule_service.get_schedule(schedule_id)
if not schedule:
logger.error(f"Schedule {schedule_id} not found")
return
if not schedule['enabled']:
logger.warning(f"Schedule {schedule_id} is disabled, skipping execution")
return
# Create and trigger scan
scan_service = ScanService(session)
scan_id = scan_service.trigger_scan(
config_file=schedule['config_file'],
triggered_by='scheduled',
schedule_id=schedule_id,
scheduler=None # Don't pass scheduler to avoid recursion
)
# Queue the scan for execution
self.queue_scan(scan_id, schedule['config_file'])
# Update schedule's last_run and next_run
from croniter import croniter
next_run = croniter(schedule['cron_expression'], datetime.utcnow()).get_next(datetime)
schedule_service.update_run_times(
schedule_id=schedule_id,
last_run=datetime.utcnow(),
next_run=next_run
)
logger.info(f"Scheduled scan completed: schedule_id={schedule_id}, scan_id={scan_id}")
finally:
session.close()
except Exception as e:
logger.error(f"Error triggering scheduled scan {schedule_id}: {str(e)}", exc_info=True)
def get_job_status(self, job_id: str) -> Optional[dict]:
"""
Get status of a scheduled job.
Args:
job_id: APScheduler job ID
Returns:
Dictionary with job information, or None if not found
"""
if not self.scheduler:
return None
job = self.scheduler.get_job(job_id)
if not job:
return None
return {
'id': job.id,
'name': job.name,
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
'trigger': str(job.trigger)
}
def list_jobs(self) -> list:
"""
List all scheduled jobs.
Returns:
List of job information dictionaries
"""
if not self.scheduler:
return []
jobs = self.scheduler.get_jobs()
return [
{
'id': job.id,
'name': job.name,
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
'trigger': str(job.trigger)
}
for job in jobs
]