Files
SneakyScan/app/web/services/scheduler_service.py
Phillip Tarrant 5e3a70f837 Fix schedule management and update documentation for database-backed configs
This commit addresses multiple issues with schedule management and updates
  documentation to reflect the transition from YAML-based to database-backed
  configuration system.

  **Documentation Updates:**
  - Update DEPLOYMENT.md to remove all references to YAML config files
  - Document that all configurations are now stored in SQLite database
  - Update API examples to use config IDs instead of YAML filenames
  - Remove configs directory from backup/restore procedures
  - Update volume management section to reflect database-only storage

  **Cron Expression Handling:**
  - Add comprehensive documentation for APScheduler cron format conversion
  - Document that from_crontab() accepts standard format (Sunday=0) and converts automatically
  - Add validate_cron_expression() helper method with detailed error messages
  - Include helpful hints for day-of-week field errors in validation
  - Fix all deprecated datetime.utcnow() calls, replace with datetime.now(timezone.utc)

  **Timezone-Aware DateTime Fixes:**
  - Fix "can't subtract offset-naive and offset-aware datetimes" error
  - Add timezone awareness to croniter.get_next() return values
  - Make _get_relative_time() defensive to handle both naive and aware datetimes
  - Ensure all datetime comparisons use timezone-aware objects

  **Schedule Edit UI Fixes:**
  - Fix JavaScript error "Cannot set properties of null (setting 'value')"
  - Change reference from non-existent 'config-id' to correct 'config-file' element
  - Add config_name field to schedule API responses for better UX
  - Eagerly load Schedule.config relationship using joinedload()
  - Fix AttributeError: use schedule.config.title instead of .name
  - Display config title and ID in schedule edit form

  **Technical Details:**
  - app/web/services/schedule_service.py: 6 datetime.utcnow() fixes, validation enhancements
  - app/web/services/scheduler_service.py: Documentation, validation, timezone fixes
  - app/web/templates/schedule_edit.html: JavaScript element reference fix
  - docs/DEPLOYMENT.md: Complete rewrite of config management sections

  Fixes scheduling for Sunday at midnight (cron: 0 0 * * 0)
  Fixes schedule edit page JavaScript errors
  Improves user experience with config title display
2025-11-24 12:53:06 -06:00

422 lines
15 KiB
Python

"""
Scheduler service for managing background jobs and scheduled scans.
This service integrates APScheduler with Flask to enable background
scan execution and future scheduled scanning capabilities.
"""
import logging
from datetime import datetime, timezone
from typing import Optional
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.executors.pool import ThreadPoolExecutor
from flask import Flask
from web.jobs.scan_job import execute_scan
logger = logging.getLogger(__name__)
class SchedulerService:
"""
Service for managing background job scheduling.
Uses APScheduler's BackgroundScheduler to run scans asynchronously
without blocking HTTP requests.
"""
def __init__(self):
"""Initialize scheduler service (scheduler not started yet)."""
self.scheduler: Optional[BackgroundScheduler] = None
self.db_url: Optional[str] = None
def init_scheduler(self, app: Flask):
"""
Initialize and start APScheduler with Flask app.
Args:
app: Flask application instance
Configuration:
- BackgroundScheduler: Runs in separate thread
- ThreadPoolExecutor: Allows concurrent scan execution
- Max workers: 3 (configurable via SCHEDULER_MAX_WORKERS)
"""
if self.scheduler:
logger.warning("Scheduler already initialized")
return
# Store database URL for passing to background jobs
self.db_url = app.config['SQLALCHEMY_DATABASE_URI']
# Configure executor for concurrent jobs
max_workers = app.config.get('SCHEDULER_MAX_WORKERS', 3)
executors = {
'default': ThreadPoolExecutor(max_workers=max_workers)
}
# Configure job defaults
job_defaults = {
'coalesce': True, # Combine multiple pending instances into one
'max_instances': app.config.get('SCHEDULER_MAX_INSTANCES', 3),
'misfire_grace_time': 60 # Allow 60 seconds for delayed starts
}
# Create scheduler with local system timezone
# This allows users to schedule jobs using their local time
# APScheduler will automatically use the system's local timezone
self.scheduler = BackgroundScheduler(
executors=executors,
job_defaults=job_defaults
# timezone defaults to local system timezone
)
# Start scheduler
self.scheduler.start()
logger.info(f"APScheduler started with {max_workers} max workers")
# Register shutdown handler
import atexit
atexit.register(lambda: self.shutdown())
def shutdown(self):
"""
Shutdown scheduler gracefully.
Waits for running jobs to complete before shutting down.
"""
if self.scheduler:
logger.info("Shutting down APScheduler...")
self.scheduler.shutdown(wait=True)
logger.info("APScheduler shutdown complete")
self.scheduler = None
def load_schedules_on_startup(self):
"""
Load all enabled schedules from database and register with APScheduler.
Should be called after init_scheduler() to restore scheduled jobs
that were active when the application last shutdown.
Raises:
RuntimeError: If scheduler not initialized
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
# Import here to avoid circular imports
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from web.models import Schedule
try:
# Create database session
engine = create_engine(self.db_url)
Session = sessionmaker(bind=engine)
session = Session()
try:
# Query all enabled schedules
enabled_schedules = (
session.query(Schedule)
.filter(Schedule.enabled == True)
.all()
)
logger.info(f"Loading {len(enabled_schedules)} enabled schedules on startup")
# Register each schedule with APScheduler
for schedule in enabled_schedules:
try:
self.add_scheduled_scan(
schedule_id=schedule.id,
config_id=schedule.config_id,
cron_expression=schedule.cron_expression
)
logger.info(f"Loaded schedule {schedule.id}: '{schedule.name}'")
except Exception as e:
logger.error(
f"Failed to load schedule {schedule.id} ('{schedule.name}'): {str(e)}",
exc_info=True
)
logger.info("Schedule loading complete")
finally:
session.close()
except Exception as e:
logger.error(f"Error loading schedules on startup: {str(e)}", exc_info=True)
@staticmethod
def validate_cron_expression(cron_expression: str) -> tuple[bool, str]:
"""
Validate a cron expression and provide helpful feedback.
Args:
cron_expression: Cron expression to validate
Returns:
Tuple of (is_valid: bool, message: str)
- If valid: (True, "Valid cron expression")
- If invalid: (False, "Error message with details")
Note:
Standard crontab format: minute hour day month day_of_week
Day of week: 0=Sunday, 1=Monday, ..., 6=Saturday (or 7=Sunday)
"""
from apscheduler.triggers.cron import CronTrigger
try:
# Try to parse the expression
trigger = CronTrigger.from_crontab(cron_expression)
# Validate basic format (5 fields)
fields = cron_expression.split()
if len(fields) != 5:
return False, f"Cron expression must have 5 fields (minute hour day month day_of_week), got {len(fields)}"
return True, "Valid cron expression"
except (ValueError, KeyError) as e:
error_msg = str(e)
# Provide helpful hints for common errors
if "day_of_week" in error_msg.lower() or (len(cron_expression.split()) >= 5):
# Check if day_of_week field might be using APScheduler format by mistake
fields = cron_expression.split()
if len(fields) == 5:
dow_field = fields[4]
if dow_field.isdigit() and int(dow_field) >= 0:
hint = "\nNote: Use standard crontab format where 0=Sunday, 1=Monday, ..., 6=Saturday"
return False, f"Invalid cron expression: {error_msg}{hint}"
return False, f"Invalid cron expression: {error_msg}"
def queue_scan(self, scan_id: int, config_id: int) -> str:
"""
Queue a scan for immediate background execution.
Args:
scan_id: Database ID of the scan
config_id: Database config ID
Returns:
Job ID from APScheduler
Raises:
RuntimeError: If scheduler not initialized
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
# Add job to run immediately
job = self.scheduler.add_job(
func=execute_scan,
kwargs={'scan_id': scan_id, 'config_id': config_id, 'db_url': self.db_url},
id=f'scan_{scan_id}',
name=f'Scan {scan_id}',
replace_existing=True,
misfire_grace_time=300 # 5 minutes
)
logger.info(f"Queued scan {scan_id} for background execution (job_id={job.id})")
return job.id
def add_scheduled_scan(self, schedule_id: int, config_id: int,
cron_expression: str) -> str:
"""
Add a recurring scheduled scan.
Args:
schedule_id: Database ID of the schedule
config_id: Database config ID
cron_expression: Cron expression (e.g., "0 2 * * *" for 2am daily)
IMPORTANT: Use standard crontab format where:
- Day of week: 0 = Sunday, 1 = Monday, ..., 6 = Saturday
- APScheduler automatically converts to its internal format
- from_crontab() handles the conversion properly
Returns:
Job ID from APScheduler
Raises:
RuntimeError: If scheduler not initialized
ValueError: If cron expression is invalid
Note:
APScheduler internally uses Monday=0, but from_crontab() accepts
standard crontab format (Sunday=0) and converts it automatically.
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
from apscheduler.triggers.cron import CronTrigger
# Validate cron expression first to provide helpful error messages
is_valid, message = self.validate_cron_expression(cron_expression)
if not is_valid:
raise ValueError(message)
# Create cron trigger from expression using local timezone
# from_crontab() parses standard crontab format (Sunday=0)
# and converts to APScheduler's internal format (Monday=0) automatically
try:
trigger = CronTrigger.from_crontab(cron_expression)
# timezone defaults to local system timezone
except (ValueError, KeyError) as e:
# This should not happen due to validation above, but catch anyway
raise ValueError(f"Invalid cron expression '{cron_expression}': {str(e)}")
# Add cron job
job = self.scheduler.add_job(
func=self._trigger_scheduled_scan,
args=[schedule_id],
trigger=trigger,
id=f'schedule_{schedule_id}',
name=f'Schedule {schedule_id}',
replace_existing=True,
max_instances=1 # Only one instance per schedule
)
logger.info(f"Added scheduled scan {schedule_id} with cron '{cron_expression}' (job_id={job.id})")
return job.id
def remove_scheduled_scan(self, schedule_id: int):
"""
Remove a scheduled scan job.
Args:
schedule_id: Database ID of the schedule
Raises:
RuntimeError: If scheduler not initialized
"""
if not self.scheduler:
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
job_id = f'schedule_{schedule_id}'
try:
self.scheduler.remove_job(job_id)
logger.info(f"Removed scheduled scan job: {job_id}")
except Exception as e:
logger.warning(f"Failed to remove scheduled scan job {job_id}: {str(e)}")
def _trigger_scheduled_scan(self, schedule_id: int):
"""
Internal method to trigger a scan from a schedule.
Creates a new scan record and queues it for execution.
Args:
schedule_id: Database ID of the schedule
"""
logger.info(f"Scheduled scan triggered: schedule_id={schedule_id}")
# Import here to avoid circular imports
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from web.services.schedule_service import ScheduleService
from web.services.scan_service import ScanService
try:
# Create database session
engine = create_engine(self.db_url)
Session = sessionmaker(bind=engine)
session = Session()
try:
# Get schedule details
schedule_service = ScheduleService(session)
schedule = schedule_service.get_schedule(schedule_id)
if not schedule:
logger.error(f"Schedule {schedule_id} not found")
return
if not schedule['enabled']:
logger.warning(f"Schedule {schedule_id} is disabled, skipping execution")
return
# Create and trigger scan
scan_service = ScanService(session)
scan_id = scan_service.trigger_scan(
config_id=schedule['config_id'],
triggered_by='scheduled',
schedule_id=schedule_id,
scheduler=None # Don't pass scheduler to avoid recursion
)
# Queue the scan for execution
self.queue_scan(scan_id, schedule['config_id'])
# Update schedule's last_run and next_run
from croniter import croniter
now_utc = datetime.now(timezone.utc)
next_run = croniter(schedule['cron_expression'], now_utc).get_next(datetime)
# croniter returns naive datetime, add UTC timezone
if next_run.tzinfo is None:
next_run = next_run.replace(tzinfo=timezone.utc)
schedule_service.update_run_times(
schedule_id=schedule_id,
last_run=now_utc,
next_run=next_run
)
logger.info(f"Scheduled scan completed: schedule_id={schedule_id}, scan_id={scan_id}")
finally:
session.close()
except Exception as e:
logger.error(f"Error triggering scheduled scan {schedule_id}: {str(e)}", exc_info=True)
def get_job_status(self, job_id: str) -> Optional[dict]:
"""
Get status of a scheduled job.
Args:
job_id: APScheduler job ID
Returns:
Dictionary with job information, or None if not found
"""
if not self.scheduler:
return None
job = self.scheduler.get_job(job_id)
if not job:
return None
return {
'id': job.id,
'name': job.name,
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
'trigger': str(job.trigger)
}
def list_jobs(self) -> list:
"""
List all scheduled jobs.
Returns:
List of job information dictionaries
"""
if not self.scheduler:
return []
jobs = self.scheduler.get_jobs()
return [
{
'id': job.id,
'name': job.name,
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
'trigger': str(job.trigger)
}
for job in jobs
]