This commit addresses multiple issues with schedule management and updates documentation to reflect the transition from YAML-based to database-backed configuration system. **Documentation Updates:** - Update DEPLOYMENT.md to remove all references to YAML config files - Document that all configurations are now stored in SQLite database - Update API examples to use config IDs instead of YAML filenames - Remove configs directory from backup/restore procedures - Update volume management section to reflect database-only storage **Cron Expression Handling:** - Add comprehensive documentation for APScheduler cron format conversion - Document that from_crontab() accepts standard format (Sunday=0) and converts automatically - Add validate_cron_expression() helper method with detailed error messages - Include helpful hints for day-of-week field errors in validation - Fix all deprecated datetime.utcnow() calls, replace with datetime.now(timezone.utc) **Timezone-Aware DateTime Fixes:** - Fix "can't subtract offset-naive and offset-aware datetimes" error - Add timezone awareness to croniter.get_next() return values - Make _get_relative_time() defensive to handle both naive and aware datetimes - Ensure all datetime comparisons use timezone-aware objects **Schedule Edit UI Fixes:** - Fix JavaScript error "Cannot set properties of null (setting 'value')" - Change reference from non-existent 'config-id' to correct 'config-file' element - Add config_name field to schedule API responses for better UX - Eagerly load Schedule.config relationship using joinedload() - Fix AttributeError: use schedule.config.title instead of .name - Display config title and ID in schedule edit form **Technical Details:** - app/web/services/schedule_service.py: 6 datetime.utcnow() fixes, validation enhancements - app/web/services/scheduler_service.py: Documentation, validation, timezone fixes - app/web/templates/schedule_edit.html: JavaScript element reference fix - docs/DEPLOYMENT.md: Complete rewrite of config management sections Fixes scheduling for Sunday at midnight (cron: 0 0 * * 0) Fixes schedule edit page JavaScript errors Improves user experience with config title display
422 lines
15 KiB
Python
422 lines
15 KiB
Python
"""
|
|
Scheduler service for managing background jobs and scheduled scans.
|
|
|
|
This service integrates APScheduler with Flask to enable background
|
|
scan execution and future scheduled scanning capabilities.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
from apscheduler.executors.pool import ThreadPoolExecutor
|
|
from flask import Flask
|
|
|
|
from web.jobs.scan_job import execute_scan
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SchedulerService:
|
|
"""
|
|
Service for managing background job scheduling.
|
|
|
|
Uses APScheduler's BackgroundScheduler to run scans asynchronously
|
|
without blocking HTTP requests.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize scheduler service (scheduler not started yet)."""
|
|
self.scheduler: Optional[BackgroundScheduler] = None
|
|
self.db_url: Optional[str] = None
|
|
|
|
def init_scheduler(self, app: Flask):
|
|
"""
|
|
Initialize and start APScheduler with Flask app.
|
|
|
|
Args:
|
|
app: Flask application instance
|
|
|
|
Configuration:
|
|
- BackgroundScheduler: Runs in separate thread
|
|
- ThreadPoolExecutor: Allows concurrent scan execution
|
|
- Max workers: 3 (configurable via SCHEDULER_MAX_WORKERS)
|
|
"""
|
|
if self.scheduler:
|
|
logger.warning("Scheduler already initialized")
|
|
return
|
|
|
|
# Store database URL for passing to background jobs
|
|
self.db_url = app.config['SQLALCHEMY_DATABASE_URI']
|
|
|
|
# Configure executor for concurrent jobs
|
|
max_workers = app.config.get('SCHEDULER_MAX_WORKERS', 3)
|
|
executors = {
|
|
'default': ThreadPoolExecutor(max_workers=max_workers)
|
|
}
|
|
|
|
# Configure job defaults
|
|
job_defaults = {
|
|
'coalesce': True, # Combine multiple pending instances into one
|
|
'max_instances': app.config.get('SCHEDULER_MAX_INSTANCES', 3),
|
|
'misfire_grace_time': 60 # Allow 60 seconds for delayed starts
|
|
}
|
|
|
|
# Create scheduler with local system timezone
|
|
# This allows users to schedule jobs using their local time
|
|
# APScheduler will automatically use the system's local timezone
|
|
self.scheduler = BackgroundScheduler(
|
|
executors=executors,
|
|
job_defaults=job_defaults
|
|
# timezone defaults to local system timezone
|
|
)
|
|
|
|
# Start scheduler
|
|
self.scheduler.start()
|
|
logger.info(f"APScheduler started with {max_workers} max workers")
|
|
|
|
# Register shutdown handler
|
|
import atexit
|
|
atexit.register(lambda: self.shutdown())
|
|
|
|
def shutdown(self):
|
|
"""
|
|
Shutdown scheduler gracefully.
|
|
|
|
Waits for running jobs to complete before shutting down.
|
|
"""
|
|
if self.scheduler:
|
|
logger.info("Shutting down APScheduler...")
|
|
self.scheduler.shutdown(wait=True)
|
|
logger.info("APScheduler shutdown complete")
|
|
self.scheduler = None
|
|
|
|
def load_schedules_on_startup(self):
|
|
"""
|
|
Load all enabled schedules from database and register with APScheduler.
|
|
|
|
Should be called after init_scheduler() to restore scheduled jobs
|
|
that were active when the application last shutdown.
|
|
|
|
Raises:
|
|
RuntimeError: If scheduler not initialized
|
|
"""
|
|
if not self.scheduler:
|
|
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
|
|
|
|
# Import here to avoid circular imports
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
from web.models import Schedule
|
|
|
|
try:
|
|
# Create database session
|
|
engine = create_engine(self.db_url)
|
|
Session = sessionmaker(bind=engine)
|
|
session = Session()
|
|
|
|
try:
|
|
# Query all enabled schedules
|
|
enabled_schedules = (
|
|
session.query(Schedule)
|
|
.filter(Schedule.enabled == True)
|
|
.all()
|
|
)
|
|
|
|
logger.info(f"Loading {len(enabled_schedules)} enabled schedules on startup")
|
|
|
|
# Register each schedule with APScheduler
|
|
for schedule in enabled_schedules:
|
|
try:
|
|
self.add_scheduled_scan(
|
|
schedule_id=schedule.id,
|
|
config_id=schedule.config_id,
|
|
cron_expression=schedule.cron_expression
|
|
)
|
|
logger.info(f"Loaded schedule {schedule.id}: '{schedule.name}'")
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Failed to load schedule {schedule.id} ('{schedule.name}'): {str(e)}",
|
|
exc_info=True
|
|
)
|
|
|
|
logger.info("Schedule loading complete")
|
|
|
|
finally:
|
|
session.close()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error loading schedules on startup: {str(e)}", exc_info=True)
|
|
|
|
@staticmethod
|
|
def validate_cron_expression(cron_expression: str) -> tuple[bool, str]:
|
|
"""
|
|
Validate a cron expression and provide helpful feedback.
|
|
|
|
Args:
|
|
cron_expression: Cron expression to validate
|
|
|
|
Returns:
|
|
Tuple of (is_valid: bool, message: str)
|
|
- If valid: (True, "Valid cron expression")
|
|
- If invalid: (False, "Error message with details")
|
|
|
|
Note:
|
|
Standard crontab format: minute hour day month day_of_week
|
|
Day of week: 0=Sunday, 1=Monday, ..., 6=Saturday (or 7=Sunday)
|
|
"""
|
|
from apscheduler.triggers.cron import CronTrigger
|
|
|
|
try:
|
|
# Try to parse the expression
|
|
trigger = CronTrigger.from_crontab(cron_expression)
|
|
|
|
# Validate basic format (5 fields)
|
|
fields = cron_expression.split()
|
|
if len(fields) != 5:
|
|
return False, f"Cron expression must have 5 fields (minute hour day month day_of_week), got {len(fields)}"
|
|
|
|
return True, "Valid cron expression"
|
|
|
|
except (ValueError, KeyError) as e:
|
|
error_msg = str(e)
|
|
|
|
# Provide helpful hints for common errors
|
|
if "day_of_week" in error_msg.lower() or (len(cron_expression.split()) >= 5):
|
|
# Check if day_of_week field might be using APScheduler format by mistake
|
|
fields = cron_expression.split()
|
|
if len(fields) == 5:
|
|
dow_field = fields[4]
|
|
if dow_field.isdigit() and int(dow_field) >= 0:
|
|
hint = "\nNote: Use standard crontab format where 0=Sunday, 1=Monday, ..., 6=Saturday"
|
|
return False, f"Invalid cron expression: {error_msg}{hint}"
|
|
|
|
return False, f"Invalid cron expression: {error_msg}"
|
|
|
|
def queue_scan(self, scan_id: int, config_id: int) -> str:
|
|
"""
|
|
Queue a scan for immediate background execution.
|
|
|
|
Args:
|
|
scan_id: Database ID of the scan
|
|
config_id: Database config ID
|
|
|
|
Returns:
|
|
Job ID from APScheduler
|
|
|
|
Raises:
|
|
RuntimeError: If scheduler not initialized
|
|
"""
|
|
if not self.scheduler:
|
|
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
|
|
|
|
# Add job to run immediately
|
|
job = self.scheduler.add_job(
|
|
func=execute_scan,
|
|
kwargs={'scan_id': scan_id, 'config_id': config_id, 'db_url': self.db_url},
|
|
id=f'scan_{scan_id}',
|
|
name=f'Scan {scan_id}',
|
|
replace_existing=True,
|
|
misfire_grace_time=300 # 5 minutes
|
|
)
|
|
|
|
logger.info(f"Queued scan {scan_id} for background execution (job_id={job.id})")
|
|
return job.id
|
|
|
|
def add_scheduled_scan(self, schedule_id: int, config_id: int,
|
|
cron_expression: str) -> str:
|
|
"""
|
|
Add a recurring scheduled scan.
|
|
|
|
Args:
|
|
schedule_id: Database ID of the schedule
|
|
config_id: Database config ID
|
|
cron_expression: Cron expression (e.g., "0 2 * * *" for 2am daily)
|
|
IMPORTANT: Use standard crontab format where:
|
|
- Day of week: 0 = Sunday, 1 = Monday, ..., 6 = Saturday
|
|
- APScheduler automatically converts to its internal format
|
|
- from_crontab() handles the conversion properly
|
|
|
|
Returns:
|
|
Job ID from APScheduler
|
|
|
|
Raises:
|
|
RuntimeError: If scheduler not initialized
|
|
ValueError: If cron expression is invalid
|
|
|
|
Note:
|
|
APScheduler internally uses Monday=0, but from_crontab() accepts
|
|
standard crontab format (Sunday=0) and converts it automatically.
|
|
"""
|
|
if not self.scheduler:
|
|
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
|
|
|
|
from apscheduler.triggers.cron import CronTrigger
|
|
|
|
# Validate cron expression first to provide helpful error messages
|
|
is_valid, message = self.validate_cron_expression(cron_expression)
|
|
if not is_valid:
|
|
raise ValueError(message)
|
|
|
|
# Create cron trigger from expression using local timezone
|
|
# from_crontab() parses standard crontab format (Sunday=0)
|
|
# and converts to APScheduler's internal format (Monday=0) automatically
|
|
try:
|
|
trigger = CronTrigger.from_crontab(cron_expression)
|
|
# timezone defaults to local system timezone
|
|
except (ValueError, KeyError) as e:
|
|
# This should not happen due to validation above, but catch anyway
|
|
raise ValueError(f"Invalid cron expression '{cron_expression}': {str(e)}")
|
|
|
|
# Add cron job
|
|
job = self.scheduler.add_job(
|
|
func=self._trigger_scheduled_scan,
|
|
args=[schedule_id],
|
|
trigger=trigger,
|
|
id=f'schedule_{schedule_id}',
|
|
name=f'Schedule {schedule_id}',
|
|
replace_existing=True,
|
|
max_instances=1 # Only one instance per schedule
|
|
)
|
|
|
|
logger.info(f"Added scheduled scan {schedule_id} with cron '{cron_expression}' (job_id={job.id})")
|
|
return job.id
|
|
|
|
def remove_scheduled_scan(self, schedule_id: int):
|
|
"""
|
|
Remove a scheduled scan job.
|
|
|
|
Args:
|
|
schedule_id: Database ID of the schedule
|
|
|
|
Raises:
|
|
RuntimeError: If scheduler not initialized
|
|
"""
|
|
if not self.scheduler:
|
|
raise RuntimeError("Scheduler not initialized. Call init_scheduler() first.")
|
|
|
|
job_id = f'schedule_{schedule_id}'
|
|
|
|
try:
|
|
self.scheduler.remove_job(job_id)
|
|
logger.info(f"Removed scheduled scan job: {job_id}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to remove scheduled scan job {job_id}: {str(e)}")
|
|
|
|
def _trigger_scheduled_scan(self, schedule_id: int):
|
|
"""
|
|
Internal method to trigger a scan from a schedule.
|
|
|
|
Creates a new scan record and queues it for execution.
|
|
|
|
Args:
|
|
schedule_id: Database ID of the schedule
|
|
"""
|
|
logger.info(f"Scheduled scan triggered: schedule_id={schedule_id}")
|
|
|
|
# Import here to avoid circular imports
|
|
from sqlalchemy import create_engine
|
|
from sqlalchemy.orm import sessionmaker
|
|
from web.services.schedule_service import ScheduleService
|
|
from web.services.scan_service import ScanService
|
|
|
|
try:
|
|
# Create database session
|
|
engine = create_engine(self.db_url)
|
|
Session = sessionmaker(bind=engine)
|
|
session = Session()
|
|
|
|
try:
|
|
# Get schedule details
|
|
schedule_service = ScheduleService(session)
|
|
schedule = schedule_service.get_schedule(schedule_id)
|
|
|
|
if not schedule:
|
|
logger.error(f"Schedule {schedule_id} not found")
|
|
return
|
|
|
|
if not schedule['enabled']:
|
|
logger.warning(f"Schedule {schedule_id} is disabled, skipping execution")
|
|
return
|
|
|
|
# Create and trigger scan
|
|
scan_service = ScanService(session)
|
|
scan_id = scan_service.trigger_scan(
|
|
config_id=schedule['config_id'],
|
|
triggered_by='scheduled',
|
|
schedule_id=schedule_id,
|
|
scheduler=None # Don't pass scheduler to avoid recursion
|
|
)
|
|
|
|
# Queue the scan for execution
|
|
self.queue_scan(scan_id, schedule['config_id'])
|
|
|
|
# Update schedule's last_run and next_run
|
|
from croniter import croniter
|
|
now_utc = datetime.now(timezone.utc)
|
|
next_run = croniter(schedule['cron_expression'], now_utc).get_next(datetime)
|
|
|
|
# croniter returns naive datetime, add UTC timezone
|
|
if next_run.tzinfo is None:
|
|
next_run = next_run.replace(tzinfo=timezone.utc)
|
|
|
|
schedule_service.update_run_times(
|
|
schedule_id=schedule_id,
|
|
last_run=now_utc,
|
|
next_run=next_run
|
|
)
|
|
|
|
logger.info(f"Scheduled scan completed: schedule_id={schedule_id}, scan_id={scan_id}")
|
|
|
|
finally:
|
|
session.close()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error triggering scheduled scan {schedule_id}: {str(e)}", exc_info=True)
|
|
|
|
def get_job_status(self, job_id: str) -> Optional[dict]:
|
|
"""
|
|
Get status of a scheduled job.
|
|
|
|
Args:
|
|
job_id: APScheduler job ID
|
|
|
|
Returns:
|
|
Dictionary with job information, or None if not found
|
|
"""
|
|
if not self.scheduler:
|
|
return None
|
|
|
|
job = self.scheduler.get_job(job_id)
|
|
if not job:
|
|
return None
|
|
|
|
return {
|
|
'id': job.id,
|
|
'name': job.name,
|
|
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
|
|
'trigger': str(job.trigger)
|
|
}
|
|
|
|
def list_jobs(self) -> list:
|
|
"""
|
|
List all scheduled jobs.
|
|
|
|
Returns:
|
|
List of job information dictionaries
|
|
"""
|
|
if not self.scheduler:
|
|
return []
|
|
|
|
jobs = self.scheduler.get_jobs()
|
|
return [
|
|
{
|
|
'id': job.id,
|
|
'name': job.name,
|
|
'next_run_time': job.next_run_time.isoformat() if job.next_run_time else None,
|
|
'trigger': str(job.trigger)
|
|
}
|
|
for job in jobs
|
|
]
|