happyDomain/internal/app/testscheduler.go
Pierre-Olivier Mercier 11a65fa2ac Wire option merging for scheduled test executions
Fetch user/domain/service-level plugin options via GetTestPluginOptions
and merge them with schedule-specific options (schedule opts take priority)
before running a test. Previously tests only used schedule.Options.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-11 20:38:25 +07:00

727 lines
20 KiB
Go

// This file is part of the happyDomain (R) project.
// Copyright (c) 2020-2026 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
//
// This program is offered under a commercial and under the AGPL license.
// For commercial licensing, contact us at <contact@happydomain.org>.
//
// For AGPL licensing:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package app
import (
"context"
"fmt"
"log"
"runtime"
"sync"
"time"
"git.happydns.org/happyDomain/internal/storage"
"git.happydns.org/happyDomain/internal/usecase/testresult"
"git.happydns.org/happyDomain/model"
)
const (
SchedulerCheckInterval = 1 * time.Minute // How often to check for due tests
SchedulerCleanupInterval = 24 * time.Hour // How often to clean up old executions
SchedulerDiscoveryInterval = 1 * time.Hour // How often to auto-discover new targets
TestExecutionTimeout = 5 * time.Minute // Max time for a single test
MaxRetries = 3 // Max retry attempts for failed tests
)
// Priority levels for test execution queue
const (
PriorityOnDemand = iota // On-demand tests (highest priority)
PriorityOverdue // Overdue scheduled tests
PriorityScheduled // Regular scheduled tests
)
// testScheduler manages background test execution
type testScheduler struct {
cfg *happydns.Options
store storage.Storage
pluginUsecase happydns.TestPluginUsecase
resultUsecase *testresult.TestResultUsecase
scheduleUsecase *testresult.TestScheduleUsecase
stop chan bool
runNowChan chan *happydns.TestSchedule
queue *priorityQueue
activeExecutions map[string]*activeExecution
workers []*worker
mu sync.RWMutex
wg sync.WaitGroup
runtimeEnabled bool
running bool
}
// activeExecution tracks a running test execution
type activeExecution struct {
execution *happydns.TestExecution
cancel context.CancelFunc
startTime time.Time
}
// queueItem represents a test execution request in the queue
type queueItem struct {
schedule *happydns.TestSchedule
execution *happydns.TestExecution
priority int
queuedAt time.Time
retries int
}
// priorityQueue manages test execution queue with priority levels
type priorityQueue struct {
items []*queueItem
mu sync.Mutex
}
// newPriorityQueue creates a new priority queue
func newPriorityQueue() *priorityQueue {
return &priorityQueue{
items: make([]*queueItem, 0),
}
}
// Push adds an item to the queue
func (q *priorityQueue) Push(item *queueItem) {
q.mu.Lock()
defer q.mu.Unlock()
q.items = append(q.items, item)
// Sort by priority (lower number = higher priority)
// Within same priority, FIFO order
for i := len(q.items) - 1; i > 0; i-- {
if q.items[i].priority < q.items[i-1].priority {
q.items[i], q.items[i-1] = q.items[i-1], q.items[i]
} else {
break
}
}
}
// Pop removes and returns the highest priority item
func (q *priorityQueue) Pop() *queueItem {
q.mu.Lock()
defer q.mu.Unlock()
if len(q.items) == 0 {
return nil
}
item := q.items[0]
q.items = q.items[1:]
return item
}
// Len returns the queue length
func (q *priorityQueue) Len() int {
q.mu.Lock()
defer q.mu.Unlock()
return len(q.items)
}
// worker processes tests from the queue
type worker struct {
id int
scheduler *testScheduler
stop chan bool
}
// disabledScheduler is a no-op implementation used when scheduler is disabled
type disabledScheduler struct{}
// TriggerOnDemandTest returns an error indicating the scheduler is disabled
func (d *disabledScheduler) TriggerOnDemandTest(pluginName string, targetType happydns.TestScopeType, targetId happydns.Identifier, userId happydns.Identifier, options happydns.PluginOptions) (happydns.Identifier, error) {
return happydns.Identifier{}, fmt.Errorf("test scheduler is disabled in configuration")
}
// GetSchedulerStatus returns a status indicating the scheduler is disabled
func (d *disabledScheduler) GetSchedulerStatus() happydns.SchedulerStatus {
return happydns.SchedulerStatus{
ConfigEnabled: false,
RuntimeEnabled: false,
Running: false,
}
}
// SetEnabled returns an error since the scheduler is disabled in configuration
func (d *disabledScheduler) SetEnabled(enabled bool) error {
return fmt.Errorf("scheduler is disabled in configuration, cannot enable at runtime")
}
// RescheduleUpcomingTests returns an error since the scheduler is disabled
func (d *disabledScheduler) RescheduleUpcomingTests() (int, error) {
return 0, fmt.Errorf("test scheduler is disabled in configuration")
}
// newTestScheduler creates a new test scheduler
func newTestScheduler(
cfg *happydns.Options,
store storage.Storage,
pluginUsecase happydns.TestPluginUsecase,
) *testScheduler {
numWorkers := cfg.TestWorkers
if numWorkers <= 0 {
numWorkers = runtime.NumCPU()
}
scheduler := &testScheduler{
cfg: cfg,
store: store,
pluginUsecase: pluginUsecase,
resultUsecase: testresult.NewTestResultUsecase(store, cfg),
scheduleUsecase: testresult.NewTestScheduleUsecase(store, cfg),
stop: make(chan bool),
runNowChan: make(chan *happydns.TestSchedule, 100),
queue: newPriorityQueue(),
activeExecutions: make(map[string]*activeExecution),
workers: make([]*worker, numWorkers),
runtimeEnabled: true,
}
// Create workers
for i := 0; i < numWorkers; i++ {
scheduler.workers[i] = &worker{
id: i,
scheduler: scheduler,
stop: make(chan bool),
}
}
return scheduler
}
// Close stops the scheduler
func (s *testScheduler) Close() {
log.Println("Stopping test scheduler...")
// Stop the main loop
s.stop <- true
// Stop all workers
for _, w := range s.workers {
w.stop <- true
}
// Cancel all active executions
s.mu.Lock()
for _, exec := range s.activeExecutions {
exec.cancel()
}
s.mu.Unlock()
// Wait for all workers to finish
s.wg.Wait()
log.Println("Test scheduler stopped")
}
// Run starts the scheduler main loop
func (s *testScheduler) Run() {
if s.cfg.DisableScheduler {
log.Println("Test scheduler disabled by configuration")
return
}
s.mu.Lock()
s.running = true
s.mu.Unlock()
defer func() {
s.mu.Lock()
s.running = false
s.mu.Unlock()
}()
log.Printf("Starting test scheduler with %d workers...\n", len(s.workers))
// Reschedule overdue tests before starting workers so that tests missed
// during a server suspend or shutdown are spread into the near future
// instead of all firing at once.
if n, err := s.scheduleUsecase.RescheduleOverdueTests(); err != nil {
log.Printf("Warning: failed to reschedule overdue tests: %v\n", err)
} else if n > 0 {
log.Printf("Rescheduled %d overdue test(s) into the near future\n", n)
}
// Start workers
for _, w := range s.workers {
s.wg.Add(1)
go w.run(&s.wg)
}
// Main scheduling loop
checkTicker := time.NewTicker(SchedulerCheckInterval)
cleanupTicker := time.NewTicker(SchedulerCleanupInterval)
discoveryTicker := time.NewTicker(SchedulerDiscoveryInterval)
defer checkTicker.Stop()
defer cleanupTicker.Stop()
defer discoveryTicker.Stop()
// Initial discovery: create default schedules for all existing targets
s.discoverAndEnsureSchedules()
// Initial check
s.checkSchedules()
for {
select {
case <-checkTicker.C:
s.checkSchedules()
case <-cleanupTicker.C:
s.cleanup()
case <-discoveryTicker.C:
s.discoverAndEnsureSchedules()
case schedule := <-s.runNowChan:
s.queueOnDemandTest(schedule)
case <-s.stop:
return
}
}
}
// checkSchedules checks for due tests and queues them
func (s *testScheduler) checkSchedules() {
s.mu.RLock()
enabled := s.runtimeEnabled
s.mu.RUnlock()
if !enabled {
return
}
dueSchedules, err := s.scheduleUsecase.ListDueSchedules()
if err != nil {
log.Printf("Error listing due schedules: %v\n", err)
return
}
now := time.Now()
for _, schedule := range dueSchedules {
// Determine priority based on how overdue the test is
priority := PriorityScheduled
if schedule.NextRun.Add(schedule.Interval).Before(now) {
priority = PriorityOverdue
}
// Create execution record
execution := &happydns.TestExecution{
ScheduleId: &schedule.Id,
PluginName: schedule.PluginName,
OwnerId: schedule.OwnerId,
TargetType: schedule.TargetType,
TargetId: schedule.TargetId,
Status: happydns.TestExecutionPending,
StartedAt: time.Now(),
Options: schedule.Options,
}
if err := s.resultUsecase.CreateTestExecution(execution); err != nil {
log.Printf("Error creating execution for schedule %s: %v\n", schedule.Id, err)
continue
}
// Queue the test
s.queue.Push(&queueItem{
schedule: schedule,
execution: execution,
priority: priority,
queuedAt: now,
retries: 0,
})
}
// Mark scheduler run
if err := s.store.TestSchedulerRun(); err != nil {
log.Printf("Error marking scheduler run: %v\n", err)
}
}
// discoverAndEnsureSchedules creates default (enabled) schedules for all
// (plugin, target) pairs that don't yet have an explicit schedule record.
// This implements the opt-out model: tests run automatically unless a schedule
// with Enabled=false has been explicitly saved.
func (s *testScheduler) discoverAndEnsureSchedules() {
s.mu.RLock()
enabled := s.runtimeEnabled
s.mu.RUnlock()
if !enabled {
return
}
plugins, err := s.pluginUsecase.ListTestPlugins()
if err != nil {
log.Printf("Error listing test plugins for discovery: %v\n", err)
return
}
// Filter domain-level plugins
var domainPlugins []happydns.TestPlugin
for _, p := range plugins {
if p.Version().AvailableOn.ApplyToDomain {
domainPlugins = append(domainPlugins, p)
}
}
if len(domainPlugins) > 0 {
iter, err := s.store.ListAllDomains()
if err != nil {
log.Printf("Error listing domains for schedule discovery: %v\n", err)
} else {
defer iter.Close()
for iter.Next() {
domain := iter.Item()
if domain == nil {
continue
}
for _, plugin := range domainPlugins {
pluginName := plugin.Version().Name
schedules, err := s.scheduleUsecase.ListSchedulesByTarget(happydns.TestScopeDomain, domain.Id)
if err != nil {
continue
}
hasSchedule := false
for _, sched := range schedules {
if sched.PluginName == pluginName {
hasSchedule = true
break
}
}
if !hasSchedule {
if err := s.scheduleUsecase.CreateSchedule(&happydns.TestSchedule{
PluginName: pluginName,
OwnerId: domain.Owner,
TargetType: happydns.TestScopeDomain,
TargetId: domain.Id,
Enabled: true,
}); err != nil {
log.Printf("Error auto-creating schedule for domain %s / plugin %s: %v\n",
domain.Id, pluginName, err)
}
}
}
}
}
}
// Service-level plugin discovery is deferred: services live inside zones
// and enumeration would require iterating all zones across all domains.
// Services get auto-scheduled on their first explicit interaction instead.
}
// queueOnDemandTest queues an on-demand test execution
func (s *testScheduler) queueOnDemandTest(schedule *happydns.TestSchedule) {
execution := &happydns.TestExecution{
ScheduleId: nil, // On-demand has no schedule
PluginName: schedule.PluginName,
OwnerId: schedule.OwnerId,
TargetType: schedule.TargetType,
TargetId: schedule.TargetId,
Status: happydns.TestExecutionPending,
StartedAt: time.Now(),
Options: schedule.Options,
}
if err := s.resultUsecase.CreateTestExecution(execution); err != nil {
log.Printf("Error creating on-demand execution: %v\n", err)
return
}
s.queue.Push(&queueItem{
schedule: schedule,
execution: execution,
priority: PriorityOnDemand,
queuedAt: time.Now(),
retries: 0,
})
}
// TriggerOnDemandTest triggers an immediate test execution
func (s *testScheduler) TriggerOnDemandTest(pluginName string, targetType happydns.TestScopeType, targetId happydns.Identifier, ownerId happydns.Identifier, options happydns.PluginOptions) (happydns.Identifier, error) {
// Create a temporary schedule for on-demand execution
schedule := &happydns.TestSchedule{
PluginName: pluginName,
OwnerId: ownerId,
TargetType: targetType,
TargetId: targetId,
Interval: 0, // On-demand, no interval
Enabled: true,
Options: options,
}
// Create execution record
execution := &happydns.TestExecution{
ScheduleId: nil,
PluginName: pluginName,
OwnerId: ownerId,
TargetType: targetType,
TargetId: targetId,
Status: happydns.TestExecutionPending,
StartedAt: time.Now(),
Options: options,
}
if err := s.resultUsecase.CreateTestExecution(execution); err != nil {
return happydns.Identifier{}, err
}
// Queue with highest priority
s.queue.Push(&queueItem{
schedule: schedule,
execution: execution,
priority: PriorityOnDemand,
queuedAt: time.Now(),
retries: 0,
})
return execution.Id, nil
}
// GetSchedulerStatus returns a snapshot of the current scheduler state
func (s *testScheduler) GetSchedulerStatus() happydns.SchedulerStatus {
s.mu.RLock()
activeCount := len(s.activeExecutions)
running := s.running
runtimeEnabled := s.runtimeEnabled
s.mu.RUnlock()
nextSchedules, _ := s.scheduleUsecase.ListUpcomingSchedules(20)
return happydns.SchedulerStatus{
ConfigEnabled: !s.cfg.DisableScheduler,
RuntimeEnabled: runtimeEnabled,
Running: running,
WorkerCount: len(s.workers),
QueueSize: s.queue.Len(),
ActiveCount: activeCount,
NextSchedules: nextSchedules,
}
}
// SetEnabled enables or disables the scheduler at runtime
func (s *testScheduler) SetEnabled(enabled bool) error {
s.mu.Lock()
s.runtimeEnabled = enabled
s.mu.Unlock()
return nil
}
// RescheduleUpcomingTests randomizes the next run time of all enabled schedules
// within their respective intervals, delegating to the schedule usecase.
func (s *testScheduler) RescheduleUpcomingTests() (int, error) {
return s.scheduleUsecase.RescheduleUpcomingTests()
}
// cleanup removes old execution records and expired test results
func (s *testScheduler) cleanup() {
log.Println("Running scheduler cleanup...")
// Delete completed/failed execution records older than 7 days
if err := s.resultUsecase.DeleteCompletedExecutions(7 * 24 * time.Hour); err != nil {
log.Printf("Error cleaning up old executions: %v\n", err)
}
// Delete test results older than the configured retention period
if err := s.resultUsecase.CleanupOldResults(); err != nil {
log.Printf("Error cleaning up old test results: %v\n", err)
}
log.Println("Scheduler cleanup complete")
}
// worker.run processes tests from the queue
func (w *worker) run(wg *sync.WaitGroup) {
defer wg.Done()
log.Printf("Worker %d started\n", w.id)
for {
select {
case <-w.stop:
log.Printf("Worker %d stopped\n", w.id)
return
default:
// Try to get work from queue
item := w.scheduler.queue.Pop()
if item == nil {
// No work, sleep briefly
time.Sleep(1 * time.Second)
continue
}
// Execute the test
w.executeTest(item)
}
}
}
// executeTest runs a test plugin and stores the result
func (w *worker) executeTest(item *queueItem) {
ctx, cancel := context.WithTimeout(context.Background(), TestExecutionTimeout)
defer cancel()
execution := item.execution
schedule := item.schedule
// Always update schedule NextRun after execution, whether it succeeds or fails.
// This prevents the schedule from being re-queued on the next tick if the test fails.
if item.execution.ScheduleId != nil {
defer func() {
if err := w.scheduler.scheduleUsecase.UpdateScheduleAfterRun(*item.execution.ScheduleId); err != nil {
log.Printf("Worker %d: Error updating schedule after run: %v\n", w.id, err)
}
}()
}
// Mark execution as running
execution.Status = happydns.TestExecutionRunning
if err := w.scheduler.resultUsecase.UpdateTestExecution(execution); err != nil {
log.Printf("Worker %d: Error updating execution status: %v\n", w.id, err)
_ = w.scheduler.resultUsecase.FailTestExecution(execution.Id, err.Error())
return
}
// Track active execution
w.scheduler.mu.Lock()
w.scheduler.activeExecutions[execution.Id.String()] = &activeExecution{
execution: execution,
cancel: cancel,
startTime: time.Now(),
}
w.scheduler.mu.Unlock()
defer func() {
w.scheduler.mu.Lock()
delete(w.scheduler.activeExecutions, execution.Id.String())
w.scheduler.mu.Unlock()
}()
// Get the plugin
plugin, err := w.scheduler.pluginUsecase.GetTestPlugin(schedule.PluginName)
if err != nil {
errMsg := fmt.Sprintf("plugin not found: %s - %v", schedule.PluginName, err)
log.Printf("Worker %d: %s\n", w.id, errMsg)
_ = w.scheduler.resultUsecase.FailTestExecution(execution.Id, errMsg)
return
}
// Merge options: global defaults < user opts < domain/service opts < schedule opts
var domainId, serviceId *happydns.Identifier
switch schedule.TargetType {
case happydns.TestScopeDomain:
domainId = &schedule.TargetId
case happydns.TestScopeService:
serviceId = &schedule.TargetId
}
baseOptions, err := w.scheduler.pluginUsecase.GetTestPluginOptions(schedule.PluginName, &schedule.OwnerId, domainId, serviceId)
if err != nil {
log.Printf("Worker %d: warning, could not fetch plugin options for %s: %v\n", w.id, schedule.PluginName, err)
}
var mergedOptions happydns.PluginOptions
if baseOptions != nil {
mergedOptions = w.scheduler.scheduleUsecase.MergePluginOptions(nil, nil, *baseOptions, schedule.Options)
} else {
mergedOptions = schedule.Options
}
// Prepare metadata
meta := make(map[string]string)
meta["target_type"] = schedule.TargetType.String()
meta["target_id"] = schedule.TargetId.String()
// Run the test
startTime := time.Now()
resultChan := make(chan *happydns.PluginResult, 1)
errorChan := make(chan error, 1)
go func() {
defer func() {
if r := recover(); r != nil {
errorChan <- fmt.Errorf("plugin panicked: %v", r)
}
}()
result, err := plugin.RunTest(mergedOptions, meta)
if err != nil {
errorChan <- err
} else {
resultChan <- result
}
}()
// Wait for result or timeout
var pluginResult *happydns.PluginResult
var testErr error
select {
case pluginResult = <-resultChan:
// Test completed successfully
case testErr = <-errorChan:
// Test returned an error
case <-ctx.Done():
// Timeout
testErr = fmt.Errorf("test execution timeout after %v", TestExecutionTimeout)
}
duration := time.Since(startTime)
// Store the result
result := &happydns.TestResult{
PluginName: schedule.PluginName,
TestType: schedule.TargetType,
TargetId: schedule.TargetId,
OwnerId: schedule.OwnerId,
ExecutedAt: time.Now(),
ScheduledTest: item.execution.ScheduleId != nil,
Options: schedule.Options,
Duration: duration,
}
if testErr != nil {
result.Status = happydns.PluginResultStatusKO
result.StatusLine = "Test execution failed"
result.Error = testErr.Error()
} else if pluginResult != nil {
result.Status = pluginResult.Status
result.StatusLine = pluginResult.StatusLine
result.Report = pluginResult.Report
} else {
result.Status = happydns.PluginResultStatusKO
result.StatusLine = "Unknown error"
result.Error = "No result or error returned from plugin"
}
// Save the result
if err := w.scheduler.resultUsecase.CreateTestResult(result); err != nil {
log.Printf("Worker %d: Error saving test result: %v\n", w.id, err)
_ = w.scheduler.resultUsecase.FailTestExecution(execution.Id, err.Error())
return
}
// Complete the execution
if err := w.scheduler.resultUsecase.CompleteTestExecution(execution.Id, result.Id); err != nil {
log.Printf("Worker %d: Error completing execution: %v\n", w.id, err)
return
}
log.Printf("Worker %d: Completed test %s for target %s (status: %d, duration: %v)\n",
w.id, schedule.PluginName, schedule.TargetId, result.Status, duration)
}