happyDomain/internal/app/testscheduler.go

// This file is part of the happyDomain (R) project.
// Copyright (c) 2020-2026 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
//
// This program is offered under a commercial and under the AGPL license.
// For commercial licensing, contact us at <contact@happydomain.org>.
//
// For AGPL licensing:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <https://www.gnu.org/licenses/>.

package app

import (
	"context"
	"fmt"
	"log"
	"runtime"
	"sync"
	"time"

	"git.happydns.org/happyDomain/internal/storage"
	"git.happydns.org/happyDomain/internal/usecase/testresult"
	"git.happydns.org/happyDomain/model"
)

const (
	SchedulerCheckInterval     = 1 * time.Minute // How often to check for due tests
	SchedulerCleanupInterval   = 24 * time.Hour  // How often to clean up old executions
	SchedulerDiscoveryInterval = 1 * time.Hour   // How often to auto-discover new targets
	TestExecutionTimeout       = 5 * time.Minute // Max time for a single test
	MaxRetries                 = 3               // Max retry attempts for failed tests
)

// Priority levels for test execution queue
const (
	PriorityOnDemand  = iota // On-demand tests (highest priority)
	PriorityOverdue          // Overdue scheduled tests
	PriorityScheduled        // Regular scheduled tests
)

// testScheduler manages background test execution
type testScheduler struct {
	cfg              *happydns.Options
	store            storage.Storage
	pluginUsecase    happydns.TestPluginUsecase
	resultUsecase    *testresult.TestResultUsecase
	scheduleUsecase  *testresult.TestScheduleUsecase
	stop             chan bool
	runNowChan       chan *happydns.TestSchedule
	queue            *priorityQueue
	activeExecutions map[string]*activeExecution
	workers          []*worker
	mu               sync.RWMutex
	wg               sync.WaitGroup
	runtimeEnabled   bool
	running          bool
}

// activeExecution tracks a running test execution
type activeExecution struct {
	execution *happydns.TestExecution
	cancel    context.CancelFunc
	startTime time.Time
}

// queueItem represents a test execution request in the queue
type queueItem struct {
	schedule  *happydns.TestSchedule
	execution *happydns.TestExecution
	priority  int
	queuedAt  time.Time
	retries   int
}

// priorityQueue manages test execution queue with priority levels
type priorityQueue struct {
	items []*queueItem
	mu    sync.Mutex
}

// newPriorityQueue creates a new priority queue
func newPriorityQueue() *priorityQueue {
	return &priorityQueue{
		items: make([]*queueItem, 0),
	}
}

// Push adds an item to the queue
func (q *priorityQueue) Push(item *queueItem) {
	q.mu.Lock()
	defer q.mu.Unlock()

	q.items = append(q.items, item)

	// Sort by priority (lower number = higher priority)
	// Within same priority, FIFO order
	for i := len(q.items) - 1; i > 0; i-- {
		if q.items[i].priority < q.items[i-1].priority {
			q.items[i], q.items[i-1] = q.items[i-1], q.items[i]
		} else {
			break
		}
	}
}

// Pop removes and returns the highest priority item
func (q *priorityQueue) Pop() *queueItem {
	q.mu.Lock()
	defer q.mu.Unlock()

	if len(q.items) == 0 {
		return nil
	}

	item := q.items[0]
	q.items = q.items[1:]
	return item
}

// Len returns the queue length
func (q *priorityQueue) Len() int {
	q.mu.Lock()
	defer q.mu.Unlock()
	return len(q.items)
}

// worker processes tests from the queue
type worker struct {
	id        int
	scheduler *testScheduler
	stop      chan bool
}

// disabledScheduler is a no-op implementation used when scheduler is disabled
type disabledScheduler struct{}

// TriggerOnDemandTest returns an error indicating the scheduler is disabled
func (d *disabledScheduler) TriggerOnDemandTest(pluginName string, targetType happydns.TestScopeType, targetId happydns.Identifier, userId happydns.Identifier, options happydns.PluginOptions) (happydns.Identifier, error) {
	return happydns.Identifier{}, fmt.Errorf("test scheduler is disabled in configuration")
}

// GetSchedulerStatus returns a status indicating the scheduler is disabled
func (d *disabledScheduler) GetSchedulerStatus() happydns.SchedulerStatus {
	return happydns.SchedulerStatus{
		ConfigEnabled:  false,
		RuntimeEnabled: false,
		Running:        false,
	}
}

// SetEnabled returns an error since the scheduler is disabled in configuration
func (d *disabledScheduler) SetEnabled(enabled bool) error {
	return fmt.Errorf("scheduler is disabled in configuration, cannot enable at runtime")
}

// RescheduleUpcomingTests returns an error since the scheduler is disabled
func (d *disabledScheduler) RescheduleUpcomingTests() (int, error) {
	return 0, fmt.Errorf("test scheduler is disabled in configuration")
}

// newTestScheduler creates a new test scheduler
func newTestScheduler(
	cfg *happydns.Options,
	store storage.Storage,
	pluginUsecase happydns.TestPluginUsecase,
) *testScheduler {
	numWorkers := cfg.TestWorkers
	if numWorkers <= 0 {
		numWorkers = runtime.NumCPU()
	}

	scheduler := &testScheduler{
		cfg:              cfg,
		store:            store,
		pluginUsecase:    pluginUsecase,
		resultUsecase:    testresult.NewTestResultUsecase(store, cfg),
		scheduleUsecase:  testresult.NewTestScheduleUsecase(store, cfg),
		stop:             make(chan bool),
		runNowChan:       make(chan *happydns.TestSchedule, 100),
		queue:            newPriorityQueue(),
		activeExecutions: make(map[string]*activeExecution),
		workers:          make([]*worker, numWorkers),
		runtimeEnabled:   true,
	}

	// Create workers
	for i := 0; i < numWorkers; i++ {
		scheduler.workers[i] = &worker{
			id:        i,
			scheduler: scheduler,
			stop:      make(chan bool),
		}
	}

	return scheduler
}

// Close stops the scheduler
func (s *testScheduler) Close() {
	log.Println("Stopping test scheduler...")

	// Stop the main loop
	s.stop <- true

	// Stop all workers
	for _, w := range s.workers {
		w.stop <- true
	}

	// Cancel all active executions
	s.mu.Lock()
	for _, exec := range s.activeExecutions {
		exec.cancel()
	}
	s.mu.Unlock()

	// Wait for all workers to finish
	s.wg.Wait()

	log.Println("Test scheduler stopped")
}

// Run starts the scheduler main loop
func (s *testScheduler) Run() {
	if s.cfg.DisableScheduler {
		log.Println("Test scheduler disabled by configuration")
		return
	}

	s.mu.Lock()
	s.running = true
	s.mu.Unlock()

	defer func() {
		s.mu.Lock()
		s.running = false
		s.mu.Unlock()
	}()

	log.Printf("Starting test scheduler with %d workers...\n", len(s.workers))

	// Reschedule overdue tests before starting workers so that tests missed
	// during a server suspend or shutdown are spread into the near future
	// instead of all firing at once.
	if n, err := s.scheduleUsecase.RescheduleOverdueTests(); err != nil {
		log.Printf("Warning: failed to reschedule overdue tests: %v\n", err)
	} else if n > 0 {
		log.Printf("Rescheduled %d overdue test(s) into the near future\n", n)
	}

	// Start workers
	for _, w := range s.workers {
		s.wg.Add(1)
		go w.run(&s.wg)
	}

	// Main scheduling loop
	checkTicker := time.NewTicker(SchedulerCheckInterval)
	cleanupTicker := time.NewTicker(SchedulerCleanupInterval)
	discoveryTicker := time.NewTicker(SchedulerDiscoveryInterval)
	defer checkTicker.Stop()
	defer cleanupTicker.Stop()
	defer discoveryTicker.Stop()

	// Initial discovery: create default schedules for all existing targets
	s.discoverAndEnsureSchedules()
	// Initial check
	s.checkSchedules()

	for {
		select {
		case <-checkTicker.C:
			s.checkSchedules()

		case <-cleanupTicker.C:
			s.cleanup()

		case <-discoveryTicker.C:
			s.discoverAndEnsureSchedules()

		case schedule := <-s.runNowChan:
			s.queueOnDemandTest(schedule)

		case <-s.stop:
			return
		}
	}
}

// checkSchedules checks for due tests and queues them
func (s *testScheduler) checkSchedules() {
	s.mu.RLock()
	enabled := s.runtimeEnabled
	s.mu.RUnlock()
	if !enabled {
		return
	}

	dueSchedules, err := s.scheduleUsecase.ListDueSchedules()
	if err != nil {
		log.Printf("Error listing due schedules: %v\n", err)
		return
	}

	now := time.Now()
	for _, schedule := range dueSchedules {
		// Determine priority based on how overdue the test is
		priority := PriorityScheduled
		if schedule.NextRun.Add(schedule.Interval).Before(now) {
			priority = PriorityOverdue
		}

		// Create execution record
		execution := &happydns.TestExecution{
			ScheduleId: &schedule.Id,
			PluginName: schedule.PluginName,
			OwnerId:    schedule.OwnerId,
			TargetType: schedule.TargetType,
			TargetId:   schedule.TargetId,
			Status:     happydns.TestExecutionPending,
			StartedAt:  time.Now(),
			Options:    schedule.Options,
		}

		if err := s.resultUsecase.CreateTestExecution(execution); err != nil {
			log.Printf("Error creating execution for schedule %s: %v\n", schedule.Id, err)
			continue
		}

		// Queue the test
		s.queue.Push(&queueItem{
			schedule:  schedule,
			execution: execution,
			priority:  priority,
			queuedAt:  now,
			retries:   0,
		})
	}

	// Mark scheduler run
	if err := s.store.TestSchedulerRun(); err != nil {
		log.Printf("Error marking scheduler run: %v\n", err)
	}
}

// discoverAndEnsureSchedules creates default (enabled) schedules for all
// (plugin, target) pairs that don't yet have an explicit schedule record.
// This implements the opt-out model: tests run automatically unless a schedule
// with Enabled=false has been explicitly saved.
func (s *testScheduler) discoverAndEnsureSchedules() {
	s.mu.RLock()
	enabled := s.runtimeEnabled
	s.mu.RUnlock()
	if !enabled {
		return
	}

	plugins, err := s.pluginUsecase.ListTestPlugins()
	if err != nil {
		log.Printf("Error listing test plugins for discovery: %v\n", err)
		return
	}

	// Filter domain-level plugins
	var domainPlugins []happydns.TestPlugin
	for _, p := range plugins {
		if p.Version().AvailableOn.ApplyToDomain {
			domainPlugins = append(domainPlugins, p)
		}
	}

	if len(domainPlugins) > 0 {
		iter, err := s.store.ListAllDomains()
		if err != nil {
			log.Printf("Error listing domains for schedule discovery: %v\n", err)
		} else {
			defer iter.Close()
			for iter.Next() {
				domain := iter.Item()
				if domain == nil {
					continue
				}
				for _, plugin := range domainPlugins {
					pluginName := plugin.Version().Name
					schedules, err := s.scheduleUsecase.ListSchedulesByTarget(happydns.TestScopeDomain, domain.Id)
					if err != nil {
						continue
					}

					hasSchedule := false
					for _, sched := range schedules {
						if sched.PluginName == pluginName {
							hasSchedule = true
							break
						}
					}

					if !hasSchedule {
						if err := s.scheduleUsecase.CreateSchedule(&happydns.TestSchedule{
							PluginName: pluginName,
							OwnerId:    domain.Owner,
							TargetType: happydns.TestScopeDomain,
							TargetId:   domain.Id,
							Enabled:    true,
						}); err != nil {
							log.Printf("Error auto-creating schedule for domain %s / plugin %s: %v\n",
								domain.Id, pluginName, err)
						}
					}
				}
			}
		}
	}

	// Service-level plugin discovery is deferred: services live inside zones
	// and enumeration would require iterating all zones across all domains.
	// Services get auto-scheduled on their first explicit interaction instead.
}

// queueOnDemandTest queues an on-demand test execution
func (s *testScheduler) queueOnDemandTest(schedule *happydns.TestSchedule) {
	execution := &happydns.TestExecution{
		ScheduleId: nil, // On-demand has no schedule
		PluginName: schedule.PluginName,
		OwnerId:    schedule.OwnerId,
		TargetType: schedule.TargetType,
		TargetId:   schedule.TargetId,
		Status:     happydns.TestExecutionPending,
		StartedAt:  time.Now(),
		Options:    schedule.Options,
	}

	if err := s.resultUsecase.CreateTestExecution(execution); err != nil {
		log.Printf("Error creating on-demand execution: %v\n", err)
		return
	}

	s.queue.Push(&queueItem{
		schedule:  schedule,
		execution: execution,
		priority:  PriorityOnDemand,
		queuedAt:  time.Now(),
		retries:   0,
	})
}

// TriggerOnDemandTest triggers an immediate test execution
func (s *testScheduler) TriggerOnDemandTest(pluginName string, targetType happydns.TestScopeType, targetId happydns.Identifier, ownerId happydns.Identifier, options happydns.PluginOptions) (happydns.Identifier, error) {
	// Create a temporary schedule for on-demand execution
	schedule := &happydns.TestSchedule{
		PluginName: pluginName,
		OwnerId:    ownerId,
		TargetType: targetType,
		TargetId:   targetId,
		Interval:   0, // On-demand, no interval
		Enabled:    true,
		Options:    options,
	}

	// Create execution record
	execution := &happydns.TestExecution{
		ScheduleId: nil,
		PluginName: pluginName,
		OwnerId:    ownerId,
		TargetType: targetType,
		TargetId:   targetId,
		Status:     happydns.TestExecutionPending,
		StartedAt:  time.Now(),
		Options:    options,
	}

	if err := s.resultUsecase.CreateTestExecution(execution); err != nil {
		return happydns.Identifier{}, err
	}

	// Queue with highest priority
	s.queue.Push(&queueItem{
		schedule:  schedule,
		execution: execution,
		priority:  PriorityOnDemand,
		queuedAt:  time.Now(),
		retries:   0,
	})

	return execution.Id, nil
}

// GetSchedulerStatus returns a snapshot of the current scheduler state
func (s *testScheduler) GetSchedulerStatus() happydns.SchedulerStatus {
	s.mu.RLock()
	activeCount := len(s.activeExecutions)
	running := s.running
	runtimeEnabled := s.runtimeEnabled
	s.mu.RUnlock()

	nextSchedules, _ := s.scheduleUsecase.ListUpcomingSchedules(20)

	return happydns.SchedulerStatus{
		ConfigEnabled:  !s.cfg.DisableScheduler,
		RuntimeEnabled: runtimeEnabled,
		Running:        running,
		WorkerCount:    len(s.workers),
		QueueSize:      s.queue.Len(),
		ActiveCount:    activeCount,
		NextSchedules:  nextSchedules,
	}
}

// SetEnabled enables or disables the scheduler at runtime
func (s *testScheduler) SetEnabled(enabled bool) error {
	s.mu.Lock()
	s.runtimeEnabled = enabled
	s.mu.Unlock()
	return nil
}

// RescheduleUpcomingTests randomizes the next run time of all enabled schedules
// within their respective intervals, delegating to the schedule usecase.
func (s *testScheduler) RescheduleUpcomingTests() (int, error) {
	return s.scheduleUsecase.RescheduleUpcomingTests()
}

// cleanup removes old execution records and expired test results
func (s *testScheduler) cleanup() {
	log.Println("Running scheduler cleanup...")

	// Delete completed/failed execution records older than 7 days
	if err := s.resultUsecase.DeleteCompletedExecutions(7 * 24 * time.Hour); err != nil {
		log.Printf("Error cleaning up old executions: %v\n", err)
	}

	// Delete test results older than the configured retention period
	if err := s.resultUsecase.CleanupOldResults(); err != nil {
		log.Printf("Error cleaning up old test results: %v\n", err)
	}

	log.Println("Scheduler cleanup complete")
}

// worker.run processes tests from the queue
func (w *worker) run(wg *sync.WaitGroup) {
	defer wg.Done()

	log.Printf("Worker %d started\n", w.id)

	for {
		select {
		case <-w.stop:
			log.Printf("Worker %d stopped\n", w.id)
			return
		default:
			// Try to get work from queue
			item := w.scheduler.queue.Pop()
			if item == nil {
				// No work, sleep briefly
				time.Sleep(1 * time.Second)
				continue
			}

			// Execute the test
			w.executeTest(item)
		}
	}
}

// executeTest runs a test plugin and stores the result
func (w *worker) executeTest(item *queueItem) {
	ctx, cancel := context.WithTimeout(context.Background(), TestExecutionTimeout)
	defer cancel()

	execution := item.execution
	schedule := item.schedule

	// Always update schedule NextRun after execution, whether it succeeds or fails.
	// This prevents the schedule from being re-queued on the next tick if the test fails.
	if item.execution.ScheduleId != nil {
		defer func() {
			if err := w.scheduler.scheduleUsecase.UpdateScheduleAfterRun(*item.execution.ScheduleId); err != nil {
				log.Printf("Worker %d: Error updating schedule after run: %v\n", w.id, err)
			}
		}()
	}

	// Mark execution as running
	execution.Status = happydns.TestExecutionRunning
	if err := w.scheduler.resultUsecase.UpdateTestExecution(execution); err != nil {
		log.Printf("Worker %d: Error updating execution status: %v\n", w.id, err)
		_ = w.scheduler.resultUsecase.FailTestExecution(execution.Id, err.Error())
		return
	}

	// Track active execution
	w.scheduler.mu.Lock()
	w.scheduler.activeExecutions[execution.Id.String()] = &activeExecution{
		execution: execution,
		cancel:    cancel,
		startTime: time.Now(),
	}
	w.scheduler.mu.Unlock()

	defer func() {
		w.scheduler.mu.Lock()
		delete(w.scheduler.activeExecutions, execution.Id.String())
		w.scheduler.mu.Unlock()
	}()

	// Get the plugin
	plugin, err := w.scheduler.pluginUsecase.GetTestPlugin(schedule.PluginName)
	if err != nil {
		errMsg := fmt.Sprintf("plugin not found: %s - %v", schedule.PluginName, err)
		log.Printf("Worker %d: %s\n", w.id, errMsg)
		_ = w.scheduler.resultUsecase.FailTestExecution(execution.Id, errMsg)
		return
	}

	// Merge options: global defaults < user opts < domain/service opts < schedule opts
	var domainId, serviceId *happydns.Identifier
	switch schedule.TargetType {
	case happydns.TestScopeDomain:
		domainId = &schedule.TargetId
	case happydns.TestScopeService:
		serviceId = &schedule.TargetId
	}
	baseOptions, err := w.scheduler.pluginUsecase.GetTestPluginOptions(schedule.PluginName, &schedule.OwnerId, domainId, serviceId)
	if err != nil {
		log.Printf("Worker %d: warning, could not fetch plugin options for %s: %v\n", w.id, schedule.PluginName, err)
	}
	var mergedOptions happydns.PluginOptions
	if baseOptions != nil {
		mergedOptions = w.scheduler.scheduleUsecase.MergePluginOptions(nil, nil, *baseOptions, schedule.Options)
	} else {
		mergedOptions = schedule.Options
	}

	// Prepare metadata
	meta := make(map[string]string)
	meta["target_type"] = schedule.TargetType.String()
	meta["target_id"] = schedule.TargetId.String()

	// Run the test
	startTime := time.Now()
	resultChan := make(chan *happydns.PluginResult, 1)
	errorChan := make(chan error, 1)

	go func() {
		defer func() {
			if r := recover(); r != nil {
				errorChan <- fmt.Errorf("plugin panicked: %v", r)
			}
		}()
		result, err := plugin.RunTest(mergedOptions, meta)
		if err != nil {
			errorChan <- err
		} else {
			resultChan <- result
		}
	}()

	// Wait for result or timeout
	var pluginResult *happydns.PluginResult
	var testErr error

	select {
	case pluginResult = <-resultChan:
		// Test completed successfully
	case testErr = <-errorChan:
		// Test returned an error
	case <-ctx.Done():
		// Timeout
		testErr = fmt.Errorf("test execution timeout after %v", TestExecutionTimeout)
	}

	duration := time.Since(startTime)

	// Store the result
	result := &happydns.TestResult{
		PluginName:    schedule.PluginName,
		TestType:      schedule.TargetType,
		TargetId:      schedule.TargetId,
		OwnerId:       schedule.OwnerId,
		ExecutedAt:    time.Now(),
		ScheduledTest: item.execution.ScheduleId != nil,
		Options:       schedule.Options,
		Duration:      duration,
	}

	if testErr != nil {
		result.Status = happydns.PluginResultStatusKO
		result.StatusLine = "Test execution failed"
		result.Error = testErr.Error()
	} else if pluginResult != nil {
		result.Status = pluginResult.Status
		result.StatusLine = pluginResult.StatusLine
		result.Report = pluginResult.Report
	} else {
		result.Status = happydns.PluginResultStatusKO
		result.StatusLine = "Unknown error"
		result.Error = "No result or error returned from plugin"
	}

	// Save the result
	if err := w.scheduler.resultUsecase.CreateTestResult(result); err != nil {
		log.Printf("Worker %d: Error saving test result: %v\n", w.id, err)
		_ = w.scheduler.resultUsecase.FailTestExecution(execution.Id, err.Error())
		return
	}

	// Complete the execution
	if err := w.scheduler.resultUsecase.CompleteTestExecution(execution.Id, result.Id); err != nil {
		log.Printf("Worker %d: Error completing execution: %v\n", w.id, err)
		return
	}

	log.Printf("Worker %d: Completed test %s for target %s (status: %d, duration: %v)\n",
		w.id, schedule.PluginName, schedule.TargetId, result.Status, duration)
}