Instrument check scheduler with Prometheus metrics

Track queue depth on enqueue and pop, active worker count, check execution
duration per checker, and check result status counters.
This commit is contained in:
nemunaire 2026-03-02 00:12:28 +07:00
commit 94d9d03b9d

View file

@ -30,6 +30,7 @@ import (
"sync" "sync"
"time" "time"
"git.happydns.org/happyDomain/internal/metrics"
"git.happydns.org/happyDomain/internal/storage" "git.happydns.org/happyDomain/internal/storage"
"git.happydns.org/happyDomain/model" "git.happydns.org/happyDomain/model"
) )
@ -192,6 +193,7 @@ func newCheckScheduler(
// enqueue pushes an item to the priority queue and wakes one idle worker. // enqueue pushes an item to the priority queue and wakes one idle worker.
func (s *checkScheduler) enqueue(item *queueItem) { func (s *checkScheduler) enqueue(item *queueItem) {
s.queue.Push(item) s.queue.Push(item)
metrics.SchedulerQueueDepth.Set(float64(s.queue.Len()))
select { select {
case s.workAvail <- struct{}{}: case s.workAvail <- struct{}{}:
default: default:
@ -470,6 +472,7 @@ func (w *worker) run(wg *sync.WaitGroup) {
for { for {
// Drain: try to grab work before blocking. // Drain: try to grab work before blocking.
if item := w.scheduler.queue.Pop(); item != nil { if item := w.scheduler.queue.Pop(); item != nil {
metrics.SchedulerQueueDepth.Set(float64(w.scheduler.queue.Len()))
w.executeCheck(item) w.executeCheck(item)
continue continue
} }
@ -493,6 +496,13 @@ func (w *worker) executeCheck(item *queueItem) {
execution := item.execution execution := item.execution
schedule := item.schedule schedule := item.schedule
metrics.SchedulerActiveWorkers.Inc()
checkStart := time.Now()
defer func() {
metrics.SchedulerActiveWorkers.Dec()
metrics.SchedulerCheckDuration.WithLabelValues(schedule.CheckerName).Observe(time.Since(checkStart).Seconds())
}()
// Always update schedule NextRun after execution, whether it succeeds or fails. // Always update schedule NextRun after execution, whether it succeeds or fails.
// This prevents the schedule from being re-queued on the next tick if the test fails. // This prevents the schedule from being re-queued on the next tick if the test fails.
if execution.ScheduleId != nil { if execution.ScheduleId != nil {
@ -594,6 +604,9 @@ func (w *worker) executeCheck(item *queueItem) {
} }
} }
// Record check status metric
metrics.SchedulerChecksTotal.WithLabelValues(schedule.CheckerName, result.Status.String()).Inc()
// Save the result // Save the result
if err := w.scheduler.resultUsecase.CreateCheckResult(result); err != nil { if err := w.scheduler.resultUsecase.CreateCheckResult(result); err != nil {
log.Printf("Worker %d: Error saving test result: %v\n", w.id, err) log.Printf("Worker %d: Error saving test result: %v\n", w.id, err)