server: expose runtime metrics on /health for scheduler routing

Adds HealthResponse carrying inflight count, total requests, 1/5/15-min
EWMA load averages, uptime, and NumCPU so a scheduler can pick the least
busy worker. A background sampler updates the load averages every 5s,
stopped by a new idempotent Close method. Work endpoints (/collect,
/evaluate, /report) are wrapped with a trackWork middleware; /health
and /definition are excluded so polling traffic does not pollute the
signal.
This commit is contained in:
nemunaire 2026-04-16 16:17:43 +07:00
commit 6b96ee8c2f
3 changed files with 300 additions and 11 deletions

View file

@ -20,14 +20,42 @@ import (
"fmt"
"io"
"log"
"math"
"net/http"
"runtime"
"strings"
"sync"
"sync/atomic"
"time"
)
// maxRequestBodySize is the maximum allowed size for incoming request bodies (1 MB).
const maxRequestBodySize = 1 << 20
// loadSampleInterval is how often the background sampler updates the
// exponentially weighted moving averages reported in HealthResponse.LoadAvg.
// 5 seconds matches the Unix kernel's loadavg cadence.
const loadSampleInterval = 5 * time.Second
// EWMA smoothing factors for 1, 5, and 15-minute windows sampled every
// loadSampleInterval. Derived as 1 - exp(-interval/window) so that the
// steady-state response to a constant InFlight of N converges to N.
var (
loadAlpha1 = 1 - math.Exp(-float64(loadSampleInterval)/float64(1*time.Minute))
loadAlpha5 = 1 - math.Exp(-float64(loadSampleInterval)/float64(5*time.Minute))
loadAlpha15 = 1 - math.Exp(-float64(loadSampleInterval)/float64(15*time.Minute))
)
// updateLoadAvg advances the three EWMAs by one tick given the current
// InFlight sample. It is a pure function to keep the sampler trivially testable.
func updateLoadAvg(prev [3]float64, sample float64) [3]float64 {
return [3]float64{
prev[0] + loadAlpha1*(sample-prev[0]),
prev[1] + loadAlpha5*(sample-prev[1]),
prev[2] + loadAlpha15*(sample-prev[2]),
}
}
// Server is a generic HTTP server for external checkers.
// It always exposes /health and /collect. If the provider implements
// CheckerDefinitionProvider, it also exposes /definition and /evaluate.
@ -41,29 +69,65 @@ type Server struct {
provider ObservationProvider
definition *CheckerDefinition
mux *http.ServeMux
// startTime is captured in NewServer and used to compute uptime.
startTime time.Time
// inFlight counts work requests (/collect, /evaluate, /report) currently
// being processed. /health and /definition are not tracked.
inFlight atomic.Int64
// totalRequests is the cumulative number of work requests served.
totalRequests atomic.Uint64
// loadBits stores the 1, 5, 15-minute EWMAs of inFlight as float64 bit
// patterns (math.Float64bits) so reads and writes are tear-free and
// lock-free across the sampler goroutine and the /health handler.
loadBits [3]atomic.Uint64
// cancelSampler stops the background load-average sampler.
cancelSampler context.CancelFunc
// samplerDone is closed when the sampler goroutine returns.
samplerDone chan struct{}
// closeOnce guarantees Close is idempotent.
closeOnce sync.Once
}
// NewServer creates a new checker HTTP server backed by the given provider.
// Additional endpoints are registered based on optional interfaces the provider implements.
//
// NewServer also starts a background goroutine that samples the in-flight
// request count every loadSampleInterval to compute the load averages
// reported on /health. Call Close to stop it.
func NewServer(provider ObservationProvider) *Server {
s := &Server{provider: provider}
ctx, cancel := context.WithCancel(context.Background())
s := &Server{
provider: provider,
startTime: time.Now(),
cancelSampler: cancel,
samplerDone: make(chan struct{}),
}
s.mux = http.NewServeMux()
s.mux.HandleFunc("GET /health", s.handleHealth)
s.mux.HandleFunc("POST /collect", s.handleCollect)
s.mux.Handle("POST /collect", s.trackWork(http.HandlerFunc(s.handleCollect)))
if dp, ok := provider.(CheckerDefinitionProvider); ok {
s.definition = dp.Definition()
s.definition.BuildRulesInfo()
s.mux.HandleFunc("GET /definition", s.handleDefinition)
s.mux.HandleFunc("POST /evaluate", s.handleEvaluate)
s.mux.Handle("POST /evaluate", s.trackWork(http.HandlerFunc(s.handleEvaluate)))
}
if _, ok := provider.(CheckerHTMLReporter); ok {
s.mux.HandleFunc("POST /report", s.handleReport)
s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport)))
} else if _, ok := provider.(CheckerMetricsReporter); ok {
s.mux.HandleFunc("POST /report", s.handleReport)
s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport)))
}
go s.runSampler(ctx)
return s
}
@ -74,11 +138,61 @@ func (s *Server) Handler() http.Handler {
}
// ListenAndServe starts the HTTP server on the given address.
//
// ListenAndServe does not stop the background load-average sampler on return;
// call Close to stop it. This is not required for process-scoped usage but is
// recommended for tests and embedded lifecycles.
func (s *Server) ListenAndServe(addr string) error {
log.Printf("checker listening on %s", addr)
return http.ListenAndServe(addr, requestLogger(s.mux))
}
// Close stops the background load-average sampler goroutine. It is safe to
// call multiple times; subsequent calls are no-ops. Close does not shut down
// any underlying http.Server — callers own that lifecycle.
func (s *Server) Close() error {
s.closeOnce.Do(func() {
s.cancelSampler()
<-s.samplerDone
})
return nil
}
// trackWork wraps a handler with in-flight and total-request accounting.
// It is applied only to "work" endpoints (/collect, /evaluate, /report) so
// that /health polling traffic does not pollute the load signal.
func (s *Server) trackWork(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
s.inFlight.Add(1)
s.totalRequests.Add(1)
defer s.inFlight.Add(-1)
next.ServeHTTP(w, r)
})
}
// runSampler updates the load-average EWMAs every loadSampleInterval until
// ctx is canceled. It closes s.samplerDone on exit.
func (s *Server) runSampler(ctx context.Context) {
defer close(s.samplerDone)
ticker := time.NewTicker(loadSampleInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
var prev [3]float64
for i := range prev {
prev[i] = math.Float64frombits(s.loadBits[i].Load())
}
next := updateLoadAvg(prev, float64(s.inFlight.Load()))
for i := range next {
s.loadBits[i].Store(math.Float64bits(next[i]))
}
}
}
}
type statusRecorder struct {
http.ResponseWriter
status int
@ -99,8 +213,18 @@ func requestLogger(next http.Handler) http.Handler {
}
func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(map[string]string{"status": "ok"})
var load [3]float64
for i := range load {
load[i] = math.Float64frombits(s.loadBits[i].Load())
}
writeJSON(w, http.StatusOK, HealthResponse{
Status: "ok",
Uptime: time.Since(s.startTime).Seconds(),
NumCPU: runtime.NumCPU(),
InFlight: s.inFlight.Load(),
TotalRequests: s.totalRequests.Load(),
LoadAvg: load,
})
}
func (s *Server) handleDefinition(w http.ResponseWriter, r *http.Request) {