checker-sdk-go/checker/server.go
Pierre-Olivier Mercier 7567271536 checker: cross-checker observation composition via ReportContext
Add the plumbing that lets a checker receive (at evaluation, report
rendering, and metrics extraction) observations produced by other
checkers on DiscoveryEntry records it originally published.

Surface changes:

  - RelatedObservation struct: one downstream observation, tagged with
    the producing CheckerID and the Ref matching the DiscoveryEntry
    it covers.

  - ObservationGetter gains GetRelated(ctx, key), so rules can opt in
    to cross-checker composition. mapObservationGetter (remote
    /evaluate path) returns empty; the host owns lineage resolution.

  - ReportContext interface: Data() + Related(key). Reporters consume
    it instead of a raw json.RawMessage, which collapses the former
    legacy/Ctx duplicate and gives one uniform signature:

        GetHTMLReport(ctx ReportContext) (string, error)
        ExtractMetrics(ctx ReportContext, t time.Time) ([]CheckMetric, error)

  - NewReportContext(data, related) and StaticReportContext(data) build
    fixed-payload contexts for entry points without an ObservationContext.

  - ExternalReportRequest gains a Related map so the host can ship
    pre-composed lineage to a remote checker over /report. The SDK's
    /report handler threads it through to the reporter via
    NewReportContext, closing the wire gap that previously forced
    remote reports to a StaticReportContext with no related data.

Tests cover the Related map round-trip end-to-end via a peeking provider.
2026-04-22 16:50:59 +07:00

377 lines
12 KiB
Go

// Copyright 2020-2026 The happyDomain Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package checker
import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"math"
"net/http"
"runtime"
"strings"
"sync"
"sync/atomic"
"time"
)
// maxRequestBodySize is the maximum allowed size for incoming request bodies (1 MB).
const maxRequestBodySize = 1 << 20
// loadSampleInterval is how often the background sampler updates the
// exponentially weighted moving averages reported in HealthResponse.LoadAvg.
// 5 seconds matches the Unix kernel's loadavg cadence.
const loadSampleInterval = 5 * time.Second
// EWMA smoothing factors for 1, 5, and 15-minute windows sampled every
// loadSampleInterval. Derived as 1 - exp(-interval/window) so that the
// steady-state response to a constant InFlight of N converges to N.
var (
loadAlpha1 = 1 - math.Exp(-float64(loadSampleInterval)/float64(1*time.Minute))
loadAlpha5 = 1 - math.Exp(-float64(loadSampleInterval)/float64(5*time.Minute))
loadAlpha15 = 1 - math.Exp(-float64(loadSampleInterval)/float64(15*time.Minute))
)
// updateLoadAvg advances the three EWMAs by one tick given the current
// InFlight sample. It is a pure function to keep the sampler trivially testable.
func updateLoadAvg(prev [3]float64, sample float64) [3]float64 {
return [3]float64{
prev[0] + loadAlpha1*(sample-prev[0]),
prev[1] + loadAlpha5*(sample-prev[1]),
prev[2] + loadAlpha15*(sample-prev[2]),
}
}
// Server is a generic HTTP server for external checkers.
// It always exposes /health and /collect. If the provider implements
// CheckerDefinitionProvider, it also exposes /definition and /evaluate.
// If the provider implements CheckerHTMLReporter or CheckerMetricsReporter,
// it also exposes /report.
//
// Security: Server does not perform any authentication or authorization.
// It is intended to be run behind a reverse proxy or in a trusted network
// where access control is handled externally (e.g. by the happyDomain server).
type Server struct {
provider ObservationProvider
definition *CheckerDefinition
mux *http.ServeMux
// startTime is captured in NewServer and used to compute uptime.
startTime time.Time
// inFlight counts work requests (/collect, /evaluate, /report) currently
// being processed. /health and /definition are not tracked.
inFlight atomic.Int64
// totalRequests is the cumulative number of work requests served.
totalRequests atomic.Uint64
// loadBits stores the 1, 5, 15-minute EWMAs of inFlight as float64 bit
// patterns (math.Float64bits) so reads and writes are tear-free and
// lock-free across the sampler goroutine and the /health handler.
loadBits [3]atomic.Uint64
// cancelSampler stops the background load-average sampler.
cancelSampler context.CancelFunc
// samplerDone is closed when the sampler goroutine returns.
samplerDone chan struct{}
// closeOnce guarantees Close is idempotent.
closeOnce sync.Once
}
// NewServer creates a new checker HTTP server backed by the given provider.
// Additional endpoints are registered based on optional interfaces the provider implements.
//
// NewServer also starts a background goroutine that samples the in-flight
// request count every loadSampleInterval to compute the load averages
// reported on /health. Call Close to stop it.
func NewServer(provider ObservationProvider) *Server {
ctx, cancel := context.WithCancel(context.Background())
s := &Server{
provider: provider,
startTime: time.Now(),
cancelSampler: cancel,
samplerDone: make(chan struct{}),
}
s.mux = http.NewServeMux()
s.mux.HandleFunc("GET /health", s.handleHealth)
s.mux.Handle("POST /collect", s.trackWork(http.HandlerFunc(s.handleCollect)))
if dp, ok := provider.(CheckerDefinitionProvider); ok {
s.definition = dp.Definition()
s.definition.BuildRulesInfo()
s.mux.HandleFunc("GET /definition", s.handleDefinition)
s.mux.Handle("POST /evaluate", s.trackWork(http.HandlerFunc(s.handleEvaluate)))
}
if _, ok := provider.(CheckerHTMLReporter); ok {
s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport)))
} else if _, ok := provider.(CheckerMetricsReporter); ok {
s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport)))
}
go s.runSampler(ctx)
return s
}
// Handler returns the http.Handler for this server, allowing callers
// to embed it in a custom server or add middleware.
func (s *Server) Handler() http.Handler {
return requestLogger(s.mux)
}
// ListenAndServe starts the HTTP server on the given address.
//
// ListenAndServe does not stop the background load-average sampler on return;
// call Close to stop it. This is not required for process-scoped usage but is
// recommended for tests and embedded lifecycles.
func (s *Server) ListenAndServe(addr string) error {
log.Printf("checker listening on %s", addr)
return http.ListenAndServe(addr, requestLogger(s.mux))
}
// Close stops the background load-average sampler goroutine. It is safe to
// call multiple times; subsequent calls are no-ops. Close does not shut down
// any underlying http.Server — callers own that lifecycle.
func (s *Server) Close() error {
s.closeOnce.Do(func() {
s.cancelSampler()
<-s.samplerDone
})
return nil
}
// trackWork wraps a handler with in-flight and total-request accounting.
// It is applied only to "work" endpoints (/collect, /evaluate, /report) so
// that /health polling traffic does not pollute the load signal.
func (s *Server) trackWork(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
s.inFlight.Add(1)
s.totalRequests.Add(1)
defer s.inFlight.Add(-1)
next.ServeHTTP(w, r)
})
}
// runSampler updates the load-average EWMAs every loadSampleInterval until
// ctx is canceled. It closes s.samplerDone on exit.
func (s *Server) runSampler(ctx context.Context) {
defer close(s.samplerDone)
ticker := time.NewTicker(loadSampleInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
var prev [3]float64
for i := range prev {
prev[i] = math.Float64frombits(s.loadBits[i].Load())
}
next := updateLoadAvg(prev, float64(s.inFlight.Load()))
for i := range next {
s.loadBits[i].Store(math.Float64bits(next[i]))
}
}
}
}
type statusRecorder struct {
http.ResponseWriter
status int
}
func (r *statusRecorder) WriteHeader(code int) {
r.status = code
r.ResponseWriter.WriteHeader(code)
}
func requestLogger(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
rec := &statusRecorder{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(rec, r)
log.Printf("%s %s %d %s", r.Method, r.URL.Path, rec.status, time.Since(start))
})
}
func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) {
var load [3]float64
for i := range load {
load[i] = math.Float64frombits(s.loadBits[i].Load())
}
writeJSON(w, http.StatusOK, HealthResponse{
Status: "ok",
Uptime: time.Since(s.startTime).Seconds(),
NumCPU: runtime.NumCPU(),
InFlight: s.inFlight.Load(),
TotalRequests: s.totalRequests.Load(),
LoadAvg: load,
})
}
func (s *Server) handleDefinition(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, s.definition)
}
func (s *Server) handleCollect(w http.ResponseWriter, r *http.Request) {
var req ExternalCollectRequest
if err := json.NewDecoder(io.LimitReader(r.Body, maxRequestBodySize)).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, ExternalCollectResponse{
Error: fmt.Sprintf("invalid request body: %v", err),
})
return
}
data, err := s.provider.Collect(r.Context(), req.Options)
if err != nil {
writeJSON(w, http.StatusInternalServerError, ExternalCollectResponse{
Error: err.Error(),
})
return
}
raw, err := json.Marshal(data)
if err != nil {
writeJSON(w, http.StatusInternalServerError, ExternalCollectResponse{
Error: fmt.Sprintf("failed to marshal result: %v", err),
})
return
}
resp := ExternalCollectResponse{Data: json.RawMessage(raw)}
// Harvest discovery entries from the native Go value, before it goes
// out of scope. No re-parse; DiscoverEntries operates on the same
// object that was just marshaled above.
if dp, ok := s.provider.(DiscoveryPublisher); ok {
entries, derr := dp.DiscoverEntries(data)
if derr != nil {
log.Printf("DiscoverEntries failed: %v", derr)
} else {
resp.Entries = entries
}
}
writeJSON(w, http.StatusOK, resp)
}
func (s *Server) handleEvaluate(w http.ResponseWriter, r *http.Request) {
var req ExternalEvaluateRequest
if err := json.NewDecoder(io.LimitReader(r.Body, maxRequestBodySize)).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, ExternalEvaluateResponse{
Error: fmt.Sprintf("invalid request body: %v", err),
})
return
}
obs := &mapObservationGetter{data: req.Observations}
var states []CheckState
for _, rule := range s.definition.Rules {
if len(req.EnabledRules) > 0 {
if enabled, ok := req.EnabledRules[rule.Name()]; ok && !enabled {
continue
}
}
state := rule.Evaluate(r.Context(), obs, req.Options)
if state.Code == "" {
state.Code = rule.Name()
}
states = append(states, state)
}
writeJSON(w, http.StatusOK, ExternalEvaluateResponse{States: states})
}
func (s *Server) handleReport(w http.ResponseWriter, r *http.Request) {
var req ExternalReportRequest
if err := json.NewDecoder(io.LimitReader(r.Body, maxRequestBodySize)).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{
"error": fmt.Sprintf("invalid request body: %v", err),
})
return
}
accept := r.Header.Get("Accept")
if strings.Contains(accept, "text/html") {
reporter, ok := s.provider.(CheckerHTMLReporter)
if !ok {
http.Error(w, "this checker does not support HTML reports", http.StatusNotImplemented)
return
}
html, err := reporter.GetHTMLReport(NewReportContext(req.Data, req.Related))
if err != nil {
http.Error(w, fmt.Sprintf("failed to generate HTML report: %v", err), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "text/html; charset=utf-8")
w.Write([]byte(html))
return
}
// Default: JSON metrics.
reporter, ok := s.provider.(CheckerMetricsReporter)
if !ok {
http.Error(w, "this checker does not support metrics reports", http.StatusNotImplemented)
return
}
metrics, err := reporter.ExtractMetrics(NewReportContext(req.Data, req.Related), time.Now())
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{
"error": fmt.Sprintf("failed to extract metrics: %v", err),
})
return
}
writeJSON(w, http.StatusOK, metrics)
}
// mapObservationGetter implements ObservationGetter backed by a static map.
type mapObservationGetter struct {
data map[ObservationKey]json.RawMessage
}
func (g *mapObservationGetter) Get(ctx context.Context, key ObservationKey, dest any) error {
raw, ok := g.data[key]
if !ok {
return fmt.Errorf("observation %q not available", key)
}
return json.Unmarshal(raw, dest)
}
// GetRelated always returns nil in the remote /evaluate path: the host that
// invokes /evaluate does not (currently) carry cross-checker related data in
// ExternalEvaluateRequest. Consumers that need related observations must run
// evaluation locally with a host-side ObservationContext that resolves
// lineage.
func (g *mapObservationGetter) GetRelated(ctx context.Context, key ObservationKey) ([]RelatedObservation, error) {
return nil, nil
}
func writeJSON(w http.ResponseWriter, status int, v any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
json.NewEncoder(w).Encode(v)
}