Add the plumbing that lets a checker receive (at evaluation, report
rendering, and metrics extraction) observations produced by other
checkers on DiscoveryEntry records it originally published.
Surface changes:
- RelatedObservation struct: one downstream observation, tagged with
the producing CheckerID and the Ref matching the DiscoveryEntry
it covers.
- ObservationGetter gains GetRelated(ctx, key), so rules can opt in
to cross-checker composition. mapObservationGetter (remote
/evaluate path) returns empty; the host owns lineage resolution.
- ReportContext interface: Data() + Related(key). Reporters consume
it instead of a raw json.RawMessage, which collapses the former
legacy/Ctx duplicate and gives one uniform signature:
GetHTMLReport(ctx ReportContext) (string, error)
ExtractMetrics(ctx ReportContext, t time.Time) ([]CheckMetric, error)
- NewReportContext(data, related) and StaticReportContext(data) build
fixed-payload contexts for entry points without an ObservationContext.
- ExternalReportRequest gains a Related map so the host can ship
pre-composed lineage to a remote checker over /report. The SDK's
/report handler threads it through to the reporter via
NewReportContext, closing the wire gap that previously forced
remote reports to a StaticReportContext with no related data.
Tests cover the Related map round-trip end-to-end via a peeking provider.
377 lines
12 KiB
Go
377 lines
12 KiB
Go
// Copyright 2020-2026 The happyDomain Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package checker
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"math"
|
|
"net/http"
|
|
"runtime"
|
|
"strings"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
)
|
|
|
|
// maxRequestBodySize is the maximum allowed size for incoming request bodies (1 MB).
|
|
const maxRequestBodySize = 1 << 20
|
|
|
|
// loadSampleInterval is how often the background sampler updates the
|
|
// exponentially weighted moving averages reported in HealthResponse.LoadAvg.
|
|
// 5 seconds matches the Unix kernel's loadavg cadence.
|
|
const loadSampleInterval = 5 * time.Second
|
|
|
|
// EWMA smoothing factors for 1, 5, and 15-minute windows sampled every
|
|
// loadSampleInterval. Derived as 1 - exp(-interval/window) so that the
|
|
// steady-state response to a constant InFlight of N converges to N.
|
|
var (
|
|
loadAlpha1 = 1 - math.Exp(-float64(loadSampleInterval)/float64(1*time.Minute))
|
|
loadAlpha5 = 1 - math.Exp(-float64(loadSampleInterval)/float64(5*time.Minute))
|
|
loadAlpha15 = 1 - math.Exp(-float64(loadSampleInterval)/float64(15*time.Minute))
|
|
)
|
|
|
|
// updateLoadAvg advances the three EWMAs by one tick given the current
|
|
// InFlight sample. It is a pure function to keep the sampler trivially testable.
|
|
func updateLoadAvg(prev [3]float64, sample float64) [3]float64 {
|
|
return [3]float64{
|
|
prev[0] + loadAlpha1*(sample-prev[0]),
|
|
prev[1] + loadAlpha5*(sample-prev[1]),
|
|
prev[2] + loadAlpha15*(sample-prev[2]),
|
|
}
|
|
}
|
|
|
|
// Server is a generic HTTP server for external checkers.
|
|
// It always exposes /health and /collect. If the provider implements
|
|
// CheckerDefinitionProvider, it also exposes /definition and /evaluate.
|
|
// If the provider implements CheckerHTMLReporter or CheckerMetricsReporter,
|
|
// it also exposes /report.
|
|
//
|
|
// Security: Server does not perform any authentication or authorization.
|
|
// It is intended to be run behind a reverse proxy or in a trusted network
|
|
// where access control is handled externally (e.g. by the happyDomain server).
|
|
type Server struct {
|
|
provider ObservationProvider
|
|
definition *CheckerDefinition
|
|
mux *http.ServeMux
|
|
|
|
// startTime is captured in NewServer and used to compute uptime.
|
|
startTime time.Time
|
|
|
|
// inFlight counts work requests (/collect, /evaluate, /report) currently
|
|
// being processed. /health and /definition are not tracked.
|
|
inFlight atomic.Int64
|
|
|
|
// totalRequests is the cumulative number of work requests served.
|
|
totalRequests atomic.Uint64
|
|
|
|
// loadBits stores the 1, 5, 15-minute EWMAs of inFlight as float64 bit
|
|
// patterns (math.Float64bits) so reads and writes are tear-free and
|
|
// lock-free across the sampler goroutine and the /health handler.
|
|
loadBits [3]atomic.Uint64
|
|
|
|
// cancelSampler stops the background load-average sampler.
|
|
cancelSampler context.CancelFunc
|
|
|
|
// samplerDone is closed when the sampler goroutine returns.
|
|
samplerDone chan struct{}
|
|
|
|
// closeOnce guarantees Close is idempotent.
|
|
closeOnce sync.Once
|
|
}
|
|
|
|
// NewServer creates a new checker HTTP server backed by the given provider.
|
|
// Additional endpoints are registered based on optional interfaces the provider implements.
|
|
//
|
|
// NewServer also starts a background goroutine that samples the in-flight
|
|
// request count every loadSampleInterval to compute the load averages
|
|
// reported on /health. Call Close to stop it.
|
|
func NewServer(provider ObservationProvider) *Server {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
s := &Server{
|
|
provider: provider,
|
|
startTime: time.Now(),
|
|
cancelSampler: cancel,
|
|
samplerDone: make(chan struct{}),
|
|
}
|
|
s.mux = http.NewServeMux()
|
|
s.mux.HandleFunc("GET /health", s.handleHealth)
|
|
s.mux.Handle("POST /collect", s.trackWork(http.HandlerFunc(s.handleCollect)))
|
|
|
|
if dp, ok := provider.(CheckerDefinitionProvider); ok {
|
|
s.definition = dp.Definition()
|
|
s.definition.BuildRulesInfo()
|
|
s.mux.HandleFunc("GET /definition", s.handleDefinition)
|
|
s.mux.Handle("POST /evaluate", s.trackWork(http.HandlerFunc(s.handleEvaluate)))
|
|
}
|
|
|
|
if _, ok := provider.(CheckerHTMLReporter); ok {
|
|
s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport)))
|
|
} else if _, ok := provider.(CheckerMetricsReporter); ok {
|
|
s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport)))
|
|
}
|
|
|
|
go s.runSampler(ctx)
|
|
|
|
return s
|
|
}
|
|
|
|
// Handler returns the http.Handler for this server, allowing callers
|
|
// to embed it in a custom server or add middleware.
|
|
func (s *Server) Handler() http.Handler {
|
|
return requestLogger(s.mux)
|
|
}
|
|
|
|
// ListenAndServe starts the HTTP server on the given address.
|
|
//
|
|
// ListenAndServe does not stop the background load-average sampler on return;
|
|
// call Close to stop it. This is not required for process-scoped usage but is
|
|
// recommended for tests and embedded lifecycles.
|
|
func (s *Server) ListenAndServe(addr string) error {
|
|
log.Printf("checker listening on %s", addr)
|
|
return http.ListenAndServe(addr, requestLogger(s.mux))
|
|
}
|
|
|
|
// Close stops the background load-average sampler goroutine. It is safe to
|
|
// call multiple times; subsequent calls are no-ops. Close does not shut down
|
|
// any underlying http.Server — callers own that lifecycle.
|
|
func (s *Server) Close() error {
|
|
s.closeOnce.Do(func() {
|
|
s.cancelSampler()
|
|
<-s.samplerDone
|
|
})
|
|
return nil
|
|
}
|
|
|
|
// trackWork wraps a handler with in-flight and total-request accounting.
|
|
// It is applied only to "work" endpoints (/collect, /evaluate, /report) so
|
|
// that /health polling traffic does not pollute the load signal.
|
|
func (s *Server) trackWork(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
s.inFlight.Add(1)
|
|
s.totalRequests.Add(1)
|
|
defer s.inFlight.Add(-1)
|
|
next.ServeHTTP(w, r)
|
|
})
|
|
}
|
|
|
|
// runSampler updates the load-average EWMAs every loadSampleInterval until
|
|
// ctx is canceled. It closes s.samplerDone on exit.
|
|
func (s *Server) runSampler(ctx context.Context) {
|
|
defer close(s.samplerDone)
|
|
ticker := time.NewTicker(loadSampleInterval)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
var prev [3]float64
|
|
for i := range prev {
|
|
prev[i] = math.Float64frombits(s.loadBits[i].Load())
|
|
}
|
|
next := updateLoadAvg(prev, float64(s.inFlight.Load()))
|
|
for i := range next {
|
|
s.loadBits[i].Store(math.Float64bits(next[i]))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
type statusRecorder struct {
|
|
http.ResponseWriter
|
|
status int
|
|
}
|
|
|
|
func (r *statusRecorder) WriteHeader(code int) {
|
|
r.status = code
|
|
r.ResponseWriter.WriteHeader(code)
|
|
}
|
|
|
|
func requestLogger(next http.Handler) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
start := time.Now()
|
|
rec := &statusRecorder{ResponseWriter: w, status: http.StatusOK}
|
|
next.ServeHTTP(rec, r)
|
|
log.Printf("%s %s %d %s", r.Method, r.URL.Path, rec.status, time.Since(start))
|
|
})
|
|
}
|
|
|
|
func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) {
|
|
var load [3]float64
|
|
for i := range load {
|
|
load[i] = math.Float64frombits(s.loadBits[i].Load())
|
|
}
|
|
writeJSON(w, http.StatusOK, HealthResponse{
|
|
Status: "ok",
|
|
Uptime: time.Since(s.startTime).Seconds(),
|
|
NumCPU: runtime.NumCPU(),
|
|
InFlight: s.inFlight.Load(),
|
|
TotalRequests: s.totalRequests.Load(),
|
|
LoadAvg: load,
|
|
})
|
|
}
|
|
|
|
func (s *Server) handleDefinition(w http.ResponseWriter, r *http.Request) {
|
|
writeJSON(w, http.StatusOK, s.definition)
|
|
}
|
|
|
|
func (s *Server) handleCollect(w http.ResponseWriter, r *http.Request) {
|
|
var req ExternalCollectRequest
|
|
if err := json.NewDecoder(io.LimitReader(r.Body, maxRequestBodySize)).Decode(&req); err != nil {
|
|
writeJSON(w, http.StatusBadRequest, ExternalCollectResponse{
|
|
Error: fmt.Sprintf("invalid request body: %v", err),
|
|
})
|
|
return
|
|
}
|
|
|
|
data, err := s.provider.Collect(r.Context(), req.Options)
|
|
if err != nil {
|
|
writeJSON(w, http.StatusInternalServerError, ExternalCollectResponse{
|
|
Error: err.Error(),
|
|
})
|
|
return
|
|
}
|
|
|
|
raw, err := json.Marshal(data)
|
|
if err != nil {
|
|
writeJSON(w, http.StatusInternalServerError, ExternalCollectResponse{
|
|
Error: fmt.Sprintf("failed to marshal result: %v", err),
|
|
})
|
|
return
|
|
}
|
|
|
|
resp := ExternalCollectResponse{Data: json.RawMessage(raw)}
|
|
|
|
// Harvest discovery entries from the native Go value, before it goes
|
|
// out of scope. No re-parse; DiscoverEntries operates on the same
|
|
// object that was just marshaled above.
|
|
if dp, ok := s.provider.(DiscoveryPublisher); ok {
|
|
entries, derr := dp.DiscoverEntries(data)
|
|
if derr != nil {
|
|
log.Printf("DiscoverEntries failed: %v", derr)
|
|
} else {
|
|
resp.Entries = entries
|
|
}
|
|
}
|
|
|
|
writeJSON(w, http.StatusOK, resp)
|
|
}
|
|
|
|
func (s *Server) handleEvaluate(w http.ResponseWriter, r *http.Request) {
|
|
var req ExternalEvaluateRequest
|
|
if err := json.NewDecoder(io.LimitReader(r.Body, maxRequestBodySize)).Decode(&req); err != nil {
|
|
writeJSON(w, http.StatusBadRequest, ExternalEvaluateResponse{
|
|
Error: fmt.Sprintf("invalid request body: %v", err),
|
|
})
|
|
return
|
|
}
|
|
|
|
obs := &mapObservationGetter{data: req.Observations}
|
|
|
|
var states []CheckState
|
|
for _, rule := range s.definition.Rules {
|
|
if len(req.EnabledRules) > 0 {
|
|
if enabled, ok := req.EnabledRules[rule.Name()]; ok && !enabled {
|
|
continue
|
|
}
|
|
}
|
|
state := rule.Evaluate(r.Context(), obs, req.Options)
|
|
if state.Code == "" {
|
|
state.Code = rule.Name()
|
|
}
|
|
states = append(states, state)
|
|
}
|
|
|
|
writeJSON(w, http.StatusOK, ExternalEvaluateResponse{States: states})
|
|
}
|
|
|
|
func (s *Server) handleReport(w http.ResponseWriter, r *http.Request) {
|
|
var req ExternalReportRequest
|
|
if err := json.NewDecoder(io.LimitReader(r.Body, maxRequestBodySize)).Decode(&req); err != nil {
|
|
writeJSON(w, http.StatusBadRequest, map[string]string{
|
|
"error": fmt.Sprintf("invalid request body: %v", err),
|
|
})
|
|
return
|
|
}
|
|
|
|
accept := r.Header.Get("Accept")
|
|
|
|
if strings.Contains(accept, "text/html") {
|
|
reporter, ok := s.provider.(CheckerHTMLReporter)
|
|
if !ok {
|
|
http.Error(w, "this checker does not support HTML reports", http.StatusNotImplemented)
|
|
return
|
|
}
|
|
|
|
html, err := reporter.GetHTMLReport(NewReportContext(req.Data, req.Related))
|
|
if err != nil {
|
|
http.Error(w, fmt.Sprintf("failed to generate HTML report: %v", err), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
|
w.Write([]byte(html))
|
|
return
|
|
}
|
|
|
|
// Default: JSON metrics.
|
|
reporter, ok := s.provider.(CheckerMetricsReporter)
|
|
if !ok {
|
|
http.Error(w, "this checker does not support metrics reports", http.StatusNotImplemented)
|
|
return
|
|
}
|
|
|
|
metrics, err := reporter.ExtractMetrics(NewReportContext(req.Data, req.Related), time.Now())
|
|
if err != nil {
|
|
writeJSON(w, http.StatusInternalServerError, map[string]string{
|
|
"error": fmt.Sprintf("failed to extract metrics: %v", err),
|
|
})
|
|
return
|
|
}
|
|
|
|
writeJSON(w, http.StatusOK, metrics)
|
|
}
|
|
|
|
// mapObservationGetter implements ObservationGetter backed by a static map.
|
|
type mapObservationGetter struct {
|
|
data map[ObservationKey]json.RawMessage
|
|
}
|
|
|
|
func (g *mapObservationGetter) Get(ctx context.Context, key ObservationKey, dest any) error {
|
|
raw, ok := g.data[key]
|
|
if !ok {
|
|
return fmt.Errorf("observation %q not available", key)
|
|
}
|
|
return json.Unmarshal(raw, dest)
|
|
}
|
|
|
|
// GetRelated always returns nil in the remote /evaluate path: the host that
|
|
// invokes /evaluate does not (currently) carry cross-checker related data in
|
|
// ExternalEvaluateRequest. Consumers that need related observations must run
|
|
// evaluation locally with a host-side ObservationContext that resolves
|
|
// lineage.
|
|
func (g *mapObservationGetter) GetRelated(ctx context.Context, key ObservationKey) ([]RelatedObservation, error) {
|
|
return nil, nil
|
|
}
|
|
|
|
func writeJSON(w http.ResponseWriter, status int, v any) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(status)
|
|
json.NewEncoder(w).Encode(v)
|
|
}
|