From fa5198f78c6bff1bc90d0df2c8a5f3ecdc31a386 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 16 Apr 2026 00:47:17 +0700 Subject: [PATCH 01/15] Revert "checker: reorder Status with negatives for good, JSON as string" This reverts commit 6be3578c334ec16fbef1def31f4bd7262edda261. --- checker/types.go | 18 +++++--------- checker/types_test.go | 57 +++++++++---------------------------------- 2 files changed, 18 insertions(+), 57 deletions(-) diff --git a/checker/types.go b/checker/types.go index bd048a5..9adced0 100644 --- a/checker/types.go +++ b/checker/types.go @@ -146,21 +146,15 @@ type CheckerOptionsDocumentation struct { } // Status represents the result status of a check evaluation. -// -// Numeric ordering is severity ordering: lower = better, higher = worse. -// StatusUnknown is intentionally the zero value, so an uninitialized -// CheckState reads as "no signal yet" rather than as a healthy OK. -// "Good" statuses are negative so that aggregators can simply take the -// max() of a set of statuses to compute the worst one. type Status int const ( - StatusOK Status = -2 - StatusInfo Status = -1 - StatusUnknown Status = 0 // zero value: not initialized / no signal yet - StatusWarn Status = 1 - StatusCrit Status = 2 - StatusError Status = 3 + StatusUnknown Status = iota + StatusOK + StatusInfo + StatusWarn + StatusCrit + StatusError ) // String returns the human-readable name of the status. diff --git a/checker/types_test.go b/checker/types_test.go index b2ee5db..74b57d0 100644 --- a/checker/types_test.go +++ b/checker/types_test.go @@ -24,12 +24,12 @@ func TestStatus_MarshalJSON(t *testing.T) { status Status want string }{ - {StatusOK, `"OK"`}, - {StatusInfo, `"INFO"`}, - {StatusUnknown, `"UNKNOWN"`}, - {StatusWarn, `"WARN"`}, - {StatusCrit, `"CRIT"`}, - {StatusError, `"ERROR"`}, + {StatusUnknown, `0`}, + {StatusOK, `1`}, + {StatusInfo, `2`}, + {StatusWarn, `3`}, + {StatusCrit, `4`}, + {StatusError, `5`}, } for _, tt := range tests { got, err := json.Marshal(tt.status) @@ -43,42 +43,17 @@ func TestStatus_MarshalJSON(t *testing.T) { } } -func TestStatus_UnmarshalJSON_String(t *testing.T) { +func TestStatus_UnmarshalJSON(t *testing.T) { tests := []struct { input string want Status }{ - {`"OK"`, StatusOK}, - {`"INFO"`, StatusInfo}, - {`"UNKNOWN"`, StatusUnknown}, - {`""`, StatusUnknown}, - {`"WARN"`, StatusWarn}, - {`"CRIT"`, StatusCrit}, - {`"ERROR"`, StatusError}, - } - for _, tt := range tests { - var got Status - if err := json.Unmarshal([]byte(tt.input), &got); err != nil { - t.Errorf("Unmarshal(%s) error: %v", tt.input, err) - continue - } - if got != tt.want { - t.Errorf("Unmarshal(%s) = %v, want %v", tt.input, got, tt.want) - } - } -} - -func TestStatus_UnmarshalJSON_LegacyInt(t *testing.T) { - tests := []struct { - input string - want Status - }{ - {`-2`, StatusOK}, - {`-1`, StatusInfo}, {`0`, StatusUnknown}, - {`1`, StatusWarn}, - {`2`, StatusCrit}, - {`3`, StatusError}, + {`1`, StatusOK}, + {`2`, StatusInfo}, + {`3`, StatusWarn}, + {`4`, StatusCrit}, + {`5`, StatusError}, } for _, tt := range tests { var got Status @@ -92,14 +67,6 @@ func TestStatus_UnmarshalJSON_LegacyInt(t *testing.T) { } } -func TestStatus_UnmarshalJSON_UnknownString(t *testing.T) { - var s Status - err := json.Unmarshal([]byte(`"BOGUS"`), &s) - if err == nil { - t.Error("Unmarshal(\"BOGUS\") should return error, got nil") - } -} - func TestStatus_RoundTrip(t *testing.T) { for _, s := range []Status{StatusOK, StatusInfo, StatusUnknown, StatusWarn, StatusCrit, StatusError} { data, err := json.Marshal(s) From 6b96ee8c2f7928ffded3fdcf4468b666d1132486 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 16 Apr 2026 16:17:43 +0700 Subject: [PATCH 02/15] server: expose runtime metrics on /health for scheduler routing Adds HealthResponse carrying inflight count, total requests, 1/5/15-min EWMA load averages, uptime, and NumCPU so a scheduler can pick the least busy worker. A background sampler updates the load averages every 5s, stopped by a new idempotent Close method. Work endpoints (/collect, /evaluate, /report) are wrapped with a trackWork middleware; /health and /definition are excluded so polling traffic does not pollute the signal. --- checker/server.go | 138 +++++++++++++++++++++++++++++++++++++-- checker/server_test.go | 143 +++++++++++++++++++++++++++++++++++++++-- checker/types.go | 30 +++++++++ 3 files changed, 300 insertions(+), 11 deletions(-) diff --git a/checker/server.go b/checker/server.go index c036b07..d9b7f61 100644 --- a/checker/server.go +++ b/checker/server.go @@ -20,14 +20,42 @@ import ( "fmt" "io" "log" + "math" "net/http" + "runtime" "strings" + "sync" + "sync/atomic" "time" ) // maxRequestBodySize is the maximum allowed size for incoming request bodies (1 MB). const maxRequestBodySize = 1 << 20 +// loadSampleInterval is how often the background sampler updates the +// exponentially weighted moving averages reported in HealthResponse.LoadAvg. +// 5 seconds matches the Unix kernel's loadavg cadence. +const loadSampleInterval = 5 * time.Second + +// EWMA smoothing factors for 1, 5, and 15-minute windows sampled every +// loadSampleInterval. Derived as 1 - exp(-interval/window) so that the +// steady-state response to a constant InFlight of N converges to N. +var ( + loadAlpha1 = 1 - math.Exp(-float64(loadSampleInterval)/float64(1*time.Minute)) + loadAlpha5 = 1 - math.Exp(-float64(loadSampleInterval)/float64(5*time.Minute)) + loadAlpha15 = 1 - math.Exp(-float64(loadSampleInterval)/float64(15*time.Minute)) +) + +// updateLoadAvg advances the three EWMAs by one tick given the current +// InFlight sample. It is a pure function to keep the sampler trivially testable. +func updateLoadAvg(prev [3]float64, sample float64) [3]float64 { + return [3]float64{ + prev[0] + loadAlpha1*(sample-prev[0]), + prev[1] + loadAlpha5*(sample-prev[1]), + prev[2] + loadAlpha15*(sample-prev[2]), + } +} + // Server is a generic HTTP server for external checkers. // It always exposes /health and /collect. If the provider implements // CheckerDefinitionProvider, it also exposes /definition and /evaluate. @@ -41,29 +69,65 @@ type Server struct { provider ObservationProvider definition *CheckerDefinition mux *http.ServeMux + + // startTime is captured in NewServer and used to compute uptime. + startTime time.Time + + // inFlight counts work requests (/collect, /evaluate, /report) currently + // being processed. /health and /definition are not tracked. + inFlight atomic.Int64 + + // totalRequests is the cumulative number of work requests served. + totalRequests atomic.Uint64 + + // loadBits stores the 1, 5, 15-minute EWMAs of inFlight as float64 bit + // patterns (math.Float64bits) so reads and writes are tear-free and + // lock-free across the sampler goroutine and the /health handler. + loadBits [3]atomic.Uint64 + + // cancelSampler stops the background load-average sampler. + cancelSampler context.CancelFunc + + // samplerDone is closed when the sampler goroutine returns. + samplerDone chan struct{} + + // closeOnce guarantees Close is idempotent. + closeOnce sync.Once } // NewServer creates a new checker HTTP server backed by the given provider. // Additional endpoints are registered based on optional interfaces the provider implements. +// +// NewServer also starts a background goroutine that samples the in-flight +// request count every loadSampleInterval to compute the load averages +// reported on /health. Call Close to stop it. func NewServer(provider ObservationProvider) *Server { - s := &Server{provider: provider} + ctx, cancel := context.WithCancel(context.Background()) + s := &Server{ + provider: provider, + startTime: time.Now(), + cancelSampler: cancel, + samplerDone: make(chan struct{}), + } s.mux = http.NewServeMux() s.mux.HandleFunc("GET /health", s.handleHealth) - s.mux.HandleFunc("POST /collect", s.handleCollect) + s.mux.Handle("POST /collect", s.trackWork(http.HandlerFunc(s.handleCollect))) if dp, ok := provider.(CheckerDefinitionProvider); ok { s.definition = dp.Definition() s.definition.BuildRulesInfo() s.mux.HandleFunc("GET /definition", s.handleDefinition) - s.mux.HandleFunc("POST /evaluate", s.handleEvaluate) + s.mux.Handle("POST /evaluate", s.trackWork(http.HandlerFunc(s.handleEvaluate))) } if _, ok := provider.(CheckerHTMLReporter); ok { - s.mux.HandleFunc("POST /report", s.handleReport) + s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport))) } else if _, ok := provider.(CheckerMetricsReporter); ok { - s.mux.HandleFunc("POST /report", s.handleReport) + s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport))) } + go s.runSampler(ctx) + return s } @@ -74,11 +138,61 @@ func (s *Server) Handler() http.Handler { } // ListenAndServe starts the HTTP server on the given address. +// +// ListenAndServe does not stop the background load-average sampler on return; +// call Close to stop it. This is not required for process-scoped usage but is +// recommended for tests and embedded lifecycles. func (s *Server) ListenAndServe(addr string) error { log.Printf("checker listening on %s", addr) return http.ListenAndServe(addr, requestLogger(s.mux)) } +// Close stops the background load-average sampler goroutine. It is safe to +// call multiple times; subsequent calls are no-ops. Close does not shut down +// any underlying http.Server — callers own that lifecycle. +func (s *Server) Close() error { + s.closeOnce.Do(func() { + s.cancelSampler() + <-s.samplerDone + }) + return nil +} + +// trackWork wraps a handler with in-flight and total-request accounting. +// It is applied only to "work" endpoints (/collect, /evaluate, /report) so +// that /health polling traffic does not pollute the load signal. +func (s *Server) trackWork(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + s.inFlight.Add(1) + s.totalRequests.Add(1) + defer s.inFlight.Add(-1) + next.ServeHTTP(w, r) + }) +} + +// runSampler updates the load-average EWMAs every loadSampleInterval until +// ctx is canceled. It closes s.samplerDone on exit. +func (s *Server) runSampler(ctx context.Context) { + defer close(s.samplerDone) + ticker := time.NewTicker(loadSampleInterval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + var prev [3]float64 + for i := range prev { + prev[i] = math.Float64frombits(s.loadBits[i].Load()) + } + next := updateLoadAvg(prev, float64(s.inFlight.Load())) + for i := range next { + s.loadBits[i].Store(math.Float64bits(next[i])) + } + } + } +} + type statusRecorder struct { http.ResponseWriter status int @@ -99,8 +213,18 @@ func requestLogger(next http.Handler) http.Handler { } func (s *Server) handleHealth(w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json") - json.NewEncoder(w).Encode(map[string]string{"status": "ok"}) + var load [3]float64 + for i := range load { + load[i] = math.Float64frombits(s.loadBits[i].Load()) + } + writeJSON(w, http.StatusOK, HealthResponse{ + Status: "ok", + Uptime: time.Since(s.startTime).Seconds(), + NumCPU: runtime.NumCPU(), + InFlight: s.inFlight.Load(), + TotalRequests: s.totalRequests.Load(), + LoadAvg: load, + }) } func (s *Server) handleDefinition(w http.ResponseWriter, r *http.Request) { diff --git a/checker/server_test.go b/checker/server_test.go index 3bc36bd..a20ee61 100644 --- a/checker/server_test.go +++ b/checker/server_test.go @@ -21,6 +21,7 @@ import ( "errors" "net/http" "net/http/httptest" + "sync" "testing" "time" ) @@ -96,14 +97,148 @@ func doRequest(handler http.Handler, method, path string, body any, headers map[ func TestServer_Health(t *testing.T) { p := &testProvider{key: "test", definition: &CheckerDefinition{ID: "test", Rules: []CheckRule{}}} srv := newTestServer(p) + defer srv.Close() rec := doRequest(srv.Handler(), "GET", "/health", nil, nil) if rec.Code != http.StatusOK { t.Fatalf("GET /health = %d, want %d", rec.Code, http.StatusOK) } - var resp map[string]string - json.NewDecoder(rec.Body).Decode(&resp) - if resp["status"] != "ok" { - t.Errorf("GET /health status = %q, want \"ok\"", resp["status"]) + var resp HealthResponse + if err := json.NewDecoder(rec.Body).Decode(&resp); err != nil { + t.Fatalf("decode /health: %v", err) + } + if resp.Status != "ok" { + t.Errorf("GET /health status = %q, want \"ok\"", resp.Status) + } + if resp.NumCPU <= 0 { + t.Errorf("NumCPU = %d, want > 0", resp.NumCPU) + } + if resp.Uptime < 0 { + t.Errorf("Uptime = %v, want >= 0", resp.Uptime) + } + if resp.InFlight != 0 { + t.Errorf("InFlight = %d on fresh server, want 0", resp.InFlight) + } + if resp.TotalRequests != 0 { + t.Errorf("TotalRequests = %d on fresh server, want 0", resp.TotalRequests) + } + if resp.LoadAvg != [3]float64{0, 0, 0} { + t.Errorf("LoadAvg = %v on fresh server, want all zero", resp.LoadAvg) + } +} + +func TestServer_Health_TracksInFlight(t *testing.T) { + release := make(chan struct{}) + var collectEntered sync.WaitGroup + p := &testProvider{ + key: "test", + definition: &CheckerDefinition{ID: "test", Rules: []CheckRule{}}, + collectFn: func(ctx context.Context, opts CheckerOptions) (any, error) { + collectEntered.Done() + <-release + return map[string]string{"ok": "1"}, nil + }, + } + srv := newTestServer(p) + defer srv.Close() + handler := srv.Handler() + + const n = 3 + collectEntered.Add(n) + var clientsDone sync.WaitGroup + clientsDone.Add(n) + for i := 0; i < n; i++ { + go func() { + defer clientsDone.Done() + doRequest(handler, "POST", "/collect", ExternalCollectRequest{Key: "test"}, nil) + }() + } + + // Wait for all n handlers to be inside collectFn (== all n in-flight). + collectEntered.Wait() + + // Record /health mid-flight. Also hammer it to verify /health polls + // do not inflate InFlight or TotalRequests. + var mid HealthResponse + for i := 0; i < 5; i++ { + rec := doRequest(handler, "GET", "/health", nil, nil) + if rec.Code != http.StatusOK { + t.Fatalf("GET /health = %d, want %d", rec.Code, http.StatusOK) + } + if err := json.NewDecoder(rec.Body).Decode(&mid); err != nil { + t.Fatalf("decode /health: %v", err) + } + } + if mid.InFlight != n { + t.Errorf("mid-flight InFlight = %d, want %d", mid.InFlight, n) + } + if mid.TotalRequests != n { + t.Errorf("mid-flight TotalRequests = %d, want %d (health polls must not count)", mid.TotalRequests, n) + } + + // Release all work and wait for clients to return. + close(release) + clientsDone.Wait() + + rec := doRequest(handler, "GET", "/health", nil, nil) + var after HealthResponse + if err := json.NewDecoder(rec.Body).Decode(&after); err != nil { + t.Fatalf("decode /health: %v", err) + } + if after.InFlight != 0 { + t.Errorf("post-flight InFlight = %d, want 0", after.InFlight) + } + if after.TotalRequests != n { + t.Errorf("post-flight TotalRequests = %d, want %d", after.TotalRequests, n) + } + if after.Uptime < mid.Uptime { + t.Errorf("Uptime went backwards: mid=%v after=%v", mid.Uptime, after.Uptime) + } +} + +func TestUpdateLoadAvg(t *testing.T) { + load := [3]float64{0, 0, 0} + for i := 0; i < 20; i++ { + load = updateLoadAvg(load, 5) + } + if !(load[0] > load[1] && load[1] > load[2]) { + t.Errorf("expected load[0] > load[1] > load[2], got %v", load) + } + for i, v := range load { + if v <= 0 { + t.Errorf("load[%d] = %v, want > 0", i, v) + } + if v >= 5 { + t.Errorf("load[%d] = %v, want < 5 (not yet converged)", i, v) + } + } + + // Constant sample of zero from a non-zero state must decay toward zero. + decaying := load + for i := 0; i < 50; i++ { + decaying = updateLoadAvg(decaying, 0) + } + for i := range decaying { + if decaying[i] >= load[i] { + t.Errorf("decaying[%d] = %v, want < %v", i, decaying[i], load[i]) + } + } +} + +func TestServer_Close_Idempotent(t *testing.T) { + p := &testProvider{key: "test", definition: &CheckerDefinition{ID: "test", Rules: []CheckRule{}}} + srv := newTestServer(p) + done := make(chan error, 2) + go func() { done <- srv.Close() }() + go func() { done <- srv.Close() }() + for i := 0; i < 2; i++ { + select { + case err := <-done: + if err != nil { + t.Errorf("Close() returned %v, want nil", err) + } + case <-time.After(2 * time.Second): + t.Fatal("Close() deadlocked") + } } } diff --git a/checker/types.go b/checker/types.go index 9adced0..08f7431 100644 --- a/checker/types.go +++ b/checker/types.go @@ -336,3 +336,33 @@ type ExternalReportRequest struct { Key ObservationKey `json:"key"` Data json.RawMessage `json:"data"` } + +// HealthResponse is returned by GET /health on a remote checker endpoint. +// It carries lightweight runtime signals so a scheduler can pick the least +// busy worker among a set of equivalent checker instances. +// +// LoadAvg mirrors /proc/loadavg semantics: it is the 1, 5, 15-minute +// exponentially weighted moving average of the InFlight request count, +// sampled every 5 seconds. Divide by NumCPU to estimate saturation. +type HealthResponse struct { + // Status is a coarse liveness indicator. Currently always "ok"; + // "degraded" is reserved for future use. + Status string `json:"status"` + + // Uptime is the number of (fractional) seconds since the server started. + Uptime float64 `json:"uptime_seconds"` + + // NumCPU is the value of runtime.NumCPU() on this worker. + NumCPU int `json:"num_cpu"` + + // InFlight is the number of work requests (/collect, /evaluate, /report) + // currently being processed. /health and /definition are not counted. + InFlight int64 `json:"inflight"` + + // TotalRequests is the cumulative number of work requests served since + // the server started. /health and /definition are not counted. + TotalRequests uint64 `json:"total_requests"` + + // LoadAvg holds the 1, 5, 15-minute EWMAs of InFlight. + LoadAvg [3]float64 `json:"loadavg"` +} From 087032f6ccffeef715f005c1d51e61e1e757ffc0 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Sun, 19 Apr 2026 10:27:17 +0700 Subject: [PATCH 03/15] checker: add DiscoveryPublisher interface for cross-checker discovery Introduce a DiscoveryEntry struct and an optional DiscoveryPublisher interface that providers can co-implement to declare things worth probing by other checkers (TLS endpoints, HTTP probes, ACME challenges, DNSSEC keys, ...) without having to re-parse raw observations. DiscoveryEntry carries an opaque Payload: the SDK does not interpret it. Producers and consumers agree on the Payload schema through a separate contract (eg. a small shared Go package imported by both) identified by the free-form Type string. This keeps the SDK free of protocol-specific concepts; new entry families can appear without touching it. The /collect HTTP handler type-asserts the provider against DiscoveryPublisher immediately after Collect and forwards the resulting entries in ExternalCollectResponse.Entries. --- checker/server.go | 18 +++++++++++++--- checker/types.go | 54 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/checker/server.go b/checker/server.go index d9b7f61..db7f88c 100644 --- a/checker/server.go +++ b/checker/server.go @@ -256,9 +256,21 @@ func (s *Server) handleCollect(w http.ResponseWriter, r *http.Request) { return } - writeJSON(w, http.StatusOK, ExternalCollectResponse{ - Data: json.RawMessage(raw), - }) + resp := ExternalCollectResponse{Data: json.RawMessage(raw)} + + // Harvest discovery entries from the native Go value, before it goes + // out of scope. No re-parse; DiscoverEntries operates on the same + // object that was just marshaled above. + if dp, ok := s.provider.(DiscoveryPublisher); ok { + entries, derr := dp.DiscoverEntries(data) + if derr != nil { + log.Printf("DiscoverEntries failed: %v", derr) + } else { + resp.Entries = entries + } + } + + writeJSON(w, http.StatusOK, resp) } func (s *Server) handleEvaluate(w http.ResponseWriter, r *http.Request) { diff --git a/checker/types.go b/checker/types.go index 08f7431..2c26a63 100644 --- a/checker/types.go +++ b/checker/types.go @@ -41,6 +41,11 @@ const ( AutoFillZone = "zone" AutoFillServiceType = "service_type" AutoFillService = "service" + + // AutoFillDiscoveryEntries receives DiscoveryEntry records published by + // other checkers on the same target. The host does not pre-filter by + // Type; consumers pick the contracts they understand and ignore the rest. + AutoFillDiscoveryEntries = "discovery_entries" ) // CheckTarget identifies the resource a check applies to. Identifiers are @@ -314,8 +319,53 @@ type ExternalCollectRequest struct { // ExternalCollectResponse is returned by POST /collect on a remote checker endpoint. type ExternalCollectResponse struct { - Data json.RawMessage `json:"data,omitempty"` - Error string `json:"error,omitempty"` + Data json.RawMessage `json:"data,omitempty"` + Entries []DiscoveryEntry `json:"entries,omitempty"` + Error string `json:"error,omitempty"` +} + +// DiscoveryEntry is a single "thing worth probing" declared by a checker as a +// by-product of its collection, intended to be consumed by other checkers +// without having to re-parse raw observations. +// +// The SDK treats Payload as an opaque byte string: producer and consumer +// checkers agree on a schema through a separate contract (typically a small +// shared Go package imported by both). This keeps the SDK free of +// protocol-specific concepts; new entry families (TLS endpoint, HTTP probe, +// ACME challenge, DNSSEC key, …) can appear without touching it. +// +// Entries are ingested by happyDomain into a separate index. Each new +// collection from the same source atomically replaces the set of entries +// previously published for the same (producer, target) pair. +type DiscoveryEntry struct { + // Type names the contract Payload follows, e.g. "tls.endpoint" or + // "http.probe". Producers and consumers match on this string; the SDK + // does not interpret it. Stick to a reverse-DNS-ish convention so that + // independent contracts do not collide. + Type string `json:"type"` + + // Ref is a stable per-entry identifier chosen by the producer. The host + // uses it to dedupe entries across repeated collections and to link + // related observations back to this entry (RelatedObservation.Ref). Two + // producers may reuse the same Ref space; the host namespaces them by + // (producer, target). + Ref string `json:"ref"` + + // Payload is the entry-specific data, in the format defined by the + // contract named in Type. Opaque to the SDK. + Payload json.RawMessage `json:"payload"` +} + +// DiscoveryPublisher is an optional interface an ObservationProvider can +// co-implement to declare DiscoveryEntry records derived from the value it +// just collected. +// +// The host invokes DiscoverEntries immediately after Collect, passing the +// native Go value returned by Collect (no JSON round-trip). Implementations +// should therefore type-assert data to their concrete collection type and +// marshal each contract payload themselves. +type DiscoveryPublisher interface { + DiscoverEntries(data any) ([]DiscoveryEntry, error) } // ExternalEvaluateRequest is sent to POST /evaluate on a remote checker endpoint. From 7567271536de6045f0f23870bcf0b3be80c0e707 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Sun, 19 Apr 2026 23:35:42 +0700 Subject: [PATCH 04/15] checker: cross-checker observation composition via ReportContext Add the plumbing that lets a checker receive (at evaluation, report rendering, and metrics extraction) observations produced by other checkers on DiscoveryEntry records it originally published. Surface changes: - RelatedObservation struct: one downstream observation, tagged with the producing CheckerID and the Ref matching the DiscoveryEntry it covers. - ObservationGetter gains GetRelated(ctx, key), so rules can opt in to cross-checker composition. mapObservationGetter (remote /evaluate path) returns empty; the host owns lineage resolution. - ReportContext interface: Data() + Related(key). Reporters consume it instead of a raw json.RawMessage, which collapses the former legacy/Ctx duplicate and gives one uniform signature: GetHTMLReport(ctx ReportContext) (string, error) ExtractMetrics(ctx ReportContext, t time.Time) ([]CheckMetric, error) - NewReportContext(data, related) and StaticReportContext(data) build fixed-payload contexts for entry points without an ObservationContext. - ExternalReportRequest gains a Related map so the host can ship pre-composed lineage to a remote checker over /report. The SDK's /report handler threads it through to the reporter via NewReportContext, closing the wire gap that previously forced remote reports to a StaticReportContext with no related data. Tests cover the Related map round-trip end-to-end via a peeking provider. --- checker/server.go | 13 ++++- checker/server_test.go | 65 +++++++++++++++++++++++-- checker/types.go | 106 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 172 insertions(+), 12 deletions(-) diff --git a/checker/server.go b/checker/server.go index db7f88c..8b2eb31 100644 --- a/checker/server.go +++ b/checker/server.go @@ -319,7 +319,7 @@ func (s *Server) handleReport(w http.ResponseWriter, r *http.Request) { return } - html, err := reporter.GetHTMLReport(req.Data) + html, err := reporter.GetHTMLReport(NewReportContext(req.Data, req.Related)) if err != nil { http.Error(w, fmt.Sprintf("failed to generate HTML report: %v", err), http.StatusInternalServerError) return @@ -337,7 +337,7 @@ func (s *Server) handleReport(w http.ResponseWriter, r *http.Request) { return } - metrics, err := reporter.ExtractMetrics(req.Data, time.Now()) + metrics, err := reporter.ExtractMetrics(NewReportContext(req.Data, req.Related), time.Now()) if err != nil { writeJSON(w, http.StatusInternalServerError, map[string]string{ "error": fmt.Sprintf("failed to extract metrics: %v", err), @@ -361,6 +361,15 @@ func (g *mapObservationGetter) Get(ctx context.Context, key ObservationKey, dest return json.Unmarshal(raw, dest) } +// GetRelated always returns nil in the remote /evaluate path: the host that +// invokes /evaluate does not (currently) carry cross-checker related data in +// ExternalEvaluateRequest. Consumers that need related observations must run +// evaluation locally with a host-side ObservationContext that resolves +// lineage. +func (g *mapObservationGetter) GetRelated(ctx context.Context, key ObservationKey) ([]RelatedObservation, error) { + return nil, nil +} + func writeJSON(w http.ResponseWriter, status int, v any) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) diff --git a/checker/server_test.go b/checker/server_test.go index a20ee61..6a228b6 100644 --- a/checker/server_test.go +++ b/checker/server_test.go @@ -44,15 +44,15 @@ func (p *testProvider) Collect(ctx context.Context, opts CheckerOptions) (any, e return map[string]string{"result": "ok"}, nil } func (p *testProvider) Definition() *CheckerDefinition { return p.definition } -func (p *testProvider) GetHTMLReport(raw json.RawMessage) (string, error) { +func (p *testProvider) GetHTMLReport(ctx ReportContext) (string, error) { if p.htmlFn != nil { - return p.htmlFn(raw) + return p.htmlFn(ctx.Data()) } return "

report

", nil } -func (p *testProvider) ExtractMetrics(raw json.RawMessage, t time.Time) ([]CheckMetric, error) { +func (p *testProvider) ExtractMetrics(ctx ReportContext, t time.Time) ([]CheckMetric, error) { if p.metricsFn != nil { - return p.metricsFn(raw, t) + return p.metricsFn(ctx.Data(), t) } return []CheckMetric{{Name: "m1", Value: 1.0, Timestamp: t}}, nil } @@ -428,6 +428,63 @@ func TestServer_Report_Metrics(t *testing.T) { } } +// TestServer_Report_Related verifies the remote /report path wires +// ExternalReportRequest.Related through to the provider's ReportContext, +// the fix for the "remote checkers can't see related observations" gap. +func TestServer_Report_Related(t *testing.T) { + var gotRelated []RelatedObservation + p := &testProvider{ + key: "test", + definition: &CheckerDefinition{ID: "test-checker", Rules: []CheckRule{}}, + } + // Replace htmlFn with one that peeks at a related key. We can't do that + // directly through testProvider's htmlFn (which only sees raw), so + // bind to GetHTMLReport via an inline wrapper: use a per-test provider + // that captures the ReportContext before delegating to the template. + srv := NewServer(&relatedPeekingProvider{ + base: p, + target: &gotRelated, + }) + defer srv.Close() + + req := ExternalReportRequest{ + Key: "test", + Data: json.RawMessage(`{}`), + Related: map[ObservationKey][]RelatedObservation{ + "tls_probes": { + {CheckerID: "tls", Key: "tls_probes", Data: json.RawMessage(`{"ok":true}`), Ref: "ep-1"}, + }, + }, + } + rec := doRequest(srv.Handler(), "POST", "/report", req, map[string]string{"Accept": "text/html"}) + if rec.Code != http.StatusOK { + t.Fatalf("POST /report = %d, want 200", rec.Code) + } + if len(gotRelated) != 1 { + t.Fatalf("provider saw %d related observations, want 1", len(gotRelated)) + } + if gotRelated[0].CheckerID != "tls" || string(gotRelated[0].Data) != `{"ok":true}` { + t.Errorf("related mismatch: got %+v", gotRelated[0]) + } +} + +// relatedPeekingProvider forwards to a base testProvider but copies the +// Related("tls_probes") slice observed at GetHTMLReport time into target. +type relatedPeekingProvider struct { + base *testProvider + target *[]RelatedObservation +} + +func (p *relatedPeekingProvider) Key() ObservationKey { return p.base.Key() } +func (p *relatedPeekingProvider) Collect(ctx context.Context, opts CheckerOptions) (any, error) { + return p.base.Collect(ctx, opts) +} +func (p *relatedPeekingProvider) Definition() *CheckerDefinition { return p.base.definition } +func (p *relatedPeekingProvider) GetHTMLReport(ctx ReportContext) (string, error) { + *p.target = ctx.Related("tls_probes") + return "

ok

", nil +} + func TestServer_Report_BadBody(t *testing.T) { p := &testProvider{ key: "test", diff --git a/checker/types.go b/checker/types.go index 2c26a63..9099a2f 100644 --- a/checker/types.go +++ b/checker/types.go @@ -238,8 +238,45 @@ type CheckRuleWithOptions interface { // ObservationGetter provides access to observation data (used by CheckRule). // Get unmarshals observation data into dest (like json.Unmarshal). +// +// GetRelated returns observations produced by other checkers on DiscoveryEntry +// records originally published by the current target. It is the core of +// cross-checker composition: a checker that published some entries via its +// DiscoveryPublisher can, during rule evaluation, fetch the latest +// observations that cover those entries and fold them into its own states. +// +// GetRelated returns an empty slice (not an error) when there is nothing +// to relate (no entries originally published, no downstream observation +// yet, no downstream checker registered for the entry type, …). Callers +// handle that as "no related data", typically skipping optional sections. type ObservationGetter interface { Get(ctx context.Context, key ObservationKey, dest any) error + GetRelated(ctx context.Context, key ObservationKey) ([]RelatedObservation, error) +} + +// RelatedObservation is a single observation, produced by some other checker, +// that covers a DiscoveryEntry originally published by the current target. +// +// Data carries the raw JSON payload; consumers parse it according to the +// producer's schema, which they are expected to know via external agreement +// (typically a shared contract package imported by both producer and +// consumer). +type RelatedObservation struct { + // CheckerID identifies the producer of this observation. + CheckerID string `json:"checkerId"` + + // Key is the observation key the producer filled. + Key ObservationKey `json:"key"` + + // Data is the raw JSON payload as persisted by the producer. + Data json.RawMessage `json:"data"` + + // CollectedAt is when the producer ran its Collect. + CollectedAt time.Time `json:"collectedAt"` + + // Ref matches DiscoveryEntry.Ref of the entry this observation covers. + // Opaque to the SDK; meaningful within the producer/consumer contract. + Ref string `json:"ref"` } // CheckAggregator combines multiple CheckStates into a single result. @@ -247,20 +284,69 @@ type CheckAggregator interface { Aggregate(states []CheckState) CheckState } +// ReportContext carries both the primary observation payload and any +// observations produced by other checkers that cover the same discovery +// entries. Hosts build a ReportContext and hand it to reporter methods. +// +// The method set is deliberately tiny: a single primary payload (Data) and +// a query for related observations by key (Related). Hosts return nil from +// Related when there is nothing to relate; reporters must tolerate that. +type ReportContext interface { + Data() json.RawMessage + Related(key ObservationKey) []RelatedObservation +} + +// NewReportContext returns a ReportContext backed by a primary payload and +// a pre-resolved map of related observations by key. The SDK's /report HTTP +// handler uses this to wrap ExternalReportRequest contents; hosts and tests +// can use it whenever they already have the related observations in memory. +// +// Passing a nil or empty related map is fine; Related(key) will then return +// nil, just like StaticReportContext. +func NewReportContext(data json.RawMessage, related map[ObservationKey][]RelatedObservation) ReportContext { + return fixedReportContext{data: data, related: related} +} + +// StaticReportContext is a shorthand for NewReportContext(data, nil): a +// ReportContext with a primary payload and no related observations. +// Intended for tests and ad-hoc callers that have no lineage to supply. +func StaticReportContext(data json.RawMessage) ReportContext { + return fixedReportContext{data: data} +} + +type fixedReportContext struct { + data json.RawMessage + related map[ObservationKey][]RelatedObservation +} + +func (f fixedReportContext) Data() json.RawMessage { return f.data } +func (f fixedReportContext) Related(key ObservationKey) []RelatedObservation { + if f.related == nil { + return nil + } + return f.related[key] +} + // CheckerHTMLReporter is an optional interface that observation providers can // implement to render their stored data as a full HTML document (for iframe embedding). // Detect support with a type assertion: _, ok := provider.(CheckerHTMLReporter) +// +// The ReportContext carries the primary observation payload plus any +// downstream observations produced on DiscoveryEntry records this checker +// published. Implementations that do not need related observations can +// simply consume ctx.Data(). type CheckerHTMLReporter interface { - // GetHTMLReport generates an HTML document from the JSON-encoded observation data. - GetHTMLReport(raw json.RawMessage) (string, error) + GetHTMLReport(ctx ReportContext) (string, error) } // CheckerMetricsReporter is an optional interface that observation providers can // implement to extract time-series metrics from their stored data. // Detect support with a type assertion: _, ok := provider.(CheckerMetricsReporter) +// +// As with CheckerHTMLReporter, the ReportContext exposes related +// observations for cross-checker composition. type CheckerMetricsReporter interface { - // ExtractMetrics returns metrics from JSON-encoded observation data. - ExtractMetrics(raw json.RawMessage, collectedAt time.Time) ([]CheckMetric, error) + ExtractMetrics(ctx ReportContext, collectedAt time.Time) ([]CheckMetric, error) } // CheckerDefinitionProvider is an optional interface that observation providers can @@ -382,9 +468,17 @@ type ExternalEvaluateResponse struct { } // ExternalReportRequest is sent to POST /report on a remote checker endpoint. +// +// Related carries observations produced by other checkers on DiscoveryEntry +// records originally published by the target of this report, that is, the +// cross-checker lineage that ObservationGetter.GetRelated would expose in +// the in-process path. The host composes it before making the HTTP request; +// when absent, the remote checker receives a context that reports no +// related observations (equivalent to StaticReportContext). type ExternalReportRequest struct { - Key ObservationKey `json:"key"` - Data json.RawMessage `json:"data"` + Key ObservationKey `json:"key"` + Data json.RawMessage `json:"data"` + Related map[ObservationKey][]RelatedObservation `json:"related,omitempty"` } // HealthResponse is returned by GET /health on a remote checker endpoint. From d847c71a509dde358a8772c5ba15e756399734d3 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 23 Apr 2026 10:06:48 +0700 Subject: [PATCH 05/15] checker: let CheckRule.Evaluate return per-subject CheckStates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rules that iterate over multiple elements (certificates, CAA records, nameservers, …) previously had to squash per-element results into a single concatenated message. Evaluate now returns []CheckState and CheckState carries an opaque Subject, so each element gets its own structured state. The server injects a StatusUnknown placeholder when a rule returns nothing, to avoid silently dropping the rule. --- checker/server.go | 16 ++++++++++++---- checker/server_test.go | 4 ++-- checker/types.go | 19 ++++++++++++++++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/checker/server.go b/checker/server.go index 8b2eb31..444c118 100644 --- a/checker/server.go +++ b/checker/server.go @@ -291,11 +291,19 @@ func (s *Server) handleEvaluate(w http.ResponseWriter, r *http.Request) { continue } } - state := rule.Evaluate(r.Context(), obs, req.Options) - if state.Code == "" { - state.Code = rule.Name() + ruleStates := rule.Evaluate(r.Context(), obs, req.Options) + if len(ruleStates) == 0 { + ruleStates = []CheckState{{ + Status: StatusUnknown, + Message: fmt.Sprintf("rule %q returned no state", rule.Name()), + }} + } + for _, state := range ruleStates { + if state.Code == "" { + state.Code = rule.Name() + } + states = append(states, state) } - states = append(states, state) } writeJSON(w, http.StatusOK, ExternalEvaluateResponse{States: states}) diff --git a/checker/server_test.go b/checker/server_test.go index 6a228b6..23ac446 100644 --- a/checker/server_test.go +++ b/checker/server_test.go @@ -65,8 +65,8 @@ type dummyRule struct { func (r *dummyRule) Name() string { return r.name } func (r *dummyRule) Description() string { return r.desc } -func (r *dummyRule) Evaluate(ctx context.Context, obs ObservationGetter, opts CheckerOptions) CheckState { - return CheckState{Status: StatusOK, Message: r.name + " passed"} +func (r *dummyRule) Evaluate(ctx context.Context, obs ObservationGetter, opts CheckerOptions) []CheckState { + return []CheckState{{Status: StatusOK, Message: r.name + " passed"}} } // --- helpers --- diff --git a/checker/types.go b/checker/types.go index 9099a2f..2600987 100644 --- a/checker/types.go +++ b/checker/types.go @@ -182,11 +182,15 @@ func (s Status) String() string { } } -// CheckState is the result of evaluating a single rule. +// CheckState is the result of evaluating a single rule on a single subject. +// Subject is opaque to the SDK: producers and consumers agree on its shape +// (a hostname, a record key, a serial, …). Leave Subject empty for rules +// that produce a single, global result. type CheckState struct { Status Status `json:"status"` Message string `json:"message"` Code string `json:"code,omitempty"` + Subject string `json:"subject,omitempty"` Meta map[string]any `json:"meta,omitempty"` } @@ -222,11 +226,20 @@ type CheckRuleInfo struct { Options *CheckerOptionsDocumentation `json:"options,omitempty"` } -// CheckRule evaluates observations and produces a CheckState. +// CheckRule evaluates observations and produces one or more CheckStates. +// +// Evaluate returns a slice so a rule iterating over multiple elements can +// emit one state per subject (each carrying CheckState.Subject) without +// squashing them into a single concatenated message. +// +// Evaluate must not return a nil or empty slice: callers expect at least +// one state per rule. When a rule finds nothing to evaluate, return a +// single CheckState with an appropriate status (typically StatusInfo or +// StatusOK) describing that fact. type CheckRule interface { Name() string Description() string - Evaluate(ctx context.Context, obs ObservationGetter, opts CheckerOptions) CheckState + Evaluate(ctx context.Context, obs ObservationGetter, opts CheckerOptions) []CheckState } // CheckRuleWithOptions is an optional interface that rules can implement From 0c6a886e8272c07db9c79703ba0213b13a6f2ca9 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 23 Apr 2026 10:12:55 +0700 Subject: [PATCH 06/15] server: expose Handle/HandleFunc for custom checker routes Lets plugins register auxiliary endpoints (debug pages, webhooks, UI assets) on the SDK mux, with TrackWork as an opt-in for the /health load signal. --- README.md | 26 ++++++++++++++++++++++++++ checker/server.go | 35 ++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ed5f3f6..7f47eb8 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,32 @@ go get git.happydns.org/checker-sdk-go/checker See [checker-dummy](https://git.happydns.org/checker-dummy) for a fully working, documented template. +## Extending the server + +`checker.Server` exposes the standard SDK routes (`/health`, `/collect`, +and, depending on the provider's optional interfaces, `/definition`, +`/evaluate`, `/report`). Plugins that need to serve auxiliary endpoints +(debug pages, webhooks, custom UI assets, …) can register them on the +same mux: + +```go +srv := checker.NewServer(provider) + +srv.HandleFunc("GET /debug/state", func(w http.ResponseWriter, r *http.Request) { + // … +}) + +// Opt a custom route into the in-flight / load-average signal +// reported on /health: +srv.Handle("POST /webhook", srv.TrackWork(myWebhookHandler)) + +log.Fatal(srv.ListenAndServe(":8080")) +``` + +Patterns that collide with built-in routes panic at registration — +pick non-overlapping paths. Custom handlers are not wrapped by the +load-tracking middleware unless you opt in via `TrackWork`. + ## License Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE). diff --git a/checker/server.go b/checker/server.go index 444c118..6163497 100644 --- a/checker/server.go +++ b/checker/server.go @@ -111,19 +111,21 @@ func NewServer(provider ObservationProvider) *Server { } s.mux = http.NewServeMux() s.mux.HandleFunc("GET /health", s.handleHealth) - s.mux.Handle("POST /collect", s.trackWork(http.HandlerFunc(s.handleCollect))) + s.mux.Handle("POST /collect", s.TrackWork(http.HandlerFunc(s.handleCollect))) if dp, ok := provider.(CheckerDefinitionProvider); ok { - s.definition = dp.Definition() - s.definition.BuildRulesInfo() - s.mux.HandleFunc("GET /definition", s.handleDefinition) - s.mux.Handle("POST /evaluate", s.trackWork(http.HandlerFunc(s.handleEvaluate))) + if def := dp.Definition(); def != nil { + s.definition = def + s.definition.BuildRulesInfo() + s.mux.HandleFunc("GET /definition", s.handleDefinition) + s.mux.Handle("POST /evaluate", s.TrackWork(http.HandlerFunc(s.handleEvaluate))) + } } if _, ok := provider.(CheckerHTMLReporter); ok { - s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport))) + s.mux.Handle("POST /report", s.TrackWork(http.HandlerFunc(s.handleReport))) } else if _, ok := provider.(CheckerMetricsReporter); ok { - s.mux.Handle("POST /report", s.trackWork(http.HandlerFunc(s.handleReport))) + s.mux.Handle("POST /report", s.TrackWork(http.HandlerFunc(s.handleReport))) } go s.runSampler(ctx) @@ -137,6 +139,18 @@ func (s *Server) Handler() http.Handler { return requestLogger(s.mux) } +// Handle registers an auxiliary handler on the server's mux. Must be called +// before ListenAndServe or Handler(). Custom handlers are not tracked by +// TrackWork; wrap them explicitly if you want them counted in /health load. +func (s *Server) Handle(pattern string, handler http.Handler) { + s.mux.Handle(pattern, handler) +} + +// HandleFunc is the http.HandlerFunc-flavoured counterpart of Handle. +func (s *Server) HandleFunc(pattern string, handler func(http.ResponseWriter, *http.Request)) { + s.mux.HandleFunc(pattern, handler) +} + // ListenAndServe starts the HTTP server on the given address. // // ListenAndServe does not stop the background load-average sampler on return; @@ -158,10 +172,9 @@ func (s *Server) Close() error { return nil } -// trackWork wraps a handler with in-flight and total-request accounting. -// It is applied only to "work" endpoints (/collect, /evaluate, /report) so -// that /health polling traffic does not pollute the load signal. -func (s *Server) trackWork(next http.Handler) http.Handler { +// TrackWork wraps a handler with in-flight and total-request accounting, +// opting custom routes into the load signal reported on /health. +func (s *Server) TrackWork(next http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { s.inFlight.Add(1) s.totalRequests.Add(1) From 199c7dea3fc3d305bdcf523ca6a9438cb12206fb Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 23 Apr 2026 11:20:29 +0700 Subject: [PATCH 07/15] checker: add /check route for standalone human-facing web UI Providers that implement the new CheckerInteractive interface (RenderForm + ParseForm) get a built-in HTML form on GET /check and a consolidated result page on POST /check that runs the standard Collect -> Evaluate -> GetHTMLReport / ExtractMetrics pipeline. This lets a checker be used directly from a browser outside of happyDomain, with the checker itself resolving what the host would normally auto-fill (typically via its own DNS queries). Also guards NewServer against a nil Definition() so providers that advertise CheckerDefinitionProvider without a ready definition no longer panic at registration. --- README.md | 21 +++ checker/interactive.go | 347 ++++++++++++++++++++++++++++++++++++ checker/interactive_test.go | 245 +++++++++++++++++++++++++ checker/server.go | 49 +++-- 4 files changed, 644 insertions(+), 18 deletions(-) create mode 100644 checker/interactive.go create mode 100644 checker/interactive_test.go diff --git a/README.md b/README.md index 7f47eb8..e224fdf 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,27 @@ Patterns that collide with built-in routes panic at registration — pick non-overlapping paths. Custom handlers are not wrapped by the load-tracking middleware unless you opt in via `TrackWork`. +## Standalone human UI (`/check`) + +Providers that implement `CheckerInteractive` get a built-in human-facing +web form on `/check`, usable outside of happyDomain: + +```go +type CheckerInteractive interface { + RenderForm() []CheckerOptionField + ParseForm(r *http.Request) (CheckerOptions, error) +} +``` + +- `GET /check` renders a form derived from `RenderForm()`. +- `POST /check` calls `ParseForm` to obtain `CheckerOptions`, runs the + standard `Collect` → `Evaluate` → `GetHTMLReport` / `ExtractMetrics` + pipeline, and returns a consolidated HTML page. + +`ParseForm` is where the checker replaces what happyDomain would normally +auto-fill (zone records, service payload, …) — typically by issuing its +own DNS queries from the human-supplied inputs. + ## License Apache License 2.0. See [LICENSE](LICENSE) and [NOTICE](NOTICE). diff --git a/checker/interactive.go b/checker/interactive.go new file mode 100644 index 0000000..b3473bc --- /dev/null +++ b/checker/interactive.go @@ -0,0 +1,347 @@ +// Copyright 2020-2026 The happyDomain Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checker + +import ( + "bytes" + "encoding/json" + "fmt" + "html/template" + "log" + "net/http" + "time" +) + +// CheckerInteractive is an optional interface that observation providers +// can implement to expose a human-facing web form usable standalone, +// outside of a happyDomain host. Detect support with a type assertion: +// _, ok := provider.(CheckerInteractive). +// +// When the provider implements it, Server binds GET and POST on /check. +// GET renders an HTML form built from RenderForm(). POST calls ParseForm +// to obtain the CheckerOptions, then runs the standard pipeline +// (Collect, Evaluate, GetHTMLReport, ExtractMetrics) and renders a +// consolidated result page. +// +// Unlike /evaluate, which relies on happyDomain to fill AutoFill-backed +// options from execution context, a CheckerInteractive implementation is +// responsible for resolving whatever it needs from the human inputs +// (typically via direct DNS queries) before Collect runs. +type CheckerInteractive interface { + // RenderForm returns the fields the human must fill in to bootstrap + // a check. Typically a minimal set (domain name, nameserver to + // query, …) that ParseForm expands into the full CheckerOptions + // that Collect expects. + RenderForm() []CheckerOptionField + + // ParseForm reads the submitted form and returns the CheckerOptions + // ready to feed Collect. It is the checker's responsibility to do + // whatever lookups or resolutions are needed to populate fields + // that would normally be auto-filled by happyDomain. Returning an + // error causes the SDK to re-render the form with the error + // displayed. + ParseForm(r *http.Request) (CheckerOptions, error) +} + +// checkResult holds everything the result page needs to render. +type checkResult struct { + Title string + States []CheckState + Metrics []CheckMetric + ReportHTML string + CollectErr string + ReportErr string + MetricsErr string +} + +type checkFormPage struct { + Title string + Fields []CheckerOptionField + Error string +} + +func (s *Server) handleCheckForm(w http.ResponseWriter, r *http.Request) { + s.renderCheckForm(w, s.interactive.RenderForm(), "") +} + +func (s *Server) handleCheckSubmit(w http.ResponseWriter, r *http.Request) { + if err := r.ParseForm(); err != nil { + s.renderCheckForm(w, s.interactive.RenderForm(), fmt.Sprintf("invalid form: %v", err)) + return + } + + opts, err := s.interactive.ParseForm(r) + if err != nil { + s.renderCheckForm(w, s.interactive.RenderForm(), err.Error()) + return + } + + result := &checkResult{Title: s.checkPageTitle()} + + data, err := s.provider.Collect(r.Context(), opts) + if err != nil { + result.CollectErr = err.Error() + s.renderCheckResult(w, result) + return + } + + raw, err := json.Marshal(data) + if err != nil { + result.CollectErr = fmt.Sprintf("failed to marshal collected data: %v", err) + s.renderCheckResult(w, result) + return + } + + if s.definition != nil { + obs := &mapObservationGetter{data: map[ObservationKey]json.RawMessage{ + s.provider.Key(): raw, + }} + result.States = s.evaluateRules(r.Context(), obs, opts, nil) + } + + ctx := NewReportContext(raw, nil) + + if reporter, ok := s.provider.(CheckerHTMLReporter); ok { + html, rerr := reporter.GetHTMLReport(ctx) + if rerr != nil { + result.ReportErr = rerr.Error() + } else { + result.ReportHTML = html + } + } + + if reporter, ok := s.provider.(CheckerMetricsReporter); ok { + metrics, merr := reporter.ExtractMetrics(ctx, time.Now()) + if merr != nil { + result.MetricsErr = merr.Error() + } else { + result.Metrics = metrics + } + } + + s.renderCheckResult(w, result) +} + +func (s *Server) checkPageTitle() string { + if s.definition != nil && s.definition.Name != "" { + return s.definition.Name + } + return "Checker" +} + +func renderHTML(w http.ResponseWriter, status int, tpl *template.Template, data any) { + var buf bytes.Buffer + if err := tpl.Execute(&buf, data); err != nil { + log.Printf("render %s: %v", tpl.Name(), err) + http.Error(w, "failed to render page", http.StatusInternalServerError) + return + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.WriteHeader(status) + w.Write(buf.Bytes()) +} + +func (s *Server) renderCheckForm(w http.ResponseWriter, fields []CheckerOptionField, errMsg string) { + status := http.StatusOK + if errMsg != "" { + status = http.StatusBadRequest + } + renderHTML(w, status, checkFormTemplate, checkFormPage{ + Title: s.checkPageTitle(), + Fields: fields, + Error: errMsg, + }) +} + +func (s *Server) renderCheckResult(w http.ResponseWriter, result *checkResult) { + renderHTML(w, http.StatusOK, checkResultTemplate, result) +} + +func statusClass(s Status) string { + switch s { + case StatusOK: + return "ok" + case StatusInfo: + return "info" + case StatusWarn: + return "warn" + case StatusCrit: + return "crit" + case StatusError: + return "error" + default: + return "unknown" + } +} + +// defaultString avoids printing the literal "" for unset defaults. +func defaultString(v any) string { + if v == nil { + return "" + } + switch t := v.(type) { + case string: + return t + case bool: + if t { + return "true" + } + return "" + default: + return fmt.Sprintf("%v", v) + } +} + +func defaultBool(v any) bool { + b, _ := v.(bool) + return b +} + +var templateFuncs = template.FuncMap{ + "statusClass": statusClass, + "statusString": Status.String, + "defaultString": defaultString, + "defaultBool": defaultBool, +} + +const baseCSS = ` +body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; max-width: 960px; margin: 2rem auto; padding: 0 1rem; color: #222; } +h1, h2 { border-bottom: 1px solid #eee; padding-bottom: 0.3rem; } +form { display: grid; gap: 1rem; } +label { display: block; font-weight: 600; margin-bottom: 0.25rem; } +.required::after { content: " *"; color: #c00; } +.desc { font-weight: normal; color: #666; font-size: 0.9rem; display: block; margin-top: 0.1rem; } +input[type=text], input[type=password], input[type=number], select, textarea { + width: 100%; padding: 0.5rem; border: 1px solid #bbb; border-radius: 4px; box-sizing: border-box; font: inherit; +} +textarea { min-height: 6rem; } +button { padding: 0.6rem 1.2rem; background: #0b63c5; color: #fff; border: 0; border-radius: 4px; font: inherit; cursor: pointer; } +button:hover { background: #084c98; } +.err { background: #fee; border: 1px solid #fbb; color: #900; padding: 0.6rem 0.8rem; border-radius: 4px; margin: 1rem 0; } +table { border-collapse: collapse; width: 100%; margin: 0.5rem 0 1.5rem; } +th, td { text-align: left; padding: 0.5rem 0.6rem; border-bottom: 1px solid #eee; vertical-align: top; } +th { background: #f7f7f7; } +.badge { display: inline-block; padding: 0.15rem 0.5rem; border-radius: 3px; font-size: 0.8rem; font-weight: 600; color: #fff; } +.badge.ok { background: #2a9d3c; } +.badge.info { background: #3277cc; } +.badge.warn { background: #d08a00; } +.badge.crit { background: #c0392b; } +.badge.error { background: #7a1f1f; } +.badge.unknown { background: #777; } +iframe.report { width: 100%; min-height: 480px; border: 1px solid #ccc; border-radius: 4px; } +.actions { margin-top: 1.5rem; } +.actions a { color: #0b63c5; text-decoration: none; } +.actions a:hover { text-decoration: underline; } +` + +var checkFormTemplate = template.Must(template.New("form").Funcs(templateFuncs).Parse(` + + + +{{.Title}} – Check + + + +

{{.Title}}

+{{if .Error}}
{{.Error}}
{{end}} +
+{{range .Fields}}{{if not .Hide}} +
+ + {{if .Choices}} + + {{else if eq .Type "bool"}} + + {{else if .Textarea}} + + {{else if eq .Type "number"}} + + {{else if eq .Type "uint"}} + + {{else if .Secret}} + + {{else}} + + {{end}} +
+{{end}}{{end}} +
+
+ +`)) + +var checkResultTemplate = template.Must(template.New("result").Funcs(templateFuncs).Parse(` + + + +{{.Title}} – Result + + + +

{{.Title}}

+ +{{if .CollectErr}}
Collect failed: {{.CollectErr}}
{{end}} + +{{if .States}} +

Check states

+ + + + {{range .States}} + + + + + + + {{end}} + +
StatusCodeSubjectMessage
{{statusString .Status}}{{.Code}}{{.Subject}}{{.Message}}
+{{end}} + +{{if .Metrics}} +

Metrics

+ + + + {{range .Metrics}} + + + + + + + {{end}} + +
NameValueUnitLabels
{{.Name}}{{.Value}}{{.Unit}}{{range $k, $v := .Labels}}{{$k}}={{$v}} {{end}}
+{{end}} + +{{if .MetricsErr}}
Metrics error: {{.MetricsErr}}
{{end}} + +{{if .ReportHTML}} +

Report

+ +{{end}} + +{{if .ReportErr}}
Report error: {{.ReportErr}}
{{end}} + + + +`)) diff --git a/checker/interactive_test.go b/checker/interactive_test.go new file mode 100644 index 0000000..0a48512 --- /dev/null +++ b/checker/interactive_test.go @@ -0,0 +1,245 @@ +// Copyright 2020-2026 The happyDomain Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checker + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "net/url" + "strings" + "testing" +) + +// interactiveProvider embeds testProvider and adds CheckerInteractive. +type interactiveProvider struct { + *testProvider + fields []CheckerOptionField + parseFn func(r *http.Request) (CheckerOptions, error) + parseErr error +} + +func (p *interactiveProvider) RenderForm() []CheckerOptionField { + return p.fields +} + +func (p *interactiveProvider) ParseForm(r *http.Request) (CheckerOptions, error) { + if p.parseErr != nil { + return nil, p.parseErr + } + if p.parseFn != nil { + return p.parseFn(r) + } + return CheckerOptions{"domain": r.FormValue("domain")}, nil +} + +func postForm(handler http.Handler, path string, values url.Values) *httptest.ResponseRecorder { + req := httptest.NewRequest("POST", path, strings.NewReader(values.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + rec := httptest.NewRecorder() + handler.ServeHTTP(rec, req) + return rec +} + +// minimalProvider implements only ObservationProvider. +type minimalProvider struct{ key ObservationKey } + +func (m *minimalProvider) Key() ObservationKey { return m.key } +func (m *minimalProvider) Collect(ctx context.Context, opts CheckerOptions) (any, error) { + return nil, nil +} + +func TestCheck_NotRegistered_WhenProviderLacksInterface(t *testing.T) { + p := &minimalProvider{key: "test"} + srv := NewServer(p) + defer srv.Close() + + rec := doRequest(srv.Handler(), "GET", "/check", nil, nil) + if rec.Code != http.StatusNotFound { + t.Fatalf("GET /check without CheckerInteractive = %d, want 404", rec.Code) + } +} + +func TestCheck_Form_Renders(t *testing.T) { + p := &interactiveProvider{ + testProvider: &testProvider{key: "test"}, + fields: []CheckerOptionField{ + {Id: "domain", Type: "string", Label: "Domain name", Required: true, Placeholder: "example.com"}, + {Id: "verbose", Type: "bool", Label: "Verbose", Default: true}, + {Id: "flavor", Type: "string", Choices: []string{"a", "b"}, Default: "b"}, + {Id: "hidden", Type: "string", Hide: true}, + }, + } + srv := NewServer(p) + defer srv.Close() + + rec := doRequest(srv.Handler(), "GET", "/check", nil, nil) + if rec.Code != http.StatusOK { + t.Fatalf("GET /check = %d, want 200", rec.Code) + } + body := rec.Body.String() + for _, want := range []string{ + `name="domain"`, + `placeholder="example.com"`, + `Domain name`, + `type="checkbox"`, + `name="verbose"`, + ` checked`, + `