From 680a7735f0242fda47da6392c40e7d1aa9b824c6 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 18 Jun 2026 09:30:01 +0900 Subject: [PATCH 1/3] checker: report chain transport errors as Unknown, not Warn A transport-level query failure (connection refused, timeout, network unreachable) means the alias state could not be observed, not that the alias is misconfigured. Mapping it to Warn made the check flap whenever a flaky auth server alternated between refusing connections (Warn) and answering SERVFAIL (Crit). Report TermQueryErr as Unknown so only definitive DNS evidence drives Warn/Crit. --- checker/rules_chain.go | 10 +++++++--- checker/rules_test.go | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/checker/rules_chain.go b/checker/rules_chain.go index e3da350..59ce506 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,11 +90,15 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } + // A transport failure (connection refused, timeout, network unreachable) + // means we could not observe the alias, not that the alias is broken. Report + // it as Unknown so an intermittent reachability glitch does not flap the + // check into Warn/Crit; only definitive DNS evidence drives those statuses. return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusWarn, + Status: sdk.StatusUnknown, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), - }, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")} + Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), + }, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")} } type chainRcodeRule struct{} diff --git a/checker/rules_test.go b/checker/rules_test.go index 1d2d92f..c289dbb 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -125,7 +125,7 @@ func TestChainQueryErrorRule(t *testing.T) { t.Run("query err", func(t *testing.T) { d := apexKnownData() d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} - assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) + assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) }) } From af0dceca6c30c3daf7a3a3d3a9a6b1778ec680db Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 18 Jun 2026 09:30:56 +0900 Subject: [PATCH 2/3] checker: fail over to other auth servers on SERVFAIL/REFUSED queryAtAuth already failed over on transport errors but treated any DNS response as final, so a SERVFAIL from the first auth server terminated the chain as Crit even when a sibling server would answer NOERROR. This made the check flap against a flaky server. Treat SERVFAIL/REFUSED as transient and try the remaining servers, returning a definitive answer when any server gives one and only falling back to the transient response (or the last transport error) when every server fails. --- checker/dns.go | 26 ++++++++++++- checker/dns_test.go | 91 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 checker/dns_test.go diff --git a/checker/dns.go b/checker/dns.go index 7270da1..876fde1 100644 --- a/checker/dns.go +++ b/checker/dns.go @@ -128,24 +128,46 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { return out, nil } -// queryAtAuth tries each server in order and returns the first usable answer. -// dnssec=true sets the DO bit; only the DNSSEC probes need it. +// queryAtAuth tries each server in order and returns the first definitive +// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it +// fail over to the next server so a single flaky auth server cannot decide the +// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once. +// If every server fails it returns the last transient response when there was +// one (so callers can still inspect the rcode), otherwise the last transport +// error. dnssec=true sets the DO bit; only the DNSSEC probes need it. func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) { var lastErr error + var transientMsg *dns.Msg + var transientServer string for _, s := range servers { r, err := dnsExchange(ctx, proto, s, q, false, dnssec) if err != nil { lastErr = err continue } + if isTransientRcode(r.Rcode) { + transientMsg, transientServer = r, s + continue + } return r, s, nil } + if transientMsg != nil { + return transientMsg, transientServer, nil + } if lastErr == nil { lastErr = fmt.Errorf("no servers provided") } return nil, "", lastErr } +// isTransientRcode reports whether an rcode is worth retrying against another +// auth server rather than treating as the zone's final answer. SERVFAIL and +// REFUSED are typically per-server faults (backend down, server not yet loaded +// the zone), unlike NXDOMAIN which is an authoritative negative answer. +func isTransientRcode(rcode int) bool { + return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused +} + func rcodeText(r int) string { if s, ok := dns.RcodeToString[r]; ok { return s diff --git a/checker/dns_test.go b/checker/dns_test.go new file mode 100644 index 0000000..35fce22 --- /dev/null +++ b/checker/dns_test.go @@ -0,0 +1,91 @@ +package checker + +import ( + "context" + "net" + "testing" + + "github.com/miekg/dns" +) + +func TestIsTransientRcode(t *testing.T) { + transient := []int{dns.RcodeServerFailure, dns.RcodeRefused} + for _, rc := range transient { + if !isTransientRcode(rc) { + t.Errorf("rcode %s should be transient", rcodeText(rc)) + } + } + final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented} + for _, rc := range final { + if isTransientRcode(rc) { + t.Errorf("rcode %s should not be transient", rcodeText(rc)) + } + } +} + +// startTestServer spins up a UDP DNS server that answers every query with the +// given handler, returning its address and a shutdown func. +func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { + t.Helper() + mux := dns.NewServeMux() + mux.HandleFunc(".", handler) + pc, err := net.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + srv := &dns.Server{PacketConn: pc, Handler: mux} + go srv.ActivateAndServe() + return pc.LocalAddr().String(), func() { srv.Shutdown() } +} + +func answerWith(rcode int, withAnswer bool) dns.HandlerFunc { + return func(w dns.ResponseWriter, r *dns.Msg) { + m := new(dns.Msg) + m.SetReply(r) + m.Rcode = rcode + if withAnswer && len(r.Question) > 0 { + rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.") + if rr != nil { + m.Answer = append(m.Answer, rr) + } + } + w.WriteMsg(m) + } +} + +func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) { + q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} + + t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) { + bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) + defer stopBad() + good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true)) + defer stopGood() + + r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if r.Rcode != dns.RcodeSuccess { + t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode)) + } + if server != good { + t.Fatalf("answered by %s, want the healthy server %s", server, good) + } + }) + + t.Run("returns transient response when every server fails", func(t *testing.T) { + s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) + defer stop1() + s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false)) + defer stop2() + + r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !isTransientRcode(r.Rcode) { + t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode)) + } + }) +} From da6def100c95c2ff0b839aaee295f38d0c405816 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 18 Jun 2026 10:05:51 +0900 Subject: [PATCH 3/3] checker: report transient apex-lookup failures as Unknown, not Crit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit. --- checker/collect.go | 21 +++++++++++++++------ checker/dns.go | 32 ++++++++++++++++++++++++++++++-- checker/dns_test.go | 18 ++++++++++++++++++ checker/rules_apex.go | 13 +++++++++++-- checker/rules_chain.go | 21 ++++++++++++++------- checker/rules_test.go | 26 ++++++++++++++++++++++++-- checker/types.go | 15 +++++++++++---- 7 files changed, 123 insertions(+), 23 deletions(-) diff --git a/checker/collect.go b/checker/collect.go index c3f2a15..f631ace 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a apex, servers, err := findApex(ctx, owner, resolver) if err != nil { data.ApexLookupError = err.Error() + data.ApexLookupTransient = isTransientApexError(err) return data, nil } data.Apex = apex @@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) { q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} r, server, err := c.queryFor(ctx, currentServers, q) if err != nil { + // A query that never produced a response is a transport/resolver + // fault: we could not observe the alias, so report it as transient. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: current, - Detail: err.Error(), + Reason: TermQueryErr, + Subject: current, + Detail: err.Error(), + Transient: true, } c.data.FinalTarget = current return @@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) { // answered by the parent's auth set. zone, ns, zerr := c.reanchor(ctx, target) if zerr != nil { + // Re-anchoring fails either because the target genuinely has no + // locatable apex (definitive: the alias points into the void) or + // because a resolver/transport fault prevented observing it. Only the + // latter is transient; classify so the rule does not mask a real break. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: target, - Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Reason: TermQueryErr, + Subject: target, + Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Transient: isTransientApexError(zerr), } c.data.FinalTarget = target return diff --git a/checker/dns.go b/checker/dns.go index 876fde1..282c651 100644 --- a/checker/dns.go +++ b/checker/dns.go @@ -2,6 +2,7 @@ package checker import ( "context" + "errors" "fmt" "net" "strings" @@ -61,14 +62,22 @@ func hostPort(host, port string) string { func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) { labels := dns.SplitDomainName(fqdn) + // transientSeen records whether any candidate failed for a transport or + // SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by + // a flaky recursive resolver is reported as transient rather than definitive. + transientSeen := false for i := range labels { candidate := dns.Fqdn(strings.Join(labels[i:], ".")) q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET} r, rerr := recursiveExchange(ctx, resolver, q) if rerr != nil { + transientSeen = true continue } if r.Rcode != dns.RcodeSuccess { + if isTransientRcode(r.Rcode) { + transientSeen = true + } continue } hasSOA := false @@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers apex = candidate servers, err = resolveZoneNSAddrs(ctx, apex) if err != nil { - return "", nil, err + // A resolver fault (e.g. "server misbehaving") means we could not + // observe the apex's NS, not that the delegation is broken. + return "", nil, transientApexError{err} } if len(servers) == 0 { return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex) } return apex, servers, nil } - return "", nil, fmt.Errorf("could not locate apex of %s", fqdn) + err = fmt.Errorf("could not locate apex of %s", fqdn) + if transientSeen { + return "", nil, transientApexError{err} + } + return "", nil, err } func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { @@ -160,6 +175,19 @@ func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Ques return nil, "", lastErr } +// transientApexError marks an apex-lookup failure that stems from a transport or +// resolver fault rather than definitive DNS evidence, so apexLookupRule can +// report it as Unknown instead of flapping the check into Crit. +type transientApexError struct{ err error } + +func (e transientApexError) Error() string { return e.err.Error() } +func (e transientApexError) Unwrap() error { return e.err } + +func isTransientApexError(err error) bool { + var t transientApexError + return errors.As(err, &t) +} + // isTransientRcode reports whether an rcode is worth retrying against another // auth server rather than treating as the zone's final answer. SERVFAIL and // REFUSED are typically per-server faults (backend down, server not yet loaded diff --git a/checker/dns_test.go b/checker/dns_test.go index 35fce22..317be19 100644 --- a/checker/dns_test.go +++ b/checker/dns_test.go @@ -2,6 +2,8 @@ package checker import ( "context" + "errors" + "fmt" "net" "testing" @@ -23,6 +25,22 @@ func TestIsTransientRcode(t *testing.T) { } } +func TestIsTransientApexError(t *testing.T) { + wrapped := transientApexError{errors.New("server misbehaving")} + if !isTransientApexError(wrapped) { + t.Errorf("transientApexError should be classified as transient") + } + if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) { + t.Errorf("error wrapping a transientApexError should be transient") + } + if isTransientApexError(errors.New("could not locate apex of example.com.")) { + t.Errorf("plain error should not be classified as transient") + } + if isTransientApexError(nil) { + t.Errorf("nil error should not be classified as transient") + } +} + // startTestServer spins up a UDP DNS server that answers every query with the // given handler, returning its address and a shutdown func. func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { diff --git a/checker/rules_apex.go b/checker/rules_apex.go index 3ff7319..4cb1918 100644 --- a/checker/rules_apex.go +++ b/checker/rules_apex.go @@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ if data.Apex != "" { return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex)) } + // A transport/resolver fault means the apex could not be observed, not that the + // delegation is broken. Report it as Unknown so an intermittent recursive-resolver + // glitch does not flap the check into Crit; only definitive evidence drives Crit. + status := sdk.StatusCrit + hint := "Check that the parent delegation exists and that the zone is published." + if data.ApexLookupTransient { + status = sdk.StatusUnknown + hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusCrit, + Status: status, Subject: data.Owner, Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError), - }, "Check that the parent delegation exists and that the zone is published.")} + }, hint)} } type cnameAtApexRule struct{} diff --git a/checker/rules_chain.go b/checker/rules_chain.go index 59ce506..4f7cba0 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,15 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } - // A transport failure (connection refused, timeout, network unreachable) - // means we could not observe the alias, not that the alias is broken. Report - // it as Unknown so an intermittent reachability glitch does not flap the - // check into Warn/Crit; only definitive DNS evidence drives those statuses. + // A transport failure (connection refused, timeout, network unreachable) means + // we could not observe the alias, not that it is broken: report it as Unknown so + // an intermittent reachability glitch does not flap the check into Warn/Crit. A + // non-transient failure (e.g. the target has no locatable apex) is definitive + // evidence the alias cannot be followed: report it as Warn. + status, verb := sdk.StatusWarn, "failed" + hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails." + if data.ChainTerminated.Transient { + status, verb = sdk.StatusUnknown, "could not be completed" + hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusUnknown, + Status: status, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), - }, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")} + Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail), + }, hint)} } type chainRcodeRule struct{} diff --git a/checker/rules_test.go b/checker/rules_test.go index c289dbb..afd087b 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) { t.Fatalf("want hint, got none") } }) + t.Run("transient", func(t *testing.T) { + // A resolver fault (e.g. "server misbehaving") could not observe the apex, + // so it must be Unknown rather than Crit to avoid flapping the check. + data := &AliasData{ + Owner: "nemunai.re.", + ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving", + ApexLookupTransient: true, + } + s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown) + if s.Meta[hintKey] == nil { + t.Fatalf("want hint, got none") + } + }) } func TestChainLoopRule(t *testing.T) { @@ -122,11 +135,20 @@ func TestChainQueryErrorRule(t *testing.T) { d.ChainTerminated.Reason = TermOK assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK) }) - t.Run("query err", func(t *testing.T) { + t.Run("transient query err", func(t *testing.T) { + // A transport fault (timeout) could not observe the alias, so it must be + // Unknown rather than Warn to avoid flapping the check. d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true} assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) }) + t.Run("definitive query err", func(t *testing.T) { + // A non-transient failure (target has no locatable apex) is definitive + // evidence the alias cannot be followed: Warn, not Unknown. + d := apexKnownData() + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."} + assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) + }) } func TestChainRcodeRule(t *testing.T) { diff --git a/checker/types.go b/checker/types.go index ecfa074..44a2356 100644 --- a/checker/types.go +++ b/checker/types.go @@ -47,16 +47,23 @@ type ChainTermination struct { Subject string `json:"subject,omitempty"` Detail string `json:"detail,omitempty"` Rcode string `json:"rcode,omitempty"` // only with TermRcode + // Transient is meaningful with TermQueryErr: true when the query could not be + // completed because of a transport/resolver fault (could not observe), false + // when it stems from definitive evidence such as a target with no locatable apex. + Transient bool `json:"transient,omitempty"` } // AliasData carries raw facts only; judgement is delegated to the rules. type AliasData struct { Owner string `json:"owner"` - // Apex is empty iff the apex lookup failed; ApexLookupError explains why. - Apex string `json:"apex,omitempty"` - ApexLookupError string `json:"apex_lookup_error,omitempty"` - AuthServers []string `json:"auth_servers,omitempty"` + // Apex is empty iff the apex lookup failed; ApexLookupError explains why and + // ApexLookupTransient is true when the failure was a transport/resolver fault + // (could not observe) rather than definitive evidence the apex is missing. + Apex string `json:"apex,omitempty"` + ApexLookupError string `json:"apex_lookup_error,omitempty"` + ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"` + AuthServers []string `json:"auth_servers,omitempty"` Chain []ChainHop `json:"chain,omitempty"` ChainTerminated ChainTermination `json:"chain_terminated"`