From da6def100c95c2ff0b839aaee295f38d0c405816 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 18 Jun 2026 10:05:51 +0900 Subject: [PATCH] checker: report transient apex-lookup failures as Unknown, not Crit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit. --- checker/collect.go | 21 +++++++++++++++------ checker/dns.go | 32 ++++++++++++++++++++++++++++++-- checker/dns_test.go | 18 ++++++++++++++++++ checker/rules_apex.go | 13 +++++++++++-- checker/rules_chain.go | 21 ++++++++++++++------- checker/rules_test.go | 26 ++++++++++++++++++++++++-- checker/types.go | 15 +++++++++++---- 7 files changed, 123 insertions(+), 23 deletions(-) diff --git a/checker/collect.go b/checker/collect.go index c3f2a15..f631ace 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a apex, servers, err := findApex(ctx, owner, resolver) if err != nil { data.ApexLookupError = err.Error() + data.ApexLookupTransient = isTransientApexError(err) return data, nil } data.Apex = apex @@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) { q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} r, server, err := c.queryFor(ctx, currentServers, q) if err != nil { + // A query that never produced a response is a transport/resolver + // fault: we could not observe the alias, so report it as transient. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: current, - Detail: err.Error(), + Reason: TermQueryErr, + Subject: current, + Detail: err.Error(), + Transient: true, } c.data.FinalTarget = current return @@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) { // answered by the parent's auth set. zone, ns, zerr := c.reanchor(ctx, target) if zerr != nil { + // Re-anchoring fails either because the target genuinely has no + // locatable apex (definitive: the alias points into the void) or + // because a resolver/transport fault prevented observing it. Only the + // latter is transient; classify so the rule does not mask a real break. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: target, - Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Reason: TermQueryErr, + Subject: target, + Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Transient: isTransientApexError(zerr), } c.data.FinalTarget = target return diff --git a/checker/dns.go b/checker/dns.go index 876fde1..282c651 100644 --- a/checker/dns.go +++ b/checker/dns.go @@ -2,6 +2,7 @@ package checker import ( "context" + "errors" "fmt" "net" "strings" @@ -61,14 +62,22 @@ func hostPort(host, port string) string { func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) { labels := dns.SplitDomainName(fqdn) + // transientSeen records whether any candidate failed for a transport or + // SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by + // a flaky recursive resolver is reported as transient rather than definitive. + transientSeen := false for i := range labels { candidate := dns.Fqdn(strings.Join(labels[i:], ".")) q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET} r, rerr := recursiveExchange(ctx, resolver, q) if rerr != nil { + transientSeen = true continue } if r.Rcode != dns.RcodeSuccess { + if isTransientRcode(r.Rcode) { + transientSeen = true + } continue } hasSOA := false @@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers apex = candidate servers, err = resolveZoneNSAddrs(ctx, apex) if err != nil { - return "", nil, err + // A resolver fault (e.g. "server misbehaving") means we could not + // observe the apex's NS, not that the delegation is broken. + return "", nil, transientApexError{err} } if len(servers) == 0 { return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex) } return apex, servers, nil } - return "", nil, fmt.Errorf("could not locate apex of %s", fqdn) + err = fmt.Errorf("could not locate apex of %s", fqdn) + if transientSeen { + return "", nil, transientApexError{err} + } + return "", nil, err } func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { @@ -160,6 +175,19 @@ func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Ques return nil, "", lastErr } +// transientApexError marks an apex-lookup failure that stems from a transport or +// resolver fault rather than definitive DNS evidence, so apexLookupRule can +// report it as Unknown instead of flapping the check into Crit. +type transientApexError struct{ err error } + +func (e transientApexError) Error() string { return e.err.Error() } +func (e transientApexError) Unwrap() error { return e.err } + +func isTransientApexError(err error) bool { + var t transientApexError + return errors.As(err, &t) +} + // isTransientRcode reports whether an rcode is worth retrying against another // auth server rather than treating as the zone's final answer. SERVFAIL and // REFUSED are typically per-server faults (backend down, server not yet loaded diff --git a/checker/dns_test.go b/checker/dns_test.go index 35fce22..317be19 100644 --- a/checker/dns_test.go +++ b/checker/dns_test.go @@ -2,6 +2,8 @@ package checker import ( "context" + "errors" + "fmt" "net" "testing" @@ -23,6 +25,22 @@ func TestIsTransientRcode(t *testing.T) { } } +func TestIsTransientApexError(t *testing.T) { + wrapped := transientApexError{errors.New("server misbehaving")} + if !isTransientApexError(wrapped) { + t.Errorf("transientApexError should be classified as transient") + } + if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) { + t.Errorf("error wrapping a transientApexError should be transient") + } + if isTransientApexError(errors.New("could not locate apex of example.com.")) { + t.Errorf("plain error should not be classified as transient") + } + if isTransientApexError(nil) { + t.Errorf("nil error should not be classified as transient") + } +} + // startTestServer spins up a UDP DNS server that answers every query with the // given handler, returning its address and a shutdown func. func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { diff --git a/checker/rules_apex.go b/checker/rules_apex.go index 3ff7319..4cb1918 100644 --- a/checker/rules_apex.go +++ b/checker/rules_apex.go @@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ if data.Apex != "" { return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex)) } + // A transport/resolver fault means the apex could not be observed, not that the + // delegation is broken. Report it as Unknown so an intermittent recursive-resolver + // glitch does not flap the check into Crit; only definitive evidence drives Crit. + status := sdk.StatusCrit + hint := "Check that the parent delegation exists and that the zone is published." + if data.ApexLookupTransient { + status = sdk.StatusUnknown + hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusCrit, + Status: status, Subject: data.Owner, Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError), - }, "Check that the parent delegation exists and that the zone is published.")} + }, hint)} } type cnameAtApexRule struct{} diff --git a/checker/rules_chain.go b/checker/rules_chain.go index 59ce506..4f7cba0 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,15 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } - // A transport failure (connection refused, timeout, network unreachable) - // means we could not observe the alias, not that the alias is broken. Report - // it as Unknown so an intermittent reachability glitch does not flap the - // check into Warn/Crit; only definitive DNS evidence drives those statuses. + // A transport failure (connection refused, timeout, network unreachable) means + // we could not observe the alias, not that it is broken: report it as Unknown so + // an intermittent reachability glitch does not flap the check into Warn/Crit. A + // non-transient failure (e.g. the target has no locatable apex) is definitive + // evidence the alias cannot be followed: report it as Warn. + status, verb := sdk.StatusWarn, "failed" + hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails." + if data.ChainTerminated.Transient { + status, verb = sdk.StatusUnknown, "could not be completed" + hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusUnknown, + Status: status, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), - }, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")} + Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail), + }, hint)} } type chainRcodeRule struct{} diff --git a/checker/rules_test.go b/checker/rules_test.go index c289dbb..afd087b 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) { t.Fatalf("want hint, got none") } }) + t.Run("transient", func(t *testing.T) { + // A resolver fault (e.g. "server misbehaving") could not observe the apex, + // so it must be Unknown rather than Crit to avoid flapping the check. + data := &AliasData{ + Owner: "nemunai.re.", + ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving", + ApexLookupTransient: true, + } + s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown) + if s.Meta[hintKey] == nil { + t.Fatalf("want hint, got none") + } + }) } func TestChainLoopRule(t *testing.T) { @@ -122,11 +135,20 @@ func TestChainQueryErrorRule(t *testing.T) { d.ChainTerminated.Reason = TermOK assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK) }) - t.Run("query err", func(t *testing.T) { + t.Run("transient query err", func(t *testing.T) { + // A transport fault (timeout) could not observe the alias, so it must be + // Unknown rather than Warn to avoid flapping the check. d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true} assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) }) + t.Run("definitive query err", func(t *testing.T) { + // A non-transient failure (target has no locatable apex) is definitive + // evidence the alias cannot be followed: Warn, not Unknown. + d := apexKnownData() + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."} + assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) + }) } func TestChainRcodeRule(t *testing.T) { diff --git a/checker/types.go b/checker/types.go index ecfa074..44a2356 100644 --- a/checker/types.go +++ b/checker/types.go @@ -47,16 +47,23 @@ type ChainTermination struct { Subject string `json:"subject,omitempty"` Detail string `json:"detail,omitempty"` Rcode string `json:"rcode,omitempty"` // only with TermRcode + // Transient is meaningful with TermQueryErr: true when the query could not be + // completed because of a transport/resolver fault (could not observe), false + // when it stems from definitive evidence such as a target with no locatable apex. + Transient bool `json:"transient,omitempty"` } // AliasData carries raw facts only; judgement is delegated to the rules. type AliasData struct { Owner string `json:"owner"` - // Apex is empty iff the apex lookup failed; ApexLookupError explains why. - Apex string `json:"apex,omitempty"` - ApexLookupError string `json:"apex_lookup_error,omitempty"` - AuthServers []string `json:"auth_servers,omitempty"` + // Apex is empty iff the apex lookup failed; ApexLookupError explains why and + // ApexLookupTransient is true when the failure was a transport/resolver fault + // (could not observe) rather than definitive evidence the apex is missing. + Apex string `json:"apex,omitempty"` + ApexLookupError string `json:"apex_lookup_error,omitempty"` + ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"` + AuthServers []string `json:"auth_servers,omitempty"` Chain []ChainHop `json:"chain,omitempty"` ChainTerminated ChainTermination `json:"chain_terminated"`