diff --git a/checker/collect.go b/checker/collect.go index c3f2a15..f631ace 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a apex, servers, err := findApex(ctx, owner, resolver) if err != nil { data.ApexLookupError = err.Error() + data.ApexLookupTransient = isTransientApexError(err) return data, nil } data.Apex = apex @@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) { q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} r, server, err := c.queryFor(ctx, currentServers, q) if err != nil { + // A query that never produced a response is a transport/resolver + // fault: we could not observe the alias, so report it as transient. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: current, - Detail: err.Error(), + Reason: TermQueryErr, + Subject: current, + Detail: err.Error(), + Transient: true, } c.data.FinalTarget = current return @@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) { // answered by the parent's auth set. zone, ns, zerr := c.reanchor(ctx, target) if zerr != nil { + // Re-anchoring fails either because the target genuinely has no + // locatable apex (definitive: the alias points into the void) or + // because a resolver/transport fault prevented observing it. Only the + // latter is transient; classify so the rule does not mask a real break. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: target, - Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Reason: TermQueryErr, + Subject: target, + Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Transient: isTransientApexError(zerr), } c.data.FinalTarget = target return diff --git a/checker/dns.go b/checker/dns.go index 7270da1..282c651 100644 --- a/checker/dns.go +++ b/checker/dns.go @@ -2,6 +2,7 @@ package checker import ( "context" + "errors" "fmt" "net" "strings" @@ -61,14 +62,22 @@ func hostPort(host, port string) string { func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) { labels := dns.SplitDomainName(fqdn) + // transientSeen records whether any candidate failed for a transport or + // SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by + // a flaky recursive resolver is reported as transient rather than definitive. + transientSeen := false for i := range labels { candidate := dns.Fqdn(strings.Join(labels[i:], ".")) q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET} r, rerr := recursiveExchange(ctx, resolver, q) if rerr != nil { + transientSeen = true continue } if r.Rcode != dns.RcodeSuccess { + if isTransientRcode(r.Rcode) { + transientSeen = true + } continue } hasSOA := false @@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers apex = candidate servers, err = resolveZoneNSAddrs(ctx, apex) if err != nil { - return "", nil, err + // A resolver fault (e.g. "server misbehaving") means we could not + // observe the apex's NS, not that the delegation is broken. + return "", nil, transientApexError{err} } if len(servers) == 0 { return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex) } return apex, servers, nil } - return "", nil, fmt.Errorf("could not locate apex of %s", fqdn) + err = fmt.Errorf("could not locate apex of %s", fqdn) + if transientSeen { + return "", nil, transientApexError{err} + } + return "", nil, err } func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { @@ -128,24 +143,59 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { return out, nil } -// queryAtAuth tries each server in order and returns the first usable answer. -// dnssec=true sets the DO bit; only the DNSSEC probes need it. +// queryAtAuth tries each server in order and returns the first definitive +// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it +// fail over to the next server so a single flaky auth server cannot decide the +// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once. +// If every server fails it returns the last transient response when there was +// one (so callers can still inspect the rcode), otherwise the last transport +// error. dnssec=true sets the DO bit; only the DNSSEC probes need it. func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) { var lastErr error + var transientMsg *dns.Msg + var transientServer string for _, s := range servers { r, err := dnsExchange(ctx, proto, s, q, false, dnssec) if err != nil { lastErr = err continue } + if isTransientRcode(r.Rcode) { + transientMsg, transientServer = r, s + continue + } return r, s, nil } + if transientMsg != nil { + return transientMsg, transientServer, nil + } if lastErr == nil { lastErr = fmt.Errorf("no servers provided") } return nil, "", lastErr } +// transientApexError marks an apex-lookup failure that stems from a transport or +// resolver fault rather than definitive DNS evidence, so apexLookupRule can +// report it as Unknown instead of flapping the check into Crit. +type transientApexError struct{ err error } + +func (e transientApexError) Error() string { return e.err.Error() } +func (e transientApexError) Unwrap() error { return e.err } + +func isTransientApexError(err error) bool { + var t transientApexError + return errors.As(err, &t) +} + +// isTransientRcode reports whether an rcode is worth retrying against another +// auth server rather than treating as the zone's final answer. SERVFAIL and +// REFUSED are typically per-server faults (backend down, server not yet loaded +// the zone), unlike NXDOMAIN which is an authoritative negative answer. +func isTransientRcode(rcode int) bool { + return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused +} + func rcodeText(r int) string { if s, ok := dns.RcodeToString[r]; ok { return s diff --git a/checker/dns_test.go b/checker/dns_test.go new file mode 100644 index 0000000..317be19 --- /dev/null +++ b/checker/dns_test.go @@ -0,0 +1,109 @@ +package checker + +import ( + "context" + "errors" + "fmt" + "net" + "testing" + + "github.com/miekg/dns" +) + +func TestIsTransientRcode(t *testing.T) { + transient := []int{dns.RcodeServerFailure, dns.RcodeRefused} + for _, rc := range transient { + if !isTransientRcode(rc) { + t.Errorf("rcode %s should be transient", rcodeText(rc)) + } + } + final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented} + for _, rc := range final { + if isTransientRcode(rc) { + t.Errorf("rcode %s should not be transient", rcodeText(rc)) + } + } +} + +func TestIsTransientApexError(t *testing.T) { + wrapped := transientApexError{errors.New("server misbehaving")} + if !isTransientApexError(wrapped) { + t.Errorf("transientApexError should be classified as transient") + } + if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) { + t.Errorf("error wrapping a transientApexError should be transient") + } + if isTransientApexError(errors.New("could not locate apex of example.com.")) { + t.Errorf("plain error should not be classified as transient") + } + if isTransientApexError(nil) { + t.Errorf("nil error should not be classified as transient") + } +} + +// startTestServer spins up a UDP DNS server that answers every query with the +// given handler, returning its address and a shutdown func. +func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { + t.Helper() + mux := dns.NewServeMux() + mux.HandleFunc(".", handler) + pc, err := net.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + srv := &dns.Server{PacketConn: pc, Handler: mux} + go srv.ActivateAndServe() + return pc.LocalAddr().String(), func() { srv.Shutdown() } +} + +func answerWith(rcode int, withAnswer bool) dns.HandlerFunc { + return func(w dns.ResponseWriter, r *dns.Msg) { + m := new(dns.Msg) + m.SetReply(r) + m.Rcode = rcode + if withAnswer && len(r.Question) > 0 { + rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.") + if rr != nil { + m.Answer = append(m.Answer, rr) + } + } + w.WriteMsg(m) + } +} + +func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) { + q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} + + t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) { + bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) + defer stopBad() + good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true)) + defer stopGood() + + r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if r.Rcode != dns.RcodeSuccess { + t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode)) + } + if server != good { + t.Fatalf("answered by %s, want the healthy server %s", server, good) + } + }) + + t.Run("returns transient response when every server fails", func(t *testing.T) { + s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) + defer stop1() + s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false)) + defer stop2() + + r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !isTransientRcode(r.Rcode) { + t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode)) + } + }) +} diff --git a/checker/rules_apex.go b/checker/rules_apex.go index 3ff7319..4cb1918 100644 --- a/checker/rules_apex.go +++ b/checker/rules_apex.go @@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ if data.Apex != "" { return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex)) } + // A transport/resolver fault means the apex could not be observed, not that the + // delegation is broken. Report it as Unknown so an intermittent recursive-resolver + // glitch does not flap the check into Crit; only definitive evidence drives Crit. + status := sdk.StatusCrit + hint := "Check that the parent delegation exists and that the zone is published." + if data.ApexLookupTransient { + status = sdk.StatusUnknown + hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusCrit, + Status: status, Subject: data.Owner, Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError), - }, "Check that the parent delegation exists and that the zone is published.")} + }, hint)} } type cnameAtApexRule struct{} diff --git a/checker/rules_chain.go b/checker/rules_chain.go index e3da350..4f7cba0 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,11 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } + // A transport failure (connection refused, timeout, network unreachable) means + // we could not observe the alias, not that it is broken: report it as Unknown so + // an intermittent reachability glitch does not flap the check into Warn/Crit. A + // non-transient failure (e.g. the target has no locatable apex) is definitive + // evidence the alias cannot be followed: report it as Warn. + status, verb := sdk.StatusWarn, "failed" + hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails." + if data.ChainTerminated.Transient { + status, verb = sdk.StatusUnknown, "could not be completed" + hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusWarn, + Status: status, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), - }, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")} + Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail), + }, hint)} } type chainRcodeRule struct{} diff --git a/checker/rules_test.go b/checker/rules_test.go index 1d2d92f..afd087b 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) { t.Fatalf("want hint, got none") } }) + t.Run("transient", func(t *testing.T) { + // A resolver fault (e.g. "server misbehaving") could not observe the apex, + // so it must be Unknown rather than Crit to avoid flapping the check. + data := &AliasData{ + Owner: "nemunai.re.", + ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving", + ApexLookupTransient: true, + } + s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown) + if s.Meta[hintKey] == nil { + t.Fatalf("want hint, got none") + } + }) } func TestChainLoopRule(t *testing.T) { @@ -122,9 +135,18 @@ func TestChainQueryErrorRule(t *testing.T) { d.ChainTerminated.Reason = TermOK assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK) }) - t.Run("query err", func(t *testing.T) { + t.Run("transient query err", func(t *testing.T) { + // A transport fault (timeout) could not observe the alias, so it must be + // Unknown rather than Warn to avoid flapping the check. d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true} + assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) + }) + t.Run("definitive query err", func(t *testing.T) { + // A non-transient failure (target has no locatable apex) is definitive + // evidence the alias cannot be followed: Warn, not Unknown. + d := apexKnownData() + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."} assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) }) } diff --git a/checker/types.go b/checker/types.go index ecfa074..44a2356 100644 --- a/checker/types.go +++ b/checker/types.go @@ -47,16 +47,23 @@ type ChainTermination struct { Subject string `json:"subject,omitempty"` Detail string `json:"detail,omitempty"` Rcode string `json:"rcode,omitempty"` // only with TermRcode + // Transient is meaningful with TermQueryErr: true when the query could not be + // completed because of a transport/resolver fault (could not observe), false + // when it stems from definitive evidence such as a target with no locatable apex. + Transient bool `json:"transient,omitempty"` } // AliasData carries raw facts only; judgement is delegated to the rules. type AliasData struct { Owner string `json:"owner"` - // Apex is empty iff the apex lookup failed; ApexLookupError explains why. - Apex string `json:"apex,omitempty"` - ApexLookupError string `json:"apex_lookup_error,omitempty"` - AuthServers []string `json:"auth_servers,omitempty"` + // Apex is empty iff the apex lookup failed; ApexLookupError explains why and + // ApexLookupTransient is true when the failure was a transport/resolver fault + // (could not observe) rather than definitive evidence the apex is missing. + Apex string `json:"apex,omitempty"` + ApexLookupError string `json:"apex_lookup_error,omitempty"` + ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"` + AuthServers []string `json:"auth_servers,omitempty"` Chain []ChainHop `json:"chain,omitempty"` ChainTerminated ChainTermination `json:"chain_terminated"`