diff --git a/checker/collect.go b/checker/collect.go index c3f2a15..580f900 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a apex, servers, err := findApex(ctx, owner, resolver) if err != nil { data.ApexLookupError = err.Error() + data.ApexLookupTransient = isTransientApexError(err) return data, nil } data.Apex = apex @@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) { q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} r, server, err := c.queryFor(ctx, currentServers, q) if err != nil { + // A query that never produced a response is a transport/resolver + // fault: we could not observe the alias, so report it as transient. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: current, - Detail: err.Error(), + Reason: TermQueryErr, + Subject: current, + Detail: err.Error(), + Transient: true, } c.data.FinalTarget = current return @@ -133,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) { if r.Rcode != dns.RcodeSuccess { rcode := rcodeText(r.Rcode) + // A SERVFAIL/REFUSED from every auth server means we could not observe + // the record, not that the zone published a negative answer; mark it + // transient so the rule reports Unknown instead of Crit. c.data.ChainTerminated = ChainTermination{ - Reason: TermRcode, - Subject: current, - Rcode: rcode, - Detail: fmt.Sprintf("server answered %s for %s", rcode, current), + Reason: TermRcode, + Subject: current, + Rcode: rcode, + Detail: fmt.Sprintf("server answered %s for %s", rcode, current), + Transient: isTransientRcode(r.Rcode), } c.data.FinalTarget = current return @@ -189,10 +197,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) { // answered by the parent's auth set. zone, ns, zerr := c.reanchor(ctx, target) if zerr != nil { + // Re-anchoring fails either because the target genuinely has no + // locatable apex (definitive: the alias points into the void) or + // because a resolver/transport fault prevented observing it. Only the + // latter is transient; classify so the rule does not mask a real break. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: target, - Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Reason: TermQueryErr, + Subject: target, + Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), + Transient: isTransientApexError(zerr), } c.data.FinalTarget = target return @@ -271,8 +284,9 @@ func extractCNAME(r *dns.Msg, owner string) (target string, fromDNAME bool, ttl func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) { type result struct { - addrs []string - rcode string + addrs []string + rcode string + transient bool } query := func(qtype uint16) result { @@ -292,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri var res result if r.Rcode != dns.RcodeSuccess { res.rcode = rcodeText(r.Rcode) + res.transient = isTransientRcode(r.Rcode) } for _, rr := range r.Answer { switch v := rr.(type) { @@ -323,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri switch { case aRes.rcode != "": c.data.FinalRcode = aRes.rcode + c.data.FinalRcodeTransient = aRes.transient case aaaaRes.rcode != "": c.data.FinalRcode = aaaaRes.rcode + c.data.FinalRcodeTransient = aaaaRes.transient } } diff --git a/checker/dns.go b/checker/dns.go index 7270da1..282c651 100644 --- a/checker/dns.go +++ b/checker/dns.go @@ -2,6 +2,7 @@ package checker import ( "context" + "errors" "fmt" "net" "strings" @@ -61,14 +62,22 @@ func hostPort(host, port string) string { func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) { labels := dns.SplitDomainName(fqdn) + // transientSeen records whether any candidate failed for a transport or + // SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by + // a flaky recursive resolver is reported as transient rather than definitive. + transientSeen := false for i := range labels { candidate := dns.Fqdn(strings.Join(labels[i:], ".")) q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET} r, rerr := recursiveExchange(ctx, resolver, q) if rerr != nil { + transientSeen = true continue } if r.Rcode != dns.RcodeSuccess { + if isTransientRcode(r.Rcode) { + transientSeen = true + } continue } hasSOA := false @@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers apex = candidate servers, err = resolveZoneNSAddrs(ctx, apex) if err != nil { - return "", nil, err + // A resolver fault (e.g. "server misbehaving") means we could not + // observe the apex's NS, not that the delegation is broken. + return "", nil, transientApexError{err} } if len(servers) == 0 { return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex) } return apex, servers, nil } - return "", nil, fmt.Errorf("could not locate apex of %s", fqdn) + err = fmt.Errorf("could not locate apex of %s", fqdn) + if transientSeen { + return "", nil, transientApexError{err} + } + return "", nil, err } func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { @@ -128,24 +143,59 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { return out, nil } -// queryAtAuth tries each server in order and returns the first usable answer. -// dnssec=true sets the DO bit; only the DNSSEC probes need it. +// queryAtAuth tries each server in order and returns the first definitive +// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it +// fail over to the next server so a single flaky auth server cannot decide the +// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once. +// If every server fails it returns the last transient response when there was +// one (so callers can still inspect the rcode), otherwise the last transport +// error. dnssec=true sets the DO bit; only the DNSSEC probes need it. func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) { var lastErr error + var transientMsg *dns.Msg + var transientServer string for _, s := range servers { r, err := dnsExchange(ctx, proto, s, q, false, dnssec) if err != nil { lastErr = err continue } + if isTransientRcode(r.Rcode) { + transientMsg, transientServer = r, s + continue + } return r, s, nil } + if transientMsg != nil { + return transientMsg, transientServer, nil + } if lastErr == nil { lastErr = fmt.Errorf("no servers provided") } return nil, "", lastErr } +// transientApexError marks an apex-lookup failure that stems from a transport or +// resolver fault rather than definitive DNS evidence, so apexLookupRule can +// report it as Unknown instead of flapping the check into Crit. +type transientApexError struct{ err error } + +func (e transientApexError) Error() string { return e.err.Error() } +func (e transientApexError) Unwrap() error { return e.err } + +func isTransientApexError(err error) bool { + var t transientApexError + return errors.As(err, &t) +} + +// isTransientRcode reports whether an rcode is worth retrying against another +// auth server rather than treating as the zone's final answer. SERVFAIL and +// REFUSED are typically per-server faults (backend down, server not yet loaded +// the zone), unlike NXDOMAIN which is an authoritative negative answer. +func isTransientRcode(rcode int) bool { + return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused +} + func rcodeText(r int) string { if s, ok := dns.RcodeToString[r]; ok { return s diff --git a/checker/dns_test.go b/checker/dns_test.go new file mode 100644 index 0000000..317be19 --- /dev/null +++ b/checker/dns_test.go @@ -0,0 +1,109 @@ +package checker + +import ( + "context" + "errors" + "fmt" + "net" + "testing" + + "github.com/miekg/dns" +) + +func TestIsTransientRcode(t *testing.T) { + transient := []int{dns.RcodeServerFailure, dns.RcodeRefused} + for _, rc := range transient { + if !isTransientRcode(rc) { + t.Errorf("rcode %s should be transient", rcodeText(rc)) + } + } + final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented} + for _, rc := range final { + if isTransientRcode(rc) { + t.Errorf("rcode %s should not be transient", rcodeText(rc)) + } + } +} + +func TestIsTransientApexError(t *testing.T) { + wrapped := transientApexError{errors.New("server misbehaving")} + if !isTransientApexError(wrapped) { + t.Errorf("transientApexError should be classified as transient") + } + if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) { + t.Errorf("error wrapping a transientApexError should be transient") + } + if isTransientApexError(errors.New("could not locate apex of example.com.")) { + t.Errorf("plain error should not be classified as transient") + } + if isTransientApexError(nil) { + t.Errorf("nil error should not be classified as transient") + } +} + +// startTestServer spins up a UDP DNS server that answers every query with the +// given handler, returning its address and a shutdown func. +func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { + t.Helper() + mux := dns.NewServeMux() + mux.HandleFunc(".", handler) + pc, err := net.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + srv := &dns.Server{PacketConn: pc, Handler: mux} + go srv.ActivateAndServe() + return pc.LocalAddr().String(), func() { srv.Shutdown() } +} + +func answerWith(rcode int, withAnswer bool) dns.HandlerFunc { + return func(w dns.ResponseWriter, r *dns.Msg) { + m := new(dns.Msg) + m.SetReply(r) + m.Rcode = rcode + if withAnswer && len(r.Question) > 0 { + rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.") + if rr != nil { + m.Answer = append(m.Answer, rr) + } + } + w.WriteMsg(m) + } +} + +func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) { + q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} + + t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) { + bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) + defer stopBad() + good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true)) + defer stopGood() + + r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if r.Rcode != dns.RcodeSuccess { + t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode)) + } + if server != good { + t.Fatalf("answered by %s, want the healthy server %s", server, good) + } + }) + + t.Run("returns transient response when every server fails", func(t *testing.T) { + s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) + defer stop1() + s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false)) + defer stop2() + + r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !isTransientRcode(r.Rcode) { + t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode)) + } + }) +} diff --git a/checker/rules_apex.go b/checker/rules_apex.go index 3ff7319..4cb1918 100644 --- a/checker/rules_apex.go +++ b/checker/rules_apex.go @@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ if data.Apex != "" { return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex)) } + // A transport/resolver fault means the apex could not be observed, not that the + // delegation is broken. Report it as Unknown so an intermittent recursive-resolver + // glitch does not flap the check into Crit; only definitive evidence drives Crit. + status := sdk.StatusCrit + hint := "Check that the parent delegation exists and that the zone is published." + if data.ApexLookupTransient { + status = sdk.StatusUnknown + hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusCrit, + Status: status, Subject: data.Owner, Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError), - }, "Check that the parent delegation exists and that the zone is published.")} + }, hint)} } type cnameAtApexRule struct{} diff --git a/checker/rules_chain.go b/checker/rules_chain.go index e3da350..1cd31aa 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,11 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } + // A transport failure (connection refused, timeout, network unreachable) means + // we could not observe the alias, not that it is broken: report it as Unknown so + // an intermittent reachability glitch does not flap the check into Warn/Crit. A + // non-transient failure (e.g. the target has no locatable apex) is definitive + // evidence the alias cannot be followed: report it as Warn. + status, verb := sdk.StatusWarn, "failed" + hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails." + if data.ChainTerminated.Transient { + status, verb = sdk.StatusUnknown, "could not be completed" + hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail." + } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusWarn, + Status: status, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), - }, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")} + Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail), + }, hint)} } type chainRcodeRule struct{} @@ -114,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ } var out []sdk.CheckState if data.ChainTerminated.Reason == TermRcode { + // A transient rcode (SERVFAIL/REFUSED from every auth server) means we could + // not observe the record, not that the zone published a negative answer: + // report it as Unknown so a flaky server does not flap the check into Crit. + // A definitive NXDOMAIN mid-chain is a real break and stays Crit. + status := sdk.StatusCrit + hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias." + if data.ChainTerminated.Transient { + status = sdk.StatusUnknown + hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined." + } out = append(out, withHint(sdk.CheckState{ - Status: sdk.StatusCrit, + Status: status, Subject: data.ChainTerminated.Subject, Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode), - }, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias.")) + }, hint)) } if data.FinalRcode != "" && data.FinalRcode != "NOERROR" { + // Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not + // be observed (Unknown), a definitive rcode is a real publication gap (Warn). + status := sdk.StatusWarn + hint := "Check the upstream zone's A/AAAA publication." + if data.FinalRcodeTransient { + status = sdk.StatusUnknown + hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined." + } out = append(out, withHint(sdk.CheckState{ - Status: sdk.StatusWarn, + Status: status, Subject: data.FinalTarget, Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode), - }, "Check the upstream zone's A/AAAA publication.")) + }, hint)) } if len(out) == 0 { return okState(data.Owner, "all chain and final lookups returned NOERROR") diff --git a/checker/rules_test.go b/checker/rules_test.go index 1d2d92f..c1ba05f 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) { t.Fatalf("want hint, got none") } }) + t.Run("transient", func(t *testing.T) { + // A resolver fault (e.g. "server misbehaving") could not observe the apex, + // so it must be Unknown rather than Crit to avoid flapping the check. + data := &AliasData{ + Owner: "nemunai.re.", + ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving", + ApexLookupTransient: true, + } + s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown) + if s.Meta[hintKey] == nil { + t.Fatalf("want hint, got none") + } + }) } func TestChainLoopRule(t *testing.T) { @@ -122,9 +135,18 @@ func TestChainQueryErrorRule(t *testing.T) { d.ChainTerminated.Reason = TermOK assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK) }) - t.Run("query err", func(t *testing.T) { + t.Run("transient query err", func(t *testing.T) { + // A transport fault (timeout) could not observe the alias, so it must be + // Unknown rather than Warn to avoid flapping the check. d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true} + assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) + }) + t.Run("definitive query err", func(t *testing.T) { + // A non-transient failure (target has no locatable apex) is definitive + // evidence the alias cannot be followed: Warn, not Unknown. + d := apexKnownData() + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."} assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) }) } @@ -140,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) { d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"} assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit) }) - t.Run("final rcode", func(t *testing.T) { + t.Run("mid-chain transient SERVFAIL", func(t *testing.T) { + // SERVFAIL from every auth server could not be observed: Unknown, not Crit. + d := apexKnownData() + d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true} + assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown) + }) + t.Run("final definitive rcode", func(t *testing.T) { + d := apexKnownData() + d.ChainTerminated.Reason = TermOK + d.FinalTarget = "target.example." + d.FinalRcode = "NXDOMAIN" + states := run(chainRcodeRule{}, d, nil) + if len(states) != 1 || states[0].Status != sdk.StatusWarn { + t.Fatalf("want single WARN, got %+v", states) + } + }) + t.Run("final transient rcode", func(t *testing.T) { + // SERVFAIL on the final lookup could not be observed: Unknown, not Warn. d := apexKnownData() d.ChainTerminated.Reason = TermOK d.FinalTarget = "target.example." d.FinalRcode = "SERVFAIL" + d.FinalRcodeTransient = true states := run(chainRcodeRule{}, d, nil) - if len(states) != 1 || states[0].Status != sdk.StatusWarn { - t.Fatalf("want single WARN, got %+v", states) + if len(states) != 1 || states[0].Status != sdk.StatusUnknown { + t.Fatalf("want single UNKNOWN, got %+v", states) } }) } diff --git a/checker/types.go b/checker/types.go index ecfa074..ba9d064 100644 --- a/checker/types.go +++ b/checker/types.go @@ -47,16 +47,24 @@ type ChainTermination struct { Subject string `json:"subject,omitempty"` Detail string `json:"detail,omitempty"` Rcode string `json:"rcode,omitempty"` // only with TermRcode + // Transient is meaningful with TermQueryErr and TermRcode: true when the failure + // could not be observed as a definitive answer (a transport/resolver fault, or a + // SERVFAIL/REFUSED from every auth server), false when it stems from definitive + // evidence such as a target with no locatable apex or an authoritative NXDOMAIN. + Transient bool `json:"transient,omitempty"` } // AliasData carries raw facts only; judgement is delegated to the rules. type AliasData struct { Owner string `json:"owner"` - // Apex is empty iff the apex lookup failed; ApexLookupError explains why. - Apex string `json:"apex,omitempty"` - ApexLookupError string `json:"apex_lookup_error,omitempty"` - AuthServers []string `json:"auth_servers,omitempty"` + // Apex is empty iff the apex lookup failed; ApexLookupError explains why and + // ApexLookupTransient is true when the failure was a transport/resolver fault + // (could not observe) rather than definitive evidence the apex is missing. + Apex string `json:"apex,omitempty"` + ApexLookupError string `json:"apex_lookup_error,omitempty"` + ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"` + AuthServers []string `json:"auth_servers,omitempty"` Chain []ChainHop `json:"chain,omitempty"` ChainTerminated ChainTermination `json:"chain_terminated"` @@ -66,7 +74,11 @@ type AliasData struct { FinalTarget string `json:"final_target,omitempty"` FinalA []string `json:"final_a,omitempty"` FinalAAAA []string `json:"final_aaaa,omitempty"` - FinalRcode string `json:"final_rcode,omitempty"` + // FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any; + // FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe) + // rather than a definitive negative answer. + FinalRcode string `json:"final_rcode,omitempty"` + FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"` // Coexisting is populated only when Owner has a CNAME. Coexisting []CoexistingRRset `json:"coexisting,omitempty"`