diff --git a/checker/collect.go b/checker/collect.go index 580f900..c3f2a15 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -26,7 +26,6 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a apex, servers, err := findApex(ctx, owner, resolver) if err != nil { data.ApexLookupError = err.Error() - data.ApexLookupTransient = isTransientApexError(err) return data, nil } data.Apex = apex @@ -123,13 +122,10 @@ func (c *chainCtx) walk(ctx context.Context, name string) { q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} r, server, err := c.queryFor(ctx, currentServers, q) if err != nil { - // A query that never produced a response is a transport/resolver - // fault: we could not observe the alias, so report it as transient. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: current, - Detail: err.Error(), - Transient: true, + Reason: TermQueryErr, + Subject: current, + Detail: err.Error(), } c.data.FinalTarget = current return @@ -137,15 +133,11 @@ func (c *chainCtx) walk(ctx context.Context, name string) { if r.Rcode != dns.RcodeSuccess { rcode := rcodeText(r.Rcode) - // A SERVFAIL/REFUSED from every auth server means we could not observe - // the record, not that the zone published a negative answer; mark it - // transient so the rule reports Unknown instead of Crit. c.data.ChainTerminated = ChainTermination{ - Reason: TermRcode, - Subject: current, - Rcode: rcode, - Detail: fmt.Sprintf("server answered %s for %s", rcode, current), - Transient: isTransientRcode(r.Rcode), + Reason: TermRcode, + Subject: current, + Rcode: rcode, + Detail: fmt.Sprintf("server answered %s for %s", rcode, current), } c.data.FinalTarget = current return @@ -197,15 +189,10 @@ func (c *chainCtx) walk(ctx context.Context, name string) { // answered by the parent's auth set. zone, ns, zerr := c.reanchor(ctx, target) if zerr != nil { - // Re-anchoring fails either because the target genuinely has no - // locatable apex (definitive: the alias points into the void) or - // because a resolver/transport fault prevented observing it. Only the - // latter is transient; classify so the rule does not mask a real break. c.data.ChainTerminated = ChainTermination{ - Reason: TermQueryErr, - Subject: target, - Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), - Transient: isTransientApexError(zerr), + Reason: TermQueryErr, + Subject: target, + Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), } c.data.FinalTarget = target return @@ -284,9 +271,8 @@ func extractCNAME(r *dns.Msg, owner string) (target string, fromDNAME bool, ttl func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) { type result struct { - addrs []string - rcode string - transient bool + addrs []string + rcode string } query := func(qtype uint16) result { @@ -306,7 +292,6 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri var res result if r.Rcode != dns.RcodeSuccess { res.rcode = rcodeText(r.Rcode) - res.transient = isTransientRcode(r.Rcode) } for _, rr := range r.Answer { switch v := rr.(type) { @@ -338,10 +323,8 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri switch { case aRes.rcode != "": c.data.FinalRcode = aRes.rcode - c.data.FinalRcodeTransient = aRes.transient case aaaaRes.rcode != "": c.data.FinalRcode = aaaaRes.rcode - c.data.FinalRcodeTransient = aaaaRes.transient } } diff --git a/checker/dns.go b/checker/dns.go index 282c651..7270da1 100644 --- a/checker/dns.go +++ b/checker/dns.go @@ -2,7 +2,6 @@ package checker import ( "context" - "errors" "fmt" "net" "strings" @@ -62,22 +61,14 @@ func hostPort(host, port string) string { func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) { labels := dns.SplitDomainName(fqdn) - // transientSeen records whether any candidate failed for a transport or - // SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by - // a flaky recursive resolver is reported as transient rather than definitive. - transientSeen := false for i := range labels { candidate := dns.Fqdn(strings.Join(labels[i:], ".")) q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET} r, rerr := recursiveExchange(ctx, resolver, q) if rerr != nil { - transientSeen = true continue } if r.Rcode != dns.RcodeSuccess { - if isTransientRcode(r.Rcode) { - transientSeen = true - } continue } hasSOA := false @@ -96,20 +87,14 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers apex = candidate servers, err = resolveZoneNSAddrs(ctx, apex) if err != nil { - // A resolver fault (e.g. "server misbehaving") means we could not - // observe the apex's NS, not that the delegation is broken. - return "", nil, transientApexError{err} + return "", nil, err } if len(servers) == 0 { return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex) } return apex, servers, nil } - err = fmt.Errorf("could not locate apex of %s", fqdn) - if transientSeen { - return "", nil, transientApexError{err} - } - return "", nil, err + return "", nil, fmt.Errorf("could not locate apex of %s", fqdn) } func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { @@ -143,59 +128,24 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { return out, nil } -// queryAtAuth tries each server in order and returns the first definitive -// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it -// fail over to the next server so a single flaky auth server cannot decide the -// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once. -// If every server fails it returns the last transient response when there was -// one (so callers can still inspect the rcode), otherwise the last transport -// error. dnssec=true sets the DO bit; only the DNSSEC probes need it. +// queryAtAuth tries each server in order and returns the first usable answer. +// dnssec=true sets the DO bit; only the DNSSEC probes need it. func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) { var lastErr error - var transientMsg *dns.Msg - var transientServer string for _, s := range servers { r, err := dnsExchange(ctx, proto, s, q, false, dnssec) if err != nil { lastErr = err continue } - if isTransientRcode(r.Rcode) { - transientMsg, transientServer = r, s - continue - } return r, s, nil } - if transientMsg != nil { - return transientMsg, transientServer, nil - } if lastErr == nil { lastErr = fmt.Errorf("no servers provided") } return nil, "", lastErr } -// transientApexError marks an apex-lookup failure that stems from a transport or -// resolver fault rather than definitive DNS evidence, so apexLookupRule can -// report it as Unknown instead of flapping the check into Crit. -type transientApexError struct{ err error } - -func (e transientApexError) Error() string { return e.err.Error() } -func (e transientApexError) Unwrap() error { return e.err } - -func isTransientApexError(err error) bool { - var t transientApexError - return errors.As(err, &t) -} - -// isTransientRcode reports whether an rcode is worth retrying against another -// auth server rather than treating as the zone's final answer. SERVFAIL and -// REFUSED are typically per-server faults (backend down, server not yet loaded -// the zone), unlike NXDOMAIN which is an authoritative negative answer. -func isTransientRcode(rcode int) bool { - return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused -} - func rcodeText(r int) string { if s, ok := dns.RcodeToString[r]; ok { return s diff --git a/checker/dns_test.go b/checker/dns_test.go deleted file mode 100644 index 317be19..0000000 --- a/checker/dns_test.go +++ /dev/null @@ -1,109 +0,0 @@ -package checker - -import ( - "context" - "errors" - "fmt" - "net" - "testing" - - "github.com/miekg/dns" -) - -func TestIsTransientRcode(t *testing.T) { - transient := []int{dns.RcodeServerFailure, dns.RcodeRefused} - for _, rc := range transient { - if !isTransientRcode(rc) { - t.Errorf("rcode %s should be transient", rcodeText(rc)) - } - } - final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented} - for _, rc := range final { - if isTransientRcode(rc) { - t.Errorf("rcode %s should not be transient", rcodeText(rc)) - } - } -} - -func TestIsTransientApexError(t *testing.T) { - wrapped := transientApexError{errors.New("server misbehaving")} - if !isTransientApexError(wrapped) { - t.Errorf("transientApexError should be classified as transient") - } - if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) { - t.Errorf("error wrapping a transientApexError should be transient") - } - if isTransientApexError(errors.New("could not locate apex of example.com.")) { - t.Errorf("plain error should not be classified as transient") - } - if isTransientApexError(nil) { - t.Errorf("nil error should not be classified as transient") - } -} - -// startTestServer spins up a UDP DNS server that answers every query with the -// given handler, returning its address and a shutdown func. -func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { - t.Helper() - mux := dns.NewServeMux() - mux.HandleFunc(".", handler) - pc, err := net.ListenPacket("udp", "127.0.0.1:0") - if err != nil { - t.Fatalf("listen: %v", err) - } - srv := &dns.Server{PacketConn: pc, Handler: mux} - go srv.ActivateAndServe() - return pc.LocalAddr().String(), func() { srv.Shutdown() } -} - -func answerWith(rcode int, withAnswer bool) dns.HandlerFunc { - return func(w dns.ResponseWriter, r *dns.Msg) { - m := new(dns.Msg) - m.SetReply(r) - m.Rcode = rcode - if withAnswer && len(r.Question) > 0 { - rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.") - if rr != nil { - m.Answer = append(m.Answer, rr) - } - } - w.WriteMsg(m) - } -} - -func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) { - q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} - - t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) { - bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) - defer stopBad() - good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true)) - defer stopGood() - - r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if r.Rcode != dns.RcodeSuccess { - t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode)) - } - if server != good { - t.Fatalf("answered by %s, want the healthy server %s", server, good) - } - }) - - t.Run("returns transient response when every server fails", func(t *testing.T) { - s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false)) - defer stop1() - s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false)) - defer stop2() - - r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if !isTransientRcode(r.Rcode) { - t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode)) - } - }) -} diff --git a/checker/rules_apex.go b/checker/rules_apex.go index 4cb1918..3ff7319 100644 --- a/checker/rules_apex.go +++ b/checker/rules_apex.go @@ -22,20 +22,11 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ if data.Apex != "" { return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex)) } - // A transport/resolver fault means the apex could not be observed, not that the - // delegation is broken. Report it as Unknown so an intermittent recursive-resolver - // glitch does not flap the check into Crit; only definitive evidence drives Crit. - status := sdk.StatusCrit - hint := "Check that the parent delegation exists and that the zone is published." - if data.ApexLookupTransient { - status = sdk.StatusUnknown - hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability." - } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: status, + Status: sdk.StatusCrit, Subject: data.Owner, Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError), - }, hint)} + }, "Check that the parent delegation exists and that the zone is published.")} } type cnameAtApexRule struct{} diff --git a/checker/rules_chain.go b/checker/rules_chain.go index 1cd31aa..e3da350 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,22 +90,11 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } - // A transport failure (connection refused, timeout, network unreachable) means - // we could not observe the alias, not that it is broken: report it as Unknown so - // an intermittent reachability glitch does not flap the check into Warn/Crit. A - // non-transient failure (e.g. the target has no locatable apex) is definitive - // evidence the alias cannot be followed: report it as Warn. - status, verb := sdk.StatusWarn, "failed" - hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails." - if data.ChainTerminated.Transient { - status, verb = sdk.StatusUnknown, "could not be completed" - hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail." - } return []sdk.CheckState{withHint(sdk.CheckState{ - Status: status, + Status: sdk.StatusWarn, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail), - }, hint)} + Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), + }, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")} } type chainRcodeRule struct{} @@ -125,36 +114,18 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ } var out []sdk.CheckState if data.ChainTerminated.Reason == TermRcode { - // A transient rcode (SERVFAIL/REFUSED from every auth server) means we could - // not observe the record, not that the zone published a negative answer: - // report it as Unknown so a flaky server does not flap the check into Crit. - // A definitive NXDOMAIN mid-chain is a real break and stays Crit. - status := sdk.StatusCrit - hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias." - if data.ChainTerminated.Transient { - status = sdk.StatusUnknown - hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined." - } out = append(out, withHint(sdk.CheckState{ - Status: status, + Status: sdk.StatusCrit, Subject: data.ChainTerminated.Subject, Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode), - }, hint)) + }, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias.")) } if data.FinalRcode != "" && data.FinalRcode != "NOERROR" { - // Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not - // be observed (Unknown), a definitive rcode is a real publication gap (Warn). - status := sdk.StatusWarn - hint := "Check the upstream zone's A/AAAA publication." - if data.FinalRcodeTransient { - status = sdk.StatusUnknown - hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined." - } out = append(out, withHint(sdk.CheckState{ - Status: status, + Status: sdk.StatusWarn, Subject: data.FinalTarget, Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode), - }, hint)) + }, "Check the upstream zone's A/AAAA publication.")) } if len(out) == 0 { return okState(data.Owner, "all chain and final lookups returned NOERROR") diff --git a/checker/rules_test.go b/checker/rules_test.go index c1ba05f..1d2d92f 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -81,19 +81,6 @@ func TestApexLookupRule(t *testing.T) { t.Fatalf("want hint, got none") } }) - t.Run("transient", func(t *testing.T) { - // A resolver fault (e.g. "server misbehaving") could not observe the apex, - // so it must be Unknown rather than Crit to avoid flapping the check. - data := &AliasData{ - Owner: "nemunai.re.", - ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving", - ApexLookupTransient: true, - } - s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown) - if s.Meta[hintKey] == nil { - t.Fatalf("want hint, got none") - } - }) } func TestChainLoopRule(t *testing.T) { @@ -135,18 +122,9 @@ func TestChainQueryErrorRule(t *testing.T) { d.ChainTerminated.Reason = TermOK assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK) }) - t.Run("transient query err", func(t *testing.T) { - // A transport fault (timeout) could not observe the alias, so it must be - // Unknown rather than Warn to avoid flapping the check. + t.Run("query err", func(t *testing.T) { d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true} - assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) - }) - t.Run("definitive query err", func(t *testing.T) { - // A non-transient failure (target has no locatable apex) is definitive - // evidence the alias cannot be followed: Warn, not Unknown. - d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."} + d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) }) } @@ -162,32 +140,14 @@ func TestChainRcodeRule(t *testing.T) { d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"} assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit) }) - t.Run("mid-chain transient SERVFAIL", func(t *testing.T) { - // SERVFAIL from every auth server could not be observed: Unknown, not Crit. - d := apexKnownData() - d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true} - assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown) - }) - t.Run("final definitive rcode", func(t *testing.T) { - d := apexKnownData() - d.ChainTerminated.Reason = TermOK - d.FinalTarget = "target.example." - d.FinalRcode = "NXDOMAIN" - states := run(chainRcodeRule{}, d, nil) - if len(states) != 1 || states[0].Status != sdk.StatusWarn { - t.Fatalf("want single WARN, got %+v", states) - } - }) - t.Run("final transient rcode", func(t *testing.T) { - // SERVFAIL on the final lookup could not be observed: Unknown, not Warn. + t.Run("final rcode", func(t *testing.T) { d := apexKnownData() d.ChainTerminated.Reason = TermOK d.FinalTarget = "target.example." d.FinalRcode = "SERVFAIL" - d.FinalRcodeTransient = true states := run(chainRcodeRule{}, d, nil) - if len(states) != 1 || states[0].Status != sdk.StatusUnknown { - t.Fatalf("want single UNKNOWN, got %+v", states) + if len(states) != 1 || states[0].Status != sdk.StatusWarn { + t.Fatalf("want single WARN, got %+v", states) } }) } diff --git a/checker/types.go b/checker/types.go index ba9d064..ecfa074 100644 --- a/checker/types.go +++ b/checker/types.go @@ -47,24 +47,16 @@ type ChainTermination struct { Subject string `json:"subject,omitempty"` Detail string `json:"detail,omitempty"` Rcode string `json:"rcode,omitempty"` // only with TermRcode - // Transient is meaningful with TermQueryErr and TermRcode: true when the failure - // could not be observed as a definitive answer (a transport/resolver fault, or a - // SERVFAIL/REFUSED from every auth server), false when it stems from definitive - // evidence such as a target with no locatable apex or an authoritative NXDOMAIN. - Transient bool `json:"transient,omitempty"` } // AliasData carries raw facts only; judgement is delegated to the rules. type AliasData struct { Owner string `json:"owner"` - // Apex is empty iff the apex lookup failed; ApexLookupError explains why and - // ApexLookupTransient is true when the failure was a transport/resolver fault - // (could not observe) rather than definitive evidence the apex is missing. - Apex string `json:"apex,omitempty"` - ApexLookupError string `json:"apex_lookup_error,omitempty"` - ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"` - AuthServers []string `json:"auth_servers,omitempty"` + // Apex is empty iff the apex lookup failed; ApexLookupError explains why. + Apex string `json:"apex,omitempty"` + ApexLookupError string `json:"apex_lookup_error,omitempty"` + AuthServers []string `json:"auth_servers,omitempty"` Chain []ChainHop `json:"chain,omitempty"` ChainTerminated ChainTermination `json:"chain_terminated"` @@ -74,11 +66,7 @@ type AliasData struct { FinalTarget string `json:"final_target,omitempty"` FinalA []string `json:"final_a,omitempty"` FinalAAAA []string `json:"final_aaaa,omitempty"` - // FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any; - // FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe) - // rather than a definitive negative answer. - FinalRcode string `json:"final_rcode,omitempty"` - FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"` + FinalRcode string `json:"final_rcode,omitempty"` // Coexisting is populated only when Owner has a CNAME. Coexisting []CoexistingRRset `json:"coexisting,omitempty"`