From 65687ce375fb8baafe962adfbad89fdd0d06b031 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 18 Jun 2026 11:22:00 +0900 Subject: [PATCH] checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn SERVFAIL/REFUSED from every auth server means the record could not be observed, not that the zone published a negative answer. Mark such rcodes transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule reports Unknown instead of flapping the check into Crit/Warn; definitive NXDOMAIN answers still drive Crit (mid-chain) and Warn (final). --- checker/collect.go | 20 ++++++++++++++------ checker/rules_chain.go | 26 ++++++++++++++++++++++---- checker/rules_test.go | 24 +++++++++++++++++++++--- checker/types.go | 13 +++++++++---- 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/checker/collect.go b/checker/collect.go index f631ace..580f900 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -137,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) { if r.Rcode != dns.RcodeSuccess { rcode := rcodeText(r.Rcode) + // A SERVFAIL/REFUSED from every auth server means we could not observe + // the record, not that the zone published a negative answer; mark it + // transient so the rule reports Unknown instead of Crit. c.data.ChainTerminated = ChainTermination{ - Reason: TermRcode, - Subject: current, - Rcode: rcode, - Detail: fmt.Sprintf("server answered %s for %s", rcode, current), + Reason: TermRcode, + Subject: current, + Rcode: rcode, + Detail: fmt.Sprintf("server answered %s for %s", rcode, current), + Transient: isTransientRcode(r.Rcode), } c.data.FinalTarget = current return @@ -280,8 +284,9 @@ func extractCNAME(r *dns.Msg, owner string) (target string, fromDNAME bool, ttl func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) { type result struct { - addrs []string - rcode string + addrs []string + rcode string + transient bool } query := func(qtype uint16) result { @@ -301,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri var res result if r.Rcode != dns.RcodeSuccess { res.rcode = rcodeText(r.Rcode) + res.transient = isTransientRcode(r.Rcode) } for _, rr := range r.Answer { switch v := rr.(type) { @@ -332,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri switch { case aRes.rcode != "": c.data.FinalRcode = aRes.rcode + c.data.FinalRcodeTransient = aRes.transient case aaaaRes.rcode != "": c.data.FinalRcode = aaaaRes.rcode + c.data.FinalRcodeTransient = aaaaRes.transient } } diff --git a/checker/rules_chain.go b/checker/rules_chain.go index 4f7cba0..1cd31aa 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -125,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ } var out []sdk.CheckState if data.ChainTerminated.Reason == TermRcode { + // A transient rcode (SERVFAIL/REFUSED from every auth server) means we could + // not observe the record, not that the zone published a negative answer: + // report it as Unknown so a flaky server does not flap the check into Crit. + // A definitive NXDOMAIN mid-chain is a real break and stays Crit. + status := sdk.StatusCrit + hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias." + if data.ChainTerminated.Transient { + status = sdk.StatusUnknown + hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined." + } out = append(out, withHint(sdk.CheckState{ - Status: sdk.StatusCrit, + Status: status, Subject: data.ChainTerminated.Subject, Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode), - }, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias.")) + }, hint)) } if data.FinalRcode != "" && data.FinalRcode != "NOERROR" { + // Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not + // be observed (Unknown), a definitive rcode is a real publication gap (Warn). + status := sdk.StatusWarn + hint := "Check the upstream zone's A/AAAA publication." + if data.FinalRcodeTransient { + status = sdk.StatusUnknown + hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined." + } out = append(out, withHint(sdk.CheckState{ - Status: sdk.StatusWarn, + Status: status, Subject: data.FinalTarget, Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode), - }, "Check the upstream zone's A/AAAA publication.")) + }, hint)) } if len(out) == 0 { return okState(data.Owner, "all chain and final lookups returned NOERROR") diff --git a/checker/rules_test.go b/checker/rules_test.go index afd087b..c1ba05f 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -162,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) { d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"} assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit) }) - t.Run("final rcode", func(t *testing.T) { + t.Run("mid-chain transient SERVFAIL", func(t *testing.T) { + // SERVFAIL from every auth server could not be observed: Unknown, not Crit. + d := apexKnownData() + d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true} + assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown) + }) + t.Run("final definitive rcode", func(t *testing.T) { + d := apexKnownData() + d.ChainTerminated.Reason = TermOK + d.FinalTarget = "target.example." + d.FinalRcode = "NXDOMAIN" + states := run(chainRcodeRule{}, d, nil) + if len(states) != 1 || states[0].Status != sdk.StatusWarn { + t.Fatalf("want single WARN, got %+v", states) + } + }) + t.Run("final transient rcode", func(t *testing.T) { + // SERVFAIL on the final lookup could not be observed: Unknown, not Warn. d := apexKnownData() d.ChainTerminated.Reason = TermOK d.FinalTarget = "target.example." d.FinalRcode = "SERVFAIL" + d.FinalRcodeTransient = true states := run(chainRcodeRule{}, d, nil) - if len(states) != 1 || states[0].Status != sdk.StatusWarn { - t.Fatalf("want single WARN, got %+v", states) + if len(states) != 1 || states[0].Status != sdk.StatusUnknown { + t.Fatalf("want single UNKNOWN, got %+v", states) } }) } diff --git a/checker/types.go b/checker/types.go index 44a2356..ba9d064 100644 --- a/checker/types.go +++ b/checker/types.go @@ -47,9 +47,10 @@ type ChainTermination struct { Subject string `json:"subject,omitempty"` Detail string `json:"detail,omitempty"` Rcode string `json:"rcode,omitempty"` // only with TermRcode - // Transient is meaningful with TermQueryErr: true when the query could not be - // completed because of a transport/resolver fault (could not observe), false - // when it stems from definitive evidence such as a target with no locatable apex. + // Transient is meaningful with TermQueryErr and TermRcode: true when the failure + // could not be observed as a definitive answer (a transport/resolver fault, or a + // SERVFAIL/REFUSED from every auth server), false when it stems from definitive + // evidence such as a target with no locatable apex or an authoritative NXDOMAIN. Transient bool `json:"transient,omitempty"` } @@ -73,7 +74,11 @@ type AliasData struct { FinalTarget string `json:"final_target,omitempty"` FinalA []string `json:"final_a,omitempty"` FinalAAAA []string `json:"final_aaaa,omitempty"` - FinalRcode string `json:"final_rcode,omitempty"` + // FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any; + // FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe) + // rather than a definitive negative answer. + FinalRcode string `json:"final_rcode,omitempty"` + FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"` // Coexisting is populated only when Owner has a CNAME. Coexisting []CoexistingRRset `json:"coexisting,omitempty"`