Compare commits

...

1 commit

Author SHA1 Message Date
65687ce375 checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn
All checks were successful
continuous-integration/drone/tag Build is passing
continuous-integration/drone/push Build is passing
SERVFAIL/REFUSED from every auth server means the record could not be
observed, not that the zone published a negative answer. Mark such rcodes
transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule
reports Unknown instead of flapping the check into Crit/Warn; definitive
NXDOMAIN answers still drive Crit (mid-chain) and Warn (final).
2026-06-18 11:22:08 +09:00
4 changed files with 65 additions and 16 deletions

View file

@ -137,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
if r.Rcode != dns.RcodeSuccess { if r.Rcode != dns.RcodeSuccess {
rcode := rcodeText(r.Rcode) rcode := rcodeText(r.Rcode)
// A SERVFAIL/REFUSED from every auth server means we could not observe
// the record, not that the zone published a negative answer; mark it
// transient so the rule reports Unknown instead of Crit.
c.data.ChainTerminated = ChainTermination{ c.data.ChainTerminated = ChainTermination{
Reason: TermRcode, Reason: TermRcode,
Subject: current, Subject: current,
Rcode: rcode, Rcode: rcode,
Detail: fmt.Sprintf("server answered %s for %s", rcode, current), Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
Transient: isTransientRcode(r.Rcode),
} }
c.data.FinalTarget = current c.data.FinalTarget = current
return return
@ -280,8 +284,9 @@ func extractCNAME(r *dns.Msg, owner string) (target string, fromDNAME bool, ttl
func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) { func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) {
type result struct { type result struct {
addrs []string addrs []string
rcode string rcode string
transient bool
} }
query := func(qtype uint16) result { query := func(qtype uint16) result {
@ -301,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
var res result var res result
if r.Rcode != dns.RcodeSuccess { if r.Rcode != dns.RcodeSuccess {
res.rcode = rcodeText(r.Rcode) res.rcode = rcodeText(r.Rcode)
res.transient = isTransientRcode(r.Rcode)
} }
for _, rr := range r.Answer { for _, rr := range r.Answer {
switch v := rr.(type) { switch v := rr.(type) {
@ -332,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
switch { switch {
case aRes.rcode != "": case aRes.rcode != "":
c.data.FinalRcode = aRes.rcode c.data.FinalRcode = aRes.rcode
c.data.FinalRcodeTransient = aRes.transient
case aaaaRes.rcode != "": case aaaaRes.rcode != "":
c.data.FinalRcode = aaaaRes.rcode c.data.FinalRcode = aaaaRes.rcode
c.data.FinalRcodeTransient = aaaaRes.transient
} }
} }

View file

@ -125,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
} }
var out []sdk.CheckState var out []sdk.CheckState
if data.ChainTerminated.Reason == TermRcode { if data.ChainTerminated.Reason == TermRcode {
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
// not observe the record, not that the zone published a negative answer:
// report it as Unknown so a flaky server does not flap the check into Crit.
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
status := sdk.StatusCrit
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
if data.ChainTerminated.Transient {
status = sdk.StatusUnknown
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
}
out = append(out, withHint(sdk.CheckState{ out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusCrit, Status: status,
Subject: data.ChainTerminated.Subject, Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode), Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias.")) }, hint))
} }
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" { if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
status := sdk.StatusWarn
hint := "Check the upstream zone's A/AAAA publication."
if data.FinalRcodeTransient {
status = sdk.StatusUnknown
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
}
out = append(out, withHint(sdk.CheckState{ out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusWarn, Status: status,
Subject: data.FinalTarget, Subject: data.FinalTarget,
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode), Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
}, "Check the upstream zone's A/AAAA publication.")) }, hint))
} }
if len(out) == 0 { if len(out) == 0 {
return okState(data.Owner, "all chain and final lookups returned NOERROR") return okState(data.Owner, "all chain and final lookups returned NOERROR")

View file

@ -162,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) {
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"} d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit) assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
}) })
t.Run("final rcode", func(t *testing.T) { t.Run("mid-chain transient SERVFAIL", func(t *testing.T) {
// SERVFAIL from every auth server could not be observed: Unknown, not Crit.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true}
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown)
})
t.Run("final definitive rcode", func(t *testing.T) {
d := apexKnownData()
d.ChainTerminated.Reason = TermOK
d.FinalTarget = "target.example."
d.FinalRcode = "NXDOMAIN"
states := run(chainRcodeRule{}, d, nil)
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
t.Fatalf("want single WARN, got %+v", states)
}
})
t.Run("final transient rcode", func(t *testing.T) {
// SERVFAIL on the final lookup could not be observed: Unknown, not Warn.
d := apexKnownData() d := apexKnownData()
d.ChainTerminated.Reason = TermOK d.ChainTerminated.Reason = TermOK
d.FinalTarget = "target.example." d.FinalTarget = "target.example."
d.FinalRcode = "SERVFAIL" d.FinalRcode = "SERVFAIL"
d.FinalRcodeTransient = true
states := run(chainRcodeRule{}, d, nil) states := run(chainRcodeRule{}, d, nil)
if len(states) != 1 || states[0].Status != sdk.StatusWarn { if len(states) != 1 || states[0].Status != sdk.StatusUnknown {
t.Fatalf("want single WARN, got %+v", states) t.Fatalf("want single UNKNOWN, got %+v", states)
} }
}) })
} }

View file

@ -47,9 +47,10 @@ type ChainTermination struct {
Subject string `json:"subject,omitempty"` Subject string `json:"subject,omitempty"`
Detail string `json:"detail,omitempty"` Detail string `json:"detail,omitempty"`
Rcode string `json:"rcode,omitempty"` // only with TermRcode Rcode string `json:"rcode,omitempty"` // only with TermRcode
// Transient is meaningful with TermQueryErr: true when the query could not be // Transient is meaningful with TermQueryErr and TermRcode: true when the failure
// completed because of a transport/resolver fault (could not observe), false // could not be observed as a definitive answer (a transport/resolver fault, or a
// when it stems from definitive evidence such as a target with no locatable apex. // SERVFAIL/REFUSED from every auth server), false when it stems from definitive
// evidence such as a target with no locatable apex or an authoritative NXDOMAIN.
Transient bool `json:"transient,omitempty"` Transient bool `json:"transient,omitempty"`
} }
@ -73,7 +74,11 @@ type AliasData struct {
FinalTarget string `json:"final_target,omitempty"` FinalTarget string `json:"final_target,omitempty"`
FinalA []string `json:"final_a,omitempty"` FinalA []string `json:"final_a,omitempty"`
FinalAAAA []string `json:"final_aaaa,omitempty"` FinalAAAA []string `json:"final_aaaa,omitempty"`
FinalRcode string `json:"final_rcode,omitempty"` // FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any;
// FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe)
// rather than a definitive negative answer.
FinalRcode string `json:"final_rcode,omitempty"`
FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"`
// Coexisting is populated only when Owner has a CNAME. // Coexisting is populated only when Owner has a CNAME.
Coexisting []CoexistingRRset `json:"coexisting,omitempty"` Coexisting []CoexistingRRset `json:"coexisting,omitempty"`