checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn
SERVFAIL/REFUSED from every auth server means the record could not be observed, not that the zone published a negative answer. Mark such rcodes transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule reports Unknown instead of flapping the check into Crit/Warn; definitive NXDOMAIN answers still drive Crit (mid-chain) and Warn (final).
This commit is contained in:
parent
da6def100c
commit
65687ce375
4 changed files with 65 additions and 16 deletions
|
|
@ -137,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
|||
|
||||
if r.Rcode != dns.RcodeSuccess {
|
||||
rcode := rcodeText(r.Rcode)
|
||||
// A SERVFAIL/REFUSED from every auth server means we could not observe
|
||||
// the record, not that the zone published a negative answer; mark it
|
||||
// transient so the rule reports Unknown instead of Crit.
|
||||
c.data.ChainTerminated = ChainTermination{
|
||||
Reason: TermRcode,
|
||||
Subject: current,
|
||||
Rcode: rcode,
|
||||
Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
|
||||
Transient: isTransientRcode(r.Rcode),
|
||||
}
|
||||
c.data.FinalTarget = current
|
||||
return
|
||||
|
|
@ -282,6 +286,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
|||
type result struct {
|
||||
addrs []string
|
||||
rcode string
|
||||
transient bool
|
||||
}
|
||||
|
||||
query := func(qtype uint16) result {
|
||||
|
|
@ -301,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
|||
var res result
|
||||
if r.Rcode != dns.RcodeSuccess {
|
||||
res.rcode = rcodeText(r.Rcode)
|
||||
res.transient = isTransientRcode(r.Rcode)
|
||||
}
|
||||
for _, rr := range r.Answer {
|
||||
switch v := rr.(type) {
|
||||
|
|
@ -332,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
|||
switch {
|
||||
case aRes.rcode != "":
|
||||
c.data.FinalRcode = aRes.rcode
|
||||
c.data.FinalRcodeTransient = aRes.transient
|
||||
case aaaaRes.rcode != "":
|
||||
c.data.FinalRcode = aaaaRes.rcode
|
||||
c.data.FinalRcodeTransient = aaaaRes.transient
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -125,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
|
|||
}
|
||||
var out []sdk.CheckState
|
||||
if data.ChainTerminated.Reason == TermRcode {
|
||||
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
|
||||
// not observe the record, not that the zone published a negative answer:
|
||||
// report it as Unknown so a flaky server does not flap the check into Crit.
|
||||
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
|
||||
status := sdk.StatusCrit
|
||||
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
|
||||
if data.ChainTerminated.Transient {
|
||||
status = sdk.StatusUnknown
|
||||
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
|
||||
}
|
||||
out = append(out, withHint(sdk.CheckState{
|
||||
Status: sdk.StatusCrit,
|
||||
Status: status,
|
||||
Subject: data.ChainTerminated.Subject,
|
||||
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
|
||||
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias."))
|
||||
}, hint))
|
||||
}
|
||||
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
|
||||
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
|
||||
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
|
||||
status := sdk.StatusWarn
|
||||
hint := "Check the upstream zone's A/AAAA publication."
|
||||
if data.FinalRcodeTransient {
|
||||
status = sdk.StatusUnknown
|
||||
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
|
||||
}
|
||||
out = append(out, withHint(sdk.CheckState{
|
||||
Status: sdk.StatusWarn,
|
||||
Status: status,
|
||||
Subject: data.FinalTarget,
|
||||
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
|
||||
}, "Check the upstream zone's A/AAAA publication."))
|
||||
}, hint))
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return okState(data.Owner, "all chain and final lookups returned NOERROR")
|
||||
|
|
|
|||
|
|
@ -162,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) {
|
|||
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
|
||||
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
|
||||
})
|
||||
t.Run("final rcode", func(t *testing.T) {
|
||||
t.Run("mid-chain transient SERVFAIL", func(t *testing.T) {
|
||||
// SERVFAIL from every auth server could not be observed: Unknown, not Crit.
|
||||
d := apexKnownData()
|
||||
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true}
|
||||
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown)
|
||||
})
|
||||
t.Run("final definitive rcode", func(t *testing.T) {
|
||||
d := apexKnownData()
|
||||
d.ChainTerminated.Reason = TermOK
|
||||
d.FinalTarget = "target.example."
|
||||
d.FinalRcode = "NXDOMAIN"
|
||||
states := run(chainRcodeRule{}, d, nil)
|
||||
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
|
||||
t.Fatalf("want single WARN, got %+v", states)
|
||||
}
|
||||
})
|
||||
t.Run("final transient rcode", func(t *testing.T) {
|
||||
// SERVFAIL on the final lookup could not be observed: Unknown, not Warn.
|
||||
d := apexKnownData()
|
||||
d.ChainTerminated.Reason = TermOK
|
||||
d.FinalTarget = "target.example."
|
||||
d.FinalRcode = "SERVFAIL"
|
||||
d.FinalRcodeTransient = true
|
||||
states := run(chainRcodeRule{}, d, nil)
|
||||
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
|
||||
t.Fatalf("want single WARN, got %+v", states)
|
||||
if len(states) != 1 || states[0].Status != sdk.StatusUnknown {
|
||||
t.Fatalf("want single UNKNOWN, got %+v", states)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,9 +47,10 @@ type ChainTermination struct {
|
|||
Subject string `json:"subject,omitempty"`
|
||||
Detail string `json:"detail,omitempty"`
|
||||
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
||||
// Transient is meaningful with TermQueryErr: true when the query could not be
|
||||
// completed because of a transport/resolver fault (could not observe), false
|
||||
// when it stems from definitive evidence such as a target with no locatable apex.
|
||||
// Transient is meaningful with TermQueryErr and TermRcode: true when the failure
|
||||
// could not be observed as a definitive answer (a transport/resolver fault, or a
|
||||
// SERVFAIL/REFUSED from every auth server), false when it stems from definitive
|
||||
// evidence such as a target with no locatable apex or an authoritative NXDOMAIN.
|
||||
Transient bool `json:"transient,omitempty"`
|
||||
}
|
||||
|
||||
|
|
@ -73,7 +74,11 @@ type AliasData struct {
|
|||
FinalTarget string `json:"final_target,omitempty"`
|
||||
FinalA []string `json:"final_a,omitempty"`
|
||||
FinalAAAA []string `json:"final_aaaa,omitempty"`
|
||||
// FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any;
|
||||
// FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe)
|
||||
// rather than a definitive negative answer.
|
||||
FinalRcode string `json:"final_rcode,omitempty"`
|
||||
FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"`
|
||||
|
||||
// Coexisting is populated only when Owner has a CNAME.
|
||||
Coexisting []CoexistingRRset `json:"coexisting,omitempty"`
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue