checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn
All checks were successful
continuous-integration/drone/tag Build is passing
continuous-integration/drone/push Build is passing

SERVFAIL/REFUSED from every auth server means the record could not be
observed, not that the zone published a negative answer. Mark such rcodes
transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule
reports Unknown instead of flapping the check into Crit/Warn; definitive
NXDOMAIN answers still drive Crit (mid-chain) and Warn (final).
This commit is contained in:
nemunaire 2026-06-18 11:22:00 +09:00
commit 65687ce375
4 changed files with 65 additions and 16 deletions

View file

@ -125,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
}
var out []sdk.CheckState
if data.ChainTerminated.Reason == TermRcode {
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
// not observe the record, not that the zone published a negative answer:
// report it as Unknown so a flaky server does not flap the check into Crit.
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
status := sdk.StatusCrit
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
if data.ChainTerminated.Transient {
status = sdk.StatusUnknown
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
}
out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusCrit,
Status: status,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias."))
}, hint))
}
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
status := sdk.StatusWarn
hint := "Check the upstream zone's A/AAAA publication."
if data.FinalRcodeTransient {
status = sdk.StatusUnknown
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
}
out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusWarn,
Status: status,
Subject: data.FinalTarget,
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
}, "Check the upstream zone's A/AAAA publication."))
}, hint))
}
if len(out) == 0 {
return okState(data.Owner, "all chain and final lookups returned NOERROR")