checker: fail over to other auth servers on SERVFAIL/REFUSED

queryAtAuth already failed over on transport errors but treated any DNS
response as final, so a SERVFAIL from the first auth server terminated the
chain as Crit even when a sibling server would answer NOERROR. This made
the check flap against a flaky server. Treat SERVFAIL/REFUSED as transient
and try the remaining servers, returning a definitive answer when any
server gives one and only falling back to the transient response (or the
last transport error) when every server fails.
This commit is contained in:
nemunaire 2026-06-18 09:30:56 +09:00
commit af0dceca6c
2 changed files with 115 additions and 2 deletions

View file

@ -128,24 +128,46 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
return out, nil
}
// queryAtAuth tries each server in order and returns the first usable answer.
// dnssec=true sets the DO bit; only the DNSSEC probes need it.
// queryAtAuth tries each server in order and returns the first definitive
// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
// fail over to the next server so a single flaky auth server cannot decide the
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
// If every server fails it returns the last transient response when there was
// one (so callers can still inspect the rcode), otherwise the last transport
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
var lastErr error
var transientMsg *dns.Msg
var transientServer string
for _, s := range servers {
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
if err != nil {
lastErr = err
continue
}
if isTransientRcode(r.Rcode) {
transientMsg, transientServer = r, s
continue
}
return r, s, nil
}
if transientMsg != nil {
return transientMsg, transientServer, nil
}
if lastErr == nil {
lastErr = fmt.Errorf("no servers provided")
}
return nil, "", lastErr
}
// isTransientRcode reports whether an rcode is worth retrying against another
// auth server rather than treating as the zone's final answer. SERVFAIL and
// REFUSED are typically per-server faults (backend down, server not yet loaded
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
func isTransientRcode(rcode int) bool {
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
}
func rcodeText(r int) string {
if s, ok := dns.RcodeToString[r]; ok {
return s