checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit.
This commit is contained in:
parent
af0dceca6c
commit
da6def100c
7 changed files with 123 additions and 23 deletions
|
|
@ -2,6 +2,7 @@ package checker
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
|
|
@ -61,14 +62,22 @@ func hostPort(host, port string) string {
|
|||
|
||||
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
|
||||
labels := dns.SplitDomainName(fqdn)
|
||||
// transientSeen records whether any candidate failed for a transport or
|
||||
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
|
||||
// a flaky recursive resolver is reported as transient rather than definitive.
|
||||
transientSeen := false
|
||||
for i := range labels {
|
||||
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
|
||||
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
|
||||
r, rerr := recursiveExchange(ctx, resolver, q)
|
||||
if rerr != nil {
|
||||
transientSeen = true
|
||||
continue
|
||||
}
|
||||
if r.Rcode != dns.RcodeSuccess {
|
||||
if isTransientRcode(r.Rcode) {
|
||||
transientSeen = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
hasSOA := false
|
||||
|
|
@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
|
|||
apex = candidate
|
||||
servers, err = resolveZoneNSAddrs(ctx, apex)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
// A resolver fault (e.g. "server misbehaving") means we could not
|
||||
// observe the apex's NS, not that the delegation is broken.
|
||||
return "", nil, transientApexError{err}
|
||||
}
|
||||
if len(servers) == 0 {
|
||||
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
|
||||
}
|
||||
return apex, servers, nil
|
||||
}
|
||||
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
|
||||
err = fmt.Errorf("could not locate apex of %s", fqdn)
|
||||
if transientSeen {
|
||||
return "", nil, transientApexError{err}
|
||||
}
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
||||
|
|
@ -160,6 +175,19 @@ func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Ques
|
|||
return nil, "", lastErr
|
||||
}
|
||||
|
||||
// transientApexError marks an apex-lookup failure that stems from a transport or
|
||||
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
|
||||
// report it as Unknown instead of flapping the check into Crit.
|
||||
type transientApexError struct{ err error }
|
||||
|
||||
func (e transientApexError) Error() string { return e.err.Error() }
|
||||
func (e transientApexError) Unwrap() error { return e.err }
|
||||
|
||||
func isTransientApexError(err error) bool {
|
||||
var t transientApexError
|
||||
return errors.As(err, &t)
|
||||
}
|
||||
|
||||
// isTransientRcode reports whether an rcode is worth retrying against another
|
||||
// auth server rather than treating as the zone's final answer. SERVFAIL and
|
||||
// REFUSED are typically per-server faults (backend down, server not yet loaded
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue