checker: report transient apex-lookup failures as Unknown, not Crit
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing

apexLookupRule mapped every findApex failure to Crit, including transport
and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server
misbehaving" — a flaky recursive resolver, not a broken delegation. That
made the check flap into Crit whenever the resolver hiccuped, the same
class of false negative the chain path already fixed.

Mark apex-lookup failures that stem from a transport/resolver fault
(resolveZoneNSAddrs net errors, recursiveExchange transport errors, and
SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed
error, surface it as ApexLookupTransient, and have apexLookupRule report
Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable
NS) still drive Crit.
This commit is contained in:
nemunaire 2026-06-18 10:05:51 +09:00
commit da6def100c
7 changed files with 123 additions and 23 deletions

View file

@ -2,6 +2,7 @@ package checker
import (
"context"
"errors"
"fmt"
"net"
"strings"
@ -61,14 +62,22 @@ func hostPort(host, port string) string {
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
labels := dns.SplitDomainName(fqdn)
// transientSeen records whether any candidate failed for a transport or
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
// a flaky recursive resolver is reported as transient rather than definitive.
transientSeen := false
for i := range labels {
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
r, rerr := recursiveExchange(ctx, resolver, q)
if rerr != nil {
transientSeen = true
continue
}
if r.Rcode != dns.RcodeSuccess {
if isTransientRcode(r.Rcode) {
transientSeen = true
}
continue
}
hasSOA := false
@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
apex = candidate
servers, err = resolveZoneNSAddrs(ctx, apex)
if err != nil {
return "", nil, err
// A resolver fault (e.g. "server misbehaving") means we could not
// observe the apex's NS, not that the delegation is broken.
return "", nil, transientApexError{err}
}
if len(servers) == 0 {
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
}
return apex, servers, nil
}
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
err = fmt.Errorf("could not locate apex of %s", fqdn)
if transientSeen {
return "", nil, transientApexError{err}
}
return "", nil, err
}
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
@ -160,6 +175,19 @@ func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Ques
return nil, "", lastErr
}
// transientApexError marks an apex-lookup failure that stems from a transport or
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
// report it as Unknown instead of flapping the check into Crit.
type transientApexError struct{ err error }
func (e transientApexError) Error() string { return e.err.Error() }
func (e transientApexError) Unwrap() error { return e.err }
func isTransientApexError(err error) bool {
var t transientApexError
return errors.As(err, &t)
}
// isTransientRcode reports whether an rcode is worth retrying against another
// auth server rather than treating as the zone's final answer. SERVFAIL and
// REFUSED are typically per-server faults (backend down, server not yet loaded