checker-alias/checker/dns.go
Pierre-Olivier Mercier da6def100c
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport
and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server
misbehaving" — a flaky recursive resolver, not a broken delegation. That
made the check flap into Crit whenever the resolver hiccuped, the same
class of false negative the chain path already fixed.

Mark apex-lookup failures that stem from a transport/resolver fault
(resolveZoneNSAddrs net errors, recursiveExchange transport errors, and
SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed
error, surface it as ApexLookupTransient, and have apexLookupRule report
Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable
NS) still drive Crit.
2026-06-18 10:29:30 +09:00

208 lines
6.2 KiB
Go

package checker
import (
"context"
"errors"
"fmt"
"net"
"strings"
"sync"
"time"
"github.com/miekg/dns"
)
const dnsTimeout = 5 * time.Second
// dnsExchange sends a single query. dnssec=true requests DNSSEC RRs (DO bit);
// pass false for plain chain walks to keep responses small.
func dnsExchange(ctx context.Context, proto, server string, q dns.Question, rd, dnssec bool) (*dns.Msg, error) {
client := dns.Client{Net: proto, Timeout: dnsTimeout}
m := new(dns.Msg)
m.Id = dns.Id()
m.Question = []dns.Question{q}
m.RecursionDesired = rd
m.SetEdns0(4096, dnssec)
if deadline, ok := ctx.Deadline(); ok {
if d := time.Until(deadline); d > 0 && d < client.Timeout {
client.Timeout = d
}
}
r, _, err := client.Exchange(m, server)
if err != nil {
return nil, err
}
if r == nil {
return nil, fmt.Errorf("nil response from %s", server)
}
return r, nil
}
func recursiveExchange(ctx context.Context, server string, q dns.Question) (*dns.Msg, error) {
return dnsExchange(ctx, "", server, q, true, false)
}
// systemResolver reads /etc/resolv.conf, falling back to 1.1.1.1 in scratch
// containers where the file is absent. The fallback leaks queries to
// Cloudflare; operators that care should mount a resolv.conf.
func systemResolver() string {
cfg, err := dns.ClientConfigFromFile("/etc/resolv.conf")
if err != nil || len(cfg.Servers) == 0 {
return net.JoinHostPort("1.1.1.1", "53")
}
return net.JoinHostPort(cfg.Servers[0], cfg.Port)
}
func hostPort(host, port string) string {
return net.JoinHostPort(strings.TrimSuffix(host, "."), port)
}
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
labels := dns.SplitDomainName(fqdn)
// transientSeen records whether any candidate failed for a transport or
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
// a flaky recursive resolver is reported as transient rather than definitive.
transientSeen := false
for i := range labels {
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
r, rerr := recursiveExchange(ctx, resolver, q)
if rerr != nil {
transientSeen = true
continue
}
if r.Rcode != dns.RcodeSuccess {
if isTransientRcode(r.Rcode) {
transientSeen = true
}
continue
}
hasSOA := false
for _, rr := range r.Answer {
// Only accept a SOA whose owner is the candidate itself: when the
// candidate is a CNAME, the resolver returns the target zone's SOA,
// which is not evidence that the candidate is an apex.
if soa, ok := rr.(*dns.SOA); ok && lowerFQDN(soa.Header().Name) == lowerFQDN(candidate) {
hasSOA = true
break
}
}
if !hasSOA {
continue
}
apex = candidate
servers, err = resolveZoneNSAddrs(ctx, apex)
if err != nil {
// A resolver fault (e.g. "server misbehaving") means we could not
// observe the apex's NS, not that the delegation is broken.
return "", nil, transientApexError{err}
}
if len(servers) == 0 {
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
}
return apex, servers, nil
}
err = fmt.Errorf("could not locate apex of %s", fqdn)
if transientSeen {
return "", nil, transientApexError{err}
}
return "", nil, err
}
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
var resolver net.Resolver
nss, err := resolver.LookupNS(ctx, strings.TrimSuffix(zone, "."))
if err != nil {
return nil, err
}
results := make([][]string, len(nss))
var wg sync.WaitGroup
wg.Add(len(nss))
for i, ns := range nss {
go func() {
defer wg.Done()
addrs, err := resolver.LookupHost(ctx, strings.TrimSuffix(ns.Host, "."))
if err != nil || len(addrs) == 0 {
return
}
r := make([]string, len(addrs))
for j, a := range addrs {
r[j] = hostPort(a, "53")
}
results[i] = r
}()
}
wg.Wait()
var out []string
for _, r := range results {
out = append(out, r...)
}
return out, nil
}
// queryAtAuth tries each server in order and returns the first definitive
// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
// fail over to the next server so a single flaky auth server cannot decide the
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
// If every server fails it returns the last transient response when there was
// one (so callers can still inspect the rcode), otherwise the last transport
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
var lastErr error
var transientMsg *dns.Msg
var transientServer string
for _, s := range servers {
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
if err != nil {
lastErr = err
continue
}
if isTransientRcode(r.Rcode) {
transientMsg, transientServer = r, s
continue
}
return r, s, nil
}
if transientMsg != nil {
return transientMsg, transientServer, nil
}
if lastErr == nil {
lastErr = fmt.Errorf("no servers provided")
}
return nil, "", lastErr
}
// transientApexError marks an apex-lookup failure that stems from a transport or
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
// report it as Unknown instead of flapping the check into Crit.
type transientApexError struct{ err error }
func (e transientApexError) Error() string { return e.err.Error() }
func (e transientApexError) Unwrap() error { return e.err }
func isTransientApexError(err error) bool {
var t transientApexError
return errors.As(err, &t)
}
// isTransientRcode reports whether an rcode is worth retrying against another
// auth server rather than treating as the zone's final answer. SERVFAIL and
// REFUSED are typically per-server faults (backend down, server not yet loaded
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
func isTransientRcode(rcode int) bool {
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
}
func rcodeText(r int) string {
if s, ok := dns.RcodeToString[r]; ok {
return s
}
return fmt.Sprintf("RCODE(%d)", r)
}
func lowerFQDN(name string) string {
return strings.ToLower(dns.Fqdn(name))
}