queryAtAuth already failed over on transport errors but treated any DNS response as final, so a SERVFAIL from the first auth server terminated the chain as Crit even when a sibling server would answer NOERROR. This made the check flap against a flaky server. Treat SERVFAIL/REFUSED as transient and try the remaining servers, returning a definitive answer when any server gives one and only falling back to the transient response (or the last transport error) when every server fails.
180 lines
5.1 KiB
Go
180 lines
5.1 KiB
Go
package checker
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/miekg/dns"
|
|
)
|
|
|
|
const dnsTimeout = 5 * time.Second
|
|
|
|
// dnsExchange sends a single query. dnssec=true requests DNSSEC RRs (DO bit);
|
|
// pass false for plain chain walks to keep responses small.
|
|
func dnsExchange(ctx context.Context, proto, server string, q dns.Question, rd, dnssec bool) (*dns.Msg, error) {
|
|
client := dns.Client{Net: proto, Timeout: dnsTimeout}
|
|
|
|
m := new(dns.Msg)
|
|
m.Id = dns.Id()
|
|
m.Question = []dns.Question{q}
|
|
m.RecursionDesired = rd
|
|
m.SetEdns0(4096, dnssec)
|
|
|
|
if deadline, ok := ctx.Deadline(); ok {
|
|
if d := time.Until(deadline); d > 0 && d < client.Timeout {
|
|
client.Timeout = d
|
|
}
|
|
}
|
|
|
|
r, _, err := client.Exchange(m, server)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if r == nil {
|
|
return nil, fmt.Errorf("nil response from %s", server)
|
|
}
|
|
return r, nil
|
|
}
|
|
|
|
func recursiveExchange(ctx context.Context, server string, q dns.Question) (*dns.Msg, error) {
|
|
return dnsExchange(ctx, "", server, q, true, false)
|
|
}
|
|
|
|
// systemResolver reads /etc/resolv.conf, falling back to 1.1.1.1 in scratch
|
|
// containers where the file is absent. The fallback leaks queries to
|
|
// Cloudflare; operators that care should mount a resolv.conf.
|
|
func systemResolver() string {
|
|
cfg, err := dns.ClientConfigFromFile("/etc/resolv.conf")
|
|
if err != nil || len(cfg.Servers) == 0 {
|
|
return net.JoinHostPort("1.1.1.1", "53")
|
|
}
|
|
return net.JoinHostPort(cfg.Servers[0], cfg.Port)
|
|
}
|
|
|
|
func hostPort(host, port string) string {
|
|
return net.JoinHostPort(strings.TrimSuffix(host, "."), port)
|
|
}
|
|
|
|
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
|
|
labels := dns.SplitDomainName(fqdn)
|
|
for i := range labels {
|
|
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
|
|
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
|
|
r, rerr := recursiveExchange(ctx, resolver, q)
|
|
if rerr != nil {
|
|
continue
|
|
}
|
|
if r.Rcode != dns.RcodeSuccess {
|
|
continue
|
|
}
|
|
hasSOA := false
|
|
for _, rr := range r.Answer {
|
|
// Only accept a SOA whose owner is the candidate itself: when the
|
|
// candidate is a CNAME, the resolver returns the target zone's SOA,
|
|
// which is not evidence that the candidate is an apex.
|
|
if soa, ok := rr.(*dns.SOA); ok && lowerFQDN(soa.Header().Name) == lowerFQDN(candidate) {
|
|
hasSOA = true
|
|
break
|
|
}
|
|
}
|
|
if !hasSOA {
|
|
continue
|
|
}
|
|
apex = candidate
|
|
servers, err = resolveZoneNSAddrs(ctx, apex)
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
if len(servers) == 0 {
|
|
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
|
|
}
|
|
return apex, servers, nil
|
|
}
|
|
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
|
|
}
|
|
|
|
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
|
var resolver net.Resolver
|
|
nss, err := resolver.LookupNS(ctx, strings.TrimSuffix(zone, "."))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
results := make([][]string, len(nss))
|
|
var wg sync.WaitGroup
|
|
wg.Add(len(nss))
|
|
for i, ns := range nss {
|
|
go func() {
|
|
defer wg.Done()
|
|
addrs, err := resolver.LookupHost(ctx, strings.TrimSuffix(ns.Host, "."))
|
|
if err != nil || len(addrs) == 0 {
|
|
return
|
|
}
|
|
r := make([]string, len(addrs))
|
|
for j, a := range addrs {
|
|
r[j] = hostPort(a, "53")
|
|
}
|
|
results[i] = r
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
var out []string
|
|
for _, r := range results {
|
|
out = append(out, r...)
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// queryAtAuth tries each server in order and returns the first definitive
|
|
// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
|
|
// fail over to the next server so a single flaky auth server cannot decide the
|
|
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
|
|
// If every server fails it returns the last transient response when there was
|
|
// one (so callers can still inspect the rcode), otherwise the last transport
|
|
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
|
|
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
|
|
var lastErr error
|
|
var transientMsg *dns.Msg
|
|
var transientServer string
|
|
for _, s := range servers {
|
|
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
|
|
if err != nil {
|
|
lastErr = err
|
|
continue
|
|
}
|
|
if isTransientRcode(r.Rcode) {
|
|
transientMsg, transientServer = r, s
|
|
continue
|
|
}
|
|
return r, s, nil
|
|
}
|
|
if transientMsg != nil {
|
|
return transientMsg, transientServer, nil
|
|
}
|
|
if lastErr == nil {
|
|
lastErr = fmt.Errorf("no servers provided")
|
|
}
|
|
return nil, "", lastErr
|
|
}
|
|
|
|
// isTransientRcode reports whether an rcode is worth retrying against another
|
|
// auth server rather than treating as the zone's final answer. SERVFAIL and
|
|
// REFUSED are typically per-server faults (backend down, server not yet loaded
|
|
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
|
|
func isTransientRcode(rcode int) bool {
|
|
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
|
|
}
|
|
|
|
func rcodeText(r int) string {
|
|
if s, ok := dns.RcodeToString[r]; ok {
|
|
return s
|
|
}
|
|
return fmt.Sprintf("RCODE(%d)", r)
|
|
}
|
|
|
|
func lowerFQDN(name string) string {
|
|
return strings.ToLower(dns.Fqdn(name))
|
|
}
|