Compare commits

..

3 commits

Author SHA1 Message Date
da6def100c checker: report transient apex-lookup failures as Unknown, not Crit
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
apexLookupRule mapped every findApex failure to Crit, including transport
and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server
misbehaving" — a flaky recursive resolver, not a broken delegation. That
made the check flap into Crit whenever the resolver hiccuped, the same
class of false negative the chain path already fixed.

Mark apex-lookup failures that stem from a transport/resolver fault
(resolveZoneNSAddrs net errors, recursiveExchange transport errors, and
SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed
error, surface it as ApexLookupTransient, and have apexLookupRule report
Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable
NS) still drive Crit.
2026-06-18 10:29:30 +09:00
af0dceca6c checker: fail over to other auth servers on SERVFAIL/REFUSED
queryAtAuth already failed over on transport errors but treated any DNS
response as final, so a SERVFAIL from the first auth server terminated the
chain as Crit even when a sibling server would answer NOERROR. This made
the check flap against a flaky server. Treat SERVFAIL/REFUSED as transient
and try the remaining servers, returning a definitive answer when any
server gives one and only falling back to the transient response (or the
last transport error) when every server fails.
2026-06-18 09:47:28 +09:00
680a7735f0 checker: report chain transport errors as Unknown, not Warn
A transport-level query failure (connection refused, timeout, network
unreachable) means the alias state could not be observed, not that the
alias is misconfigured. Mapping it to Warn made the check flap whenever a
flaky auth server alternated between refusing connections (Warn) and
answering SERVFAIL (Crit). Report TermQueryErr as Unknown so only
definitive DNS evidence drives Warn/Crit.
2026-06-18 09:31:37 +09:00
7 changed files with 238 additions and 21 deletions

View file

@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
apex, servers, err := findApex(ctx, owner, resolver)
if err != nil {
data.ApexLookupError = err.Error()
data.ApexLookupTransient = isTransientApexError(err)
return data, nil
}
data.Apex = apex
@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
r, server, err := c.queryFor(ctx, currentServers, q)
if err != nil {
// A query that never produced a response is a transport/resolver
// fault: we could not observe the alias, so report it as transient.
c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr,
Subject: current,
Detail: err.Error(),
Reason: TermQueryErr,
Subject: current,
Detail: err.Error(),
Transient: true,
}
c.data.FinalTarget = current
return
@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
// answered by the parent's auth set.
zone, ns, zerr := c.reanchor(ctx, target)
if zerr != nil {
// Re-anchoring fails either because the target genuinely has no
// locatable apex (definitive: the alias points into the void) or
// because a resolver/transport fault prevented observing it. Only the
// latter is transient; classify so the rule does not mask a real break.
c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr,
Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
Reason: TermQueryErr,
Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
Transient: isTransientApexError(zerr),
}
c.data.FinalTarget = target
return

View file

@ -2,6 +2,7 @@ package checker
import (
"context"
"errors"
"fmt"
"net"
"strings"
@ -61,14 +62,22 @@ func hostPort(host, port string) string {
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
labels := dns.SplitDomainName(fqdn)
// transientSeen records whether any candidate failed for a transport or
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
// a flaky recursive resolver is reported as transient rather than definitive.
transientSeen := false
for i := range labels {
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
r, rerr := recursiveExchange(ctx, resolver, q)
if rerr != nil {
transientSeen = true
continue
}
if r.Rcode != dns.RcodeSuccess {
if isTransientRcode(r.Rcode) {
transientSeen = true
}
continue
}
hasSOA := false
@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
apex = candidate
servers, err = resolveZoneNSAddrs(ctx, apex)
if err != nil {
return "", nil, err
// A resolver fault (e.g. "server misbehaving") means we could not
// observe the apex's NS, not that the delegation is broken.
return "", nil, transientApexError{err}
}
if len(servers) == 0 {
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
}
return apex, servers, nil
}
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
err = fmt.Errorf("could not locate apex of %s", fqdn)
if transientSeen {
return "", nil, transientApexError{err}
}
return "", nil, err
}
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
@ -128,24 +143,59 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
return out, nil
}
// queryAtAuth tries each server in order and returns the first usable answer.
// dnssec=true sets the DO bit; only the DNSSEC probes need it.
// queryAtAuth tries each server in order and returns the first definitive
// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
// fail over to the next server so a single flaky auth server cannot decide the
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
// If every server fails it returns the last transient response when there was
// one (so callers can still inspect the rcode), otherwise the last transport
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
var lastErr error
var transientMsg *dns.Msg
var transientServer string
for _, s := range servers {
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
if err != nil {
lastErr = err
continue
}
if isTransientRcode(r.Rcode) {
transientMsg, transientServer = r, s
continue
}
return r, s, nil
}
if transientMsg != nil {
return transientMsg, transientServer, nil
}
if lastErr == nil {
lastErr = fmt.Errorf("no servers provided")
}
return nil, "", lastErr
}
// transientApexError marks an apex-lookup failure that stems from a transport or
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
// report it as Unknown instead of flapping the check into Crit.
type transientApexError struct{ err error }
func (e transientApexError) Error() string { return e.err.Error() }
func (e transientApexError) Unwrap() error { return e.err }
func isTransientApexError(err error) bool {
var t transientApexError
return errors.As(err, &t)
}
// isTransientRcode reports whether an rcode is worth retrying against another
// auth server rather than treating as the zone's final answer. SERVFAIL and
// REFUSED are typically per-server faults (backend down, server not yet loaded
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
func isTransientRcode(rcode int) bool {
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
}
func rcodeText(r int) string {
if s, ok := dns.RcodeToString[r]; ok {
return s

109
checker/dns_test.go Normal file
View file

@ -0,0 +1,109 @@
package checker
import (
"context"
"errors"
"fmt"
"net"
"testing"
"github.com/miekg/dns"
)
func TestIsTransientRcode(t *testing.T) {
transient := []int{dns.RcodeServerFailure, dns.RcodeRefused}
for _, rc := range transient {
if !isTransientRcode(rc) {
t.Errorf("rcode %s should be transient", rcodeText(rc))
}
}
final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented}
for _, rc := range final {
if isTransientRcode(rc) {
t.Errorf("rcode %s should not be transient", rcodeText(rc))
}
}
}
func TestIsTransientApexError(t *testing.T) {
wrapped := transientApexError{errors.New("server misbehaving")}
if !isTransientApexError(wrapped) {
t.Errorf("transientApexError should be classified as transient")
}
if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) {
t.Errorf("error wrapping a transientApexError should be transient")
}
if isTransientApexError(errors.New("could not locate apex of example.com.")) {
t.Errorf("plain error should not be classified as transient")
}
if isTransientApexError(nil) {
t.Errorf("nil error should not be classified as transient")
}
}
// startTestServer spins up a UDP DNS server that answers every query with the
// given handler, returning its address and a shutdown func.
func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) {
t.Helper()
mux := dns.NewServeMux()
mux.HandleFunc(".", handler)
pc, err := net.ListenPacket("udp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen: %v", err)
}
srv := &dns.Server{PacketConn: pc, Handler: mux}
go srv.ActivateAndServe()
return pc.LocalAddr().String(), func() { srv.Shutdown() }
}
func answerWith(rcode int, withAnswer bool) dns.HandlerFunc {
return func(w dns.ResponseWriter, r *dns.Msg) {
m := new(dns.Msg)
m.SetReply(r)
m.Rcode = rcode
if withAnswer && len(r.Question) > 0 {
rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.")
if rr != nil {
m.Answer = append(m.Answer, rr)
}
}
w.WriteMsg(m)
}
}
func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) {
q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) {
bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
defer stopBad()
good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true))
defer stopGood()
r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if r.Rcode != dns.RcodeSuccess {
t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode))
}
if server != good {
t.Fatalf("answered by %s, want the healthy server %s", server, good)
}
})
t.Run("returns transient response when every server fails", func(t *testing.T) {
s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
defer stop1()
s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false))
defer stop2()
r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !isTransientRcode(r.Rcode) {
t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode))
}
})
}

View file

@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
if data.Apex != "" {
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
}
// A transport/resolver fault means the apex could not be observed, not that the
// delegation is broken. Report it as Unknown so an intermittent recursive-resolver
// glitch does not flap the check into Crit; only definitive evidence drives Crit.
status := sdk.StatusCrit
hint := "Check that the parent delegation exists and that the zone is published."
if data.ApexLookupTransient {
status = sdk.StatusUnknown
hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability."
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusCrit,
Status: status,
Subject: data.Owner,
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
}, "Check that the parent delegation exists and that the zone is published.")}
}, hint)}
}
type cnameAtApexRule struct{}

View file

@ -90,11 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
if data.ChainTerminated.Reason != TermQueryErr {
return okState(data.Owner, "all chain queries succeeded")
}
// A transport failure (connection refused, timeout, network unreachable) means
// we could not observe the alias, not that it is broken: report it as Unknown so
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
// non-transient failure (e.g. the target has no locatable apex) is definitive
// evidence the alias cannot be followed: report it as Warn.
status, verb := sdk.StatusWarn, "failed"
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
if data.ChainTerminated.Transient {
status, verb = sdk.StatusUnknown, "could not be completed"
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusWarn,
Status: status,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
}, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")}
Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
}, hint)}
}
type chainRcodeRule struct{}

View file

@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) {
t.Fatalf("want hint, got none")
}
})
t.Run("transient", func(t *testing.T) {
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
// so it must be Unknown rather than Crit to avoid flapping the check.
data := &AliasData{
Owner: "nemunai.re.",
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
ApexLookupTransient: true,
}
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
if s.Meta[hintKey] == nil {
t.Fatalf("want hint, got none")
}
})
}
func TestChainLoopRule(t *testing.T) {
@ -122,9 +135,18 @@ func TestChainQueryErrorRule(t *testing.T) {
d.ChainTerminated.Reason = TermOK
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
})
t.Run("query err", func(t *testing.T) {
t.Run("transient query err", func(t *testing.T) {
// A transport fault (timeout) could not observe the alias, so it must be
// Unknown rather than Warn to avoid flapping the check.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
})
t.Run("definitive query err", func(t *testing.T) {
// A non-transient failure (target has no locatable apex) is definitive
// evidence the alias cannot be followed: Warn, not Unknown.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
})
}

View file

@ -47,16 +47,23 @@ type ChainTermination struct {
Subject string `json:"subject,omitempty"`
Detail string `json:"detail,omitempty"`
Rcode string `json:"rcode,omitempty"` // only with TermRcode
// Transient is meaningful with TermQueryErr: true when the query could not be
// completed because of a transport/resolver fault (could not observe), false
// when it stems from definitive evidence such as a target with no locatable apex.
Transient bool `json:"transient,omitempty"`
}
// AliasData carries raw facts only; judgement is delegated to the rules.
type AliasData struct {
Owner string `json:"owner"`
// Apex is empty iff the apex lookup failed; ApexLookupError explains why.
Apex string `json:"apex,omitempty"`
ApexLookupError string `json:"apex_lookup_error,omitempty"`
AuthServers []string `json:"auth_servers,omitempty"`
// Apex is empty iff the apex lookup failed; ApexLookupError explains why and
// ApexLookupTransient is true when the failure was a transport/resolver fault
// (could not observe) rather than definitive evidence the apex is missing.
Apex string `json:"apex,omitempty"`
ApexLookupError string `json:"apex_lookup_error,omitempty"`
ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"`
AuthServers []string `json:"auth_servers,omitempty"`
Chain []ChainHop `json:"chain,omitempty"`
ChainTerminated ChainTermination `json:"chain_terminated"`