Compare commits

..

No commits in common. "da6def100c95c2ff0b839aaee295f38d0c405816" and "0becf6bc8c79b63d03809816db7f24364d61a9f9" have entirely different histories.

7 changed files with 21 additions and 238 deletions

View file

@ -26,7 +26,6 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
apex, servers, err := findApex(ctx, owner, resolver)
if err != nil {
data.ApexLookupError = err.Error()
data.ApexLookupTransient = isTransientApexError(err)
return data, nil
}
data.Apex = apex
@ -123,13 +122,10 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
r, server, err := c.queryFor(ctx, currentServers, q)
if err != nil {
// A query that never produced a response is a transport/resolver
// fault: we could not observe the alias, so report it as transient.
c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr,
Subject: current,
Detail: err.Error(),
Transient: true,
Reason: TermQueryErr,
Subject: current,
Detail: err.Error(),
}
c.data.FinalTarget = current
return
@ -193,15 +189,10 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
// answered by the parent's auth set.
zone, ns, zerr := c.reanchor(ctx, target)
if zerr != nil {
// Re-anchoring fails either because the target genuinely has no
// locatable apex (definitive: the alias points into the void) or
// because a resolver/transport fault prevented observing it. Only the
// latter is transient; classify so the rule does not mask a real break.
c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr,
Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
Transient: isTransientApexError(zerr),
Reason: TermQueryErr,
Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
}
c.data.FinalTarget = target
return

View file

@ -2,7 +2,6 @@ package checker
import (
"context"
"errors"
"fmt"
"net"
"strings"
@ -62,22 +61,14 @@ func hostPort(host, port string) string {
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
labels := dns.SplitDomainName(fqdn)
// transientSeen records whether any candidate failed for a transport or
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
// a flaky recursive resolver is reported as transient rather than definitive.
transientSeen := false
for i := range labels {
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
r, rerr := recursiveExchange(ctx, resolver, q)
if rerr != nil {
transientSeen = true
continue
}
if r.Rcode != dns.RcodeSuccess {
if isTransientRcode(r.Rcode) {
transientSeen = true
}
continue
}
hasSOA := false
@ -96,20 +87,14 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
apex = candidate
servers, err = resolveZoneNSAddrs(ctx, apex)
if err != nil {
// A resolver fault (e.g. "server misbehaving") means we could not
// observe the apex's NS, not that the delegation is broken.
return "", nil, transientApexError{err}
return "", nil, err
}
if len(servers) == 0 {
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
}
return apex, servers, nil
}
err = fmt.Errorf("could not locate apex of %s", fqdn)
if transientSeen {
return "", nil, transientApexError{err}
}
return "", nil, err
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
}
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
@ -143,59 +128,24 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
return out, nil
}
// queryAtAuth tries each server in order and returns the first definitive
// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
// fail over to the next server so a single flaky auth server cannot decide the
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
// If every server fails it returns the last transient response when there was
// one (so callers can still inspect the rcode), otherwise the last transport
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
// queryAtAuth tries each server in order and returns the first usable answer.
// dnssec=true sets the DO bit; only the DNSSEC probes need it.
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
var lastErr error
var transientMsg *dns.Msg
var transientServer string
for _, s := range servers {
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
if err != nil {
lastErr = err
continue
}
if isTransientRcode(r.Rcode) {
transientMsg, transientServer = r, s
continue
}
return r, s, nil
}
if transientMsg != nil {
return transientMsg, transientServer, nil
}
if lastErr == nil {
lastErr = fmt.Errorf("no servers provided")
}
return nil, "", lastErr
}
// transientApexError marks an apex-lookup failure that stems from a transport or
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
// report it as Unknown instead of flapping the check into Crit.
type transientApexError struct{ err error }
func (e transientApexError) Error() string { return e.err.Error() }
func (e transientApexError) Unwrap() error { return e.err }
func isTransientApexError(err error) bool {
var t transientApexError
return errors.As(err, &t)
}
// isTransientRcode reports whether an rcode is worth retrying against another
// auth server rather than treating as the zone's final answer. SERVFAIL and
// REFUSED are typically per-server faults (backend down, server not yet loaded
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
func isTransientRcode(rcode int) bool {
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
}
func rcodeText(r int) string {
if s, ok := dns.RcodeToString[r]; ok {
return s

View file

@ -1,109 +0,0 @@
package checker
import (
"context"
"errors"
"fmt"
"net"
"testing"
"github.com/miekg/dns"
)
func TestIsTransientRcode(t *testing.T) {
transient := []int{dns.RcodeServerFailure, dns.RcodeRefused}
for _, rc := range transient {
if !isTransientRcode(rc) {
t.Errorf("rcode %s should be transient", rcodeText(rc))
}
}
final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented}
for _, rc := range final {
if isTransientRcode(rc) {
t.Errorf("rcode %s should not be transient", rcodeText(rc))
}
}
}
func TestIsTransientApexError(t *testing.T) {
wrapped := transientApexError{errors.New("server misbehaving")}
if !isTransientApexError(wrapped) {
t.Errorf("transientApexError should be classified as transient")
}
if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) {
t.Errorf("error wrapping a transientApexError should be transient")
}
if isTransientApexError(errors.New("could not locate apex of example.com.")) {
t.Errorf("plain error should not be classified as transient")
}
if isTransientApexError(nil) {
t.Errorf("nil error should not be classified as transient")
}
}
// startTestServer spins up a UDP DNS server that answers every query with the
// given handler, returning its address and a shutdown func.
func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) {
t.Helper()
mux := dns.NewServeMux()
mux.HandleFunc(".", handler)
pc, err := net.ListenPacket("udp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen: %v", err)
}
srv := &dns.Server{PacketConn: pc, Handler: mux}
go srv.ActivateAndServe()
return pc.LocalAddr().String(), func() { srv.Shutdown() }
}
func answerWith(rcode int, withAnswer bool) dns.HandlerFunc {
return func(w dns.ResponseWriter, r *dns.Msg) {
m := new(dns.Msg)
m.SetReply(r)
m.Rcode = rcode
if withAnswer && len(r.Question) > 0 {
rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.")
if rr != nil {
m.Answer = append(m.Answer, rr)
}
}
w.WriteMsg(m)
}
}
func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) {
q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) {
bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
defer stopBad()
good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true))
defer stopGood()
r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if r.Rcode != dns.RcodeSuccess {
t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode))
}
if server != good {
t.Fatalf("answered by %s, want the healthy server %s", server, good)
}
})
t.Run("returns transient response when every server fails", func(t *testing.T) {
s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
defer stop1()
s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false))
defer stop2()
r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !isTransientRcode(r.Rcode) {
t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode))
}
})
}

View file

@ -22,20 +22,11 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
if data.Apex != "" {
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
}
// A transport/resolver fault means the apex could not be observed, not that the
// delegation is broken. Report it as Unknown so an intermittent recursive-resolver
// glitch does not flap the check into Crit; only definitive evidence drives Crit.
status := sdk.StatusCrit
hint := "Check that the parent delegation exists and that the zone is published."
if data.ApexLookupTransient {
status = sdk.StatusUnknown
hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability."
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: status,
Status: sdk.StatusCrit,
Subject: data.Owner,
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
}, hint)}
}, "Check that the parent delegation exists and that the zone is published.")}
}
type cnameAtApexRule struct{}

View file

@ -90,22 +90,11 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
if data.ChainTerminated.Reason != TermQueryErr {
return okState(data.Owner, "all chain queries succeeded")
}
// A transport failure (connection refused, timeout, network unreachable) means
// we could not observe the alias, not that it is broken: report it as Unknown so
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
// non-transient failure (e.g. the target has no locatable apex) is definitive
// evidence the alias cannot be followed: report it as Warn.
status, verb := sdk.StatusWarn, "failed"
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
if data.ChainTerminated.Transient {
status, verb = sdk.StatusUnknown, "could not be completed"
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: status,
Status: sdk.StatusWarn,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
}, hint)}
Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
}, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")}
}
type chainRcodeRule struct{}

View file

@ -81,19 +81,6 @@ func TestApexLookupRule(t *testing.T) {
t.Fatalf("want hint, got none")
}
})
t.Run("transient", func(t *testing.T) {
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
// so it must be Unknown rather than Crit to avoid flapping the check.
data := &AliasData{
Owner: "nemunai.re.",
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
ApexLookupTransient: true,
}
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
if s.Meta[hintKey] == nil {
t.Fatalf("want hint, got none")
}
})
}
func TestChainLoopRule(t *testing.T) {
@ -135,18 +122,9 @@ func TestChainQueryErrorRule(t *testing.T) {
d.ChainTerminated.Reason = TermOK
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
})
t.Run("transient query err", func(t *testing.T) {
// A transport fault (timeout) could not observe the alias, so it must be
// Unknown rather than Warn to avoid flapping the check.
t.Run("query err", func(t *testing.T) {
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
})
t.Run("definitive query err", func(t *testing.T) {
// A non-transient failure (target has no locatable apex) is definitive
// evidence the alias cannot be followed: Warn, not Unknown.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
})
}

View file

@ -47,23 +47,16 @@ type ChainTermination struct {
Subject string `json:"subject,omitempty"`
Detail string `json:"detail,omitempty"`
Rcode string `json:"rcode,omitempty"` // only with TermRcode
// Transient is meaningful with TermQueryErr: true when the query could not be
// completed because of a transport/resolver fault (could not observe), false
// when it stems from definitive evidence such as a target with no locatable apex.
Transient bool `json:"transient,omitempty"`
}
// AliasData carries raw facts only; judgement is delegated to the rules.
type AliasData struct {
Owner string `json:"owner"`
// Apex is empty iff the apex lookup failed; ApexLookupError explains why and
// ApexLookupTransient is true when the failure was a transport/resolver fault
// (could not observe) rather than definitive evidence the apex is missing.
Apex string `json:"apex,omitempty"`
ApexLookupError string `json:"apex_lookup_error,omitempty"`
ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"`
AuthServers []string `json:"auth_servers,omitempty"`
// Apex is empty iff the apex lookup failed; ApexLookupError explains why.
Apex string `json:"apex,omitempty"`
ApexLookupError string `json:"apex_lookup_error,omitempty"`
AuthServers []string `json:"auth_servers,omitempty"`
Chain []ChainHop `json:"chain,omitempty"`
ChainTerminated ChainTermination `json:"chain_terminated"`