Compare commits
4 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 65687ce375 | |||
| da6def100c | |||
| af0dceca6c | |||
| 680a7735f0 |
7 changed files with 300 additions and 34 deletions
|
|
@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
|
||||||
apex, servers, err := findApex(ctx, owner, resolver)
|
apex, servers, err := findApex(ctx, owner, resolver)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
data.ApexLookupError = err.Error()
|
data.ApexLookupError = err.Error()
|
||||||
|
data.ApexLookupTransient = isTransientApexError(err)
|
||||||
return data, nil
|
return data, nil
|
||||||
}
|
}
|
||||||
data.Apex = apex
|
data.Apex = apex
|
||||||
|
|
@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
||||||
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
|
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
|
||||||
r, server, err := c.queryFor(ctx, currentServers, q)
|
r, server, err := c.queryFor(ctx, currentServers, q)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// A query that never produced a response is a transport/resolver
|
||||||
|
// fault: we could not observe the alias, so report it as transient.
|
||||||
c.data.ChainTerminated = ChainTermination{
|
c.data.ChainTerminated = ChainTermination{
|
||||||
Reason: TermQueryErr,
|
Reason: TermQueryErr,
|
||||||
Subject: current,
|
Subject: current,
|
||||||
Detail: err.Error(),
|
Detail: err.Error(),
|
||||||
|
Transient: true,
|
||||||
}
|
}
|
||||||
c.data.FinalTarget = current
|
c.data.FinalTarget = current
|
||||||
return
|
return
|
||||||
|
|
@ -133,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
||||||
|
|
||||||
if r.Rcode != dns.RcodeSuccess {
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
rcode := rcodeText(r.Rcode)
|
rcode := rcodeText(r.Rcode)
|
||||||
|
// A SERVFAIL/REFUSED from every auth server means we could not observe
|
||||||
|
// the record, not that the zone published a negative answer; mark it
|
||||||
|
// transient so the rule reports Unknown instead of Crit.
|
||||||
c.data.ChainTerminated = ChainTermination{
|
c.data.ChainTerminated = ChainTermination{
|
||||||
Reason: TermRcode,
|
Reason: TermRcode,
|
||||||
Subject: current,
|
Subject: current,
|
||||||
Rcode: rcode,
|
Rcode: rcode,
|
||||||
Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
|
Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
|
||||||
|
Transient: isTransientRcode(r.Rcode),
|
||||||
}
|
}
|
||||||
c.data.FinalTarget = current
|
c.data.FinalTarget = current
|
||||||
return
|
return
|
||||||
|
|
@ -189,10 +197,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
||||||
// answered by the parent's auth set.
|
// answered by the parent's auth set.
|
||||||
zone, ns, zerr := c.reanchor(ctx, target)
|
zone, ns, zerr := c.reanchor(ctx, target)
|
||||||
if zerr != nil {
|
if zerr != nil {
|
||||||
|
// Re-anchoring fails either because the target genuinely has no
|
||||||
|
// locatable apex (definitive: the alias points into the void) or
|
||||||
|
// because a resolver/transport fault prevented observing it. Only the
|
||||||
|
// latter is transient; classify so the rule does not mask a real break.
|
||||||
c.data.ChainTerminated = ChainTermination{
|
c.data.ChainTerminated = ChainTermination{
|
||||||
Reason: TermQueryErr,
|
Reason: TermQueryErr,
|
||||||
Subject: target,
|
Subject: target,
|
||||||
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
|
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
|
||||||
|
Transient: isTransientApexError(zerr),
|
||||||
}
|
}
|
||||||
c.data.FinalTarget = target
|
c.data.FinalTarget = target
|
||||||
return
|
return
|
||||||
|
|
@ -273,6 +286,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
||||||
type result struct {
|
type result struct {
|
||||||
addrs []string
|
addrs []string
|
||||||
rcode string
|
rcode string
|
||||||
|
transient bool
|
||||||
}
|
}
|
||||||
|
|
||||||
query := func(qtype uint16) result {
|
query := func(qtype uint16) result {
|
||||||
|
|
@ -292,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
||||||
var res result
|
var res result
|
||||||
if r.Rcode != dns.RcodeSuccess {
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
res.rcode = rcodeText(r.Rcode)
|
res.rcode = rcodeText(r.Rcode)
|
||||||
|
res.transient = isTransientRcode(r.Rcode)
|
||||||
}
|
}
|
||||||
for _, rr := range r.Answer {
|
for _, rr := range r.Answer {
|
||||||
switch v := rr.(type) {
|
switch v := rr.(type) {
|
||||||
|
|
@ -323,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
||||||
switch {
|
switch {
|
||||||
case aRes.rcode != "":
|
case aRes.rcode != "":
|
||||||
c.data.FinalRcode = aRes.rcode
|
c.data.FinalRcode = aRes.rcode
|
||||||
|
c.data.FinalRcodeTransient = aRes.transient
|
||||||
case aaaaRes.rcode != "":
|
case aaaaRes.rcode != "":
|
||||||
c.data.FinalRcode = aaaaRes.rcode
|
c.data.FinalRcode = aaaaRes.rcode
|
||||||
|
c.data.FinalRcodeTransient = aaaaRes.transient
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package checker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
@ -61,14 +62,22 @@ func hostPort(host, port string) string {
|
||||||
|
|
||||||
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
|
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
|
||||||
labels := dns.SplitDomainName(fqdn)
|
labels := dns.SplitDomainName(fqdn)
|
||||||
|
// transientSeen records whether any candidate failed for a transport or
|
||||||
|
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
|
||||||
|
// a flaky recursive resolver is reported as transient rather than definitive.
|
||||||
|
transientSeen := false
|
||||||
for i := range labels {
|
for i := range labels {
|
||||||
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
|
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
|
||||||
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
|
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
|
||||||
r, rerr := recursiveExchange(ctx, resolver, q)
|
r, rerr := recursiveExchange(ctx, resolver, q)
|
||||||
if rerr != nil {
|
if rerr != nil {
|
||||||
|
transientSeen = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if r.Rcode != dns.RcodeSuccess {
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
|
if isTransientRcode(r.Rcode) {
|
||||||
|
transientSeen = true
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
hasSOA := false
|
hasSOA := false
|
||||||
|
|
@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
|
||||||
apex = candidate
|
apex = candidate
|
||||||
servers, err = resolveZoneNSAddrs(ctx, apex)
|
servers, err = resolveZoneNSAddrs(ctx, apex)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, err
|
// A resolver fault (e.g. "server misbehaving") means we could not
|
||||||
|
// observe the apex's NS, not that the delegation is broken.
|
||||||
|
return "", nil, transientApexError{err}
|
||||||
}
|
}
|
||||||
if len(servers) == 0 {
|
if len(servers) == 0 {
|
||||||
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
|
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
|
||||||
}
|
}
|
||||||
return apex, servers, nil
|
return apex, servers, nil
|
||||||
}
|
}
|
||||||
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
|
err = fmt.Errorf("could not locate apex of %s", fqdn)
|
||||||
|
if transientSeen {
|
||||||
|
return "", nil, transientApexError{err}
|
||||||
|
}
|
||||||
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
||||||
|
|
@ -128,24 +143,59 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// queryAtAuth tries each server in order and returns the first usable answer.
|
// queryAtAuth tries each server in order and returns the first definitive
|
||||||
// dnssec=true sets the DO bit; only the DNSSEC probes need it.
|
// answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
|
||||||
|
// fail over to the next server so a single flaky auth server cannot decide the
|
||||||
|
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
|
||||||
|
// If every server fails it returns the last transient response when there was
|
||||||
|
// one (so callers can still inspect the rcode), otherwise the last transport
|
||||||
|
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
|
||||||
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
|
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
|
||||||
var lastErr error
|
var lastErr error
|
||||||
|
var transientMsg *dns.Msg
|
||||||
|
var transientServer string
|
||||||
for _, s := range servers {
|
for _, s := range servers {
|
||||||
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
|
r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
lastErr = err
|
lastErr = err
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if isTransientRcode(r.Rcode) {
|
||||||
|
transientMsg, transientServer = r, s
|
||||||
|
continue
|
||||||
|
}
|
||||||
return r, s, nil
|
return r, s, nil
|
||||||
}
|
}
|
||||||
|
if transientMsg != nil {
|
||||||
|
return transientMsg, transientServer, nil
|
||||||
|
}
|
||||||
if lastErr == nil {
|
if lastErr == nil {
|
||||||
lastErr = fmt.Errorf("no servers provided")
|
lastErr = fmt.Errorf("no servers provided")
|
||||||
}
|
}
|
||||||
return nil, "", lastErr
|
return nil, "", lastErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// transientApexError marks an apex-lookup failure that stems from a transport or
|
||||||
|
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
|
||||||
|
// report it as Unknown instead of flapping the check into Crit.
|
||||||
|
type transientApexError struct{ err error }
|
||||||
|
|
||||||
|
func (e transientApexError) Error() string { return e.err.Error() }
|
||||||
|
func (e transientApexError) Unwrap() error { return e.err }
|
||||||
|
|
||||||
|
func isTransientApexError(err error) bool {
|
||||||
|
var t transientApexError
|
||||||
|
return errors.As(err, &t)
|
||||||
|
}
|
||||||
|
|
||||||
|
// isTransientRcode reports whether an rcode is worth retrying against another
|
||||||
|
// auth server rather than treating as the zone's final answer. SERVFAIL and
|
||||||
|
// REFUSED are typically per-server faults (backend down, server not yet loaded
|
||||||
|
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
|
||||||
|
func isTransientRcode(rcode int) bool {
|
||||||
|
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
|
||||||
|
}
|
||||||
|
|
||||||
func rcodeText(r int) string {
|
func rcodeText(r int) string {
|
||||||
if s, ok := dns.RcodeToString[r]; ok {
|
if s, ok := dns.RcodeToString[r]; ok {
|
||||||
return s
|
return s
|
||||||
|
|
|
||||||
109
checker/dns_test.go
Normal file
109
checker/dns_test.go
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
package checker
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"net"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/miekg/dns"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIsTransientRcode(t *testing.T) {
|
||||||
|
transient := []int{dns.RcodeServerFailure, dns.RcodeRefused}
|
||||||
|
for _, rc := range transient {
|
||||||
|
if !isTransientRcode(rc) {
|
||||||
|
t.Errorf("rcode %s should be transient", rcodeText(rc))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented}
|
||||||
|
for _, rc := range final {
|
||||||
|
if isTransientRcode(rc) {
|
||||||
|
t.Errorf("rcode %s should not be transient", rcodeText(rc))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsTransientApexError(t *testing.T) {
|
||||||
|
wrapped := transientApexError{errors.New("server misbehaving")}
|
||||||
|
if !isTransientApexError(wrapped) {
|
||||||
|
t.Errorf("transientApexError should be classified as transient")
|
||||||
|
}
|
||||||
|
if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) {
|
||||||
|
t.Errorf("error wrapping a transientApexError should be transient")
|
||||||
|
}
|
||||||
|
if isTransientApexError(errors.New("could not locate apex of example.com.")) {
|
||||||
|
t.Errorf("plain error should not be classified as transient")
|
||||||
|
}
|
||||||
|
if isTransientApexError(nil) {
|
||||||
|
t.Errorf("nil error should not be classified as transient")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// startTestServer spins up a UDP DNS server that answers every query with the
|
||||||
|
// given handler, returning its address and a shutdown func.
|
||||||
|
func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) {
|
||||||
|
t.Helper()
|
||||||
|
mux := dns.NewServeMux()
|
||||||
|
mux.HandleFunc(".", handler)
|
||||||
|
pc, err := net.ListenPacket("udp", "127.0.0.1:0")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("listen: %v", err)
|
||||||
|
}
|
||||||
|
srv := &dns.Server{PacketConn: pc, Handler: mux}
|
||||||
|
go srv.ActivateAndServe()
|
||||||
|
return pc.LocalAddr().String(), func() { srv.Shutdown() }
|
||||||
|
}
|
||||||
|
|
||||||
|
func answerWith(rcode int, withAnswer bool) dns.HandlerFunc {
|
||||||
|
return func(w dns.ResponseWriter, r *dns.Msg) {
|
||||||
|
m := new(dns.Msg)
|
||||||
|
m.SetReply(r)
|
||||||
|
m.Rcode = rcode
|
||||||
|
if withAnswer && len(r.Question) > 0 {
|
||||||
|
rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.")
|
||||||
|
if rr != nil {
|
||||||
|
m.Answer = append(m.Answer, rr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.WriteMsg(m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) {
|
||||||
|
q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
|
||||||
|
|
||||||
|
t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) {
|
||||||
|
bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
|
||||||
|
defer stopBad()
|
||||||
|
good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true))
|
||||||
|
defer stopGood()
|
||||||
|
|
||||||
|
r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
|
t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode))
|
||||||
|
}
|
||||||
|
if server != good {
|
||||||
|
t.Fatalf("answered by %s, want the healthy server %s", server, good)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("returns transient response when every server fails", func(t *testing.T) {
|
||||||
|
s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
|
||||||
|
defer stop1()
|
||||||
|
s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false))
|
||||||
|
defer stop2()
|
||||||
|
|
||||||
|
r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if !isTransientRcode(r.Rcode) {
|
||||||
|
t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
|
||||||
if data.Apex != "" {
|
if data.Apex != "" {
|
||||||
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
|
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
|
||||||
}
|
}
|
||||||
|
// A transport/resolver fault means the apex could not be observed, not that the
|
||||||
|
// delegation is broken. Report it as Unknown so an intermittent recursive-resolver
|
||||||
|
// glitch does not flap the check into Crit; only definitive evidence drives Crit.
|
||||||
|
status := sdk.StatusCrit
|
||||||
|
hint := "Check that the parent delegation exists and that the zone is published."
|
||||||
|
if data.ApexLookupTransient {
|
||||||
|
status = sdk.StatusUnknown
|
||||||
|
hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability."
|
||||||
|
}
|
||||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusCrit,
|
Status: status,
|
||||||
Subject: data.Owner,
|
Subject: data.Owner,
|
||||||
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
|
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
|
||||||
}, "Check that the parent delegation exists and that the zone is published.")}
|
}, hint)}
|
||||||
}
|
}
|
||||||
|
|
||||||
type cnameAtApexRule struct{}
|
type cnameAtApexRule struct{}
|
||||||
|
|
|
||||||
|
|
@ -90,11 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
|
||||||
if data.ChainTerminated.Reason != TermQueryErr {
|
if data.ChainTerminated.Reason != TermQueryErr {
|
||||||
return okState(data.Owner, "all chain queries succeeded")
|
return okState(data.Owner, "all chain queries succeeded")
|
||||||
}
|
}
|
||||||
|
// A transport failure (connection refused, timeout, network unreachable) means
|
||||||
|
// we could not observe the alias, not that it is broken: report it as Unknown so
|
||||||
|
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
|
||||||
|
// non-transient failure (e.g. the target has no locatable apex) is definitive
|
||||||
|
// evidence the alias cannot be followed: report it as Warn.
|
||||||
|
status, verb := sdk.StatusWarn, "failed"
|
||||||
|
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
|
||||||
|
if data.ChainTerminated.Transient {
|
||||||
|
status, verb = sdk.StatusUnknown, "could not be completed"
|
||||||
|
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
|
||||||
|
}
|
||||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusWarn,
|
Status: status,
|
||||||
Subject: data.ChainTerminated.Subject,
|
Subject: data.ChainTerminated.Subject,
|
||||||
Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
|
Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
|
||||||
}, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")}
|
}, hint)}
|
||||||
}
|
}
|
||||||
|
|
||||||
type chainRcodeRule struct{}
|
type chainRcodeRule struct{}
|
||||||
|
|
@ -114,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
|
||||||
}
|
}
|
||||||
var out []sdk.CheckState
|
var out []sdk.CheckState
|
||||||
if data.ChainTerminated.Reason == TermRcode {
|
if data.ChainTerminated.Reason == TermRcode {
|
||||||
|
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
|
||||||
|
// not observe the record, not that the zone published a negative answer:
|
||||||
|
// report it as Unknown so a flaky server does not flap the check into Crit.
|
||||||
|
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
|
||||||
|
status := sdk.StatusCrit
|
||||||
|
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
|
||||||
|
if data.ChainTerminated.Transient {
|
||||||
|
status = sdk.StatusUnknown
|
||||||
|
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
|
||||||
|
}
|
||||||
out = append(out, withHint(sdk.CheckState{
|
out = append(out, withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusCrit,
|
Status: status,
|
||||||
Subject: data.ChainTerminated.Subject,
|
Subject: data.ChainTerminated.Subject,
|
||||||
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
|
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
|
||||||
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias."))
|
}, hint))
|
||||||
}
|
}
|
||||||
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
|
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
|
||||||
|
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
|
||||||
|
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
|
||||||
|
status := sdk.StatusWarn
|
||||||
|
hint := "Check the upstream zone's A/AAAA publication."
|
||||||
|
if data.FinalRcodeTransient {
|
||||||
|
status = sdk.StatusUnknown
|
||||||
|
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
|
||||||
|
}
|
||||||
out = append(out, withHint(sdk.CheckState{
|
out = append(out, withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusWarn,
|
Status: status,
|
||||||
Subject: data.FinalTarget,
|
Subject: data.FinalTarget,
|
||||||
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
|
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
|
||||||
}, "Check the upstream zone's A/AAAA publication."))
|
}, hint))
|
||||||
}
|
}
|
||||||
if len(out) == 0 {
|
if len(out) == 0 {
|
||||||
return okState(data.Owner, "all chain and final lookups returned NOERROR")
|
return okState(data.Owner, "all chain and final lookups returned NOERROR")
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) {
|
||||||
t.Fatalf("want hint, got none")
|
t.Fatalf("want hint, got none")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
t.Run("transient", func(t *testing.T) {
|
||||||
|
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
|
||||||
|
// so it must be Unknown rather than Crit to avoid flapping the check.
|
||||||
|
data := &AliasData{
|
||||||
|
Owner: "nemunai.re.",
|
||||||
|
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
|
||||||
|
ApexLookupTransient: true,
|
||||||
|
}
|
||||||
|
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
|
||||||
|
if s.Meta[hintKey] == nil {
|
||||||
|
t.Fatalf("want hint, got none")
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChainLoopRule(t *testing.T) {
|
func TestChainLoopRule(t *testing.T) {
|
||||||
|
|
@ -122,9 +135,18 @@ func TestChainQueryErrorRule(t *testing.T) {
|
||||||
d.ChainTerminated.Reason = TermOK
|
d.ChainTerminated.Reason = TermOK
|
||||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
|
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
|
||||||
})
|
})
|
||||||
t.Run("query err", func(t *testing.T) {
|
t.Run("transient query err", func(t *testing.T) {
|
||||||
|
// A transport fault (timeout) could not observe the alias, so it must be
|
||||||
|
// Unknown rather than Warn to avoid flapping the check.
|
||||||
d := apexKnownData()
|
d := apexKnownData()
|
||||||
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
|
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
|
||||||
|
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
|
||||||
|
})
|
||||||
|
t.Run("definitive query err", func(t *testing.T) {
|
||||||
|
// A non-transient failure (target has no locatable apex) is definitive
|
||||||
|
// evidence the alias cannot be followed: Warn, not Unknown.
|
||||||
|
d := apexKnownData()
|
||||||
|
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
|
||||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
|
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
@ -140,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) {
|
||||||
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
|
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
|
||||||
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
|
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
|
||||||
})
|
})
|
||||||
t.Run("final rcode", func(t *testing.T) {
|
t.Run("mid-chain transient SERVFAIL", func(t *testing.T) {
|
||||||
|
// SERVFAIL from every auth server could not be observed: Unknown, not Crit.
|
||||||
|
d := apexKnownData()
|
||||||
|
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true}
|
||||||
|
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown)
|
||||||
|
})
|
||||||
|
t.Run("final definitive rcode", func(t *testing.T) {
|
||||||
|
d := apexKnownData()
|
||||||
|
d.ChainTerminated.Reason = TermOK
|
||||||
|
d.FinalTarget = "target.example."
|
||||||
|
d.FinalRcode = "NXDOMAIN"
|
||||||
|
states := run(chainRcodeRule{}, d, nil)
|
||||||
|
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
|
||||||
|
t.Fatalf("want single WARN, got %+v", states)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
t.Run("final transient rcode", func(t *testing.T) {
|
||||||
|
// SERVFAIL on the final lookup could not be observed: Unknown, not Warn.
|
||||||
d := apexKnownData()
|
d := apexKnownData()
|
||||||
d.ChainTerminated.Reason = TermOK
|
d.ChainTerminated.Reason = TermOK
|
||||||
d.FinalTarget = "target.example."
|
d.FinalTarget = "target.example."
|
||||||
d.FinalRcode = "SERVFAIL"
|
d.FinalRcode = "SERVFAIL"
|
||||||
|
d.FinalRcodeTransient = true
|
||||||
states := run(chainRcodeRule{}, d, nil)
|
states := run(chainRcodeRule{}, d, nil)
|
||||||
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
|
if len(states) != 1 || states[0].Status != sdk.StatusUnknown {
|
||||||
t.Fatalf("want single WARN, got %+v", states)
|
t.Fatalf("want single UNKNOWN, got %+v", states)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,15 +47,23 @@ type ChainTermination struct {
|
||||||
Subject string `json:"subject,omitempty"`
|
Subject string `json:"subject,omitempty"`
|
||||||
Detail string `json:"detail,omitempty"`
|
Detail string `json:"detail,omitempty"`
|
||||||
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
||||||
|
// Transient is meaningful with TermQueryErr and TermRcode: true when the failure
|
||||||
|
// could not be observed as a definitive answer (a transport/resolver fault, or a
|
||||||
|
// SERVFAIL/REFUSED from every auth server), false when it stems from definitive
|
||||||
|
// evidence such as a target with no locatable apex or an authoritative NXDOMAIN.
|
||||||
|
Transient bool `json:"transient,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AliasData carries raw facts only; judgement is delegated to the rules.
|
// AliasData carries raw facts only; judgement is delegated to the rules.
|
||||||
type AliasData struct {
|
type AliasData struct {
|
||||||
Owner string `json:"owner"`
|
Owner string `json:"owner"`
|
||||||
|
|
||||||
// Apex is empty iff the apex lookup failed; ApexLookupError explains why.
|
// Apex is empty iff the apex lookup failed; ApexLookupError explains why and
|
||||||
|
// ApexLookupTransient is true when the failure was a transport/resolver fault
|
||||||
|
// (could not observe) rather than definitive evidence the apex is missing.
|
||||||
Apex string `json:"apex,omitempty"`
|
Apex string `json:"apex,omitempty"`
|
||||||
ApexLookupError string `json:"apex_lookup_error,omitempty"`
|
ApexLookupError string `json:"apex_lookup_error,omitempty"`
|
||||||
|
ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"`
|
||||||
AuthServers []string `json:"auth_servers,omitempty"`
|
AuthServers []string `json:"auth_servers,omitempty"`
|
||||||
|
|
||||||
Chain []ChainHop `json:"chain,omitempty"`
|
Chain []ChainHop `json:"chain,omitempty"`
|
||||||
|
|
@ -66,7 +74,11 @@ type AliasData struct {
|
||||||
FinalTarget string `json:"final_target,omitempty"`
|
FinalTarget string `json:"final_target,omitempty"`
|
||||||
FinalA []string `json:"final_a,omitempty"`
|
FinalA []string `json:"final_a,omitempty"`
|
||||||
FinalAAAA []string `json:"final_aaaa,omitempty"`
|
FinalAAAA []string `json:"final_aaaa,omitempty"`
|
||||||
|
// FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any;
|
||||||
|
// FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe)
|
||||||
|
// rather than a definitive negative answer.
|
||||||
FinalRcode string `json:"final_rcode,omitempty"`
|
FinalRcode string `json:"final_rcode,omitempty"`
|
||||||
|
FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"`
|
||||||
|
|
||||||
// Coexisting is populated only when Owner has a CNAME.
|
// Coexisting is populated only when Owner has a CNAME.
|
||||||
Coexisting []CoexistingRRset `json:"coexisting,omitempty"`
|
Coexisting []CoexistingRRset `json:"coexisting,omitempty"`
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue