checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit.
This commit is contained in:
parent
af0dceca6c
commit
da6def100c
7 changed files with 123 additions and 23 deletions
|
|
@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
|
||||||
apex, servers, err := findApex(ctx, owner, resolver)
|
apex, servers, err := findApex(ctx, owner, resolver)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
data.ApexLookupError = err.Error()
|
data.ApexLookupError = err.Error()
|
||||||
|
data.ApexLookupTransient = isTransientApexError(err)
|
||||||
return data, nil
|
return data, nil
|
||||||
}
|
}
|
||||||
data.Apex = apex
|
data.Apex = apex
|
||||||
|
|
@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
||||||
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
|
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
|
||||||
r, server, err := c.queryFor(ctx, currentServers, q)
|
r, server, err := c.queryFor(ctx, currentServers, q)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// A query that never produced a response is a transport/resolver
|
||||||
|
// fault: we could not observe the alias, so report it as transient.
|
||||||
c.data.ChainTerminated = ChainTermination{
|
c.data.ChainTerminated = ChainTermination{
|
||||||
Reason: TermQueryErr,
|
Reason: TermQueryErr,
|
||||||
Subject: current,
|
Subject: current,
|
||||||
Detail: err.Error(),
|
Detail: err.Error(),
|
||||||
|
Transient: true,
|
||||||
}
|
}
|
||||||
c.data.FinalTarget = current
|
c.data.FinalTarget = current
|
||||||
return
|
return
|
||||||
|
|
@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
||||||
// answered by the parent's auth set.
|
// answered by the parent's auth set.
|
||||||
zone, ns, zerr := c.reanchor(ctx, target)
|
zone, ns, zerr := c.reanchor(ctx, target)
|
||||||
if zerr != nil {
|
if zerr != nil {
|
||||||
|
// Re-anchoring fails either because the target genuinely has no
|
||||||
|
// locatable apex (definitive: the alias points into the void) or
|
||||||
|
// because a resolver/transport fault prevented observing it. Only the
|
||||||
|
// latter is transient; classify so the rule does not mask a real break.
|
||||||
c.data.ChainTerminated = ChainTermination{
|
c.data.ChainTerminated = ChainTermination{
|
||||||
Reason: TermQueryErr,
|
Reason: TermQueryErr,
|
||||||
Subject: target,
|
Subject: target,
|
||||||
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
|
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
|
||||||
|
Transient: isTransientApexError(zerr),
|
||||||
}
|
}
|
||||||
c.data.FinalTarget = target
|
c.data.FinalTarget = target
|
||||||
return
|
return
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ package checker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
@ -61,14 +62,22 @@ func hostPort(host, port string) string {
|
||||||
|
|
||||||
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
|
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
|
||||||
labels := dns.SplitDomainName(fqdn)
|
labels := dns.SplitDomainName(fqdn)
|
||||||
|
// transientSeen records whether any candidate failed for a transport or
|
||||||
|
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
|
||||||
|
// a flaky recursive resolver is reported as transient rather than definitive.
|
||||||
|
transientSeen := false
|
||||||
for i := range labels {
|
for i := range labels {
|
||||||
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
|
candidate := dns.Fqdn(strings.Join(labels[i:], "."))
|
||||||
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
|
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
|
||||||
r, rerr := recursiveExchange(ctx, resolver, q)
|
r, rerr := recursiveExchange(ctx, resolver, q)
|
||||||
if rerr != nil {
|
if rerr != nil {
|
||||||
|
transientSeen = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if r.Rcode != dns.RcodeSuccess {
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
|
if isTransientRcode(r.Rcode) {
|
||||||
|
transientSeen = true
|
||||||
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
hasSOA := false
|
hasSOA := false
|
||||||
|
|
@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
|
||||||
apex = candidate
|
apex = candidate
|
||||||
servers, err = resolveZoneNSAddrs(ctx, apex)
|
servers, err = resolveZoneNSAddrs(ctx, apex)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, err
|
// A resolver fault (e.g. "server misbehaving") means we could not
|
||||||
|
// observe the apex's NS, not that the delegation is broken.
|
||||||
|
return "", nil, transientApexError{err}
|
||||||
}
|
}
|
||||||
if len(servers) == 0 {
|
if len(servers) == 0 {
|
||||||
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
|
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
|
||||||
}
|
}
|
||||||
return apex, servers, nil
|
return apex, servers, nil
|
||||||
}
|
}
|
||||||
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn)
|
err = fmt.Errorf("could not locate apex of %s", fqdn)
|
||||||
|
if transientSeen {
|
||||||
|
return "", nil, transientApexError{err}
|
||||||
|
}
|
||||||
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
|
||||||
|
|
@ -160,6 +175,19 @@ func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Ques
|
||||||
return nil, "", lastErr
|
return nil, "", lastErr
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// transientApexError marks an apex-lookup failure that stems from a transport or
|
||||||
|
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
|
||||||
|
// report it as Unknown instead of flapping the check into Crit.
|
||||||
|
type transientApexError struct{ err error }
|
||||||
|
|
||||||
|
func (e transientApexError) Error() string { return e.err.Error() }
|
||||||
|
func (e transientApexError) Unwrap() error { return e.err }
|
||||||
|
|
||||||
|
func isTransientApexError(err error) bool {
|
||||||
|
var t transientApexError
|
||||||
|
return errors.As(err, &t)
|
||||||
|
}
|
||||||
|
|
||||||
// isTransientRcode reports whether an rcode is worth retrying against another
|
// isTransientRcode reports whether an rcode is worth retrying against another
|
||||||
// auth server rather than treating as the zone's final answer. SERVFAIL and
|
// auth server rather than treating as the zone's final answer. SERVFAIL and
|
||||||
// REFUSED are typically per-server faults (backend down, server not yet loaded
|
// REFUSED are typically per-server faults (backend down, server not yet loaded
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,8 @@ package checker
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
|
@ -23,6 +25,22 @@ func TestIsTransientRcode(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIsTransientApexError(t *testing.T) {
|
||||||
|
wrapped := transientApexError{errors.New("server misbehaving")}
|
||||||
|
if !isTransientApexError(wrapped) {
|
||||||
|
t.Errorf("transientApexError should be classified as transient")
|
||||||
|
}
|
||||||
|
if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) {
|
||||||
|
t.Errorf("error wrapping a transientApexError should be transient")
|
||||||
|
}
|
||||||
|
if isTransientApexError(errors.New("could not locate apex of example.com.")) {
|
||||||
|
t.Errorf("plain error should not be classified as transient")
|
||||||
|
}
|
||||||
|
if isTransientApexError(nil) {
|
||||||
|
t.Errorf("nil error should not be classified as transient")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// startTestServer spins up a UDP DNS server that answers every query with the
|
// startTestServer spins up a UDP DNS server that answers every query with the
|
||||||
// given handler, returning its address and a shutdown func.
|
// given handler, returning its address and a shutdown func.
|
||||||
func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) {
|
func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) {
|
||||||
|
|
|
||||||
|
|
@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
|
||||||
if data.Apex != "" {
|
if data.Apex != "" {
|
||||||
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
|
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
|
||||||
}
|
}
|
||||||
|
// A transport/resolver fault means the apex could not be observed, not that the
|
||||||
|
// delegation is broken. Report it as Unknown so an intermittent recursive-resolver
|
||||||
|
// glitch does not flap the check into Crit; only definitive evidence drives Crit.
|
||||||
|
status := sdk.StatusCrit
|
||||||
|
hint := "Check that the parent delegation exists and that the zone is published."
|
||||||
|
if data.ApexLookupTransient {
|
||||||
|
status = sdk.StatusUnknown
|
||||||
|
hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability."
|
||||||
|
}
|
||||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusCrit,
|
Status: status,
|
||||||
Subject: data.Owner,
|
Subject: data.Owner,
|
||||||
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
|
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
|
||||||
}, "Check that the parent delegation exists and that the zone is published.")}
|
}, hint)}
|
||||||
}
|
}
|
||||||
|
|
||||||
type cnameAtApexRule struct{}
|
type cnameAtApexRule struct{}
|
||||||
|
|
|
||||||
|
|
@ -90,15 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
|
||||||
if data.ChainTerminated.Reason != TermQueryErr {
|
if data.ChainTerminated.Reason != TermQueryErr {
|
||||||
return okState(data.Owner, "all chain queries succeeded")
|
return okState(data.Owner, "all chain queries succeeded")
|
||||||
}
|
}
|
||||||
// A transport failure (connection refused, timeout, network unreachable)
|
// A transport failure (connection refused, timeout, network unreachable) means
|
||||||
// means we could not observe the alias, not that the alias is broken. Report
|
// we could not observe the alias, not that it is broken: report it as Unknown so
|
||||||
// it as Unknown so an intermittent reachability glitch does not flap the
|
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
|
||||||
// check into Warn/Crit; only definitive DNS evidence drives those statuses.
|
// non-transient failure (e.g. the target has no locatable apex) is definitive
|
||||||
|
// evidence the alias cannot be followed: report it as Warn.
|
||||||
|
status, verb := sdk.StatusWarn, "failed"
|
||||||
|
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
|
||||||
|
if data.ChainTerminated.Transient {
|
||||||
|
status, verb = sdk.StatusUnknown, "could not be completed"
|
||||||
|
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
|
||||||
|
}
|
||||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusUnknown,
|
Status: status,
|
||||||
Subject: data.ChainTerminated.Subject,
|
Subject: data.ChainTerminated.Subject,
|
||||||
Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
|
Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
|
||||||
}, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")}
|
}, hint)}
|
||||||
}
|
}
|
||||||
|
|
||||||
type chainRcodeRule struct{}
|
type chainRcodeRule struct{}
|
||||||
|
|
|
||||||
|
|
@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) {
|
||||||
t.Fatalf("want hint, got none")
|
t.Fatalf("want hint, got none")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
t.Run("transient", func(t *testing.T) {
|
||||||
|
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
|
||||||
|
// so it must be Unknown rather than Crit to avoid flapping the check.
|
||||||
|
data := &AliasData{
|
||||||
|
Owner: "nemunai.re.",
|
||||||
|
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
|
||||||
|
ApexLookupTransient: true,
|
||||||
|
}
|
||||||
|
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
|
||||||
|
if s.Meta[hintKey] == nil {
|
||||||
|
t.Fatalf("want hint, got none")
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChainLoopRule(t *testing.T) {
|
func TestChainLoopRule(t *testing.T) {
|
||||||
|
|
@ -122,11 +135,20 @@ func TestChainQueryErrorRule(t *testing.T) {
|
||||||
d.ChainTerminated.Reason = TermOK
|
d.ChainTerminated.Reason = TermOK
|
||||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
|
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
|
||||||
})
|
})
|
||||||
t.Run("query err", func(t *testing.T) {
|
t.Run("transient query err", func(t *testing.T) {
|
||||||
|
// A transport fault (timeout) could not observe the alias, so it must be
|
||||||
|
// Unknown rather than Warn to avoid flapping the check.
|
||||||
d := apexKnownData()
|
d := apexKnownData()
|
||||||
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
|
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
|
||||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
|
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
|
||||||
})
|
})
|
||||||
|
t.Run("definitive query err", func(t *testing.T) {
|
||||||
|
// A non-transient failure (target has no locatable apex) is definitive
|
||||||
|
// evidence the alias cannot be followed: Warn, not Unknown.
|
||||||
|
d := apexKnownData()
|
||||||
|
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
|
||||||
|
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestChainRcodeRule(t *testing.T) {
|
func TestChainRcodeRule(t *testing.T) {
|
||||||
|
|
|
||||||
|
|
@ -47,16 +47,23 @@ type ChainTermination struct {
|
||||||
Subject string `json:"subject,omitempty"`
|
Subject string `json:"subject,omitempty"`
|
||||||
Detail string `json:"detail,omitempty"`
|
Detail string `json:"detail,omitempty"`
|
||||||
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
||||||
|
// Transient is meaningful with TermQueryErr: true when the query could not be
|
||||||
|
// completed because of a transport/resolver fault (could not observe), false
|
||||||
|
// when it stems from definitive evidence such as a target with no locatable apex.
|
||||||
|
Transient bool `json:"transient,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// AliasData carries raw facts only; judgement is delegated to the rules.
|
// AliasData carries raw facts only; judgement is delegated to the rules.
|
||||||
type AliasData struct {
|
type AliasData struct {
|
||||||
Owner string `json:"owner"`
|
Owner string `json:"owner"`
|
||||||
|
|
||||||
// Apex is empty iff the apex lookup failed; ApexLookupError explains why.
|
// Apex is empty iff the apex lookup failed; ApexLookupError explains why and
|
||||||
Apex string `json:"apex,omitempty"`
|
// ApexLookupTransient is true when the failure was a transport/resolver fault
|
||||||
ApexLookupError string `json:"apex_lookup_error,omitempty"`
|
// (could not observe) rather than definitive evidence the apex is missing.
|
||||||
AuthServers []string `json:"auth_servers,omitempty"`
|
Apex string `json:"apex,omitempty"`
|
||||||
|
ApexLookupError string `json:"apex_lookup_error,omitempty"`
|
||||||
|
ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"`
|
||||||
|
AuthServers []string `json:"auth_servers,omitempty"`
|
||||||
|
|
||||||
Chain []ChainHop `json:"chain,omitempty"`
|
Chain []ChainHop `json:"chain,omitempty"`
|
||||||
ChainTerminated ChainTermination `json:"chain_terminated"`
|
ChainTerminated ChainTermination `json:"chain_terminated"`
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue