Compare commits

...

4 commits

Author SHA1 Message Date
65687ce375 checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn
All checks were successful
continuous-integration/drone/tag Build is passing
continuous-integration/drone/push Build is passing
SERVFAIL/REFUSED from every auth server means the record could not be
observed, not that the zone published a negative answer. Mark such rcodes
transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule
reports Unknown instead of flapping the check into Crit/Warn; definitive
NXDOMAIN answers still drive Crit (mid-chain) and Warn (final).
2026-06-18 11:22:08 +09:00
da6def100c checker: report transient apex-lookup failures as Unknown, not Crit
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
apexLookupRule mapped every findApex failure to Crit, including transport
and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server
misbehaving" — a flaky recursive resolver, not a broken delegation. That
made the check flap into Crit whenever the resolver hiccuped, the same
class of false negative the chain path already fixed.

Mark apex-lookup failures that stem from a transport/resolver fault
(resolveZoneNSAddrs net errors, recursiveExchange transport errors, and
SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed
error, surface it as ApexLookupTransient, and have apexLookupRule report
Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable
NS) still drive Crit.
2026-06-18 10:29:30 +09:00
af0dceca6c checker: fail over to other auth servers on SERVFAIL/REFUSED
queryAtAuth already failed over on transport errors but treated any DNS
response as final, so a SERVFAIL from the first auth server terminated the
chain as Crit even when a sibling server would answer NOERROR. This made
the check flap against a flaky server. Treat SERVFAIL/REFUSED as transient
and try the remaining servers, returning a definitive answer when any
server gives one and only falling back to the transient response (or the
last transport error) when every server fails.
2026-06-18 09:47:28 +09:00
680a7735f0 checker: report chain transport errors as Unknown, not Warn
A transport-level query failure (connection refused, timeout, network
unreachable) means the alias state could not be observed, not that the
alias is misconfigured. Mapping it to Warn made the check flap whenever a
flaky auth server alternated between refusing connections (Warn) and
answering SERVFAIL (Crit). Report TermQueryErr as Unknown so only
definitive DNS evidence drives Warn/Crit.
2026-06-18 09:31:37 +09:00
7 changed files with 300 additions and 34 deletions

View file

@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
apex, servers, err := findApex(ctx, owner, resolver) apex, servers, err := findApex(ctx, owner, resolver)
if err != nil { if err != nil {
data.ApexLookupError = err.Error() data.ApexLookupError = err.Error()
data.ApexLookupTransient = isTransientApexError(err)
return data, nil return data, nil
} }
data.Apex = apex data.Apex = apex
@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET} q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
r, server, err := c.queryFor(ctx, currentServers, q) r, server, err := c.queryFor(ctx, currentServers, q)
if err != nil { if err != nil {
// A query that never produced a response is a transport/resolver
// fault: we could not observe the alias, so report it as transient.
c.data.ChainTerminated = ChainTermination{ c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr, Reason: TermQueryErr,
Subject: current, Subject: current,
Detail: err.Error(), Detail: err.Error(),
Transient: true,
} }
c.data.FinalTarget = current c.data.FinalTarget = current
return return
@ -133,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
if r.Rcode != dns.RcodeSuccess { if r.Rcode != dns.RcodeSuccess {
rcode := rcodeText(r.Rcode) rcode := rcodeText(r.Rcode)
// A SERVFAIL/REFUSED from every auth server means we could not observe
// the record, not that the zone published a negative answer; mark it
// transient so the rule reports Unknown instead of Crit.
c.data.ChainTerminated = ChainTermination{ c.data.ChainTerminated = ChainTermination{
Reason: TermRcode, Reason: TermRcode,
Subject: current, Subject: current,
Rcode: rcode, Rcode: rcode,
Detail: fmt.Sprintf("server answered %s for %s", rcode, current), Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
Transient: isTransientRcode(r.Rcode),
} }
c.data.FinalTarget = current c.data.FinalTarget = current
return return
@ -189,10 +197,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
// answered by the parent's auth set. // answered by the parent's auth set.
zone, ns, zerr := c.reanchor(ctx, target) zone, ns, zerr := c.reanchor(ctx, target)
if zerr != nil { if zerr != nil {
// Re-anchoring fails either because the target genuinely has no
// locatable apex (definitive: the alias points into the void) or
// because a resolver/transport fault prevented observing it. Only the
// latter is transient; classify so the rule does not mask a real break.
c.data.ChainTerminated = ChainTermination{ c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr, Reason: TermQueryErr,
Subject: target, Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr), Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
Transient: isTransientApexError(zerr),
} }
c.data.FinalTarget = target c.data.FinalTarget = target
return return
@ -271,8 +284,9 @@ func extractCNAME(r *dns.Msg, owner string) (target string, fromDNAME bool, ttl
func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) { func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []string) {
type result struct { type result struct {
addrs []string addrs []string
rcode string rcode string
transient bool
} }
query := func(qtype uint16) result { query := func(qtype uint16) result {
@ -292,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
var res result var res result
if r.Rcode != dns.RcodeSuccess { if r.Rcode != dns.RcodeSuccess {
res.rcode = rcodeText(r.Rcode) res.rcode = rcodeText(r.Rcode)
res.transient = isTransientRcode(r.Rcode)
} }
for _, rr := range r.Answer { for _, rr := range r.Answer {
switch v := rr.(type) { switch v := rr.(type) {
@ -323,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
switch { switch {
case aRes.rcode != "": case aRes.rcode != "":
c.data.FinalRcode = aRes.rcode c.data.FinalRcode = aRes.rcode
c.data.FinalRcodeTransient = aRes.transient
case aaaaRes.rcode != "": case aaaaRes.rcode != "":
c.data.FinalRcode = aaaaRes.rcode c.data.FinalRcode = aaaaRes.rcode
c.data.FinalRcodeTransient = aaaaRes.transient
} }
} }

View file

@ -2,6 +2,7 @@ package checker
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"net" "net"
"strings" "strings"
@ -61,14 +62,22 @@ func hostPort(host, port string) string {
func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) { func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers []string, err error) {
labels := dns.SplitDomainName(fqdn) labels := dns.SplitDomainName(fqdn)
// transientSeen records whether any candidate failed for a transport or
// SERVFAIL/REFUSED reason, so a fall-through "could not locate apex" caused by
// a flaky recursive resolver is reported as transient rather than definitive.
transientSeen := false
for i := range labels { for i := range labels {
candidate := dns.Fqdn(strings.Join(labels[i:], ".")) candidate := dns.Fqdn(strings.Join(labels[i:], "."))
q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET} q := dns.Question{Name: candidate, Qtype: dns.TypeSOA, Qclass: dns.ClassINET}
r, rerr := recursiveExchange(ctx, resolver, q) r, rerr := recursiveExchange(ctx, resolver, q)
if rerr != nil { if rerr != nil {
transientSeen = true
continue continue
} }
if r.Rcode != dns.RcodeSuccess { if r.Rcode != dns.RcodeSuccess {
if isTransientRcode(r.Rcode) {
transientSeen = true
}
continue continue
} }
hasSOA := false hasSOA := false
@ -87,14 +96,20 @@ func findApex(ctx context.Context, fqdn, resolver string) (apex string, servers
apex = candidate apex = candidate
servers, err = resolveZoneNSAddrs(ctx, apex) servers, err = resolveZoneNSAddrs(ctx, apex)
if err != nil { if err != nil {
return "", nil, err // A resolver fault (e.g. "server misbehaving") means we could not
// observe the apex's NS, not that the delegation is broken.
return "", nil, transientApexError{err}
} }
if len(servers) == 0 { if len(servers) == 0 {
return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex) return "", nil, fmt.Errorf("apex %s has no resolvable NS", apex)
} }
return apex, servers, nil return apex, servers, nil
} }
return "", nil, fmt.Errorf("could not locate apex of %s", fqdn) err = fmt.Errorf("could not locate apex of %s", fqdn)
if transientSeen {
return "", nil, transientApexError{err}
}
return "", nil, err
} }
func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) { func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
@ -128,24 +143,59 @@ func resolveZoneNSAddrs(ctx context.Context, zone string) ([]string, error) {
return out, nil return out, nil
} }
// queryAtAuth tries each server in order and returns the first usable answer. // queryAtAuth tries each server in order and returns the first definitive
// dnssec=true sets the DO bit; only the DNSSEC probes need it. // answer. Transport errors and transient failures (SERVFAIL/REFUSED) make it
// fail over to the next server so a single flaky auth server cannot decide the
// verdict; a definitive response (NOERROR/NXDOMAIN/...) is returned at once.
// If every server fails it returns the last transient response when there was
// one (so callers can still inspect the rcode), otherwise the last transport
// error. dnssec=true sets the DO bit; only the DNSSEC probes need it.
func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) { func queryAtAuth(ctx context.Context, proto string, servers []string, q dns.Question, dnssec bool) (*dns.Msg, string, error) {
var lastErr error var lastErr error
var transientMsg *dns.Msg
var transientServer string
for _, s := range servers { for _, s := range servers {
r, err := dnsExchange(ctx, proto, s, q, false, dnssec) r, err := dnsExchange(ctx, proto, s, q, false, dnssec)
if err != nil { if err != nil {
lastErr = err lastErr = err
continue continue
} }
if isTransientRcode(r.Rcode) {
transientMsg, transientServer = r, s
continue
}
return r, s, nil return r, s, nil
} }
if transientMsg != nil {
return transientMsg, transientServer, nil
}
if lastErr == nil { if lastErr == nil {
lastErr = fmt.Errorf("no servers provided") lastErr = fmt.Errorf("no servers provided")
} }
return nil, "", lastErr return nil, "", lastErr
} }
// transientApexError marks an apex-lookup failure that stems from a transport or
// resolver fault rather than definitive DNS evidence, so apexLookupRule can
// report it as Unknown instead of flapping the check into Crit.
type transientApexError struct{ err error }
func (e transientApexError) Error() string { return e.err.Error() }
func (e transientApexError) Unwrap() error { return e.err }
func isTransientApexError(err error) bool {
var t transientApexError
return errors.As(err, &t)
}
// isTransientRcode reports whether an rcode is worth retrying against another
// auth server rather than treating as the zone's final answer. SERVFAIL and
// REFUSED are typically per-server faults (backend down, server not yet loaded
// the zone), unlike NXDOMAIN which is an authoritative negative answer.
func isTransientRcode(rcode int) bool {
return rcode == dns.RcodeServerFailure || rcode == dns.RcodeRefused
}
func rcodeText(r int) string { func rcodeText(r int) string {
if s, ok := dns.RcodeToString[r]; ok { if s, ok := dns.RcodeToString[r]; ok {
return s return s

109
checker/dns_test.go Normal file
View file

@ -0,0 +1,109 @@
package checker
import (
"context"
"errors"
"fmt"
"net"
"testing"
"github.com/miekg/dns"
)
func TestIsTransientRcode(t *testing.T) {
transient := []int{dns.RcodeServerFailure, dns.RcodeRefused}
for _, rc := range transient {
if !isTransientRcode(rc) {
t.Errorf("rcode %s should be transient", rcodeText(rc))
}
}
final := []int{dns.RcodeSuccess, dns.RcodeNameError, dns.RcodeNotImplemented}
for _, rc := range final {
if isTransientRcode(rc) {
t.Errorf("rcode %s should not be transient", rcodeText(rc))
}
}
}
func TestIsTransientApexError(t *testing.T) {
wrapped := transientApexError{errors.New("server misbehaving")}
if !isTransientApexError(wrapped) {
t.Errorf("transientApexError should be classified as transient")
}
if !isTransientApexError(fmt.Errorf("wrapped: %w", wrapped)) {
t.Errorf("error wrapping a transientApexError should be transient")
}
if isTransientApexError(errors.New("could not locate apex of example.com.")) {
t.Errorf("plain error should not be classified as transient")
}
if isTransientApexError(nil) {
t.Errorf("nil error should not be classified as transient")
}
}
// startTestServer spins up a UDP DNS server that answers every query with the
// given handler, returning its address and a shutdown func.
func startTestServer(t *testing.T, handler dns.HandlerFunc) (string, func()) {
t.Helper()
mux := dns.NewServeMux()
mux.HandleFunc(".", handler)
pc, err := net.ListenPacket("udp", "127.0.0.1:0")
if err != nil {
t.Fatalf("listen: %v", err)
}
srv := &dns.Server{PacketConn: pc, Handler: mux}
go srv.ActivateAndServe()
return pc.LocalAddr().String(), func() { srv.Shutdown() }
}
func answerWith(rcode int, withAnswer bool) dns.HandlerFunc {
return func(w dns.ResponseWriter, r *dns.Msg) {
m := new(dns.Msg)
m.SetReply(r)
m.Rcode = rcode
if withAnswer && len(r.Question) > 0 {
rr, _ := dns.NewRR(r.Question[0].Name + " 300 IN CNAME target.example.")
if rr != nil {
m.Answer = append(m.Answer, rr)
}
}
w.WriteMsg(m)
}
}
func TestQueryAtAuthFailsOverTransientRcode(t *testing.T) {
q := dns.Question{Name: "example.com.", Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
t.Run("prefers definitive answer over SERVFAIL", func(t *testing.T) {
bad, stopBad := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
defer stopBad()
good, stopGood := startTestServer(t, answerWith(dns.RcodeSuccess, true))
defer stopGood()
r, server, err := queryAtAuth(context.Background(), "", []string{bad, good}, q, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if r.Rcode != dns.RcodeSuccess {
t.Fatalf("got rcode %s, want NOERROR", rcodeText(r.Rcode))
}
if server != good {
t.Fatalf("answered by %s, want the healthy server %s", server, good)
}
})
t.Run("returns transient response when every server fails", func(t *testing.T) {
s1, stop1 := startTestServer(t, answerWith(dns.RcodeServerFailure, false))
defer stop1()
s2, stop2 := startTestServer(t, answerWith(dns.RcodeRefused, false))
defer stop2()
r, _, err := queryAtAuth(context.Background(), "", []string{s1, s2}, q, false)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !isTransientRcode(r.Rcode) {
t.Fatalf("got rcode %s, want a transient rcode preserved", rcodeText(r.Rcode))
}
})
}

View file

@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
if data.Apex != "" { if data.Apex != "" {
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex)) return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
} }
// A transport/resolver fault means the apex could not be observed, not that the
// delegation is broken. Report it as Unknown so an intermittent recursive-resolver
// glitch does not flap the check into Crit; only definitive evidence drives Crit.
status := sdk.StatusCrit
hint := "Check that the parent delegation exists and that the zone is published."
if data.ApexLookupTransient {
status = sdk.StatusUnknown
hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability."
}
return []sdk.CheckState{withHint(sdk.CheckState{ return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusCrit, Status: status,
Subject: data.Owner, Subject: data.Owner,
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError), Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
}, "Check that the parent delegation exists and that the zone is published.")} }, hint)}
} }
type cnameAtApexRule struct{} type cnameAtApexRule struct{}

View file

@ -90,11 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
if data.ChainTerminated.Reason != TermQueryErr { if data.ChainTerminated.Reason != TermQueryErr {
return okState(data.Owner, "all chain queries succeeded") return okState(data.Owner, "all chain queries succeeded")
} }
// A transport failure (connection refused, timeout, network unreachable) means
// we could not observe the alias, not that it is broken: report it as Unknown so
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
// non-transient failure (e.g. the target has no locatable apex) is definitive
// evidence the alias cannot be followed: report it as Warn.
status, verb := sdk.StatusWarn, "failed"
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
if data.ChainTerminated.Transient {
status, verb = sdk.StatusUnknown, "could not be completed"
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
}
return []sdk.CheckState{withHint(sdk.CheckState{ return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusWarn, Status: status,
Subject: data.ChainTerminated.Subject, Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
}, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")} }, hint)}
} }
type chainRcodeRule struct{} type chainRcodeRule struct{}
@ -114,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
} }
var out []sdk.CheckState var out []sdk.CheckState
if data.ChainTerminated.Reason == TermRcode { if data.ChainTerminated.Reason == TermRcode {
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
// not observe the record, not that the zone published a negative answer:
// report it as Unknown so a flaky server does not flap the check into Crit.
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
status := sdk.StatusCrit
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
if data.ChainTerminated.Transient {
status = sdk.StatusUnknown
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
}
out = append(out, withHint(sdk.CheckState{ out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusCrit, Status: status,
Subject: data.ChainTerminated.Subject, Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode), Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias.")) }, hint))
} }
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" { if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
status := sdk.StatusWarn
hint := "Check the upstream zone's A/AAAA publication."
if data.FinalRcodeTransient {
status = sdk.StatusUnknown
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
}
out = append(out, withHint(sdk.CheckState{ out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusWarn, Status: status,
Subject: data.FinalTarget, Subject: data.FinalTarget,
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode), Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
}, "Check the upstream zone's A/AAAA publication.")) }, hint))
} }
if len(out) == 0 { if len(out) == 0 {
return okState(data.Owner, "all chain and final lookups returned NOERROR") return okState(data.Owner, "all chain and final lookups returned NOERROR")

View file

@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) {
t.Fatalf("want hint, got none") t.Fatalf("want hint, got none")
} }
}) })
t.Run("transient", func(t *testing.T) {
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
// so it must be Unknown rather than Crit to avoid flapping the check.
data := &AliasData{
Owner: "nemunai.re.",
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
ApexLookupTransient: true,
}
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
if s.Meta[hintKey] == nil {
t.Fatalf("want hint, got none")
}
})
} }
func TestChainLoopRule(t *testing.T) { func TestChainLoopRule(t *testing.T) {
@ -122,9 +135,18 @@ func TestChainQueryErrorRule(t *testing.T) {
d.ChainTerminated.Reason = TermOK d.ChainTerminated.Reason = TermOK
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK) assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
}) })
t.Run("query err", func(t *testing.T) { t.Run("transient query err", func(t *testing.T) {
// A transport fault (timeout) could not observe the alias, so it must be
// Unknown rather than Warn to avoid flapping the check.
d := apexKnownData() d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
})
t.Run("definitive query err", func(t *testing.T) {
// A non-transient failure (target has no locatable apex) is definitive
// evidence the alias cannot be followed: Warn, not Unknown.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
}) })
} }
@ -140,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) {
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"} d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit) assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
}) })
t.Run("final rcode", func(t *testing.T) { t.Run("mid-chain transient SERVFAIL", func(t *testing.T) {
// SERVFAIL from every auth server could not be observed: Unknown, not Crit.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true}
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown)
})
t.Run("final definitive rcode", func(t *testing.T) {
d := apexKnownData()
d.ChainTerminated.Reason = TermOK
d.FinalTarget = "target.example."
d.FinalRcode = "NXDOMAIN"
states := run(chainRcodeRule{}, d, nil)
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
t.Fatalf("want single WARN, got %+v", states)
}
})
t.Run("final transient rcode", func(t *testing.T) {
// SERVFAIL on the final lookup could not be observed: Unknown, not Warn.
d := apexKnownData() d := apexKnownData()
d.ChainTerminated.Reason = TermOK d.ChainTerminated.Reason = TermOK
d.FinalTarget = "target.example." d.FinalTarget = "target.example."
d.FinalRcode = "SERVFAIL" d.FinalRcode = "SERVFAIL"
d.FinalRcodeTransient = true
states := run(chainRcodeRule{}, d, nil) states := run(chainRcodeRule{}, d, nil)
if len(states) != 1 || states[0].Status != sdk.StatusWarn { if len(states) != 1 || states[0].Status != sdk.StatusUnknown {
t.Fatalf("want single WARN, got %+v", states) t.Fatalf("want single UNKNOWN, got %+v", states)
} }
}) })
} }

View file

@ -47,16 +47,24 @@ type ChainTermination struct {
Subject string `json:"subject,omitempty"` Subject string `json:"subject,omitempty"`
Detail string `json:"detail,omitempty"` Detail string `json:"detail,omitempty"`
Rcode string `json:"rcode,omitempty"` // only with TermRcode Rcode string `json:"rcode,omitempty"` // only with TermRcode
// Transient is meaningful with TermQueryErr and TermRcode: true when the failure
// could not be observed as a definitive answer (a transport/resolver fault, or a
// SERVFAIL/REFUSED from every auth server), false when it stems from definitive
// evidence such as a target with no locatable apex or an authoritative NXDOMAIN.
Transient bool `json:"transient,omitempty"`
} }
// AliasData carries raw facts only; judgement is delegated to the rules. // AliasData carries raw facts only; judgement is delegated to the rules.
type AliasData struct { type AliasData struct {
Owner string `json:"owner"` Owner string `json:"owner"`
// Apex is empty iff the apex lookup failed; ApexLookupError explains why. // Apex is empty iff the apex lookup failed; ApexLookupError explains why and
Apex string `json:"apex,omitempty"` // ApexLookupTransient is true when the failure was a transport/resolver fault
ApexLookupError string `json:"apex_lookup_error,omitempty"` // (could not observe) rather than definitive evidence the apex is missing.
AuthServers []string `json:"auth_servers,omitempty"` Apex string `json:"apex,omitempty"`
ApexLookupError string `json:"apex_lookup_error,omitempty"`
ApexLookupTransient bool `json:"apex_lookup_transient,omitempty"`
AuthServers []string `json:"auth_servers,omitempty"`
Chain []ChainHop `json:"chain,omitempty"` Chain []ChainHop `json:"chain,omitempty"`
ChainTerminated ChainTermination `json:"chain_terminated"` ChainTerminated ChainTermination `json:"chain_terminated"`
@ -66,7 +74,11 @@ type AliasData struct {
FinalTarget string `json:"final_target,omitempty"` FinalTarget string `json:"final_target,omitempty"`
FinalA []string `json:"final_a,omitempty"` FinalA []string `json:"final_a,omitempty"`
FinalAAAA []string `json:"final_aaaa,omitempty"` FinalAAAA []string `json:"final_aaaa,omitempty"`
FinalRcode string `json:"final_rcode,omitempty"` // FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any;
// FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe)
// rather than a definitive negative answer.
FinalRcode string `json:"final_rcode,omitempty"`
FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"`
// Coexisting is populated only when Owner has a CNAME. // Coexisting is populated only when Owner has a CNAME.
Coexisting []CoexistingRRset `json:"coexisting,omitempty"` Coexisting []CoexistingRRset `json:"coexisting,omitempty"`