checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn
SERVFAIL/REFUSED from every auth server means the record could not be observed, not that the zone published a negative answer. Mark such rcodes transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule reports Unknown instead of flapping the check into Crit/Warn; definitive NXDOMAIN answers still drive Crit (mid-chain) and Warn (final).
This commit is contained in:
parent
da6def100c
commit
65687ce375
4 changed files with 65 additions and 16 deletions
|
|
@ -137,11 +137,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
||||||
|
|
||||||
if r.Rcode != dns.RcodeSuccess {
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
rcode := rcodeText(r.Rcode)
|
rcode := rcodeText(r.Rcode)
|
||||||
|
// A SERVFAIL/REFUSED from every auth server means we could not observe
|
||||||
|
// the record, not that the zone published a negative answer; mark it
|
||||||
|
// transient so the rule reports Unknown instead of Crit.
|
||||||
c.data.ChainTerminated = ChainTermination{
|
c.data.ChainTerminated = ChainTermination{
|
||||||
Reason: TermRcode,
|
Reason: TermRcode,
|
||||||
Subject: current,
|
Subject: current,
|
||||||
Rcode: rcode,
|
Rcode: rcode,
|
||||||
Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
|
Detail: fmt.Sprintf("server answered %s for %s", rcode, current),
|
||||||
|
Transient: isTransientRcode(r.Rcode),
|
||||||
}
|
}
|
||||||
c.data.FinalTarget = current
|
c.data.FinalTarget = current
|
||||||
return
|
return
|
||||||
|
|
@ -282,6 +286,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
||||||
type result struct {
|
type result struct {
|
||||||
addrs []string
|
addrs []string
|
||||||
rcode string
|
rcode string
|
||||||
|
transient bool
|
||||||
}
|
}
|
||||||
|
|
||||||
query := func(qtype uint16) result {
|
query := func(qtype uint16) result {
|
||||||
|
|
@ -301,6 +306,7 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
||||||
var res result
|
var res result
|
||||||
if r.Rcode != dns.RcodeSuccess {
|
if r.Rcode != dns.RcodeSuccess {
|
||||||
res.rcode = rcodeText(r.Rcode)
|
res.rcode = rcodeText(r.Rcode)
|
||||||
|
res.transient = isTransientRcode(r.Rcode)
|
||||||
}
|
}
|
||||||
for _, rr := range r.Answer {
|
for _, rr := range r.Answer {
|
||||||
switch v := rr.(type) {
|
switch v := rr.(type) {
|
||||||
|
|
@ -332,8 +338,10 @@ func (c *chainCtx) resolveFinal(ctx context.Context, name string, servers []stri
|
||||||
switch {
|
switch {
|
||||||
case aRes.rcode != "":
|
case aRes.rcode != "":
|
||||||
c.data.FinalRcode = aRes.rcode
|
c.data.FinalRcode = aRes.rcode
|
||||||
|
c.data.FinalRcodeTransient = aRes.transient
|
||||||
case aaaaRes.rcode != "":
|
case aaaaRes.rcode != "":
|
||||||
c.data.FinalRcode = aaaaRes.rcode
|
c.data.FinalRcode = aaaaRes.rcode
|
||||||
|
c.data.FinalRcodeTransient = aaaaRes.transient
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -125,18 +125,36 @@ func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
|
||||||
}
|
}
|
||||||
var out []sdk.CheckState
|
var out []sdk.CheckState
|
||||||
if data.ChainTerminated.Reason == TermRcode {
|
if data.ChainTerminated.Reason == TermRcode {
|
||||||
|
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
|
||||||
|
// not observe the record, not that the zone published a negative answer:
|
||||||
|
// report it as Unknown so a flaky server does not flap the check into Crit.
|
||||||
|
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
|
||||||
|
status := sdk.StatusCrit
|
||||||
|
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
|
||||||
|
if data.ChainTerminated.Transient {
|
||||||
|
status = sdk.StatusUnknown
|
||||||
|
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
|
||||||
|
}
|
||||||
out = append(out, withHint(sdk.CheckState{
|
out = append(out, withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusCrit,
|
Status: status,
|
||||||
Subject: data.ChainTerminated.Subject,
|
Subject: data.ChainTerminated.Subject,
|
||||||
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
|
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
|
||||||
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias."))
|
}, hint))
|
||||||
}
|
}
|
||||||
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
|
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
|
||||||
|
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
|
||||||
|
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
|
||||||
|
status := sdk.StatusWarn
|
||||||
|
hint := "Check the upstream zone's A/AAAA publication."
|
||||||
|
if data.FinalRcodeTransient {
|
||||||
|
status = sdk.StatusUnknown
|
||||||
|
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
|
||||||
|
}
|
||||||
out = append(out, withHint(sdk.CheckState{
|
out = append(out, withHint(sdk.CheckState{
|
||||||
Status: sdk.StatusWarn,
|
Status: status,
|
||||||
Subject: data.FinalTarget,
|
Subject: data.FinalTarget,
|
||||||
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
|
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
|
||||||
}, "Check the upstream zone's A/AAAA publication."))
|
}, hint))
|
||||||
}
|
}
|
||||||
if len(out) == 0 {
|
if len(out) == 0 {
|
||||||
return okState(data.Owner, "all chain and final lookups returned NOERROR")
|
return okState(data.Owner, "all chain and final lookups returned NOERROR")
|
||||||
|
|
|
||||||
|
|
@ -162,14 +162,32 @@ func TestChainRcodeRule(t *testing.T) {
|
||||||
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
|
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "gone.example.com.", Rcode: "NXDOMAIN"}
|
||||||
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
|
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusCrit)
|
||||||
})
|
})
|
||||||
t.Run("final rcode", func(t *testing.T) {
|
t.Run("mid-chain transient SERVFAIL", func(t *testing.T) {
|
||||||
|
// SERVFAIL from every auth server could not be observed: Unknown, not Crit.
|
||||||
|
d := apexKnownData()
|
||||||
|
d.ChainTerminated = ChainTermination{Reason: TermRcode, Subject: "flaky.example.com.", Rcode: "SERVFAIL", Transient: true}
|
||||||
|
assertSingle(t, run(chainRcodeRule{}, d, nil), sdk.StatusUnknown)
|
||||||
|
})
|
||||||
|
t.Run("final definitive rcode", func(t *testing.T) {
|
||||||
|
d := apexKnownData()
|
||||||
|
d.ChainTerminated.Reason = TermOK
|
||||||
|
d.FinalTarget = "target.example."
|
||||||
|
d.FinalRcode = "NXDOMAIN"
|
||||||
|
states := run(chainRcodeRule{}, d, nil)
|
||||||
|
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
|
||||||
|
t.Fatalf("want single WARN, got %+v", states)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
t.Run("final transient rcode", func(t *testing.T) {
|
||||||
|
// SERVFAIL on the final lookup could not be observed: Unknown, not Warn.
|
||||||
d := apexKnownData()
|
d := apexKnownData()
|
||||||
d.ChainTerminated.Reason = TermOK
|
d.ChainTerminated.Reason = TermOK
|
||||||
d.FinalTarget = "target.example."
|
d.FinalTarget = "target.example."
|
||||||
d.FinalRcode = "SERVFAIL"
|
d.FinalRcode = "SERVFAIL"
|
||||||
|
d.FinalRcodeTransient = true
|
||||||
states := run(chainRcodeRule{}, d, nil)
|
states := run(chainRcodeRule{}, d, nil)
|
||||||
if len(states) != 1 || states[0].Status != sdk.StatusWarn {
|
if len(states) != 1 || states[0].Status != sdk.StatusUnknown {
|
||||||
t.Fatalf("want single WARN, got %+v", states)
|
t.Fatalf("want single UNKNOWN, got %+v", states)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -47,9 +47,10 @@ type ChainTermination struct {
|
||||||
Subject string `json:"subject,omitempty"`
|
Subject string `json:"subject,omitempty"`
|
||||||
Detail string `json:"detail,omitempty"`
|
Detail string `json:"detail,omitempty"`
|
||||||
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
Rcode string `json:"rcode,omitempty"` // only with TermRcode
|
||||||
// Transient is meaningful with TermQueryErr: true when the query could not be
|
// Transient is meaningful with TermQueryErr and TermRcode: true when the failure
|
||||||
// completed because of a transport/resolver fault (could not observe), false
|
// could not be observed as a definitive answer (a transport/resolver fault, or a
|
||||||
// when it stems from definitive evidence such as a target with no locatable apex.
|
// SERVFAIL/REFUSED from every auth server), false when it stems from definitive
|
||||||
|
// evidence such as a target with no locatable apex or an authoritative NXDOMAIN.
|
||||||
Transient bool `json:"transient,omitempty"`
|
Transient bool `json:"transient,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -73,7 +74,11 @@ type AliasData struct {
|
||||||
FinalTarget string `json:"final_target,omitempty"`
|
FinalTarget string `json:"final_target,omitempty"`
|
||||||
FinalA []string `json:"final_a,omitempty"`
|
FinalA []string `json:"final_a,omitempty"`
|
||||||
FinalAAAA []string `json:"final_aaaa,omitempty"`
|
FinalAAAA []string `json:"final_aaaa,omitempty"`
|
||||||
|
// FinalRcode is the non-NOERROR rcode of the final A/AAAA lookup, if any;
|
||||||
|
// FinalRcodeTransient is true when it was a SERVFAIL/REFUSED (could not observe)
|
||||||
|
// rather than a definitive negative answer.
|
||||||
FinalRcode string `json:"final_rcode,omitempty"`
|
FinalRcode string `json:"final_rcode,omitempty"`
|
||||||
|
FinalRcodeTransient bool `json:"final_rcode_transient,omitempty"`
|
||||||
|
|
||||||
// Coexisting is populated only when Owner has a CNAME.
|
// Coexisting is populated only when Owner has a CNAME.
|
||||||
Coexisting []CoexistingRRset `json:"coexisting,omitempty"`
|
Coexisting []CoexistingRRset `json:"coexisting,omitempty"`
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue