checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit.
This commit is contained in:
parent
af0dceca6c
commit
da6def100c
7 changed files with 123 additions and 23 deletions
|
|
@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) {
|
|||
t.Fatalf("want hint, got none")
|
||||
}
|
||||
})
|
||||
t.Run("transient", func(t *testing.T) {
|
||||
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
|
||||
// so it must be Unknown rather than Crit to avoid flapping the check.
|
||||
data := &AliasData{
|
||||
Owner: "nemunai.re.",
|
||||
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
|
||||
ApexLookupTransient: true,
|
||||
}
|
||||
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
|
||||
if s.Meta[hintKey] == nil {
|
||||
t.Fatalf("want hint, got none")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestChainLoopRule(t *testing.T) {
|
||||
|
|
@ -122,11 +135,20 @@ func TestChainQueryErrorRule(t *testing.T) {
|
|||
d.ChainTerminated.Reason = TermOK
|
||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
|
||||
})
|
||||
t.Run("query err", func(t *testing.T) {
|
||||
t.Run("transient query err", func(t *testing.T) {
|
||||
// A transport fault (timeout) could not observe the alias, so it must be
|
||||
// Unknown rather than Warn to avoid flapping the check.
|
||||
d := apexKnownData()
|
||||
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
|
||||
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
|
||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
|
||||
})
|
||||
t.Run("definitive query err", func(t *testing.T) {
|
||||
// A non-transient failure (target has no locatable apex) is definitive
|
||||
// evidence the alias cannot be followed: Warn, not Unknown.
|
||||
d := apexKnownData()
|
||||
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
|
||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
|
||||
})
|
||||
}
|
||||
|
||||
func TestChainRcodeRule(t *testing.T) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue