checker: report transient apex-lookup failures as Unknown, not Crit
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing

apexLookupRule mapped every findApex failure to Crit, including transport
and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server
misbehaving" — a flaky recursive resolver, not a broken delegation. That
made the check flap into Crit whenever the resolver hiccuped, the same
class of false negative the chain path already fixed.

Mark apex-lookup failures that stem from a transport/resolver fault
(resolveZoneNSAddrs net errors, recursiveExchange transport errors, and
SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed
error, surface it as ApexLookupTransient, and have apexLookupRule report
Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable
NS) still drive Crit.
This commit is contained in:
nemunaire 2026-06-18 10:05:51 +09:00
commit da6def100c
7 changed files with 123 additions and 23 deletions

View file

@ -81,6 +81,19 @@ func TestApexLookupRule(t *testing.T) {
t.Fatalf("want hint, got none")
}
})
t.Run("transient", func(t *testing.T) {
// A resolver fault (e.g. "server misbehaving") could not observe the apex,
// so it must be Unknown rather than Crit to avoid flapping the check.
data := &AliasData{
Owner: "nemunai.re.",
ApexLookupError: "lookup nemunai.re on 127.0.0.11:53: server misbehaving",
ApexLookupTransient: true,
}
s := assertSingle(t, run(apexLookupRule{}, data, nil), sdk.StatusUnknown)
if s.Meta[hintKey] == nil {
t.Fatalf("want hint, got none")
}
})
}
func TestChainLoopRule(t *testing.T) {
@ -122,11 +135,20 @@ func TestChainQueryErrorRule(t *testing.T) {
d.ChainTerminated.Reason = TermOK
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusOK)
})
t.Run("query err", func(t *testing.T) {
t.Run("transient query err", func(t *testing.T) {
// A transport fault (timeout) could not observe the alias, so it must be
// Unknown rather than Warn to avoid flapping the check.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout", Transient: true}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
})
t.Run("definitive query err", func(t *testing.T) {
// A non-transient failure (target has no locatable apex) is definitive
// evidence the alias cannot be followed: Warn, not Unknown.
d := apexKnownData()
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "target.example.", Detail: "re-anchor for target.example. failed: could not locate apex of target.example."}
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
})
}
func TestChainRcodeRule(t *testing.T) {