checker: report transient apex-lookup failures as Unknown, not Crit
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing

apexLookupRule mapped every findApex failure to Crit, including transport
and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server
misbehaving" — a flaky recursive resolver, not a broken delegation. That
made the check flap into Crit whenever the resolver hiccuped, the same
class of false negative the chain path already fixed.

Mark apex-lookup failures that stem from a transport/resolver fault
(resolveZoneNSAddrs net errors, recursiveExchange transport errors, and
SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed
error, surface it as ApexLookupTransient, and have apexLookupRule report
Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable
NS) still drive Crit.
This commit is contained in:
nemunaire 2026-06-18 10:05:51 +09:00
commit da6def100c
7 changed files with 123 additions and 23 deletions

View file

@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
apex, servers, err := findApex(ctx, owner, resolver)
if err != nil {
data.ApexLookupError = err.Error()
data.ApexLookupTransient = isTransientApexError(err)
return data, nil
}
data.Apex = apex
@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
r, server, err := c.queryFor(ctx, currentServers, q)
if err != nil {
// A query that never produced a response is a transport/resolver
// fault: we could not observe the alias, so report it as transient.
c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr,
Subject: current,
Detail: err.Error(),
Reason: TermQueryErr,
Subject: current,
Detail: err.Error(),
Transient: true,
}
c.data.FinalTarget = current
return
@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
// answered by the parent's auth set.
zone, ns, zerr := c.reanchor(ctx, target)
if zerr != nil {
// Re-anchoring fails either because the target genuinely has no
// locatable apex (definitive: the alias points into the void) or
// because a resolver/transport fault prevented observing it. Only the
// latter is transient; classify so the rule does not mask a real break.
c.data.ChainTerminated = ChainTermination{
Reason: TermQueryErr,
Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
Reason: TermQueryErr,
Subject: target,
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
Transient: isTransientApexError(zerr),
}
c.data.FinalTarget = target
return