checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit.
This commit is contained in:
parent
af0dceca6c
commit
da6def100c
7 changed files with 123 additions and 23 deletions
|
|
@ -26,6 +26,7 @@ func (p *aliasProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (a
|
|||
apex, servers, err := findApex(ctx, owner, resolver)
|
||||
if err != nil {
|
||||
data.ApexLookupError = err.Error()
|
||||
data.ApexLookupTransient = isTransientApexError(err)
|
||||
return data, nil
|
||||
}
|
||||
data.Apex = apex
|
||||
|
|
@ -122,10 +123,13 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
|||
q := dns.Question{Name: current, Qtype: dns.TypeCNAME, Qclass: dns.ClassINET}
|
||||
r, server, err := c.queryFor(ctx, currentServers, q)
|
||||
if err != nil {
|
||||
// A query that never produced a response is a transport/resolver
|
||||
// fault: we could not observe the alias, so report it as transient.
|
||||
c.data.ChainTerminated = ChainTermination{
|
||||
Reason: TermQueryErr,
|
||||
Subject: current,
|
||||
Detail: err.Error(),
|
||||
Reason: TermQueryErr,
|
||||
Subject: current,
|
||||
Detail: err.Error(),
|
||||
Transient: true,
|
||||
}
|
||||
c.data.FinalTarget = current
|
||||
return
|
||||
|
|
@ -189,10 +193,15 @@ func (c *chainCtx) walk(ctx context.Context, name string) {
|
|||
// answered by the parent's auth set.
|
||||
zone, ns, zerr := c.reanchor(ctx, target)
|
||||
if zerr != nil {
|
||||
// Re-anchoring fails either because the target genuinely has no
|
||||
// locatable apex (definitive: the alias points into the void) or
|
||||
// because a resolver/transport fault prevented observing it. Only the
|
||||
// latter is transient; classify so the rule does not mask a real break.
|
||||
c.data.ChainTerminated = ChainTermination{
|
||||
Reason: TermQueryErr,
|
||||
Subject: target,
|
||||
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
|
||||
Reason: TermQueryErr,
|
||||
Subject: target,
|
||||
Detail: fmt.Sprintf("re-anchor for %s failed: %v", target, zerr),
|
||||
Transient: isTransientApexError(zerr),
|
||||
}
|
||||
c.data.FinalTarget = target
|
||||
return
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue