checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit.
This commit is contained in:
parent
af0dceca6c
commit
da6def100c
7 changed files with 123 additions and 23 deletions
|
|
@ -22,11 +22,20 @@ func (apexLookupRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _
|
|||
if data.Apex != "" {
|
||||
return okState(data.Apex, fmt.Sprintf("apex %s located", data.Apex))
|
||||
}
|
||||
// A transport/resolver fault means the apex could not be observed, not that the
|
||||
// delegation is broken. Report it as Unknown so an intermittent recursive-resolver
|
||||
// glitch does not flap the check into Crit; only definitive evidence drives Crit.
|
||||
status := sdk.StatusCrit
|
||||
hint := "Check that the parent delegation exists and that the zone is published."
|
||||
if data.ApexLookupTransient {
|
||||
status = sdk.StatusUnknown
|
||||
hint = "The zone apex could not be observed due to a resolver/transport fault; retry and check recursive-resolver reachability."
|
||||
}
|
||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||
Status: sdk.StatusCrit,
|
||||
Status: status,
|
||||
Subject: data.Owner,
|
||||
Message: fmt.Sprintf("could not locate zone apex: %s", data.ApexLookupError),
|
||||
}, "Check that the parent delegation exists and that the zone is published.")}
|
||||
}, hint)}
|
||||
}
|
||||
|
||||
type cnameAtApexRule struct{}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue