checker: report transient apex-lookup failures as Unknown, not Crit
apexLookupRule mapped every findApex failure to Crit, including transport and resolver faults like "lookup nemunai.re on 127.0.0.11:53: server misbehaving" — a flaky recursive resolver, not a broken delegation. That made the check flap into Crit whenever the resolver hiccuped, the same class of false negative the chain path already fixed. Mark apex-lookup failures that stem from a transport/resolver fault (resolveZoneNSAddrs net errors, recursiveExchange transport errors, and SERVFAIL/REFUSED seen during the SOA walk) as transient via a typed error, surface it as ApexLookupTransient, and have apexLookupRule report Unknown for those. Definitive failures (NXDOMAIN-only walk, no resolvable NS) still drive Crit.
This commit is contained in:
parent
af0dceca6c
commit
da6def100c
7 changed files with 123 additions and 23 deletions
|
|
@ -90,15 +90,22 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
|
|||
if data.ChainTerminated.Reason != TermQueryErr {
|
||||
return okState(data.Owner, "all chain queries succeeded")
|
||||
}
|
||||
// A transport failure (connection refused, timeout, network unreachable)
|
||||
// means we could not observe the alias, not that the alias is broken. Report
|
||||
// it as Unknown so an intermittent reachability glitch does not flap the
|
||||
// check into Warn/Crit; only definitive DNS evidence drives those statuses.
|
||||
// A transport failure (connection refused, timeout, network unreachable) means
|
||||
// we could not observe the alias, not that it is broken: report it as Unknown so
|
||||
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
|
||||
// non-transient failure (e.g. the target has no locatable apex) is definitive
|
||||
// evidence the alias cannot be followed: report it as Warn.
|
||||
status, verb := sdk.StatusWarn, "failed"
|
||||
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
|
||||
if data.ChainTerminated.Transient {
|
||||
status, verb = sdk.StatusUnknown, "could not be completed"
|
||||
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
|
||||
}
|
||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||
Status: sdk.StatusUnknown,
|
||||
Status: status,
|
||||
Subject: data.ChainTerminated.Subject,
|
||||
Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
|
||||
}, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")}
|
||||
Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
|
||||
}, hint)}
|
||||
}
|
||||
|
||||
type chainRcodeRule struct{}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue