checker: report chain transport errors as Unknown, not Warn
A transport-level query failure (connection refused, timeout, network unreachable) means the alias state could not be observed, not that the alias is misconfigured. Mapping it to Warn made the check flap whenever a flaky auth server alternated between refusing connections (Warn) and answering SERVFAIL (Crit). Report TermQueryErr as Unknown so only definitive DNS evidence drives Warn/Crit.
This commit is contained in:
parent
0becf6bc8c
commit
680a7735f0
2 changed files with 8 additions and 4 deletions
|
|
@ -90,11 +90,15 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett
|
|||
if data.ChainTerminated.Reason != TermQueryErr {
|
||||
return okState(data.Owner, "all chain queries succeeded")
|
||||
}
|
||||
// A transport failure (connection refused, timeout, network unreachable)
|
||||
// means we could not observe the alias, not that the alias is broken. Report
|
||||
// it as Unknown so an intermittent reachability glitch does not flap the
|
||||
// check into Warn/Crit; only definitive DNS evidence drives those statuses.
|
||||
return []sdk.CheckState{withHint(sdk.CheckState{
|
||||
Status: sdk.StatusWarn,
|
||||
Status: sdk.StatusUnknown,
|
||||
Subject: data.ChainTerminated.Subject,
|
||||
Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
|
||||
}, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")}
|
||||
Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
|
||||
}, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")}
|
||||
}
|
||||
|
||||
type chainRcodeRule struct{}
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ func TestChainQueryErrorRule(t *testing.T) {
|
|||
t.Run("query err", func(t *testing.T) {
|
||||
d := apexKnownData()
|
||||
d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"}
|
||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn)
|
||||
assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown)
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue