From 680a7735f0242fda47da6392c40e7d1aa9b824c6 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Thu, 18 Jun 2026 09:30:01 +0900 Subject: [PATCH] checker: report chain transport errors as Unknown, not Warn A transport-level query failure (connection refused, timeout, network unreachable) means the alias state could not be observed, not that the alias is misconfigured. Mapping it to Warn made the check flap whenever a flaky auth server alternated between refusing connections (Warn) and answering SERVFAIL (Crit). Report TermQueryErr as Unknown so only definitive DNS evidence drives Warn/Crit. --- checker/rules_chain.go | 10 +++++++--- checker/rules_test.go | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/checker/rules_chain.go b/checker/rules_chain.go index e3da350..59ce506 100644 --- a/checker/rules_chain.go +++ b/checker/rules_chain.go @@ -90,11 +90,15 @@ func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGett if data.ChainTerminated.Reason != TermQueryErr { return okState(data.Owner, "all chain queries succeeded") } + // A transport failure (connection refused, timeout, network unreachable) + // means we could not observe the alias, not that the alias is broken. Report + // it as Unknown so an intermittent reachability glitch does not flap the + // check into Warn/Crit; only definitive DNS evidence drives those statuses. return []sdk.CheckState{withHint(sdk.CheckState{ - Status: sdk.StatusWarn, + Status: sdk.StatusUnknown, Subject: data.ChainTerminated.Subject, - Message: fmt.Sprintf("CNAME query for %s failed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), - }, "Check authoritative-server reachability and firewall rules; the alias is unusable while queries fail.")} + Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail), + }, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")} } type chainRcodeRule struct{} diff --git a/checker/rules_test.go b/checker/rules_test.go index 1d2d92f..c289dbb 100644 --- a/checker/rules_test.go +++ b/checker/rules_test.go @@ -125,7 +125,7 @@ func TestChainQueryErrorRule(t *testing.T) { t.Run("query err", func(t *testing.T) { d := apexKnownData() d.ChainTerminated = ChainTermination{Reason: TermQueryErr, Subject: "broken.example.com.", Detail: "timeout"} - assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusWarn) + assertSingle(t, run(chainQueryErrorRule{}, d, nil), sdk.StatusUnknown) }) }