checker-alias/checker/rules_chain.go
Pierre-Olivier Mercier 65687ce375
All checks were successful
continuous-integration/drone/tag Build is passing
continuous-integration/drone/push Build is passing
checker: report transient mid-chain and final rcodes as Unknown, not Crit/Warn
SERVFAIL/REFUSED from every auth server means the record could not be
observed, not that the zone published a negative answer. Mark such rcodes
transient on TermRcode terminations and final A/AAAA lookups so chainRcodeRule
reports Unknown instead of flapping the check into Crit/Warn; definitive
NXDOMAIN answers still drive Crit (mid-chain) and Warn (final).
2026-06-18 11:22:08 +09:00

280 lines
9.8 KiB
Go

package checker
import (
"context"
"fmt"
sdk "git.happydns.org/checker-sdk-go/checker"
)
type chainLoopRule struct{}
func (chainLoopRule) Name() string { return "chain_loop" }
func (chainLoopRule) Description() string {
return "Detects CNAME/DNAME cycles in the resolution chain."
}
func (chainLoopRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
if data.ChainTerminated.Reason != TermLoop {
return okState(data.Owner, "no loop in the alias chain")
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusCrit,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("chain loops back to %s", data.ChainTerminated.Subject),
}, "Break the loop by pointing the last CNAME at an A/AAAA-bearing name.")}
}
type chainLengthRule struct{}
func (chainLengthRule) Name() string { return "chain_length" }
func (chainLengthRule) Description() string {
return "Flags alias chains longer than the configured maximum (most resolvers give up around 8-16 hops)."
}
func (chainLengthRule) Options() sdk.CheckerOptionsDocumentation {
return sdk.CheckerOptionsDocumentation{
UserOpts: []sdk.CheckerOptionDocumentation{
{
Id: "maxChainLength",
Type: "uint",
Label: "Maximum chain length",
Description: "Above this number of hops the chain is reported as critical.",
Default: float64(defaultMaxChainLength),
},
},
}
}
func (chainLengthRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, opts sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
maxLen := sdk.GetIntOption(opts, "maxChainLength", defaultMaxChainLength)
if data.ChainTerminated.Reason != TermTooLong {
return okState(data.Owner, fmt.Sprintf("chain has %d hop(s), within limit of %d", len(data.Chain), maxLen))
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusCrit,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("chain exceeds %d hops; many resolvers will give up", maxLen),
}, "Flatten intermediate CNAMEs so that the chain is at most a few hops long.")}
}
type chainQueryErrorRule struct{}
func (chainQueryErrorRule) Name() string { return "chain_query_error" }
func (chainQueryErrorRule) Description() string {
return "Flags DNS query failures encountered while walking the alias chain."
}
func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
if data.ChainTerminated.Reason != TermQueryErr {
return okState(data.Owner, "all chain queries succeeded")
}
// A transport failure (connection refused, timeout, network unreachable) means
// we could not observe the alias, not that it is broken: report it as Unknown so
// an intermittent reachability glitch does not flap the check into Warn/Crit. A
// non-transient failure (e.g. the target has no locatable apex) is definitive
// evidence the alias cannot be followed: report it as Warn.
status, verb := sdk.StatusWarn, "failed"
hint := "Check that the alias target exists and is delegated; the alias is unusable while the query fails."
if data.ChainTerminated.Transient {
status, verb = sdk.StatusUnknown, "could not be completed"
hint = "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail."
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: status,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("CNAME query for %s %s: %s", data.ChainTerminated.Subject, verb, data.ChainTerminated.Detail),
}, hint)}
}
type chainRcodeRule struct{}
func (chainRcodeRule) Name() string { return "chain_rcode" }
func (chainRcodeRule) Description() string {
return "Flags NXDOMAIN/SERVFAIL/other rcodes encountered mid-chain or on the final target lookup."
}
func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
var out []sdk.CheckState
if data.ChainTerminated.Reason == TermRcode {
// A transient rcode (SERVFAIL/REFUSED from every auth server) means we could
// not observe the record, not that the zone published a negative answer:
// report it as Unknown so a flaky server does not flap the check into Crit.
// A definitive NXDOMAIN mid-chain is a real break and stays Crit.
status := sdk.StatusCrit
hint := "Ensure the zone publishes the expected record; NXDOMAIN mid-chain breaks the alias."
if data.ChainTerminated.Transient {
status = sdk.StatusUnknown
hint = "Check authoritative-server reachability; SERVFAIL/REFUSED from every server leaves the alias state undetermined."
}
out = append(out, withHint(sdk.CheckState{
Status: status,
Subject: data.ChainTerminated.Subject,
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
}, hint))
}
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
// Same distinction for the final A/AAAA lookup: a SERVFAIL/REFUSED could not
// be observed (Unknown), a definitive rcode is a real publication gap (Warn).
status := sdk.StatusWarn
hint := "Check the upstream zone's A/AAAA publication."
if data.FinalRcodeTransient {
status = sdk.StatusUnknown
hint = "Check the upstream auth servers' reachability; the final A/AAAA state could not be determined."
}
out = append(out, withHint(sdk.CheckState{
Status: status,
Subject: data.FinalTarget,
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
}, hint))
}
if len(out) == 0 {
return okState(data.Owner, "all chain and final lookups returned NOERROR")
}
return out
}
type hopTTLRule struct{}
func (hopTTLRule) Name() string { return "hop_ttl" }
func (hopTTLRule) Description() string {
return "Flags chain hops whose TTL is below the configured minimum."
}
func (hopTTLRule) Options() sdk.CheckerOptionsDocumentation {
return sdk.CheckerOptionsDocumentation{
UserOpts: []sdk.CheckerOptionDocumentation{
{
Id: "minTargetTTL",
Type: "uint",
Label: "Minimum TTL (seconds)",
Description: "Hops with a TTL below this threshold are flagged as a warning.",
Default: float64(defaultMinTargetTTL),
},
},
}
}
func (hopTTLRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, opts sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
if len(data.Chain) == 0 {
return skipped("chain is empty")
}
minTTL := uint32(sdk.GetIntOption(opts, "minTargetTTL", defaultMinTargetTTL))
var out []sdk.CheckState
for _, h := range data.Chain {
if h.Kind == KindTarget || h.TTL == 0 {
continue
}
if h.TTL < minTTL {
out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusWarn,
Subject: h.Owner,
Message: fmt.Sprintf("hop %s → %s has TTL %ds (< %d)", h.Owner, h.Target, h.TTL, minTTL),
}, "Raise the CNAME TTL to improve cache efficiency (5-15 minutes is a common floor)."))
}
}
if len(out) == 0 {
return okState(data.Owner, fmt.Sprintf("all chain hops have TTL ≥ %ds", minTTL))
}
return out
}
type targetResolvableRule struct{}
func (targetResolvableRule) Name() string { return "target_resolvable" }
func (targetResolvableRule) Description() string {
return "Verifies that the final target of the alias chain exists in DNS (returns NOERROR, not NXDOMAIN)."
}
func (targetResolvableRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
if data.ChainTerminated.Reason != TermOK {
return skipped("chain did not terminate normally")
}
if data.FinalRcode != "NXDOMAIN" {
return okState(data.FinalTarget, fmt.Sprintf("target %s exists in DNS", data.FinalTarget))
}
return []sdk.CheckState{withHint(sdk.CheckState{
Status: sdk.StatusCrit,
Subject: data.FinalTarget,
Message: fmt.Sprintf("final target %s does not exist (NXDOMAIN)", data.FinalTarget),
}, "The alias points at a name that does not exist; create the missing record or update the alias target.")}
}
type multipleRecordsRule struct{}
func (multipleRecordsRule) Name() string { return "multiple_records" }
func (multipleRecordsRule) Description() string {
return "Flags owners that carry more than one CNAME/DNAME record; only one is legal per owner."
}
func (multipleRecordsRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
data, errState := loadAlias(ctx, obs)
if errState != nil {
return errState
}
if !apexKnown(data) {
return skipped("apex lookup failed")
}
seen := map[string]int{}
for _, h := range data.Chain {
if h.Kind == KindCNAME || h.Kind == KindDNAME {
seen[h.Owner]++
}
}
var out []sdk.CheckState
for owner, n := range seen {
if n > 1 {
out = append(out, withHint(sdk.CheckState{
Status: sdk.StatusCrit,
Subject: owner,
Message: fmt.Sprintf("%s carries %d CNAME/DNAME records in the chain", owner, n),
}, "Keep a single CNAME per name; remove duplicates at the authoritative zone."))
}
}
if len(out) == 0 {
return okState(data.Owner, "every chain owner carries a single CNAME/DNAME")
}
return out
}