A transport-level query failure (connection refused, timeout, network unreachable) means the alias state could not be observed, not that the alias is misconfigured. Mapping it to Warn made the check flap whenever a flaky auth server alternated between refusing connections (Warn) and answering SERVFAIL (Crit). Report TermQueryErr as Unknown so only definitive DNS evidence drives Warn/Crit.
255 lines
8.6 KiB
Go
255 lines
8.6 KiB
Go
package checker
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
|
|
sdk "git.happydns.org/checker-sdk-go/checker"
|
|
)
|
|
|
|
type chainLoopRule struct{}
|
|
|
|
func (chainLoopRule) Name() string { return "chain_loop" }
|
|
func (chainLoopRule) Description() string {
|
|
return "Detects CNAME/DNAME cycles in the resolution chain."
|
|
}
|
|
|
|
func (chainLoopRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
if data.ChainTerminated.Reason != TermLoop {
|
|
return okState(data.Owner, "no loop in the alias chain")
|
|
}
|
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
|
Status: sdk.StatusCrit,
|
|
Subject: data.ChainTerminated.Subject,
|
|
Message: fmt.Sprintf("chain loops back to %s", data.ChainTerminated.Subject),
|
|
}, "Break the loop by pointing the last CNAME at an A/AAAA-bearing name.")}
|
|
}
|
|
|
|
type chainLengthRule struct{}
|
|
|
|
func (chainLengthRule) Name() string { return "chain_length" }
|
|
func (chainLengthRule) Description() string {
|
|
return "Flags alias chains longer than the configured maximum (most resolvers give up around 8-16 hops)."
|
|
}
|
|
|
|
func (chainLengthRule) Options() sdk.CheckerOptionsDocumentation {
|
|
return sdk.CheckerOptionsDocumentation{
|
|
UserOpts: []sdk.CheckerOptionDocumentation{
|
|
{
|
|
Id: "maxChainLength",
|
|
Type: "uint",
|
|
Label: "Maximum chain length",
|
|
Description: "Above this number of hops the chain is reported as critical.",
|
|
Default: float64(defaultMaxChainLength),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func (chainLengthRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, opts sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
maxLen := sdk.GetIntOption(opts, "maxChainLength", defaultMaxChainLength)
|
|
if data.ChainTerminated.Reason != TermTooLong {
|
|
return okState(data.Owner, fmt.Sprintf("chain has %d hop(s), within limit of %d", len(data.Chain), maxLen))
|
|
}
|
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
|
Status: sdk.StatusCrit,
|
|
Subject: data.ChainTerminated.Subject,
|
|
Message: fmt.Sprintf("chain exceeds %d hops; many resolvers will give up", maxLen),
|
|
}, "Flatten intermediate CNAMEs so that the chain is at most a few hops long.")}
|
|
}
|
|
|
|
type chainQueryErrorRule struct{}
|
|
|
|
func (chainQueryErrorRule) Name() string { return "chain_query_error" }
|
|
func (chainQueryErrorRule) Description() string {
|
|
return "Flags DNS query failures encountered while walking the alias chain."
|
|
}
|
|
|
|
func (chainQueryErrorRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
if data.ChainTerminated.Reason != TermQueryErr {
|
|
return okState(data.Owner, "all chain queries succeeded")
|
|
}
|
|
// A transport failure (connection refused, timeout, network unreachable)
|
|
// means we could not observe the alias, not that the alias is broken. Report
|
|
// it as Unknown so an intermittent reachability glitch does not flap the
|
|
// check into Warn/Crit; only definitive DNS evidence drives those statuses.
|
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
|
Status: sdk.StatusUnknown,
|
|
Subject: data.ChainTerminated.Subject,
|
|
Message: fmt.Sprintf("CNAME query for %s could not be completed: %s", data.ChainTerminated.Subject, data.ChainTerminated.Detail),
|
|
}, "Check authoritative-server reachability and firewall rules; the alias state could not be determined while queries fail.")}
|
|
}
|
|
|
|
type chainRcodeRule struct{}
|
|
|
|
func (chainRcodeRule) Name() string { return "chain_rcode" }
|
|
func (chainRcodeRule) Description() string {
|
|
return "Flags NXDOMAIN/SERVFAIL/other rcodes encountered mid-chain or on the final target lookup."
|
|
}
|
|
|
|
func (chainRcodeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
var out []sdk.CheckState
|
|
if data.ChainTerminated.Reason == TermRcode {
|
|
out = append(out, withHint(sdk.CheckState{
|
|
Status: sdk.StatusCrit,
|
|
Subject: data.ChainTerminated.Subject,
|
|
Message: fmt.Sprintf("server answered %s mid-chain", data.ChainTerminated.Rcode),
|
|
}, "Ensure the zone publishes the expected record; NXDOMAIN/SERVFAIL mid-chain breaks the alias."))
|
|
}
|
|
if data.FinalRcode != "" && data.FinalRcode != "NOERROR" {
|
|
out = append(out, withHint(sdk.CheckState{
|
|
Status: sdk.StatusWarn,
|
|
Subject: data.FinalTarget,
|
|
Message: fmt.Sprintf("final A lookup for %s returned %s", data.FinalTarget, data.FinalRcode),
|
|
}, "Check the upstream zone's A/AAAA publication."))
|
|
}
|
|
if len(out) == 0 {
|
|
return okState(data.Owner, "all chain and final lookups returned NOERROR")
|
|
}
|
|
return out
|
|
}
|
|
|
|
type hopTTLRule struct{}
|
|
|
|
func (hopTTLRule) Name() string { return "hop_ttl" }
|
|
func (hopTTLRule) Description() string {
|
|
return "Flags chain hops whose TTL is below the configured minimum."
|
|
}
|
|
|
|
func (hopTTLRule) Options() sdk.CheckerOptionsDocumentation {
|
|
return sdk.CheckerOptionsDocumentation{
|
|
UserOpts: []sdk.CheckerOptionDocumentation{
|
|
{
|
|
Id: "minTargetTTL",
|
|
Type: "uint",
|
|
Label: "Minimum TTL (seconds)",
|
|
Description: "Hops with a TTL below this threshold are flagged as a warning.",
|
|
Default: float64(defaultMinTargetTTL),
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func (hopTTLRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, opts sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
if len(data.Chain) == 0 {
|
|
return skipped("chain is empty")
|
|
}
|
|
minTTL := uint32(sdk.GetIntOption(opts, "minTargetTTL", defaultMinTargetTTL))
|
|
var out []sdk.CheckState
|
|
for _, h := range data.Chain {
|
|
if h.Kind == KindTarget || h.TTL == 0 {
|
|
continue
|
|
}
|
|
if h.TTL < minTTL {
|
|
out = append(out, withHint(sdk.CheckState{
|
|
Status: sdk.StatusWarn,
|
|
Subject: h.Owner,
|
|
Message: fmt.Sprintf("hop %s → %s has TTL %ds (< %d)", h.Owner, h.Target, h.TTL, minTTL),
|
|
}, "Raise the CNAME TTL to improve cache efficiency (5-15 minutes is a common floor)."))
|
|
}
|
|
}
|
|
if len(out) == 0 {
|
|
return okState(data.Owner, fmt.Sprintf("all chain hops have TTL ≥ %ds", minTTL))
|
|
}
|
|
return out
|
|
}
|
|
|
|
type targetResolvableRule struct{}
|
|
|
|
func (targetResolvableRule) Name() string { return "target_resolvable" }
|
|
func (targetResolvableRule) Description() string {
|
|
return "Verifies that the final target of the alias chain exists in DNS (returns NOERROR, not NXDOMAIN)."
|
|
}
|
|
|
|
func (targetResolvableRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
if data.ChainTerminated.Reason != TermOK {
|
|
return skipped("chain did not terminate normally")
|
|
}
|
|
if data.FinalRcode != "NXDOMAIN" {
|
|
return okState(data.FinalTarget, fmt.Sprintf("target %s exists in DNS", data.FinalTarget))
|
|
}
|
|
return []sdk.CheckState{withHint(sdk.CheckState{
|
|
Status: sdk.StatusCrit,
|
|
Subject: data.FinalTarget,
|
|
Message: fmt.Sprintf("final target %s does not exist (NXDOMAIN)", data.FinalTarget),
|
|
}, "The alias points at a name that does not exist; create the missing record or update the alias target.")}
|
|
}
|
|
|
|
type multipleRecordsRule struct{}
|
|
|
|
func (multipleRecordsRule) Name() string { return "multiple_records" }
|
|
func (multipleRecordsRule) Description() string {
|
|
return "Flags owners that carry more than one CNAME/DNAME record; only one is legal per owner."
|
|
}
|
|
|
|
func (multipleRecordsRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState {
|
|
data, errState := loadAlias(ctx, obs)
|
|
if errState != nil {
|
|
return errState
|
|
}
|
|
if !apexKnown(data) {
|
|
return skipped("apex lookup failed")
|
|
}
|
|
seen := map[string]int{}
|
|
for _, h := range data.Chain {
|
|
if h.Kind == KindCNAME || h.Kind == KindDNAME {
|
|
seen[h.Owner]++
|
|
}
|
|
}
|
|
var out []sdk.CheckState
|
|
for owner, n := range seen {
|
|
if n > 1 {
|
|
out = append(out, withHint(sdk.CheckState{
|
|
Status: sdk.StatusCrit,
|
|
Subject: owner,
|
|
Message: fmt.Sprintf("%s carries %d CNAME/DNAME records in the chain", owner, n),
|
|
}, "Keep a single CNAME per name; remove duplicates at the authoritative zone."))
|
|
}
|
|
}
|
|
if len(out) == 0 {
|
|
return okState(data.Owner, "every chain owner carries a single CNAME/DNAME")
|
|
}
|
|
return out
|
|
}
|