From 4177fcdc7bb90ac2c28922162c4ce495fe2e1bb5 Mon Sep 17 00:00:00 2001 From: Pierre-Olivier Mercier Date: Sat, 25 Apr 2026 23:14:42 +0700 Subject: [PATCH] Split monolithic rule into per-test rules, collect gathers facts only --- README.md | 16 +-- checker/collect.go | 4 +- checker/definition.go | 4 +- checker/prober.go | 133 ++++++------------- checker/prober_test.go | 7 +- checker/rule.go | 175 +++++++++---------------- checker/rules_certificate.go | 233 ++++++++++++++++++++++++++++++++++ checker/rules_discovery.go | 34 +++++ checker/rules_handshake.go | 60 +++++++++ checker/rules_protocol.go | 105 +++++++++++++++ checker/rules_reachability.go | 48 +++++++ checker/rules_starttls.go | 108 ++++++++++++++++ checker/types.go | 92 +++++++++----- contract/contract.go | 2 +- 14 files changed, 760 insertions(+), 261 deletions(-) create mode 100644 checker/rules_certificate.go create mode 100644 checker/rules_discovery.go create mode 100644 checker/rules_handshake.go create mode 100644 checker/rules_protocol.go create mode 100644 checker/rules_reachability.go create mode 100644 checker/rules_starttls.go diff --git a/README.md b/README.md index f94547a..61d4e1b 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ Observation data written under `tls_probes`: } ``` -The map is keyed by `contract.Ref(ep)` — the same value the host exposes +The map is keyed by `contract.Ref(ep)`, the same value the host exposes on the lineage side so that a consumer knows which probe corresponds to which entry it originally published. @@ -129,14 +129,14 @@ existing downstream parsers. ## Issues reported -- `tcp_unreachable` — dial failed. -- `handshake_failed` — TLS handshake or STARTTLS upgrade failed. -- `starttls_not_offered` — server didn't advertise STARTTLS. Severity is +- `tcp_unreachable`, dial failed. +- `handshake_failed`, TLS handshake or STARTTLS upgrade failed. +- `starttls_not_offered`, server didn't advertise STARTTLS. Severity is `crit` when `TLSEndpoint.RequireSTARTTLS` is `true`, `warn` otherwise. -- `chain_invalid` — leaf does not chain to a system-trusted root. -- `hostname_mismatch` — cert SANs don't cover the SNI. -- `expired` / `expiring_soon` — cert expiry posture. -- `weak_tls_version` — negotiated TLS < 1.2. +- `chain_invalid`, leaf does not chain to a system-trusted root. +- `hostname_mismatch`, cert SANs don't cover the SNI. +- `expired` / `expiring_soon`, cert expiry posture. +- `weak_tls_version`, negotiated TLS < 1.2. ## Options diff --git a/checker/collect.go b/checker/collect.go index ff8f60b..8db0ddb 100644 --- a/checker/collect.go +++ b/checker/collect.go @@ -47,8 +47,8 @@ func (p *tlsProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any defer wg.Done() defer func() { <-sem }() pr := probe(ctx, e.Endpoint, timeout) - log.Printf("checker-tls: %s %s:%d → tls=%s issues=%d elapsed=%dms err=%q", - pr.Type, pr.Host, pr.Port, pr.TLSVersion, len(pr.Issues), pr.ElapsedMS, pr.Error) + log.Printf("checker-tls: %s %s:%d → tls=%s handshake_ok=%t elapsed=%dms err=%q", + pr.Type, pr.Host, pr.Port, pr.TLSVersion, pr.TLSHandshakeOK, pr.ElapsedMS, pr.Error) mu.Lock() probes[e.Ref] = pr mu.Unlock() diff --git a/checker/definition.go b/checker/definition.go index c2fb6f5..c60fdd0 100644 --- a/checker/definition.go +++ b/checker/definition.go @@ -40,9 +40,7 @@ func Definition() *sdk.CheckerDefinition { }, }, }, - Rules: []sdk.CheckRule{ - Rule(), - }, + Rules: Rules(), Interval: &sdk.CheckIntervalSpec{ Min: 6 * time.Hour, Max: 7 * 24 * time.Hour, diff --git a/checker/prober.go b/checker/prober.go index b528814..74a05ef 100644 --- a/checker/prober.go +++ b/checker/prober.go @@ -58,8 +58,11 @@ func probeTypeString(ep contract.TLSEndpoint) string { // probe performs a TLS handshake (or STARTTLS upgrade + handshake) on the // given endpoint and returns a populated TLSProbe. It never returns an error: -// transport/handshake failures are recorded on the probe so the caller can -// still surface them in the report. +// transport/handshake failures are recorded on the probe as raw fields so +// rules can classify them. +// +// This function MUST NOT decide severity or pass/fail: it only gathers +// observation data. All judgement happens in CheckRules (see rules_*.go). func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) TLSProbe { start := time.Now() host := strings.TrimSuffix(ep.Host, ".") @@ -70,11 +73,13 @@ func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) } p := TLSProbe{ - Host: host, - Port: ep.Port, - Endpoint: addr, - Type: probeTypeString(ep), - SNI: sni, + Host: host, + Port: ep.Port, + Endpoint: addr, + Type: probeTypeString(ep), + SNI: sni, + RequireSTARTTLS: ep.RequireSTARTTLS, + STARTTLSDialect: ep.STARTTLS, } dialCtx, cancel := context.WithTimeout(ctx, timeout) @@ -83,13 +88,8 @@ func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) d := &net.Dialer{} conn, err := d.DialContext(dialCtx, "tcp", addr) if err != nil { + p.TCPError = err.Error() p.Error = "dial: " + err.Error() - p.Issues = append(p.Issues, Issue{ - Code: "tcp_unreachable", - Severity: SeverityCrit, - Message: fmt.Sprintf("Cannot open TCP connection to %s: %v", addr, err), - Fix: "Check DNS, firewall, and that the service listens on this port.", - }) p.ElapsedMS = time.Since(start).Milliseconds() return p } @@ -101,23 +101,28 @@ func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) tlsConn, err := handshake(conn, ep, sni) if err != nil { + p.HandshakeError = err.Error() p.Error = err.Error() - p.Issues = append(p.Issues, classifyHandshakeError(ep, err)) + if ep.STARTTLS != "" && isStartTLSUnsupported(err) { + p.STARTTLSNotOffered = true + } + if errors.Is(err, errUnsupportedStartTLSProto) { + p.STARTTLSUnsupportedProto = true + } p.ElapsedMS = time.Since(start).Milliseconds() return p } defer tlsConn.Close() + p.TLSHandshakeOK = true state := tlsConn.ConnectionState() + p.TLSVersionNum = state.Version p.TLSVersion = tls.VersionName(state.Version) p.CipherSuite = tls.CipherSuiteName(state.CipherSuite) + p.CipherSuiteID = state.CipherSuite if len(state.PeerCertificates) == 0 { - p.Issues = append(p.Issues, Issue{ - Code: "no_peer_cert", - Severity: SeverityCrit, - Message: "Server presented no certificate.", - }) + p.NoPeerCert = true p.ElapsedMS = time.Since(start).Milliseconds() return p } @@ -130,16 +135,16 @@ func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) p.IssuerAKI = strings.ToUpper(hex.EncodeToString(leaf.AuthorityKeyId)) } p.Subject = leaf.Subject.CommonName - p.DNSNames = append(p.DNSNames, leaf.DNSNames...) + p.DNSNames = leaf.DNSNames p.Chain = buildChain(state.PeerCertificates) hostnameMatch := leaf.VerifyHostname(sni) == nil p.HostnameMatch = &hostnameMatch // Chain verification against system roots, using intermediates presented - // by the server. We run this independently from Go's tls.Config - // verification so we can report a dedicated "chain invalid" issue rather - // than failing the whole handshake. + // by the server. Running it separately from tls.Config verification + // means we can record it as a raw observation rather than aborting the + // handshake, rules classify it afterwards. intermediates := x509.NewCertPool() for _, c := range state.PeerCertificates[1:] { intermediates.AddCert(c) @@ -152,48 +157,8 @@ func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) }) chainValid := verifyErr == nil p.ChainValid = &chainValid - if !chainValid { - msg := "Invalid certificate chain" - if verifyErr != nil { - msg = "Invalid certificate chain: " + verifyErr.Error() - } - p.Issues = append(p.Issues, Issue{ - Code: "chain_invalid", - Severity: SeverityCrit, - Message: msg, - Fix: "Serve the full intermediate chain and ensure the root is trusted.", - }) - } - if !hostnameMatch { - p.Issues = append(p.Issues, Issue{ - Code: "hostname_mismatch", - Severity: SeverityCrit, - Message: fmt.Sprintf("Certificate does not cover %q (SANs: %s)", sni, strings.Join(leaf.DNSNames, ", ")), - Fix: "Re-issue the certificate with a matching SAN.", - }) - } - if leaf.NotAfter.Before(now) { - p.Issues = append(p.Issues, Issue{ - Code: "expired", - Severity: SeverityCrit, - Message: "Certificate expired on " + leaf.NotAfter.Format(time.RFC3339), - Fix: "Renew the certificate.", - }) - } else if leaf.NotAfter.Sub(now) < 14*24*time.Hour { - p.Issues = append(p.Issues, Issue{ - Code: "expiring_soon", - Severity: SeverityWarn, - Message: "Certificate expires in less than 14 days (" + leaf.NotAfter.Format(time.RFC3339) + ")", - Fix: "Renew before expiry.", - }) - } - if state.Version < tls.VersionTLS12 { - p.Issues = append(p.Issues, Issue{ - Code: "weak_tls_version", - Severity: SeverityWarn, - Message: "Negotiated TLS version " + p.TLSVersion + " is below the recommended TLS 1.2.", - Fix: "Disable TLS 1.0/1.1 on the server.", - }) + if verifyErr != nil { + p.ChainVerifyErr = verifyErr.Error() } p.ElapsedMS = time.Since(start).Milliseconds() @@ -202,8 +167,8 @@ func probe(ctx context.Context, ep contract.TLSEndpoint, timeout time.Duration) // handshake performs STARTTLS upgrade (when ep.STARTTLS is non-empty) and // then a TLS handshake. InsecureSkipVerify is true on purpose: we verify -// the chain separately in probe so an invalid chain becomes a structured -// Issue rather than aborting the handshake. +// the chain separately in probe so an invalid chain becomes a raw +// observation rather than aborting the handshake. func handshake(conn net.Conn, ep contract.TLSEndpoint, sni string) (*tls.Conn, error) { cfg := &tls.Config{ ServerName: sni, @@ -220,7 +185,7 @@ func handshake(conn net.Conn, ep contract.TLSEndpoint, sni string) (*tls.Conn, e up, ok := starttlsUpgraders[ep.STARTTLS] if !ok { - return nil, fmt.Errorf("unsupported starttls protocol %q", ep.STARTTLS) + return nil, fmt.Errorf("%w: %q", errUnsupportedStartTLSProto, ep.STARTTLS) } if err := up(conn, sni); err != nil { return nil, fmt.Errorf("starttls-%s: %w", ep.STARTTLS, err) @@ -232,34 +197,10 @@ func handshake(conn net.Conn, ep contract.TLSEndpoint, sni string) (*tls.Conn, e return tlsConn, nil } -// classifyHandshakeError converts a dial/handshake error into a structured -// Issue, distinguishing "server doesn't offer STARTTLS" (which is opportunistic -// for some endpoints) from hard failures. -func classifyHandshakeError(ep contract.TLSEndpoint, err error) Issue { - msg := err.Error() - - if ep.STARTTLS != "" && isStartTLSUnsupported(err) { - sev := SeverityWarn - if ep.RequireSTARTTLS { - sev = SeverityCrit - } - return Issue{ - Code: "starttls_not_offered", - Severity: sev, - Message: fmt.Sprintf("Server on %s:%d does not advertise STARTTLS: %s", ep.Host, ep.Port, msg), - Fix: "Enable STARTTLS on the server or publish a direct-TLS endpoint.", - } - } - - return Issue{ - Code: "handshake_failed", - Severity: SeverityCrit, - Message: fmt.Sprintf("TLS handshake failed on %s:%d: %s", ep.Host, ep.Port, msg), - Fix: "Inspect the server's TLS configuration and certificate.", - } -} - -var errStartTLSNotOffered = errors.New("starttls not advertised by server") +var ( + errStartTLSNotOffered = errors.New("starttls not advertised by server") + errUnsupportedStartTLSProto = errors.New("unsupported starttls protocol") +) func isStartTLSUnsupported(err error) bool { return errors.Is(err, errStartTLSNotOffered) diff --git a/checker/prober_test.go b/checker/prober_test.go index 4023e58..ad7e288 100644 --- a/checker/prober_test.go +++ b/checker/prober_test.go @@ -60,11 +60,8 @@ func TestProbe_TCPUnreachable(t *testing.T) { Port: uint16(addr.Port), }, 1*time.Second) - if probe.Error == "" { - t.Errorf("expected an error for unreachable port") - } - if len(probe.Issues) == 0 || probe.Issues[0].Code != "tcp_unreachable" { - t.Errorf("expected tcp_unreachable issue, got %+v", probe.Issues) + if probe.TCPError == "" { + t.Errorf("expected a TCP error for unreachable port") } } diff --git a/checker/rule.go b/checker/rule.go index 4c3d105..bcb1858 100644 --- a/checker/rule.go +++ b/checker/rule.go @@ -8,140 +8,81 @@ import ( sdk "git.happydns.org/checker-sdk-go/checker" ) -// Rule returns the rule that aggregates per-endpoint TLS probe outcomes into -// a single status for this checker run. -func Rule() sdk.CheckRule { - return &tlsRule{} +// Rules returns the full list of CheckRules exposed by the TLS checker. +// Each rule covers a single concern (reachability, handshake, chain, hostname, +// expiry, TLS version, STARTTLS advertisement, cipher suite, …) so the UI can +// surface a passing-list rather than a single aggregated code. +func Rules() []sdk.CheckRule { + return []sdk.CheckRule{ + &endpointsDiscoveredRule{}, + &reachabilityRule{}, + &tlsHandshakeRule{}, + &starttlsAdvertisedRule{}, + &starttlsSupportedRule{}, + &peerCertificateRule{}, + &chainValidityRule{}, + &hostnameMatchRule{}, + &expiryRule{}, + &tlsVersionRule{}, + &cipherSuiteRule{}, + } } -type tlsRule struct{} - -func (r *tlsRule) Name() string { return "tls_posture" } - -func (r *tlsRule) Description() string { - return "Summarises TLS handshake, certificate validity, hostname match and expiry across all probed endpoints" -} - -func (r *tlsRule) ValidateOptions(opts sdk.CheckerOptions) error { - return nil -} - -func (r *tlsRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, opts sdk.CheckerOptions) []sdk.CheckState { +// loadData fetches the TLS observation. On error, returns a single error +// state the caller should emit. +func loadData(ctx context.Context, obs sdk.ObservationGetter) (*TLSData, *sdk.CheckState) { var data TLSData if err := obs.Get(ctx, ObservationKeyTLSProbes, &data); err != nil { - return []sdk.CheckState{{ + return nil, &sdk.CheckState{ Status: sdk.StatusError, - Message: fmt.Sprintf("Failed to read tls_probes: %v", err), - Code: "tls_observation_error", - }} - } - - // Steady state when no producer has published entries for this target - // yet (or when the last producer run cleared them). Report Unknown so - // we don't flap red during the eventual-consistency window between a - // fresh enrollment and the first producer cycle. - if len(data.Probes) == 0 { - return []sdk.CheckState{{ - Status: sdk.StatusUnknown, - Message: "No TLS endpoints have been discovered for this target yet", - Code: "tls_no_endpoints", - }} + Message: fmt.Sprintf("failed to load tls_probes observation: %v", err), + Code: "tls.observation_error", + } } + return &data, nil +} +// sortedRefs returns the probe refs in deterministic order. Rules iterate +// this sorted list so CheckState output is stable. +func sortedRefs(data *TLSData) []string { refs := make([]string, 0, len(data.Probes)) for ref := range data.Probes { refs = append(refs, ref) } sort.Strings(refs) - - out := make([]sdk.CheckState, 0, len(refs)) - for _, ref := range refs { - p := data.Probes[ref] - out = append(out, evaluateProbe(p)) - } - return out + return refs } -// evaluateProbe distills a single TLSProbe into a CheckState. Subject is the -// probed endpoint so the host can correlate states across runs and surface -// them per-target in the UI. Message describes the finding only -- the UI -// renders Subject separately. -func evaluateProbe(p TLSProbe) sdk.CheckState { - subject := fmt.Sprintf("%s://%s", p.Type, p.Endpoint) - meta := map[string]any{ - "type": p.Type, - "host": p.Host, - "port": p.Port, - "sni": p.SNI, - "issues": len(p.Issues), +// subjectOf formats the UI-facing subject for a single probe. +func subjectOf(p TLSProbe) string { + return fmt.Sprintf("%s://%s", p.Type, p.Endpoint) +} + +// metaOf returns a compact meta map to attach to a CheckState. +func metaOf(p TLSProbe) map[string]any { + m := map[string]any{ + "type": p.Type, + "host": p.Host, + "port": p.Port, + "sni": p.SNI, } if p.TLSVersion != "" { - meta["tls_version"] = p.TLSVersion - } - if !p.NotAfter.IsZero() { - meta["not_after"] = p.NotAfter - } - - worst, critMsg, warnMsg := summarize(p.Issues) - switch worst { - case SeverityCrit: - return sdk.CheckState{ - Status: sdk.StatusCrit, - Message: critMsg, - Code: "tls_critical", - Subject: subject, - Meta: meta, - } - case SeverityWarn: - return sdk.CheckState{ - Status: sdk.StatusWarn, - Message: warnMsg, - Code: "tls_warning", - Subject: subject, - Meta: meta, - } - default: - msg := "TLS endpoint OK" - if p.TLSVersion != "" { - msg = fmt.Sprintf("TLS endpoint OK (%s)", p.TLSVersion) - } - return sdk.CheckState{ - Status: sdk.StatusOK, - Message: msg, - Code: "tls_ok", - Subject: subject, - Meta: meta, - } + m["tls_version"] = p.TLSVersion } + return m } -// summarize walks the issues once and returns (worst severity, first -// critical message, first warning message). Picking the messages during the -// same pass avoids a second iteration in the caller. -func summarize(issues []Issue) (worst, firstCrit, firstWarn string) { - for _, is := range issues { - msg := is.Message - if msg == "" { - msg = is.Code - } - switch is.Severity { - case SeverityCrit: - worst = SeverityCrit - if firstCrit == "" { - firstCrit = msg - } - case SeverityWarn: - if worst == "" || worst == SeverityInfo { - worst = SeverityWarn - } - if firstWarn == "" { - firstWarn = msg - } - case SeverityInfo: - if worst == "" { - worst = SeverityInfo - } - } - } - return +// passState / infoState / unknownState helpers. +func passState(code, message string) sdk.CheckState { + return sdk.CheckState{Status: sdk.StatusOK, Code: code, Message: message} +} +func unknownState(code, message string) sdk.CheckState { + return sdk.CheckState{Status: sdk.StatusUnknown, Code: code, Message: message} +} + +// emptyCaseState returns a single state describing "no probes to evaluate". +// Rules call this when len(data.Probes) == 0 to avoid returning an empty +// slice (see CheckRule.Evaluate contract). +func emptyCaseState(code string) sdk.CheckState { + return unknownState(code, "No TLS endpoints have been discovered for this target yet.") } diff --git a/checker/rules_certificate.go b/checker/rules_certificate.go new file mode 100644 index 0000000..2fc2f4c --- /dev/null +++ b/checker/rules_certificate.go @@ -0,0 +1,233 @@ +package checker + +import ( + "context" + "fmt" + "strings" + "time" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// peerCertificateRule flags successful handshakes in which the server sent +// no certificate. This is distinct from chain validity: if no cert was sent, +// hostname/chain/expiry cannot be evaluated. +type peerCertificateRule struct{} + +func (r *peerCertificateRule) Name() string { return "tls.peer_certificate_present" } +func (r *peerCertificateRule) Description() string { + return "Verifies the server presented a certificate during the TLS handshake." +} + +func (r *peerCertificateRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.peer_certificate_present.no_endpoints")} + } + + var out []sdk.CheckState + anyHandshake := false + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if !p.TLSHandshakeOK { + continue + } + anyHandshake = true + if !p.NoPeerCert { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusCrit, + Code: "tls.peer_certificate_present.missing", + Subject: subjectOf(p), + Message: fmt.Sprintf("Server on %s completed the handshake but presented no certificate.", p.Endpoint), + Meta: metaOf(p), + }) + } + if !anyHandshake { + return []sdk.CheckState{unknownState( + "tls.peer_certificate_present.skipped", + "No endpoint completed a TLS handshake.", + )} + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.peer_certificate_present.ok", + "Every endpoint presented a certificate.", + )} + } + return out +} + +// chainValidityRule flags invalid certificate chains. +type chainValidityRule struct{} + +func (r *chainValidityRule) Name() string { return "tls.chain_validity" } +func (r *chainValidityRule) Description() string { + return "Verifies the presented certificate chain validates against the system trust store." +} + +func (r *chainValidityRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.chain_validity.no_endpoints")} + } + + var out []sdk.CheckState + any := false + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.ChainValid == nil { + continue + } + any = true + if *p.ChainValid { + continue + } + msg := "Invalid certificate chain" + if p.ChainVerifyErr != "" { + msg = "Invalid certificate chain: " + p.ChainVerifyErr + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusCrit, + Code: "tls.chain_validity.invalid", + Subject: subjectOf(p), + Message: msg, + Meta: metaOf(p), + }) + } + if !any { + return []sdk.CheckState{unknownState( + "tls.chain_validity.skipped", + "No endpoint yielded a certificate chain to verify.", + )} + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.chain_validity.ok", + "Every presented chain validates against the system trust store.", + )} + } + return out +} + +// hostnameMatchRule flags endpoints whose leaf cert does not cover the SNI +// the probe used. +type hostnameMatchRule struct{} + +func (r *hostnameMatchRule) Name() string { return "tls.hostname_match" } +func (r *hostnameMatchRule) Description() string { + return "Verifies the leaf certificate covers the probed hostname (SNI)." +} + +func (r *hostnameMatchRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.hostname_match.no_endpoints")} + } + + var out []sdk.CheckState + any := false + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.HostnameMatch == nil { + continue + } + any = true + if *p.HostnameMatch { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusCrit, + Code: "tls.hostname_match.mismatch", + Subject: subjectOf(p), + Message: fmt.Sprintf("Certificate does not cover %q (SANs: %s)", p.SNI, strings.Join(p.DNSNames, ", ")), + Meta: metaOf(p), + }) + } + if !any { + return []sdk.CheckState{unknownState( + "tls.hostname_match.skipped", + "No endpoint yielded a certificate to hostname-match.", + )} + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.hostname_match.ok", + "Every certificate covers its probed SNI.", + )} + } + return out +} + +// expiryRule flags expired or near-expiry certificates. +type expiryRule struct{} + +func (r *expiryRule) Name() string { return "tls.expiry" } +func (r *expiryRule) Description() string { + return "Flags expired or soon-to-expire leaf certificates." +} + +func (r *expiryRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.expiry.no_endpoints")} + } + + now := time.Now() + var out []sdk.CheckState + any := false + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.NotAfter.IsZero() { + continue + } + any = true + meta := metaOf(p) + meta["not_after"] = p.NotAfter + if p.NotAfter.Before(now) { + out = append(out, sdk.CheckState{ + Status: sdk.StatusCrit, + Code: "tls.expiry.expired", + Subject: subjectOf(p), + Message: "Certificate expired on " + p.NotAfter.Format(time.RFC3339), + Meta: meta, + }) + continue + } + if p.NotAfter.Sub(now) < ExpiringSoonThreshold { + out = append(out, sdk.CheckState{ + Status: sdk.StatusWarn, + Code: "tls.expiry.expiring_soon", + Subject: subjectOf(p), + Message: "Certificate expires in less than 14 days (" + p.NotAfter.Format(time.RFC3339) + ")", + Meta: meta, + }) + } + } + if !any { + return []sdk.CheckState{unknownState( + "tls.expiry.skipped", + "No endpoint yielded a certificate with an expiry to check.", + )} + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.expiry.ok", + "Every leaf certificate is valid for more than 14 days.", + )} + } + return out +} diff --git a/checker/rules_discovery.go b/checker/rules_discovery.go new file mode 100644 index 0000000..738b235 --- /dev/null +++ b/checker/rules_discovery.go @@ -0,0 +1,34 @@ +package checker + +import ( + "context" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// endpointsDiscoveredRule surfaces the "no producer has published endpoints +// for this target yet" steady state. Kept as its own rule so it does not +// contaminate per-endpoint findings when discovery is in flight. +type endpointsDiscoveredRule struct{} + +func (r *endpointsDiscoveredRule) Name() string { return "tls.endpoints_discovered" } +func (r *endpointsDiscoveredRule) Description() string { + return "Verifies that at least one TLS endpoint has been discovered for this target." +} + +func (r *endpointsDiscoveredRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{unknownState( + "tls.endpoints_discovered.none", + "No TLS endpoints have been discovered for this target yet.", + )} + } + return []sdk.CheckState{passState( + "tls.endpoints_discovered.ok", + "TLS endpoints were discovered for this target.", + )} +} diff --git a/checker/rules_handshake.go b/checker/rules_handshake.go new file mode 100644 index 0000000..2ee343c --- /dev/null +++ b/checker/rules_handshake.go @@ -0,0 +1,60 @@ +package checker + +import ( + "context" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// tlsHandshakeRule flags reachable endpoints on which the TLS handshake +// failed. STARTTLS-specific shortfalls (server not advertising the upgrade) +// are surfaced by starttlsAdvertisedRule / starttlsSupportedRule instead, +// so this rule skips them. +type tlsHandshakeRule struct{} + +func (r *tlsHandshakeRule) Name() string { return "tls.handshake" } +func (r *tlsHandshakeRule) Description() string { + return "Verifies the TLS handshake completes on every reachable endpoint." +} + +func (r *tlsHandshakeRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.handshake.no_endpoints")} + } + + var out []sdk.CheckState + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.TCPError != "" { + continue // reachability covers this. + } + if p.STARTTLSNotOffered || p.STARTTLSUnsupportedProto { + continue // starttls-specific rules cover these. + } + if p.TLSHandshakeOK { + continue + } + if p.HandshakeError == "" { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusCrit, + Code: "tls.handshake.failed", + Subject: subjectOf(p), + Message: fmt.Sprintf("TLS handshake failed on %s: %s", p.Endpoint, p.HandshakeError), + Meta: metaOf(p), + }) + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.handshake.ok", + "TLS handshake succeeded on every reachable endpoint.", + )} + } + return out +} diff --git a/checker/rules_protocol.go b/checker/rules_protocol.go new file mode 100644 index 0000000..f9e24ad --- /dev/null +++ b/checker/rules_protocol.go @@ -0,0 +1,105 @@ +package checker + +import ( + "context" + "crypto/tls" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// tlsVersionRule flags endpoints negotiating a protocol version below the +// recommended TLS 1.2 floor. +type tlsVersionRule struct{} + +func (r *tlsVersionRule) Name() string { return "tls.version" } +func (r *tlsVersionRule) Description() string { + return "Flags endpoints negotiating a TLS version below the recommended TLS 1.2." +} + +func (r *tlsVersionRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.version.no_endpoints")} + } + + var out []sdk.CheckState + any := false + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.TLSVersionNum == 0 { + continue + } + any = true + if p.TLSVersionNum >= tls.VersionTLS12 { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusWarn, + Code: "tls.version.weak", + Subject: subjectOf(p), + Message: fmt.Sprintf("Negotiated TLS version %s is below the recommended TLS 1.2.", p.TLSVersion), + Meta: metaOf(p), + }) + } + if !any { + return []sdk.CheckState{unknownState( + "tls.version.skipped", + "No endpoint completed a TLS handshake.", + )} + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.version.ok", + "Every endpoint negotiates TLS 1.2 or higher.", + )} + } + return out +} + +// cipherSuiteRule reports the negotiated cipher suite for visibility. +// It does not currently classify suites as weak/strong: go's crypto/tls +// refuses to negotiate the known-weak suites anyway. The rule exists so the +// UI can expose the suite in the passing-list rather than leaving it buried +// in the raw observation. +type cipherSuiteRule struct{} + +func (r *cipherSuiteRule) Name() string { return "tls.cipher_suite" } +func (r *cipherSuiteRule) Description() string { + return "Reports the cipher suite negotiated on each endpoint." +} + +func (r *cipherSuiteRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.cipher_suite.no_endpoints")} + } + + var out []sdk.CheckState + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.CipherSuite == "" { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusInfo, + Code: "tls.cipher_suite.negotiated", + Subject: subjectOf(p), + Message: fmt.Sprintf("Cipher suite %s negotiated.", p.CipherSuite), + Meta: metaOf(p), + }) + } + if len(out) == 0 { + return []sdk.CheckState{unknownState( + "tls.cipher_suite.skipped", + "No endpoint completed a TLS handshake.", + )} + } + return out +} diff --git a/checker/rules_reachability.go b/checker/rules_reachability.go new file mode 100644 index 0000000..6540059 --- /dev/null +++ b/checker/rules_reachability.go @@ -0,0 +1,48 @@ +package checker + +import ( + "context" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// reachabilityRule flags endpoints that did not accept a TCP connection. +type reachabilityRule struct{} + +func (r *reachabilityRule) Name() string { return "tls.reachability" } +func (r *reachabilityRule) Description() string { + return "Verifies that every discovered TLS endpoint accepts a TCP connection." +} + +func (r *reachabilityRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.reachability.no_endpoints")} + } + + var out []sdk.CheckState + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.TCPError == "" { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusCrit, + Code: "tls.reachability.tcp_unreachable", + Subject: subjectOf(p), + Message: fmt.Sprintf("Cannot open TCP connection to %s: %s", p.Endpoint, p.TCPError), + Meta: metaOf(p), + }) + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.reachability.ok", + "All discovered endpoints accepted a TCP connection.", + )} + } + return out +} diff --git a/checker/rules_starttls.go b/checker/rules_starttls.go new file mode 100644 index 0000000..cea5cdc --- /dev/null +++ b/checker/rules_starttls.go @@ -0,0 +1,108 @@ +package checker + +import ( + "context" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// starttlsAdvertisedRule flags STARTTLS endpoints whose server did not +// advertise the upgrade. Severity depends on RequireSTARTTLS: opportunistic +// STARTTLS degrades to a warning; mandatory STARTTLS is critical. +type starttlsAdvertisedRule struct{} + +func (r *starttlsAdvertisedRule) Name() string { return "tls.starttls_advertised" } +func (r *starttlsAdvertisedRule) Description() string { + return "Verifies that STARTTLS endpoints advertise the upgrade capability." +} + +func (r *starttlsAdvertisedRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.starttls_advertised.no_endpoints")} + } + + var out []sdk.CheckState + anySTARTTLS := false + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if p.STARTTLSDialect == "" { + continue + } + anySTARTTLS = true + if !p.STARTTLSNotOffered { + continue + } + status := sdk.StatusWarn + if p.RequireSTARTTLS { + status = sdk.StatusCrit + } + out = append(out, sdk.CheckState{ + Status: status, + Code: "tls.starttls_advertised.missing", + Subject: subjectOf(p), + Message: fmt.Sprintf("Server on %s does not advertise STARTTLS.", p.Endpoint), + Meta: metaOf(p), + }) + } + if !anySTARTTLS { + return []sdk.CheckState{unknownState( + "tls.starttls_advertised.not_applicable", + "No STARTTLS endpoint in the discovered set.", + )} + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.starttls_advertised.ok", + "STARTTLS is advertised on every STARTTLS endpoint.", + )} + } + return out +} + +// starttlsSupportedRule flags endpoints whose STARTTLS dialect is not +// implemented by this checker. A misconfigured discovery entry (typo, new +// protocol) should be visible as its own concern rather than blending into +// generic handshake failures. +type starttlsSupportedRule struct{} + +func (r *starttlsSupportedRule) Name() string { return "tls.starttls_dialect_supported" } +func (r *starttlsSupportedRule) Description() string { + return "Verifies that discovered STARTTLS dialects are implemented by the checker." +} + +func (r *starttlsSupportedRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Probes) == 0 { + return []sdk.CheckState{emptyCaseState("tls.starttls_dialect_supported.no_endpoints")} + } + + var out []sdk.CheckState + for _, ref := range sortedRefs(data) { + p := data.Probes[ref] + if !p.STARTTLSUnsupportedProto { + continue + } + out = append(out, sdk.CheckState{ + Status: sdk.StatusError, + Code: "tls.starttls_dialect_supported.unknown", + Subject: subjectOf(p), + Message: fmt.Sprintf("Unsupported STARTTLS dialect %q for %s.", p.STARTTLSDialect, p.Endpoint), + Meta: metaOf(p), + }) + } + if len(out) == 0 { + return []sdk.CheckState{passState( + "tls.starttls_dialect_supported.ok", + "Every STARTTLS dialect encountered is implemented.", + )} + } + return out +} diff --git a/checker/types.go b/checker/types.go index 3509d23..0dfd8b3 100644 --- a/checker/types.go +++ b/checker/types.go @@ -22,33 +22,67 @@ const ( MaxConcurrentProbes = 32 ) -// Severity values used in Issue.Severity (lowercase, ascii). -const ( - SeverityCrit = "crit" - SeverityWarn = "warn" - SeverityInfo = "info" -) - // TLSData is the full collected payload written under ObservationKeyTLSProbes. type TLSData struct { Probes map[string]TLSProbe `json:"probes"` CollectedAt time.Time `json:"collected_at"` } -// TLSProbe captures the outcome of probing a single endpoint. Field names -// mirror what consumers already parse (checker-xmpp's tlsProbeView). +// TLSProbe captures the outcome of probing a single endpoint. +// +// Only raw observation fields live here. Judgement (severity, pass/fail, +// human-facing messages) is derived from these fields by CheckRules. type TLSProbe struct { - Host string `json:"host"` - Port uint16 `json:"port"` - Endpoint string `json:"endpoint"` - Type string `json:"type"` - SNI string `json:"sni,omitempty"` - TLSVersion string `json:"tls_version,omitempty"` - CipherSuite string `json:"cipher_suite,omitempty"` - HostnameMatch *bool `json:"hostname_match,omitempty"` - ChainValid *bool `json:"chain_valid,omitempty"` - NotAfter time.Time `json:"not_after,omitempty"` - Issuer string `json:"issuer,omitempty"` + Host string `json:"host"` + Port uint16 `json:"port"` + Endpoint string `json:"endpoint"` + Type string `json:"type"` + SNI string `json:"sni,omitempty"` + + // RequireSTARTTLS is copied from the discovery entry so rules can tell + // whether a missing STARTTLS advertisement is a hard or soft failure. + RequireSTARTTLS bool `json:"require_starttls,omitempty"` + + // STARTTLSDialect mirrors contract.TLSEndpoint.STARTTLS verbatim. An + // empty value means direct TLS. + STARTTLSDialect string `json:"starttls_dialect,omitempty"` + + // Raw error strings. Exactly one of TCPError or HandshakeError is set + // when the probe failed before gathering handshake data. + TCPError string `json:"tcp_error,omitempty"` + HandshakeError string `json:"handshake_error,omitempty"` + + // STARTTLSNotOffered is true when HandshakeError was produced because + // the server did not advertise STARTTLS (errStartTLSNotOffered). + STARTTLSNotOffered bool `json:"starttls_not_offered,omitempty"` + + // STARTTLSUnsupportedProto is true when the STARTTLS dialect is not + // implemented by this checker. + STARTTLSUnsupportedProto bool `json:"starttls_unsupported_proto,omitempty"` + + // TLSHandshakeOK is true when a TLS handshake completed. It is + // independent from chain validity. + TLSHandshakeOK bool `json:"tls_handshake_ok,omitempty"` + + // TLSVersionNum is the numeric TLS version negotiated (uint16 from + // crypto/tls). Zero means no handshake occurred. Kept as an unsigned + // integer so rules can compare against tls.VersionTLS12 without + // re-parsing a string. + TLSVersionNum uint16 `json:"tls_version_num,omitempty"` + + TLSVersion string `json:"tls_version,omitempty"` + CipherSuite string `json:"cipher_suite,omitempty"` + CipherSuiteID uint16 `json:"cipher_suite_id,omitempty"` + + // NoPeerCert is true when the handshake succeeded but the server sent + // no certificate. + NoPeerCert bool `json:"no_peer_cert,omitempty"` + + HostnameMatch *bool `json:"hostname_match,omitempty"` + ChainValid *bool `json:"chain_valid,omitempty"` + ChainVerifyErr string `json:"chain_verify_err,omitempty"` + NotAfter time.Time `json:"not_after,omitempty"` + Issuer string `json:"issuer,omitempty"` // IssuerDN is the leaf's issuer as an RFC 2253 DN string, suitable for // matching the CCADB CAA Identifiers CSV "Subject" column when the AKI // lookup misses. @@ -65,8 +99,11 @@ type TLSProbe struct { // DANE consumers can match without re-handshaking or re-parsing. Chain []CertInfo `json:"chain,omitempty"` ElapsedMS int64 `json:"elapsed_ms,omitempty"` - Error string `json:"error,omitempty"` - Issues []Issue `json:"issues,omitempty"` + + // Error is a compatibility summary of whichever raw error applies. + // Left for any external consumer still inspecting it; rules should + // look at TCPError / HandshakeError instead. + Error string `json:"error,omitempty"` } // CertInfo describes one certificate in the presented chain together with @@ -101,10 +138,7 @@ type CertInfo struct { SPKIDERBase64 string `json:"spki_der_base64,omitempty"` } -// Issue is a single TLS finding surfaced to the consumer. -type Issue struct { - Code string `json:"code"` - Severity string `json:"severity"` - Message string `json:"message,omitempty"` - Fix string `json:"fix,omitempty"` -} +// Expiry thresholds shared by rules. +const ( + ExpiringSoonThreshold = 14 * 24 * time.Hour +) diff --git a/contract/contract.go b/contract/contract.go index bc038b0..52f1be1 100644 --- a/contract/contract.go +++ b/contract/contract.go @@ -123,7 +123,7 @@ type Entry struct { } // ParseEntries filters entries to those of Type and decodes each payload. -// Entries of other types are ignored silently — they belong to other +// Entries of other types are ignored silently, they belong to other // contracts. Entries of this type whose Payload fails to unmarshal are // skipped and returned as warnings so a single malformed payload cannot // starve the checker of the rest of its workload.