checker-srv/checker/collect.go
Pierre-Olivier Mercier 90f1b4943f Initial commit
Generic SRV records checker for happyDomain.

For each SRV record attached to an svcs.UnknownSRV service, the checker
resolves every target and probes reachability:

  - DNS resolution (A/AAAA), CNAME detection (RFC 2782 violation),
    null-target detection (RFC 2782 "service explicitly unavailable")
  - TCP connect to target:port for _tcp SRVs
  - UDP probe for _udp SRVs, using ICMP port-unreachable detection

The checker also publishes TLS endpoints (host, port, SNI) for every
SRV target hitting a well-known direct-TLS port (443, 465, 636, 853,
993, 995, 5061, 5223, …) via the EndpointDiscoverer SDK interface, so
a downstream TLS checker can pick them up.

The HTML report groups records as cards and surfaces the most common
failure scenarios (DNS failure, CNAME target, TCP unreachable,
null-target) at the top with remediation guidance.
2026-04-26 18:17:38 +07:00

228 lines
6.8 KiB
Go

// This file is part of the happyDomain (R) project.
// Copyright (c) 2020-2026 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
//
// This program is offered under a commercial and under the AGPL license.
// For commercial licensing, contact us at <contact@happydomain.org>.
//
// For AGPL licensing:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package checker
import (
"context"
"encoding/json"
"fmt"
"net"
"strconv"
"strings"
"time"
sdk "git.happydns.org/checker-sdk-go/checker"
happydns "git.happydns.org/happyDomain/model"
)
// We decode SRV records by hand (instead of importing miekg/dns) so the
// checker stays light and its build surface minimal.
type unknownSRVPayload struct {
Records []struct {
Hdr struct {
Name string `json:"Name"`
} `json:"Hdr"`
Priority uint16 `json:"Priority"`
Weight uint16 `json:"Weight"`
Port uint16 `json:"Port"`
Target string `json:"Target"`
} `json:"srv"`
}
func (p *srvProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any, error) {
svcMsg, ok := sdk.GetOption[happydns.ServiceMessage](opts, "service")
if !ok {
return p.collectFallback(ctx, opts)
}
if svcMsg.Type != "svcs.UnknownSRV" {
return nil, fmt.Errorf("service type is %q, expected svcs.UnknownSRV", svcMsg.Type)
}
var payload unknownSRVPayload
if err := json.Unmarshal(svcMsg.Service, &payload); err != nil {
return nil, fmt.Errorf("failed to decode UnknownSRV: %w", err)
}
if len(payload.Records) == 0 {
return nil, fmt.Errorf("service contains no SRV records")
}
subdomain, _ := sdk.GetOption[string](opts, "subdomain")
domain, _ := sdk.GetOption[string](opts, "domain")
serviceDomain := strings.TrimSuffix(subdomain, ".")
if domain != "" {
if serviceDomain != "" {
serviceDomain += "." + strings.TrimSuffix(domain, ".")
} else {
serviceDomain = strings.TrimSuffix(domain, ".")
}
}
tcpTimeout := durationOpt(opts, "tcpTimeout", 3000)
udpTimeout := durationOpt(opts, "udpTimeout", 2000)
data := &SRVData{
ServiceDomain: serviceDomain,
Records: make([]SRVRecord, 0, len(payload.Records)),
}
for _, r := range payload.Records {
owner := strings.TrimSuffix(r.Hdr.Name, ".")
svc, proto := parseOwner(owner, serviceDomain)
rec := SRVRecord{
Service: svc,
Proto: proto,
Owner: owner,
Target: strings.TrimSuffix(r.Target, "."),
Port: r.Port,
Priority: r.Priority,
Weight: r.Weight,
}
resolveAndProbe(ctx, &rec, tcpTimeout, udpTimeout)
data.Records = append(data.Records, rec)
}
return data, nil
}
// Owners that don't match _svc._proto have no proto we can trust, so we
// skip probing rather than silently defaulting to TCP and reporting a misleading status.
func resolveAndProbe(ctx context.Context, rec *SRVRecord, tcpTimeout, udpTimeout time.Duration) {
// RFC 2782: "." target means "service decidedly not available".
if rec.Target == "" || rec.Target == "." {
rec.IsNullTarget = true
return
}
// CNAME detection (RFC 2782 §"Usage rules": target MUST be a name that
// resolves to A/AAAA records directly, not a CNAME).
if cname, err := net.DefaultResolver.LookupCNAME(ctx, rec.Target); err == nil {
canon := strings.TrimSuffix(cname, ".")
if canon != "" && !strings.EqualFold(canon, rec.Target) {
rec.IsCNAME = true
rec.CNAMEChain = []string{rec.Target, canon}
}
}
ips, err := net.DefaultResolver.LookupIPAddr(ctx, rec.Target)
if err != nil {
rec.ResolveError = err.Error()
return
}
for _, ip := range ips {
rec.Addresses = append(rec.Addresses, ip.IP.String())
}
for _, addr := range rec.Addresses {
hostport := net.JoinHostPort(addr, strconv.Itoa(int(rec.Port)))
switch rec.Proto {
case protoTCP:
rec.Probes = append(rec.Probes, probeTCP(ctx, hostport, tcpTimeout))
case protoUDP:
rec.Probes = append(rec.Probes, probeUDP(ctx, hostport, udpTimeout))
}
}
}
func parseOwner(owner, serviceDomain string) (svc, proto string) {
// Returns ("", "") when the owner does not match: callers must treat
// that as "unknown" and skip proto-specific probing rather than guessing.
s := strings.TrimSuffix(owner, "."+serviceDomain)
parts := strings.Split(s, ".")
if len(parts) >= 2 && strings.HasPrefix(parts[0], "_") && strings.HasPrefix(parts[1], "_") {
return strings.TrimPrefix(parts[0], "_"), strings.TrimPrefix(parts[1], "_")
}
return "", ""
}
func durationOpt(opts sdk.CheckerOptions, key string, defMs int) time.Duration {
ms := defMs
if v, ok := opts[key]; ok {
switch n := v.(type) {
case float64:
ms = int(n)
case int:
ms = n
}
}
if ms < 100 {
ms = 100
}
if ms > 60000 {
ms = 60000
}
return time.Duration(ms) * time.Millisecond
}
func probeTCP(ctx context.Context, hostport string, timeout time.Duration) ProbeResult {
pr := ProbeResult{Address: hostport, Proto: protoTCP}
start := time.Now()
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
conn, err := (&net.Dialer{}).DialContext(ctx, protoTCP, hostport)
pr.LatencyMs = float64(time.Since(start).Microseconds()) / 1000.0
if err != nil {
pr.Error = err.Error()
return pr
}
_ = conn.Close()
pr.Connected = true
return pr
}
func probeUDP(ctx context.Context, hostport string, timeout time.Duration) ProbeResult {
pr := ProbeResult{Address: hostport, Proto: protoUDP}
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
conn, err := (&net.Dialer{}).DialContext(ctx, protoUDP, hostport)
if err != nil {
pr.Error = err.Error()
return pr
}
defer conn.Close()
// Send a single zero byte. If the host has nothing listening and returns
// ICMP port-unreachable, a subsequent Read will fail with "connection
// refused". Silent drops (firewalled) remain indistinguishable from a
// working service, report as "reachable (no response)".
_ = conn.SetDeadline(time.Now().Add(timeout))
if _, err := conn.Write([]byte{0}); err != nil {
pr.Error = err.Error()
return pr
}
buf := make([]byte, 1)
_, err = conn.Read(buf)
if err != nil {
if ne, ok := err.(net.Error); ok && ne.Timeout() {
// No ICMP unreachable came back: host probably accepts UDP,
// or packets are silently dropped. Treat as "reachable".
pr.Connected = true
pr.Error = "no UDP response (host may still be reachable)"
return pr
}
pr.Error = err.Error()
return pr
}
pr.Connected = true
return pr
}