checker-sip/checker/collect.go

592 lines
17 KiB
Go

package checker
import (
"bytes"
"context"
"crypto/tls"
"errors"
"fmt"
"net"
"slices"
"strconv"
"strings"
"sync"
"time"
"github.com/miekg/dns"
sdk "git.happydns.org/checker-sdk-go/checker"
)
// Collect runs the full SIP probe against a domain.
func (p *sipProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any, error) {
domain, _ := sdk.GetOption[string](opts, "domain")
domain = strings.TrimSuffix(strings.TrimSpace(domain), ".")
if domain == "" {
return nil, fmt.Errorf("domain is required")
}
timeoutSecs := sdk.GetFloatOption(opts, "timeout", 5)
if timeoutSecs < 1 {
timeoutSecs = 5
}
perEndpoint := time.Duration(timeoutSecs * float64(time.Second))
probeUDP := sdk.GetBoolOption(opts, "probeUDP", true)
probeTCP := sdk.GetBoolOption(opts, "probeTCP", true)
probeTLS := sdk.GetBoolOption(opts, "probeTLS", true)
data := &SIPData{
Domain: domain,
RunAt: time.Now().UTC().Format(time.RFC3339),
SRV: SRVLookup{Errors: map[string]string{}},
}
resolver := net.DefaultResolver
// NAPTR lookup — best-effort, failures become an info issue.
if naptr, err := lookupNAPTR(ctx, domain); err != nil {
data.SRV.Errors["naptr"] = err.Error()
} else {
data.NAPTR = naptr
}
// SRV lookups (per transport). Errors are kept per-prefix; "not
// found" is normalised to nil by lookupSRV.
type srvSet struct {
prefix string
want bool
dst *[]SRVRecord
}
sets := []srvSet{
{"_sip._udp.", probeUDP, &data.SRV.UDP},
{"_sip._tcp.", probeTCP, &data.SRV.TCP},
{"_sips._tcp.", probeTLS, &data.SRV.SIPS},
}
for _, s := range sets {
if !s.want {
continue
}
recs, err := lookupSRV(ctx, resolver, s.prefix, domain)
if err != nil {
data.SRV.Errors[s.prefix] = err.Error()
continue
}
*s.dst = recs
}
// Fallback when no SRV at all: synthesize a single target on each
// enabled transport against the bare domain.
total := len(data.SRV.UDP) + len(data.SRV.TCP) + len(data.SRV.SIPS)
if total == 0 {
data.SRV.FallbackProbed = true
if probeUDP {
data.SRV.UDP = []SRVRecord{{Target: domain, Port: 5060}}
}
if probeTCP {
data.SRV.TCP = []SRVRecord{{Target: domain, Port: 5060}}
}
if probeTLS {
data.SRV.SIPS = []SRVRecord{{Target: domain, Port: 5061}}
}
}
type transportJob struct {
records []SRVRecord
prefix string
t Transport
}
jobs := []transportJob{
{data.SRV.UDP, "_sip._udp.", TransportUDP},
{data.SRV.TCP, "_sip._tcp.", TransportTCP},
{data.SRV.SIPS, "_sips._tcp.", TransportTLS},
}
var wg sync.WaitGroup
var mu sync.Mutex
for _, job := range jobs {
wg.Add(1)
go func(j transportJob) {
defer wg.Done()
resolveAllInto(ctx, resolver, j.records)
eps := probeSet(ctx, j.prefix, j.t, j.records, perEndpoint)
mu.Lock()
data.Endpoints = append(data.Endpoints, eps...)
mu.Unlock()
}(job)
}
wg.Wait()
computeCoverage(data)
data.Issues = deriveIssues(data, probeUDP, probeTCP, probeTLS)
return data, nil
}
// ─── DNS ──────────────────────────────────────────────────────────────
func lookupSRV(ctx context.Context, r *net.Resolver, prefix, domain string) ([]SRVRecord, error) {
name := prefix + dns.Fqdn(domain)
_, records, err := r.LookupSRV(ctx, "", "", name)
if err != nil {
var dnsErr *net.DNSError
if errors.As(err, &dnsErr) && dnsErr.IsNotFound {
return nil, nil
}
return nil, err
}
// RFC 2782 null-target: single "." record with port 0 means
// "service explicitly unavailable".
if len(records) == 1 && (records[0].Target == "." || records[0].Target == "") && records[0].Port == 0 {
return nil, nil
}
out := make([]SRVRecord, 0, len(records))
for _, r := range records {
out = append(out, SRVRecord{
Target: strings.TrimSuffix(r.Target, "."),
Port: r.Port,
Priority: r.Priority,
Weight: r.Weight,
})
}
return out, nil
}
func lookupNAPTR(ctx context.Context, domain string) ([]NAPTRRecord, error) {
cfg, err := dns.ClientConfigFromFile("/etc/resolv.conf")
if err != nil || cfg == nil || len(cfg.Servers) == 0 {
cfg = &dns.ClientConfig{Servers: []string{"1.1.1.1", "8.8.8.8"}, Port: "53"}
}
m := new(dns.Msg)
m.SetQuestion(dns.Fqdn(domain), dns.TypeNAPTR)
m.RecursionDesired = true
c := new(dns.Client)
c.Timeout = 3 * time.Second
var lastErr error
for _, srv := range cfg.Servers {
addr := net.JoinHostPort(srv, cfg.Port)
in, _, err := c.ExchangeContext(ctx, m, addr)
if err != nil {
lastErr = err
continue
}
if in.Rcode == dns.RcodeNameError {
return nil, nil
}
if in.Rcode != dns.RcodeSuccess {
lastErr = fmt.Errorf("rcode %s", dns.RcodeToString[in.Rcode])
continue
}
var out []NAPTRRecord
for _, rr := range in.Answer {
n, ok := rr.(*dns.NAPTR)
if !ok {
continue
}
if !strings.HasPrefix(strings.ToUpper(n.Service), "SIP+") && !strings.HasPrefix(strings.ToUpper(n.Service), "SIPS+") {
continue
}
out = append(out, NAPTRRecord{
Service: n.Service,
Regexp: n.Regexp,
Replacement: strings.TrimSuffix(n.Replacement, "."),
Flags: n.Flags,
Order: n.Order,
Preference: n.Preference,
})
}
return out, nil
}
return nil, lastErr
}
func resolveAllInto(ctx context.Context, r *net.Resolver, records []SRVRecord) {
for i := range records {
ips, err := r.LookupIPAddr(ctx, records[i].Target)
if err != nil {
continue
}
for _, ip := range ips {
if v4 := ip.IP.To4(); v4 != nil {
records[i].IPv4 = append(records[i].IPv4, v4.String())
} else {
records[i].IPv6 = append(records[i].IPv6, ip.IP.String())
}
}
}
}
// ─── Probing ──────────────────────────────────────────────────────────
func probeSet(ctx context.Context, prefix string, t Transport, records []SRVRecord, timeout time.Duration) []EndpointProbe {
var eps []EndpointProbe
for _, rec := range records {
addrs := allAddrs(rec)
if len(addrs) == 0 {
eps = append(eps, EndpointProbe{
Transport: t,
SRVPrefix: prefix,
Target: rec.Target,
Port: rec.Port,
Error: "no A/AAAA records for target",
})
continue
}
for _, a := range addrs {
eps = append(eps, probeEndpoint(ctx, t, prefix, rec, a, timeout))
}
}
return eps
}
type probeAddr struct {
ip string
isV6 bool
}
func allAddrs(r SRVRecord) []probeAddr {
out := make([]probeAddr, 0, len(r.IPv4)+len(r.IPv6))
for _, ip := range r.IPv4 {
out = append(out, probeAddr{ip: ip, isV6: false})
}
for _, ip := range r.IPv6 {
out = append(out, probeAddr{ip: ip, isV6: true})
}
return out
}
func probeEndpoint(ctx context.Context, t Transport, prefix string, rec SRVRecord, a probeAddr, timeout time.Duration) (ep EndpointProbe) {
start := time.Now()
addrPort := net.JoinHostPort(a.ip, strconv.Itoa(int(rec.Port)))
ep = EndpointProbe{
Transport: t,
SRVPrefix: prefix,
Target: rec.Target,
Port: rec.Port,
Address: addrPort,
IsIPv6: a.isV6,
}
defer func() { ep.ElapsedMS = time.Since(start).Milliseconds() }()
ua := "happyDomain-checker-sip/" + Version
switch t {
case TransportUDP:
probeUDP(ctx, &ep, rec.Target, ua, timeout)
case TransportTCP:
probeTCP(ctx, &ep, rec.Target, ua, timeout)
case TransportTLS:
probeTLSConn(ctx, &ep, rec.Target, ua, timeout)
}
return
}
func probeUDP(ctx context.Context, ep *EndpointProbe, target, ua string, timeout time.Duration) {
d := net.Dialer{Timeout: timeout}
conn, err := d.DialContext(ctx, "udp", ep.Address)
if err != nil {
ep.ReachableErr = err.Error()
ep.Error = "udp dial: " + err.Error()
return
}
defer conn.Close()
ep.Reachable = true
_ = conn.SetDeadline(time.Now().Add(timeout))
req := buildOptionsRequest(target, ep.Port, TransportUDP, localAddrFor(conn), ua)
sent := time.Now()
if _, err := conn.Write([]byte(req)); err != nil {
ep.Error = "udp write: " + err.Error()
return
}
ep.OptionsSent = true
buf := make([]byte, 8192)
n, err := conn.Read(buf)
if err != nil {
ep.Error = "no udp response: " + err.Error()
return
}
resp, err := parseSIPResponse(bytes.NewReader(buf[:n]))
if err != nil {
ep.Error = "bad response: " + err.Error()
return
}
applyResponse(ep, resp, sent)
}
func probeTCP(ctx context.Context, ep *EndpointProbe, target, ua string, timeout time.Duration) {
d := net.Dialer{Timeout: timeout}
conn, err := d.DialContext(ctx, "tcp", ep.Address)
if err != nil {
ep.ReachableErr = err.Error()
ep.Error = "tcp dial: " + err.Error()
return
}
defer conn.Close()
ep.Reachable = true
_ = conn.SetDeadline(time.Now().Add(timeout))
req := buildOptionsRequest(target, ep.Port, TransportTCP, localAddrFor(conn), ua)
sent := time.Now()
if _, err := conn.Write([]byte(req)); err != nil {
ep.Error = "tcp write: " + err.Error()
return
}
ep.OptionsSent = true
resp, err := parseSIPResponse(conn)
if err != nil {
ep.Error = "no tcp response: " + err.Error()
return
}
applyResponse(ep, resp, sent)
}
func probeTLSConn(ctx context.Context, ep *EndpointProbe, target, ua string, timeout time.Duration) {
d := net.Dialer{Timeout: timeout}
raw, err := d.DialContext(ctx, "tcp", ep.Address)
if err != nil {
ep.ReachableErr = err.Error()
ep.Error = "tcp dial: " + err.Error()
return
}
// We deliberately skip cert verification — checker-tls is the
// source of truth for TLS posture. We just want to reach SIP over
// TLS.
cfg := &tls.Config{
InsecureSkipVerify: true, //nolint:gosec
ServerName: target,
}
conn := tls.Client(raw, cfg)
if err := conn.HandshakeContext(ctx); err != nil {
_ = raw.Close()
ep.Error = "tls handshake: " + err.Error()
return
}
defer conn.Close()
ep.Reachable = true
state := conn.ConnectionState()
ep.TLSVersion = tls.VersionName(state.Version)
ep.TLSCipher = tls.CipherSuiteName(state.CipherSuite)
_ = conn.SetDeadline(time.Now().Add(timeout))
req := buildOptionsRequest(target, ep.Port, TransportTLS, localAddrFor(conn), ua)
sent := time.Now()
if _, err := conn.Write([]byte(req)); err != nil {
ep.Error = "tls write: " + err.Error()
return
}
ep.OptionsSent = true
resp, err := parseSIPResponse(conn)
if err != nil {
ep.Error = "no tls response: " + err.Error()
return
}
applyResponse(ep, resp, sent)
}
func applyResponse(ep *EndpointProbe, resp *sipResponse, sent time.Time) {
ep.OptionsRawCode = resp.StatusCode
ep.OptionsStatus = fmt.Sprintf("%d %s", resp.StatusCode, strings.TrimSpace(resp.StatusPhrase))
ep.OptionsRTTMs = time.Since(sent).Milliseconds()
ep.ServerHeader = resp.Server
ep.UserAgent = resp.UserAgent
ep.AllowMethods = resp.Allow
ep.ContactURI = resp.Contact
}
// ─── Coverage + issues ────────────────────────────────────────────────
func computeCoverage(data *SIPData) {
for _, ep := range data.Endpoints {
if ep.Reachable {
if ep.IsIPv6 {
data.Coverage.HasIPv6 = true
} else {
data.Coverage.HasIPv4 = true
}
}
if !ep.OK() {
continue
}
switch ep.Transport {
case TransportUDP:
data.Coverage.WorkingUDP = true
case TransportTCP:
data.Coverage.WorkingTCP = true
case TransportTLS:
data.Coverage.WorkingTLS = true
}
}
data.Coverage.AnyWorking = data.Coverage.WorkingUDP || data.Coverage.WorkingTCP || data.Coverage.WorkingTLS
}
func deriveIssues(data *SIPData, wantUDP, wantTCP, wantTLS bool) []Issue {
var out []Issue
totalSRV := len(data.SRV.UDP) + len(data.SRV.TCP) + len(data.SRV.SIPS)
if totalSRV == 0 && data.SRV.FallbackProbed {
out = append(out, Issue{
Code: CodeNoSRV,
Severity: SeverityCrit,
Message: "No SIP SRV records published for " + data.Domain + ".",
Fix: "Publish `_sip._tcp." + data.Domain + ". SRV 10 10 5060 sip." + data.Domain + ".` (and `_sips._tcp` on 5061 for TLS).",
})
}
// "Only UDP" — the most common real-world failure for modern trunks.
if len(data.SRV.UDP) > 0 && len(data.SRV.TCP) == 0 && len(data.SRV.SIPS) == 0 && !data.SRV.FallbackProbed {
out = append(out, Issue{
Code: CodeOnlyUDP,
Severity: SeverityWarn,
Message: "Only _sip._udp is published; modern SIP trunks (Twilio, OVH, Orange…) prefer TCP/TLS.",
Fix: "Also publish `_sip._tcp." + data.Domain + ".` and ideally `_sips._tcp." + data.Domain + ".`.",
})
}
// No TLS at all when TCP exists.
if wantTLS && len(data.SRV.SIPS) == 0 && (len(data.SRV.UDP) > 0 || len(data.SRV.TCP) > 0) && !data.SRV.FallbackProbed {
out = append(out, Issue{
Code: CodeNoTLS,
Severity: SeverityInfo,
Message: "No _sips._tcp SRV record — SIP signalling runs in the clear.",
Fix: "Publish `_sips._tcp." + data.Domain + ".` on port 5061 and terminate TLS on the server.",
})
}
// Per-prefix DNS errors.
for prefix, msg := range data.SRV.Errors {
if prefix == "naptr" {
out = append(out, Issue{
Code: CodeNAPTRServfail,
Severity: SeverityInfo,
Message: "NAPTR lookup for " + data.Domain + " failed: " + msg,
Fix: "This is optional. If you meant to expose a NAPTR, verify your authoritative resolver answers AUTH/NXDOMAIN cleanly.",
})
continue
}
out = append(out, Issue{
Code: CodeSRVServfail,
Severity: SeverityWarn,
Message: "SRV lookup for `" + prefix + data.Domain + "` failed: " + msg,
Fix: "Check zone serial and authoritative NS for this name.",
})
}
// Fallback-probed notice.
if data.SRV.FallbackProbed {
out = append(out, Issue{
Code: CodeFallbackProbed,
Severity: SeverityInfo,
Message: "No SIP SRV records: probing fell back to " + data.Domain + ":5060 / :5061.",
Fix: "Publish the SRV records expected by SIP clients and trunks.",
})
}
// Per-endpoint findings.
for _, ep := range data.Endpoints {
switch {
case !ep.Reachable && ep.ReachableErr == "" && ep.Error == "no A/AAAA records for target":
out = append(out, Issue{
Code: CodeSRVTargetUnresolved,
Severity: SeverityCrit,
Message: "SRV target `" + ep.Target + "` has no A/AAAA.",
Fix: "Add A/AAAA records for `" + ep.Target + "` or change the SRV target.",
Endpoint: ep.Target,
})
case !ep.Reachable:
code := CodeTCPUnreachable
msg := "TCP port " + strconv.Itoa(int(ep.Port)) + " is closed or filtered on " + ep.Address + "."
fix := "Verify the SIP server is running and the firewall/NAT forwards port " + strconv.Itoa(int(ep.Port)) + "."
switch ep.Transport {
case TransportUDP:
code = CodeUDPUnreachable
msg = "UDP port " + strconv.Itoa(int(ep.Port)) + " refused on " + ep.Address + "."
fix = "Verify the SIP server listens on UDP " + strconv.Itoa(int(ep.Port)) + " and that no stateless firewall drops the reply."
case TransportTLS:
if ep.Error != "" && strings.HasPrefix(ep.Error, "tls handshake") {
code = CodeTLSHandshake
msg = "TLS handshake failed on " + ep.Address + ": " + strings.TrimPrefix(ep.Error, "tls handshake: ")
fix = "Present a valid certificate (chain + SAN including `" + ep.Target + "`) and accept TLS 1.2+."
}
}
out = append(out, Issue{
Code: code,
Severity: SeverityCrit,
Message: msg,
Fix: fix,
Endpoint: ep.Address,
})
case ep.Reachable && !ep.OptionsSent:
out = append(out, Issue{
Code: CodeOptionsNoAnswer,
Severity: SeverityCrit,
Message: ep.Address + " accepted the connection but the probe could not send an OPTIONS: " + ep.Error,
Fix: "Investigate the server's SIP listener.",
Endpoint: ep.Address,
})
case ep.OptionsSent && ep.OptionsRawCode == 0:
out = append(out, Issue{
Code: CodeOptionsNoAnswer,
Severity: SeverityCrit,
Message: ep.Address + " is reachable but silent on SIP OPTIONS.",
Fix: "Enable unauthenticated OPTIONS (`handle_options = yes` in Kamailio, `allowguest = yes` in Asterisk/FreeSWITCH) or add the probe source to the ACL.",
Endpoint: ep.Address,
})
case ep.OptionsRawCode >= 300:
out = append(out, Issue{
Code: CodeOptionsNon2xx,
Severity: SeverityWarn,
Message: ep.Address + " answered " + ep.OptionsStatus + " to OPTIONS.",
Fix: "Check SIP routing / ACL. Some stacks reject unauthenticated OPTIONS with 403/404.",
Endpoint: ep.Address,
})
case ep.OK() && len(ep.AllowMethods) > 0 && !slices.Contains(ep.AllowMethods, "INVITE"):
out = append(out, Issue{
Code: CodeOptionsNoInvite,
Severity: SeverityWarn,
Message: ep.Address + " answered 2xx but does not advertise INVITE in Allow.",
Fix: "Verify the dialplan / endpoint is allowed to place calls.",
Endpoint: ep.Address,
})
case ep.OK() && len(ep.AllowMethods) == 0:
out = append(out, Issue{
Code: CodeOptionsNoAllow,
Severity: SeverityInfo,
Message: ep.Address + " answered 2xx but did not advertise an Allow header.",
Fix: "Configure the SIP stack to include Allow (benign but helps callers discover capabilities).",
Endpoint: ep.Address,
})
}
}
// Nothing reachable at all.
if len(data.Endpoints) > 0 && !data.Coverage.AnyWorking {
out = append(out, Issue{
Code: CodeAllDown,
Severity: SeverityCrit,
Message: "No SIP endpoint answered OPTIONS on any transport.",
Fix: "Verify the SIP server is running and reachable on the published SRV ports.",
})
}
// IPv6 coverage.
if data.Coverage.HasIPv4 && !data.Coverage.HasIPv6 {
out = append(out, Issue{
Code: CodeNoIPv6,
Severity: SeverityInfo,
Message: "No IPv6 endpoint reachable.",
Fix: "Publish AAAA records for the SRV targets.",
})
}
return out
}