checker-stun-turn/checker/collect.go
Pierre-Olivier Mercier 5826bb1f40 Initial commit
Adds a happyDomain checker that probes STUN/TURN servers end-to-end:
DNS/SRV discovery, UDP/TCP/TLS/DTLS dial, STUN binding + reflexive-addr
sanity, open-relay detection, authenticated TURN Allocate (long-term
creds or REST-API HMAC), public-relay check, CreatePermission + Send
round-trip through the relay, and optional ChannelBind.

Failing sub-tests carry a remediation string (`Fix`) that the HTML
report surfaces as a yellow headline callout and inline next to each
row. Mapping covers the most common coturn misconfigurations
(external-ip, relay-ip, lt-cred-mech, min-port/max-port, cert issues,
401 nonce drift, 441/442/486/508 allocation errors).

Implements sdk.EndpointDiscoverer (checker/discovery.go): every
stuns:/turns:/DTLS endpoint observed during Collect is published as a
DiscoveredEndpoint{Type: "tls"|"dtls"} so a downstream TLS checker can
verify certificates without re-parsing the observation.

Backed by pion/stun/v3 + pion/turn/v4 + pion/dtls/v3; SDK pinned to a
local replace until the EndpointDiscoverer interface ships in a tagged
release.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 19:41:16 +07:00

331 lines
11 KiB
Go

package checker
import (
"context"
"crypto/tls"
"fmt"
"strings"
"time"
sdk "git.happydns.org/checker-sdk-go/checker"
)
type probeConfig struct {
mode string
username string
password string
sharedSecret string
realm string
probePeer string
testChannelBind bool
timeout time.Duration
warningRTT time.Duration
criticalRTT time.Duration
}
func (p *stunTurnProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any, error) {
zone, _ := opts["zone"].(string)
uri, _ := opts["serverURI"].(string)
mode, _ := opts["mode"].(string)
if mode == "" {
mode = "auto"
}
username, _ := opts["username"].(string)
password, _ := opts["credential"].(string)
sharedSecret, _ := opts["sharedSecret"].(string)
realm, _ := opts["realm"].(string)
transportsRaw, _ := opts["transports"].(string)
probePeer, _ := opts["probePeer"].(string)
if probePeer == "" {
probePeer = "1.1.1.1:53"
}
timeoutSec := sdk.GetIntOption(opts, "timeout", 5)
if timeoutSec <= 0 {
timeoutSec = 5
}
cfg := probeConfig{
mode: mode,
username: username,
password: password,
sharedSecret: sharedSecret,
realm: realm,
probePeer: probePeer,
testChannelBind: sdk.GetBoolOption(opts, "testChannelBind", false),
timeout: time.Duration(timeoutSec) * time.Second,
warningRTT: time.Duration(sdk.GetIntOption(opts, "warningRTT", 200)) * time.Millisecond,
criticalRTT: time.Duration(sdk.GetIntOption(opts, "criticalRTT", 1000)) * time.Millisecond,
}
transports := parseTransports(transportsRaw)
collectedAt := time.Now().UTC()
endpoints, err := discoverEndpoints(ctx, zone, uri, transports)
if err != nil {
return &StunTurnData{
Zone: zone,
Mode: mode,
CollectedAt: collectedAt,
GlobalError: err.Error(),
}, nil
}
data := &StunTurnData{
Zone: zone,
Mode: mode,
CollectedAt: collectedAt,
}
for _, ep := range endpoints {
report := EndpointReport{Endpoint: ep}
probeEndpoint(ctx, &report, cfg)
data.Endpoints = append(data.Endpoints, report)
}
return data, nil
}
func probeEndpoint(ctx context.Context, r *EndpointReport, cfg probeConfig) {
ep := r.Endpoint
dialName := fmt.Sprintf("dial:%s", ep.Transport)
dialStart := time.Now()
dc, err := dial(ctx, ep, cfg.timeout)
dialDur := time.Since(dialStart)
if err != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: dialName,
Status: SubTestCrit,
DurationMs: dialDur.Milliseconds(),
Error: err.Error(),
Fix: dialFix(ep, err),
})
return
}
defer dc.Close()
r.SubTests = append(r.SubTests, SubTest{
Name: dialName,
Status: SubTestOK,
DurationMs: dialDur.Milliseconds(),
Detail: fmt.Sprintf("connected to %s", dc.remoteAddr),
})
if dc.tlsState != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "tls",
Status: SubTestOK,
Detail: fmt.Sprintf("%s, %s, peer cert CN=%s",
tlsVersionString(dc.tlsState.Version),
tls.CipherSuiteName(dc.tlsState.CipherSuite),
peerCertCN(dc.tlsState),
),
})
}
if dc.dtlsState != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "dtls",
Status: SubTestOK,
Detail: "DTLS handshake completed",
})
}
bind := runSTUNBinding(dc, cfg.timeout)
if bind.Err != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "stun_binding",
Status: SubTestCrit,
Error: bind.Err.Error(),
Fix: "Server did not answer the STUN Binding Request. Check that the STUN service is actually listening on this transport, and that no middlebox is filtering RFC 5389 traffic.",
})
return
}
rttStatus := SubTestOK
rttFix := ""
if bind.RTT > cfg.criticalRTT {
rttStatus = SubTestCrit
rttFix = "Server is very slow to respond. Check server load, network path, and consider deploying closer to your users."
} else if bind.RTT > cfg.warningRTT {
rttStatus = SubTestWarn
rttFix = "Latency is high enough to noticeably degrade interactive RTC. Consider a server geographically closer to your users."
}
r.SubTests = append(r.SubTests, SubTest{
Name: "stun_binding",
Status: rttStatus,
DurationMs: bind.RTT.Milliseconds(),
Detail: fmt.Sprintf("reflexive address: %s", bind.ReflexiveAddr),
Fix: rttFix,
})
if bind.IsPrivateMapped {
r.SubTests = append(r.SubTests, SubTest{
Name: "stun_reflexive_public",
Status: SubTestCrit,
Detail: fmt.Sprintf("server returned a private/loopback IP: %s", bind.ReflexiveAddr),
Fix: "Server appears to be behind NAT and unaware of its public IP. Set `external-ip=<public>` (coturn) or the equivalent on your TURN server.",
})
} else {
r.SubTests = append(r.SubTests, SubTest{
Name: "stun_reflexive_public",
Status: SubTestOK,
Detail: fmt.Sprintf("public reflexive: %s", bind.ReflexiveAddr),
})
}
// Mode short-circuits: STUN-only servers stop here.
if cfg.mode == "stun" || !ep.IsTURN {
return
}
noAuth := runTURNAllocate(dc, nil, cfg.timeout)
if noAuth.RelayConn != nil {
_ = noAuth.RelayConn.Close()
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_open_relay_check",
Status: SubTestCrit,
Detail: "TURN allocation accepted without authentication",
Fix: "Enable long-term credentials (`lt-cred-mech` for coturn). Open relays are abused for spam and DDoS amplification.",
})
} else if noAuth.UnauthChallenge {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_open_relay_check",
Status: SubTestOK,
Detail: "server correctly challenged the unauthenticated allocate (401)",
})
} else {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_open_relay_check",
Status: SubTestWarn,
Detail: fmt.Sprintf("unexpected response (code=%d): %s", noAuth.AuthErrorCode, noAuth.AuthErrorReason),
Fix: "Server did not behave like a standard TURN. Verify it actually implements RFC 5766.",
})
}
creds := pickCredentials(cfg.username, cfg.password, cfg.sharedSecret, cfg.realm)
if creds == nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_allocate_auth",
Status: SubTestSkipped,
Detail: "no credentials provided",
})
return
}
// We need a fresh dialed conn; pion/turn binds the client to one PacketConn lifetime.
dc2, err := dial(ctx, ep, cfg.timeout)
if err != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_allocate_auth",
Status: SubTestError,
Error: fmt.Sprintf("redial failed: %v", err),
})
return
}
defer dc2.Close()
auth := runTURNAllocate(dc2, creds, cfg.timeout)
if auth.Err != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_allocate_auth",
Status: SubTestCrit,
DurationMs: auth.Duration.Milliseconds(),
Error: auth.Err.Error(),
Detail: fmt.Sprintf("STUN error code: %d", auth.AuthErrorCode),
Fix: allocateFix(auth.AuthErrorCode),
})
return
}
defer auth.RelayConn.Close()
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_allocate_auth",
Status: SubTestOK,
DurationMs: auth.Duration.Milliseconds(),
Detail: fmt.Sprintf("relay address: %s", auth.RelayAddr),
})
if auth.IsPrivateRelay {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_relay_public",
Status: SubTestCrit,
Detail: fmt.Sprintf("relay address is private: %s", auth.RelayAddr),
Fix: "Set `relay-ip=<public>` (coturn). The relay range must be publicly reachable for clients to use TURN.",
})
} else {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_relay_public",
Status: SubTestOK,
Detail: fmt.Sprintf("relay is public: %s", auth.RelayAddr),
})
}
if err := runRelayEcho(auth.RelayConn, cfg.probePeer, cfg.timeout); err != nil {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_relay_echo",
Status: SubTestWarn,
Error: err.Error(),
Fix: "Relay path could not carry traffic to the probe peer. Check the firewall/NAT around the server's relay range (`min-port`/`max-port`/`relay-ip` for coturn).",
})
} else {
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_relay_echo",
Status: SubTestOK,
Detail: fmt.Sprintf("CreatePermission + Send to %s succeeded", cfg.probePeer),
})
}
if cfg.testChannelBind {
// pion/turn handles ChannelBind transparently when the relay PacketConn
// is used through a turn.Client; we just record that the option was on.
r.SubTests = append(r.SubTests, SubTest{
Name: "turn_channel_bind",
Status: SubTestInfo,
Detail: "ChannelBind exercised implicitly by relay traffic",
})
}
}
func pickCredentials(username, password, sharedSecret, realm string) *turnCredentials {
if sharedSecret != "" {
return restAPICredentials(sharedSecret, username, realm, time.Hour)
}
if username != "" && password != "" {
return &turnCredentials{Username: username, Password: password, Realm: realm}
}
return nil
}
func dialFix(ep Endpoint, err error) string {
msg := strings.ToLower(err.Error())
switch {
case strings.Contains(msg, "no such host"):
return fmt.Sprintf("Hostname `%s` does not resolve. Add the matching A/AAAA record (or fix typos in the URI).", ep.Host)
case strings.Contains(msg, "tls handshake"), strings.Contains(msg, "x509"):
return fmt.Sprintf("TLS handshake failed for `%s`. Reissue the certificate covering this hostname (e.g. via Let's Encrypt) and reload the server (coturn: `cert=` and `pkey=`).", ep.Host)
case strings.Contains(msg, "connection refused"):
return fmt.Sprintf("Nothing is listening on %s/%d. Start the server with the appropriate listening port (coturn: `listening-port=`/`tls-listening-port=`).", ep.Host, ep.Port)
case strings.Contains(msg, "i/o timeout"), strings.Contains(msg, "deadline"):
switch ep.Transport {
case TransportUDP:
return "No reply on UDP. Open the UDP port inbound and verify your network does not block UDP egress."
default:
return "Connection timed out. A firewall or NAT is likely blocking this port."
}
}
return "Could not establish a connection to the server."
}
func allocateFix(code int) string {
switch code {
case 401:
return "Server kept rejecting the credentials. Check username/password (or the REST shared secret), and verify the server clock (NTP), as TURN nonces are time-sensitive."
case 403:
return "Server forbade the request. The user may not have allocation rights, or a peer-address filter is in effect."
case 437:
return "Allocation Mismatch. Wait a few seconds for the previous allocation to expire and retry, or restart the TURN server."
case 441:
return "Wrong Credentials. Double-check username/password; for REST-API auth ensure the shared secret matches the server's `static-auth-secret`."
case 442:
return "Unsupported Transport Protocol. Try a different transport in the URI (`?transport=tcp`/`udp`) or enable it server-side."
case 486:
return "Allocation Quota Reached. Lower per-user concurrent allocations or raise `user-quota`."
case 508:
return "Insufficient Capacity. Server is out of relay ports; raise `total-quota` or extend the `min-port`/`max-port` range."
}
return "TURN Allocate failed. Inspect the error and confirm the server speaks RFC 5766 on this transport."
}