// This file is part of the happyDomain (R) project. // Copyright (c) 2020-2026 happyDomain // Authors: Pierre-Olivier Mercier, et al. package checker import ( "bytes" "context" "crypto/tls" "fmt" "io" "log" "net" "net/http" "net/url" "os" "strings" "sync" "sync/atomic" "time" sdk "git.happydns.org/checker-sdk-go/checker" "golang.org/x/net/html" ) // verboseLogging is enabled via the CHECKER_HTTP_VERBOSE environment variable; // when off, per-probe logging is silenced to keep production logs clean. var verboseLogging = os.Getenv("CHECKER_HTTP_VERBOSE") != "" // Collect probes HTTP and HTTPS for every (IP, scheme) pair on the // abstract.Server. The HTTP body of the primary HTTPS probe is parsed for // SRI evaluation; secondary probes only retain headers/cookies/redirects. func (p *httpProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any, error) { server, err := resolveServer(opts) if err != nil { return nil, err } timeoutMs := sdk.GetIntOption(opts, OptionProbeTimeoutMs, DefaultProbeTimeoutMs) if timeoutMs <= 0 { timeoutMs = DefaultProbeTimeoutMs } timeout := time.Duration(timeoutMs) * time.Millisecond maxRedirects := sdk.GetIntOption(opts, OptionMaxRedirects, DefaultMaxRedirects) if maxRedirects < 0 { maxRedirects = DefaultMaxRedirects } userAgent := DefaultUserAgent if v, ok := sdk.GetOption[string](opts, OptionUserAgent); ok && v != "" { userAgent = v } host, ips := addressesFromServer(server) // abstract.Server only pins one A and one AAAA. Resolve the host // to pick up any additional records the authoritative DNS exposes, // so multi-IP deployments aren't silently under-probed. Failures // are non-fatal; the pinned IPs remain. seen := make(map[string]struct{}, len(ips)+4) for _, ip := range ips { seen[ip] = struct{}{} } ips = append(ips, discoverIPs(ctx, host, seen)...) if len(ips) == 0 { return nil, fmt.Errorf("abstract.Server has no A/AAAA records") } data := &HTTPData{ Domain: host, CollectedAt: time.Now(), } type job struct { scheme string port uint16 ip string // parseHTML controls whether the HTML body is parsed and its // references kept on the probe. We only do this for the first // HTTPS probe to keep the payload bounded. parseHTML bool } var jobs []job htmlPicked := false for _, ip := range ips { jobs = append(jobs, job{scheme: "http", port: DefaultHTTPPort, ip: ip}) j := job{scheme: "https", port: DefaultHTTPSPort, ip: ip} if !htmlPicked { j.parseHTML = true htmlPicked = true } jobs = append(jobs, j) } var mu sync.Mutex var wg sync.WaitGroup sem := make(chan struct{}, MaxConcurrentProbes) for _, j := range jobs { wg.Add(1) sem <- struct{}{} go func(j job) { defer wg.Done() defer func() { <-sem }() probe := runProbe(ctx, host, j.ip, j.scheme, j.port, timeout, maxRedirects, userAgent, j.parseHTML) if verboseLogging { log.Printf("checker-http: %s ip=%s status=%d redirects=%d err=%q", j.scheme, j.ip, probe.StatusCode, len(probe.RedirectChain), probe.Error) } mu.Lock() data.Probes = append(data.Probes, probe) mu.Unlock() }(j) } wg.Wait() return data, nil } func runProbe(ctx context.Context, host, ip, scheme string, port uint16, timeout time.Duration, maxRedirects int, ua string, parseHTML bool) HTTPProbe { addr := net.JoinHostPort(ip, fmt.Sprintf("%d", port)) probe := HTTPProbe{ Scheme: scheme, Host: host, IP: ip, Port: port, Address: addr, IsIPv6: strings.Contains(ip, ":"), } dialer := &net.Dialer{Timeout: timeout} // tcpConnected is set the moment a dial succeeds, so we can // distinguish pure-TCP failures from later TLS/HTTP errors without // resorting to error-string matching. var tcpConnected atomic.Bool // Force every dial to the chosen IP, regardless of what hostname is // in the URL; that way we can attribute results to a specific A/AAAA // record and bypass local resolver oddities. transport := &http.Transport{ DialContext: func(ctx context.Context, network, _ string) (net.Conn, error) { conn, err := dialer.DialContext(ctx, network, addr) if err != nil { return nil, err } tcpConnected.Store(true) return conn, nil }, TLSClientConfig: &tls.Config{ ServerName: host, // Deep TLS posture is delegated to checker-tls. We still want // HTTPS errors (expired cert, bad chain, ...) to surface as // probe errors, so verification stays enabled. }, TLSHandshakeTimeout: timeout, ResponseHeaderTimeout: timeout, DisableKeepAlives: true, } defer transport.CloseIdleConnections() // Bound the whole probe (dial + TLS + headers + body across all // redirect hops) by a single per-probe deadline derived from ctx, so // a slow target can't pin a worker beyond the parent's lifetime and // outer cancellation propagates to in-flight I/O. probeBudget := timeout * time.Duration(maxRedirects+1) probeCtx, cancel := context.WithTimeout(ctx, probeBudget) defer cancel() var redirectChain []RedirectStep client := &http.Client{ Transport: transport, // No client-level Timeout: probeCtx already bounds the request, // and a separate http.Client.Timeout would race with it. CheckRedirect: func(req *http.Request, via []*http.Request) error { prev := via[len(via)-1] redirectChain = append(redirectChain, RedirectStep{ From: prev.URL.String(), To: req.URL.String(), Status: 0, // populated post-hoc below if available }) // The transport's DialContext is pinned to the original // (ip, port) and TLS ServerName is pinned to the original // host. Following a redirect that changes host, scheme, or // port would silently route the request to the wrong // backend. Stop and return the 3xx so the caller can see // the Location, but don't follow it on this probe. if !strings.EqualFold(req.URL.Host, host) || !strings.EqualFold(req.URL.Scheme, scheme) { return http.ErrUseLastResponse } if len(via) > maxRedirects { return fmt.Errorf("stopped after %d redirects", maxRedirects) } return nil }, } target := &url.URL{Scheme: scheme, Host: host, Path: "/"} req, err := http.NewRequestWithContext(probeCtx, http.MethodGet, target.String(), nil) if err != nil { probe.Error = err.Error() return probe } req.Header.Set("User-Agent", ua) req.Header.Set("Accept", "text/html,application/xhtml+xml;q=0.9,*/*;q=0.5") start := time.Now() resp, err := client.Do(req) probe.ElapsedMS = time.Since(start).Milliseconds() if err != nil { probe.Error = err.Error() // The dialer wrapper sets tcpConnected the moment a TCP // connection is established, so we can attribute the failure // to a post-TCP layer (TLS, HTTP, redirect policy) without // any error-string heuristics. probe.TCPConnected = tcpConnected.Load() probe.RedirectChain = redirectChain return probe } defer resp.Body.Close() probe.TCPConnected = true probe.StatusCode = resp.StatusCode if resp.Request != nil && resp.Request.URL != nil { probe.FinalURL = resp.Request.URL.String() } // Per RFC 7230 ยง3.2.2, repeated headers (other than Set-Cookie) are // semantically equivalent to a single header whose value is the // comma-joined list; folding here preserves directives like a second // CSP or HSTS header that would otherwise be dropped. Set-Cookie is // excluded from the map since cookies are surfaced via resp.Cookies(). probe.Headers = make(map[string]string, len(resp.Header)) for k, v := range resp.Header { if len(v) == 0 { continue } lk := strings.ToLower(k) if lk == "set-cookie" { continue } probe.Headers[lk] = strings.Join(v, ", ") } for _, c := range resp.Cookies() { probe.Cookies = append(probe.Cookies, CookieInfo{ Name: c.Name, Domain: c.Domain, Path: c.Path, Secure: c.Secure, HttpOnly: c.HttpOnly, SameSite: sameSiteString(c.SameSite), HasExpiry: !c.Expires.IsZero() || c.MaxAge > 0, }) } probe.RedirectChain = redirectChain // Read one extra byte to detect whether we hit the cap. Anything // beyond MaxBodyBytes is dropped, but the probe surfaces // BodyTruncated so callers know SRI/HTML rules saw a partial view. body, err := io.ReadAll(io.LimitReader(resp.Body, MaxBodyBytes+1)) if err == nil { if len(body) > MaxBodyBytes { body = body[:MaxBodyBytes] probe.BodyTruncated = true } probe.HTMLBytes = len(body) if parseHTML && isHTMLContent(probe.Headers["content-type"]) { probe.Resources = extractResources(body, host) } } return probe } func sameSiteString(s http.SameSite) string { switch s { case http.SameSiteLaxMode: return "Lax" case http.SameSiteStrictMode: return "Strict" case http.SameSiteNoneMode: return "None" default: return "" } } func isHTMLContent(ct string) bool { ct = strings.ToLower(ct) return strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml") } // extractResources walks the HTML body and collects