checker-http/checker/collect.go

351 lines
10 KiB
Go

// This file is part of the happyDomain (R) project.
// Copyright (c) 2020-2026 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
package checker
import (
"bytes"
"context"
"crypto/tls"
"fmt"
"io"
"log"
"net"
"net/http"
"net/url"
"os"
"strings"
"sync"
"sync/atomic"
"time"
sdk "git.happydns.org/checker-sdk-go/checker"
"golang.org/x/net/html"
)
// verboseLogging is enabled via the CHECKER_HTTP_VERBOSE environment variable;
// when off, per-probe logging is silenced to keep production logs clean.
var verboseLogging = os.Getenv("CHECKER_HTTP_VERBOSE") != ""
// Collect probes HTTP and HTTPS for every (IP, scheme) pair on the
// abstract.Server. The HTTP body of the primary HTTPS probe is parsed for
// SRI evaluation; secondary probes only retain headers/cookies/redirects.
func (p *httpProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any, error) {
server, err := resolveServer(opts)
if err != nil {
return nil, err
}
timeoutMs := sdk.GetIntOption(opts, OptionProbeTimeoutMs, DefaultProbeTimeoutMs)
if timeoutMs <= 0 {
timeoutMs = DefaultProbeTimeoutMs
}
timeout := time.Duration(timeoutMs) * time.Millisecond
maxRedirects := sdk.GetIntOption(opts, OptionMaxRedirects, DefaultMaxRedirects)
if maxRedirects < 0 {
maxRedirects = DefaultMaxRedirects
}
userAgent := DefaultUserAgent
if v, ok := sdk.GetOption[string](opts, OptionUserAgent); ok && v != "" {
userAgent = v
}
host, ips := addressesFromServer(server)
// abstract.Server only pins one A and one AAAA. Resolve the host
// to pick up any additional records the authoritative DNS exposes,
// so multi-IP deployments aren't silently under-probed. Failures
// are non-fatal; the pinned IPs remain.
seen := make(map[string]struct{}, len(ips)+4)
for _, ip := range ips {
seen[ip] = struct{}{}
}
ips = append(ips, discoverIPs(ctx, host, seen)...)
if len(ips) == 0 {
return nil, fmt.Errorf("abstract.Server has no A/AAAA records")
}
data := &HTTPData{
Domain: host,
CollectedAt: time.Now(),
}
type job struct {
scheme string
port uint16
ip string
// parseHTML controls whether the HTML body is parsed and its
// references kept on the probe. We only do this for the first
// HTTPS probe to keep the payload bounded.
parseHTML bool
}
var jobs []job
htmlPicked := false
for _, ip := range ips {
jobs = append(jobs, job{scheme: "http", port: DefaultHTTPPort, ip: ip})
j := job{scheme: "https", port: DefaultHTTPSPort, ip: ip}
if !htmlPicked {
j.parseHTML = true
htmlPicked = true
}
jobs = append(jobs, j)
}
var mu sync.Mutex
var wg sync.WaitGroup
sem := make(chan struct{}, MaxConcurrentProbes)
for _, j := range jobs {
wg.Add(1)
sem <- struct{}{}
go func(j job) {
defer wg.Done()
defer func() { <-sem }()
probe := runProbe(ctx, host, j.ip, j.scheme, j.port, timeout, maxRedirects, userAgent, j.parseHTML)
if verboseLogging {
log.Printf("checker-http: %s ip=%s status=%d redirects=%d err=%q",
j.scheme, j.ip, probe.StatusCode, len(probe.RedirectChain), probe.Error)
}
mu.Lock()
data.Probes = append(data.Probes, probe)
mu.Unlock()
}(j)
}
wg.Wait()
return data, nil
}
func runProbe(ctx context.Context, host, ip, scheme string, port uint16, timeout time.Duration, maxRedirects int, ua string, parseHTML bool) HTTPProbe {
addr := net.JoinHostPort(ip, fmt.Sprintf("%d", port))
probe := HTTPProbe{
Scheme: scheme,
Host: host,
IP: ip,
Port: port,
Address: addr,
IsIPv6: strings.Contains(ip, ":"),
}
dialer := &net.Dialer{Timeout: timeout}
// tcpConnected is set the moment a dial succeeds, so we can
// distinguish pure-TCP failures from later TLS/HTTP errors without
// resorting to error-string matching.
var tcpConnected atomic.Bool
// Force every dial to the chosen IP, regardless of what hostname is
// in the URL; that way we can attribute results to a specific A/AAAA
// record and bypass local resolver oddities.
transport := &http.Transport{
DialContext: func(ctx context.Context, network, _ string) (net.Conn, error) {
conn, err := dialer.DialContext(ctx, network, addr)
if err != nil {
return nil, err
}
tcpConnected.Store(true)
return conn, nil
},
TLSClientConfig: &tls.Config{
ServerName: host,
// Deep TLS posture is delegated to checker-tls. We still want
// HTTPS errors (expired cert, bad chain, ...) to surface as
// probe errors, so verification stays enabled.
},
TLSHandshakeTimeout: timeout,
ResponseHeaderTimeout: timeout,
DisableKeepAlives: true,
}
defer transport.CloseIdleConnections()
// Bound the whole probe (dial + TLS + headers + body across all
// redirect hops) by a single per-probe deadline derived from ctx, so
// a slow target can't pin a worker beyond the parent's lifetime and
// outer cancellation propagates to in-flight I/O.
probeBudget := timeout * time.Duration(maxRedirects+1)
probeCtx, cancel := context.WithTimeout(ctx, probeBudget)
defer cancel()
var redirectChain []RedirectStep
client := &http.Client{
Transport: transport,
// No client-level Timeout: probeCtx already bounds the request,
// and a separate http.Client.Timeout would race with it.
CheckRedirect: func(req *http.Request, via []*http.Request) error {
prev := via[len(via)-1]
redirectChain = append(redirectChain, RedirectStep{
From: prev.URL.String(),
To: req.URL.String(),
Status: 0, // populated post-hoc below if available
})
// The transport's DialContext is pinned to the original
// (ip, port) and TLS ServerName is pinned to the original
// host. Following a redirect that changes host, scheme, or
// port would silently route the request to the wrong
// backend. Stop and return the 3xx so the caller can see
// the Location, but don't follow it on this probe.
if !strings.EqualFold(req.URL.Host, host) ||
!strings.EqualFold(req.URL.Scheme, scheme) {
return http.ErrUseLastResponse
}
if len(via) > maxRedirects {
return fmt.Errorf("stopped after %d redirects", maxRedirects)
}
return nil
},
}
target := &url.URL{Scheme: scheme, Host: host, Path: "/"}
req, err := http.NewRequestWithContext(probeCtx, http.MethodGet, target.String(), nil)
if err != nil {
probe.Error = err.Error()
return probe
}
req.Header.Set("User-Agent", ua)
req.Header.Set("Accept", "text/html,application/xhtml+xml;q=0.9,*/*;q=0.5")
start := time.Now()
resp, err := client.Do(req)
probe.ElapsedMS = time.Since(start).Milliseconds()
if err != nil {
probe.Error = err.Error()
// The dialer wrapper sets tcpConnected the moment a TCP
// connection is established, so we can attribute the failure
// to a post-TCP layer (TLS, HTTP, redirect policy) without
// any error-string heuristics.
probe.TCPConnected = tcpConnected.Load()
probe.RedirectChain = redirectChain
return probe
}
defer resp.Body.Close()
probe.TCPConnected = true
probe.StatusCode = resp.StatusCode
if resp.Request != nil && resp.Request.URL != nil {
probe.FinalURL = resp.Request.URL.String()
}
// Per RFC 7230 §3.2.2, repeated headers (other than Set-Cookie) are
// semantically equivalent to a single header whose value is the
// comma-joined list; folding here preserves directives like a second
// CSP or HSTS header that would otherwise be dropped. Set-Cookie is
// excluded from the map since cookies are surfaced via resp.Cookies().
probe.Headers = make(map[string]string, len(resp.Header))
for k, v := range resp.Header {
if len(v) == 0 {
continue
}
lk := strings.ToLower(k)
if lk == "set-cookie" {
continue
}
probe.Headers[lk] = strings.Join(v, ", ")
}
for _, c := range resp.Cookies() {
probe.Cookies = append(probe.Cookies, CookieInfo{
Name: c.Name,
Domain: c.Domain,
Path: c.Path,
Secure: c.Secure,
HttpOnly: c.HttpOnly,
SameSite: sameSiteString(c.SameSite),
HasExpiry: !c.Expires.IsZero() || c.MaxAge > 0,
})
}
probe.RedirectChain = redirectChain
// Read one extra byte to detect whether we hit the cap. Anything
// beyond MaxBodyBytes is dropped, but the probe surfaces
// BodyTruncated so callers know SRI/HTML rules saw a partial view.
body, err := io.ReadAll(io.LimitReader(resp.Body, MaxBodyBytes+1))
if err == nil {
if len(body) > MaxBodyBytes {
body = body[:MaxBodyBytes]
probe.BodyTruncated = true
}
probe.HTMLBytes = len(body)
if parseHTML && isHTMLContent(probe.Headers["content-type"]) {
probe.Resources = extractResources(body, host)
}
}
return probe
}
func sameSiteString(s http.SameSite) string {
switch s {
case http.SameSiteLaxMode:
return "Lax"
case http.SameSiteStrictMode:
return "Strict"
case http.SameSiteNoneMode:
return "None"
default:
return ""
}
}
func isHTMLContent(ct string) bool {
ct = strings.ToLower(ct)
return strings.Contains(ct, "text/html") || strings.Contains(ct, "application/xhtml")
}
// extractResources walks the HTML body and collects <script src=...>,
// <link href=... rel="stylesheet"|"preload"...> and inline-eligible <img>
// references, with a flag for whether the resource is cross-origin
// (different host than the page); SRI is only meaningful in that case.
func extractResources(body []byte, pageHost string) []HTMLResource {
doc, err := html.Parse(bytes.NewReader(body))
if err != nil {
return nil
}
var out []HTMLResource
var walk func(*html.Node)
walk = func(n *html.Node) {
if n.Type == html.ElementNode {
switch n.Data {
case "script":
if src, ok := attr(n, "src"); ok && src != "" {
out = append(out, mkResource("script", src, n, pageHost))
}
case "link":
rel, _ := attr(n, "rel")
if href, ok := attr(n, "href"); ok && href != "" && relIsAsset(rel) {
r := mkResource("link", href, n, pageHost)
r.Rel = rel
out = append(out, r)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
return out
}
func relIsAsset(rel string) bool {
rel = strings.ToLower(rel)
return strings.Contains(rel, "stylesheet") || strings.Contains(rel, "preload") || strings.Contains(rel, "modulepreload")
}
func mkResource(tag, ref string, n *html.Node, pageHost string) HTMLResource {
r := HTMLResource{Tag: tag, URL: ref}
if integ, ok := attr(n, "integrity"); ok && integ != "" {
r.Integrity = integ
}
if u, err := url.Parse(ref); err == nil && u.Host != "" && !strings.EqualFold(u.Host, pageHost) {
r.CrossOrigin = true
}
return r
}
func attr(n *html.Node, key string) (string, bool) {
for _, a := range n.Attr {
if strings.EqualFold(a.Key, key) {
return a.Val, true
}
}
return "", false
}