checker-http/checker/collector_root.go

73 lines
1.9 KiB
Go

// This file is part of the happyDomain (R) project.
// Copyright (c) 2020-2026 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
package checker
import (
"context"
"log"
"sync"
"time"
)
// rootCollector probes the target host on HTTP/80 and HTTPS/443 for every
// known IP, captures headers/cookies/redirects on each, and parses the
// HTML body of the first successful HTTPS probe (so SRI-style rules have
// something to evaluate). This is the original behaviour of Collect()
// before the Collector interface was introduced.
type rootCollector struct{}
func (rootCollector) Key() string { return ObservationKeyHTTP }
func (rootCollector) Collect(ctx context.Context, t Target) (any, error) {
data := &HTTPData{
Domain: t.Host,
CollectedAt: time.Now(),
}
type job struct {
scheme string
port uint16
ip string
// parseHTML controls whether the HTML body is parsed and its
// references kept on the probe. Only the first HTTPS probe gets
// it, to keep payload size bounded.
parseHTML bool
}
var jobs []job
htmlPicked := false
for _, ip := range t.IPs {
jobs = append(jobs, job{scheme: "http", port: DefaultHTTPPort, ip: ip})
j := job{scheme: "https", port: DefaultHTTPSPort, ip: ip}
if !htmlPicked {
j.parseHTML = true
htmlPicked = true
}
jobs = append(jobs, j)
}
var mu sync.Mutex
var wg sync.WaitGroup
sem := make(chan struct{}, MaxConcurrentProbes)
for _, j := range jobs {
wg.Add(1)
sem <- struct{}{}
go func(j job) {
defer wg.Done()
defer func() { <-sem }()
probe := runProbe(ctx, t.Host, j.ip, j.scheme, j.port, t.Timeout, t.MaxRedirects, t.UserAgent, j.parseHTML)
if verboseLogging {
log.Printf("checker-http: %s ip=%s status=%d redirects=%d err=%q",
j.scheme, j.ip, probe.StatusCode, len(probe.RedirectChain), probe.Error)
}
mu.Lock()
data.Probes = append(data.Probes, probe)
mu.Unlock()
}(j)
}
wg.Wait()
return data, nil
}