commit 2d98ed1b5d5ed969cced112b72ebc7360049549f Author: Pierre-Olivier Mercier Date: Sun Apr 26 11:49:13 2026 +0700 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1b7ee08 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +checker-resolver-propagation +checker-resolver-propagation.so diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ba87aa6 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM golang:1.25-alpine AS builder + +ARG CHECKER_VERSION=custom-build + +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN CGO_ENABLED=0 go build -tags standalone -ldflags "-X main.Version=${CHECKER_VERSION}" -o /checker-resolver-propagation . + +FROM scratch +COPY --from=builder /checker-resolver-propagation /checker-resolver-propagation +USER 65534:65534 +EXPOSE 8080 +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD ["/checker-resolver-propagation", "-healthcheck"] +ENTRYPOINT ["/checker-resolver-propagation"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..07d44d8 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 The happyDomain Authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the “Software”), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4422ea4 --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ +CHECKER_NAME := checker-resolver-propagation +CHECKER_IMAGE := happydomain/$(CHECKER_NAME) +CHECKER_VERSION ?= custom-build + +CHECKER_SOURCES := main.go $(wildcard checker/*.go) + +GO_LDFLAGS := -X main.Version=$(CHECKER_VERSION) + +.PHONY: all plugin docker test clean + +all: $(CHECKER_NAME) + +$(CHECKER_NAME): $(CHECKER_SOURCES) + go build -tags standalone -ldflags "$(GO_LDFLAGS)" -o $@ . + +plugin: $(CHECKER_NAME).so + +$(CHECKER_NAME).so: $(CHECKER_SOURCES) $(wildcard plugin/*.go) + go build -buildmode=plugin -ldflags "$(GO_LDFLAGS)" -o $@ ./plugin/ + +docker: + docker build --build-arg CHECKER_VERSION=$(CHECKER_VERSION) -t $(CHECKER_IMAGE) . + +test: + go test -tags standalone ./... + +clean: + rm -f $(CHECKER_NAME) $(CHECKER_NAME).so diff --git a/README.md b/README.md new file mode 100644 index 0000000..66f6de4 --- /dev/null +++ b/README.md @@ -0,0 +1,106 @@ +# checker-resolver-propagation + +Worldwide DNS propagation checker for [happyDomain](https://www.happydomain.org/). + +Probes a curated catalog of public recursive resolvers (Cloudflare, +Google, Quad9, OpenDNS, Yandex, regional ISPs, …) across multiple +transports (UDP, TCP, DoT, DoH) and regions, then compares their +answers to the zone's authoritative nameservers to detect propagation +gaps, regional splits, SOA serial drift, stale caches, DNSSEC +validation failures, SERVFAIL/NXDOMAIN inconsistencies, and resolver +filtering. + +## Usage + +### Standalone HTTP server + +```bash +# Build and run +make +./checker-resolver-propagation -listen :8080 +``` + +The server exposes: + +- `GET /health`: health check +- `POST /collect`: collect propagation observations (happyDomain external checker protocol) +- `POST /evaluate`: run the evaluation rules against an observation +- `POST /report`: extract metrics / HTML report from an observation + +### Docker + +```bash +make docker +docker run -p 8080:8080 happydomain/checker-resolver-propagation +``` + +### happyDomain plugin + +```bash +make plugin +# produces checker-resolver-propagation.so, loadable by happyDomain as a Go plugin +``` + +The plugin exposes a `NewCheckerPlugin` symbol returning the checker +definition and observation provider, which happyDomain registers in its +global registries at load time. + +### Versioning + +The binary, plugin, and Docker image embed a version string overridable +at build time: + +```bash +make CHECKER_VERSION=1.2.3 +make plugin CHECKER_VERSION=1.2.3 +make docker CHECKER_VERSION=1.2.3 +``` + +### happyDomain remote endpoint + +Set the `endpoint` admin option for the resolver-propagation checker to +the URL of the running checker-resolver-propagation server (e.g., +`http://checker-resolver-propagation:8080`). happyDomain will delegate +observation collection to this endpoint. + +This checker applies to **service**-level checks and is restricted to +the `abstract.Origin` and `abstract.NSOnlyOrigin` services (the zone +apex / NS configuration). + +## Options + +| Id | Type | Default | Description | +|-----------------------|--------|-------------------------------|------------------------------------------------------------------------------------------------------------------------| +| `recordTypes` | string | `SOA,NS,A,AAAA,MX,TXT,CAA` | Comma-separated list of RR types to probe at the apex (and at each `subdomains` entry). | +| `subdomains` | string | `www` | Comma-separated list of owner names to probe in addition to the apex (e.g. `www,mail,@`). Empty = apex only. | +| `includeFiltered` | bool | `false` | Probe filtering resolvers (malware/family/adblock). Their answers routinely diverge by design. | +| `region` | string | `all` | Restrict to a region: `all`, `global`, `na`, `eu`, `asia`, `ru`, `me`. | +| `transports` | string | `udp` | Comma-separated transports to probe: `udp`, `tcp`, `dot`, `doh`. Encrypted transports are only used where published. | +| `resolverAllowlist` | string | | Comma-separated resolver IDs or IPs to probe exclusively (e.g. `cloudflare,google,9.9.9.9`). Empty = catalog selection.| +| `latencyThresholdMs` | uint | `500` | Resolvers averaging above this value emit an info finding. | +| `runTimeoutSeconds` | uint | `30` | Hard wall-clock budget for one propagation run. Slower resolvers report as unreachable. | + +## Rules + +Each rule emits a finding code. Severity can be affected by the options above. + +| Code | Default severity | Condition | +|-------------------------------|------------------|-----------| +| `rprop_no_resolvers` | critical | The current option set selects no resolver from the catalog. | +| `rprop_all_resolvers_down` | critical | Every selected resolver failed to answer (likely no DNS connectivity from the checker host). | +| `rprop_resolver_unreachable` | warning | An individual resolver failed to answer within the run budget. | +| `rprop_resolver_high_latency` | info | A resolver's average response time exceeds `latencyThresholdMs`. | +| `rprop_resolver_filtered_hit` | info | A filtered resolver returned a different answer than the consensus (typical blocklist behaviour). Only when `includeFiltered` is enabled. | +| `rprop_partial_propagation` | warning | Public resolvers disagree on the answer for a probed RRset. | +| `rprop_answer_drift` | critical | The public consensus differs from the answer served by the zone's authoritative nameservers. | +| `rprop_unexpected_nxdomain` | critical | Some resolvers return NXDOMAIN while others return NOERROR for the same RRset. | +| `rprop_unexpected_servfail` | critical | A resolver returns SERVFAIL (usually a DNSSEC or reachability failure). | +| `rprop_regional_split` | warning | Every resolver of a region agrees on an answer that differs from the global consensus. | +| `rprop_serial_drift` | warning | Unfiltered resolvers disagree on the SOA serial. | +| `rprop_stale_cache` | info | A resolver still serves an SOA serial below the one last observed by happyDomain. | +| `rprop_dnssec_failure` | critical | A validating resolver fails to validate the zone's DNSSEC chain (returns SERVFAIL with AD/CD semantics). | +| `rprop_dnssec_not_validated` | info | A validating resolver answered without setting AD on a signed zone. | + +## License + +Licensed under the **MIT License** (see `LICENSE`). diff --git a/checker/collect.go b/checker/collect.go new file mode 100644 index 0000000..defe6dd --- /dev/null +++ b/checker/collect.go @@ -0,0 +1,422 @@ +package checker + +import ( + "context" + "encoding/json" + "fmt" + "log" + "net" + "strconv" + "strings" + "sync" + "time" + + "github.com/miekg/dns" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// Collect gathers raw DNS answers from each selected public resolver plus the +// zone's own authoritative ground-truth. It performs no judgement: rules +// derive consensus, drift, splits, latency, and DNSSEC verdicts from the +// observation. +func (p *resolverPropagationProvider) Collect(ctx context.Context, opts sdk.CheckerOptions) (any, error) { + svc, err := loadService(opts) + if err != nil { + return nil, err + } + + zone, err := loadZone(opts, svc) + if err != nil { + return nil, err + } + + includeFiltered := sdk.GetBoolOption(opts, "includeFiltered", false) + region := getStringOpt(opts, "region", "all") + transportsOpt := getStringOpt(opts, "transports", "udp") + recordTypesOpt := getStringOpt(opts, "recordTypes", "SOA,NS,A,AAAA,MX,TXT,CAA") + subdomainsOpt := getStringOpt(opts, "subdomains", "") + runTimeoutS := sdk.GetIntOption(opts, "runTimeoutSeconds", 30) + allowlistOpt := getStringOpt(opts, "resolverAllowlist", "") + + // Parse options. + transports := parseCSV(transportsOpt) + if len(transports) == 0 { + transports = []string{string(TransportUDP)} + } + qtypes := parseQTypes(recordTypesOpt) + if len(qtypes) == 0 { + return nil, fmt.Errorf("no valid record types in %q", recordTypesOpt) + } + extraNames := parseCSV(subdomainsOpt) + allowlist := parseCSV(allowlistOpt) + + // Build the list of owner names to probe. + names := []string{dns.Fqdn(zone)} + seenName := map[string]bool{names[0]: true} + for _, sd := range extraNames { + full := joinSubdomain(sd, zone) + if !seenName[full] { + seenName[full] = true + names = append(names, full) + } + } + + resolvers := selectedResolvers(includeFiltered, region, allowlist) + + data := &ResolverPropagationData{ + Zone: dns.Fqdn(zone), + Names: names, + Types: qtypeNames(qtypes), + Resolvers: map[string]*ResolverView{}, + RRsets: map[string]*RRsetView{}, + } + if svc.SOA != nil { + data.DeclaredSerial = svc.SOA.Serial + } + + // If the selection matches no resolvers, simply return the (empty) + // payload. Rules classify "no resolvers matched" as their own concern. + if len(resolvers) == 0 { + data.Stats = computeBasicStats(data) + return data, nil + } + + runCtx, cancel := context.WithTimeout(ctx, time.Duration(runTimeoutS)*time.Second) + defer cancel() + + started := time.Now() + + // Ground truth from the zone's own authoritative servers. + expected := collectExpected(runCtx, zone, svc, names, qtypes) + + for _, n := range names { + for _, qt := range qtypes { + key := rrsetKey(n, dns.TypeToString[qt]) + v := &RRsetView{ + Name: strings.ToLower(dns.Fqdn(n)), + Type: dns.TypeToString[qt], + } + if e, ok := expected[key]; ok { + v.Expected = e.sig + v.ExpectedRecords = e.records + } + data.RRsets[key] = v + } + } + + // Fan out probes across resolvers × transports × RRsets. + type probeJob struct { + r Resolver + tr Transport + } + var jobs []probeJob + for _, r := range resolvers { + for _, tname := range transports { + tr := Transport(strings.ToLower(strings.TrimSpace(tname))) + switch tr { + case TransportUDP, TransportTCP: + jobs = append(jobs, probeJob{r: r, tr: tr}) + case TransportDoT: + if r.DoTHost != "" { + jobs = append(jobs, probeJob{r: r, tr: tr}) + } + case TransportDoH: + if r.DoHURL != "" { + jobs = append(jobs, probeJob{r: r, tr: tr}) + } + } + } + } + + const maxConcurrent = 32 + sem := make(chan struct{}, maxConcurrent) + + var wg sync.WaitGroup + var mu sync.Mutex + for _, job := range jobs { + job := job + wg.Add(1) + sem <- struct{}{} + go func() { + defer wg.Done() + defer func() { <-sem }() + + rid := job.r.ID + if job.tr != TransportUDP { + rid = fmt.Sprintf("%s|%s", job.r.ID, job.tr) + } + + view := &ResolverView{ + ID: rid, + Name: job.r.Name, + IP: job.r.IP, + Region: job.r.Region, + Filtered: job.r.Filtered, + Transport: job.tr, + Probes: map[string]*RRProbe{}, + } + + for _, n := range names { + for _, qt := range qtypes { + probe := runProbe(runCtx, job.r, job.tr, n, qt) + key := rrsetKey(n, dns.TypeToString[qt]) + view.Probes[key] = probe + if probe.Error == "" { + view.Reachable = true + } + } + } + + mu.Lock() + data.Resolvers[rid] = view + mu.Unlock() + }() + } + wg.Wait() + + data.RunDurationMs = time.Since(started).Milliseconds() + data.Stats = computeBasicStats(data) + + return data, nil +} + +func runProbe(ctx context.Context, r Resolver, tr Transport, name string, qtype uint16) *RRProbe { + p := &RRProbe{Transport: tr} + + res, err := queryResolver(ctx, r, tr, name, qtype) + if err != nil { + p.Error = err.Error() + return p + } + p.Rcode = rcodeToString(res.Rcode) + p.AD = res.AD + p.LatencyMs = res.Latency.Milliseconds() + + if res.Rcode == dns.RcodeSuccess { + sig, recs, ttl := signatureFromRRs(res.Answer, name, qtype) + p.Signature = sig + p.Records = recs + p.MinTTL = ttl + } + return p +} + +type expectedEntry struct { + sig string + records []string +} + +func collectExpected(ctx context.Context, zone string, svc *originService, names []string, qtypes []uint16) map[string]*expectedEntry { + out := map[string]*expectedEntry{} + + var nsHosts []string + for _, n := range svc.NameServers { + if n == nil { + continue + } + nsHosts = append(nsHosts, strings.ToLower(dns.Fqdn(n.Ns))) + } + if len(nsHosts) == 0 { + var resolver net.Resolver + nss, err := resolver.LookupNS(ctx, strings.TrimSuffix(zone, ".")) + if err != nil { + log.Printf("collectExpected: NS lookup failed for %q: %v", zone, err) + return out + } + for _, ns := range nss { + nsHosts = append(nsHosts, strings.ToLower(dns.Fqdn(ns.Host))) + } + } + + var resolver net.Resolver + var authAddrs []string + for _, ns := range nsHosts { + addrs, err := resolver.LookupHost(ctx, strings.TrimSuffix(ns, ".")) + if err != nil { + continue + } + for _, a := range addrs { + authAddrs = append(authAddrs, net.JoinHostPort(a, "53")) + } + } + if len(authAddrs) == 0 { + return out + } + + for _, n := range names { + for _, qt := range qtypes { + key := rrsetKey(n, dns.TypeToString[qt]) + if e := queryAuthoritative(ctx, authAddrs, n, qt); e != nil { + out[key] = e + } + } + } + return out +} + +func queryAuthoritative(ctx context.Context, servers []string, name string, qtype uint16) *expectedEntry { + q := dns.Question{Name: dns.Fqdn(name), Qtype: qtype, Qclass: dns.ClassINET} + m := new(dns.Msg) + m.Id = dns.Id() + m.Question = []dns.Question{q} + m.RecursionDesired = false + m.SetEdns0(ednsUDPSize, false) + + client := dns.Client{Timeout: dnsTimeout} + for _, srv := range servers { + r, _, err := client.ExchangeContext(ctx, m, srv) + if err != nil || r == nil { + continue + } + if !r.Authoritative { + continue + } + if r.Rcode != dns.RcodeSuccess { + return &expectedEntry{} + } + sig, recs, _ := signatureFromRRs(r.Answer, name, qtype) + return &expectedEntry{sig: sig, records: recs} + } + return nil +} + +// computeBasicStats returns the raw rollup that Collect can produce without +// judgement: simple counts. "Agreement" (UnfilteredAgreeing) is a derived +// metric computed by deriveView once consensus has been established. +func computeBasicStats(data *ResolverPropagationData) Stats { + s := Stats{TotalResolvers: len(data.Resolvers)} + regions := map[string]bool{} + for _, rv := range data.Resolvers { + if rv.Reachable { + s.ReachableResolvers++ + } + if rv.Filtered { + s.FilteredProbed++ + } else { + s.UnfilteredProbed++ + } + regions[rv.Region] = true + } + s.CountriesCovered = len(regions) + return s +} + +func loadService(opts sdk.CheckerOptions) (*originService, error) { + svc, ok := sdk.GetOption[serviceMessage](opts, "service") + if !ok { + // Standalone / interactive use: no service was attached. Fall back + // to an empty payload; collectExpected will look up NS via the + // system resolver. + return &originService{}, nil + } + switch svc.Type { + case "", "abstract.Origin", "abstract.NSOnlyOrigin": + default: + return nil, fmt.Errorf("service is %s, expected abstract.Origin or abstract.NSOnlyOrigin", svc.Type) + } + var d originService + if err := json.Unmarshal(svc.Service, &d); err != nil { + return nil, fmt.Errorf("decoding origin service: %w", err) + } + return &d, nil +} + +func loadZone(opts sdk.CheckerOptions, svc *originService) (string, error) { + if v, ok := sdk.GetOption[string](opts, "domain_name"); ok && v != "" { + return dns.Fqdn(v), nil + } + if svc.SOA != nil && svc.SOA.Header().Name != "" { + return dns.Fqdn(svc.SOA.Header().Name), nil + } + return "", fmt.Errorf("no zone name provided (missing 'domain_name' option and SOA header)") +} + +func getStringOpt(opts sdk.CheckerOptions, key, dflt string) string { + if v, ok := sdk.GetOption[string](opts, key); ok && v != "" { + return v + } + return dflt +} + +func parseCSV(s string) []string { + if s == "" { + return nil + } + parts := strings.Split(s, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + out = append(out, p) + } + } + return out +} + +func parseQTypes(s string) []uint16 { + seen := map[uint16]bool{} + var out []uint16 + for _, t := range parseCSV(s) { + if q, ok := dns.StringToType[strings.ToUpper(t)]; ok && !seen[q] { + seen[q] = true + out = append(out, q) + } + } + return out +} + +func qtypeNames(qtypes []uint16) []string { + out := make([]string, len(qtypes)) + for i, q := range qtypes { + out[i] = dns.TypeToString[q] + } + return out +} + +func joinSubdomain(sd, zone string) string { + sd = strings.TrimSpace(sd) + zone = dns.Fqdn(zone) + if sd == "" || sd == "@" { + return zone + } + if strings.HasSuffix(sd, ".") { + return strings.ToLower(sd) + } + return strings.ToLower(sd + "." + zone) +} + +func extractSerial(records []string) uint32 { + if len(records) == 0 { + return 0 + } + fields := strings.Fields(records[0]) + if len(fields) < 7 { + return 0 + } + s, err := strconv.ParseUint(fields[2], 10, 32) + if err != nil { + return 0 + } + return uint32(s) +} + +// Hardcoded allowlist; only these resolvers' AD bit is trustworthy. +func isValidatingResolver(id string) bool { + switch strings.SplitN(id, "|", 2)[0] { + case "cloudflare", "cloudflare-malware", "cloudflare-family", + "google", "quad9", "quad9-unfiltered", + "adguard", "adguard-unfiltered", "adguard-family", + "cleanbrowsing-family", "cleanbrowsing-adult": + return true + } + return false +} + +// firstN returns a short "x, y, z (+N more)" display list. +func firstN(items []string, n int) string { + if len(items) <= n { + return strings.Join(items, ", ") + } + return strings.Join(items[:n], ", ") + fmt.Sprintf(" (+%d more)", len(items)-n) +} diff --git a/checker/collect_test.go b/checker/collect_test.go new file mode 100644 index 0000000..45966f9 --- /dev/null +++ b/checker/collect_test.go @@ -0,0 +1,243 @@ +package checker + +import ( + "encoding/json" + "reflect" + "sort" + "testing" + + "github.com/miekg/dns" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +func TestParseCSV(t *testing.T) { + cases := []struct { + in string + want []string + }{ + {"", nil}, + {"a", []string{"a"}}, + {"a,b,c", []string{"a", "b", "c"}}, + {" a , ,b ,", []string{"a", "b"}}, + {",,,", []string{}}, + } + for _, c := range cases { + got := parseCSV(c.in) + if len(got) != len(c.want) { + t.Errorf("parseCSV(%q) len = %d, want %d", c.in, len(got), len(c.want)) + continue + } + for i := range got { + if got[i] != c.want[i] { + t.Errorf("parseCSV(%q)[%d] = %q, want %q", c.in, i, got[i], c.want[i]) + } + } + } +} + +func TestParseQTypes(t *testing.T) { + got := parseQTypes("a,aaaa,MX,TxT,bogus,A") // A duplicated; bogus skipped + want := []uint16{dns.TypeA, dns.TypeAAAA, dns.TypeMX, dns.TypeTXT} + if !reflect.DeepEqual(got, want) { + t.Errorf("parseQTypes = %v, want %v", got, want) + } + + if got := parseQTypes(""); got != nil { + t.Errorf("parseQTypes(\"\") = %v, want nil", got) + } + if got := parseQTypes("nope,onlybad"); got != nil { + t.Errorf("parseQTypes(bad) = %v, want nil", got) + } +} + +func TestQtypeNames(t *testing.T) { + got := qtypeNames([]uint16{dns.TypeA, dns.TypeMX}) + want := []string{"A", "MX"} + if !reflect.DeepEqual(got, want) { + t.Errorf("qtypeNames = %v, want %v", got, want) + } +} + +func TestJoinSubdomain(t *testing.T) { + cases := []struct { + sd, zone, want string + }{ + {"", "example.com", "example.com."}, + {"@", "example.com.", "example.com."}, + {"www", "example.com", "www.example.com."}, + {"WWW", "Example.Com", "www.example.com."}, + {"foo.example.org.", "example.com", "foo.example.org."}, // already FQDN: used as-is + {" www ", "example.com", "www.example.com."}, + } + for _, c := range cases { + if got := joinSubdomain(c.sd, c.zone); got != c.want { + t.Errorf("joinSubdomain(%q,%q) = %q, want %q", c.sd, c.zone, got, c.want) + } + } +} + +func TestExtractSerial(t *testing.T) { + cases := []struct { + in []string + want uint32 + }{ + {nil, 0}, + {[]string{"ns. hostmaster. 2024010101 7200 3600 1209600 3600"}, 2024010101}, + {[]string{"too few fields"}, 0}, + {[]string{"ns. hm. notanumber 1 2 3 4"}, 0}, + {[]string{"ns. hm. 99999999999999999 1 2 3 4"}, 0}, // overflow uint32 + } + for _, c := range cases { + if got := extractSerial(c.in); got != c.want { + t.Errorf("extractSerial(%v) = %d, want %d", c.in, got, c.want) + } + } +} + +func TestFirstN(t *testing.T) { + if got := firstN([]string{"a", "b"}, 5); got != "a, b" { + t.Errorf("under: %q", got) + } + if got := firstN([]string{"a", "b", "c", "d"}, 2); got != "a, b (+2 more)" { + t.Errorf("over: %q", got) + } + if got := firstN(nil, 3); got != "" { + t.Errorf("nil: %q", got) + } +} + +func TestIsValidatingResolver(t *testing.T) { + for _, id := range []string{"cloudflare", "google", "quad9", "adguard"} { + if !isValidatingResolver(id) { + t.Errorf("%s should validate", id) + } + } + for _, id := range []string{"opendns", "yandex", "ntt-jp", ""} { + if isValidatingResolver(id) { + t.Errorf("%s should NOT validate", id) + } + } + // transport-suffixed IDs (e.g. "cloudflare|tcp") should still match. + if !isValidatingResolver("cloudflare|tcp") { + t.Errorf("transport-suffixed ID should still validate") + } +} + +func TestComputeBasicStats(t *testing.T) { + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": {Region: "eu", Reachable: true}, + "b": {Region: "eu", Reachable: false, Filtered: true}, + "c": {Region: "global", Reachable: true}, + "d": {Region: "na", Reachable: true, Filtered: true}, + }, + } + s := computeBasicStats(data) + if s.TotalResolvers != 4 { + t.Errorf("total = %d", s.TotalResolvers) + } + if s.ReachableResolvers != 3 { + t.Errorf("reachable = %d", s.ReachableResolvers) + } + if s.FilteredProbed != 2 || s.UnfilteredProbed != 2 { + t.Errorf("split filtered=%d unfiltered=%d", s.FilteredProbed, s.UnfilteredProbed) + } + if s.CountriesCovered != 3 { + t.Errorf("regions = %d", s.CountriesCovered) + } +} + +func TestGetStringOpt(t *testing.T) { + opts := sdk.CheckerOptions{"a": "x", "b": ""} + if got := getStringOpt(opts, "a", "d"); got != "x" { + t.Errorf("a = %q", got) + } + if got := getStringOpt(opts, "b", "d"); got != "d" { + t.Errorf("b = %q", got) + } + if got := getStringOpt(opts, "missing", "d"); got != "d" { + t.Errorf("missing = %q", got) + } +} + +func TestLoadService(t *testing.T) { + // Missing service: tolerated (standalone / interactive use). Returns + // an empty payload so collectExpected falls back to the system resolver. + if svc, err := loadService(sdk.CheckerOptions{}); err != nil { + t.Errorf("unexpected error for missing service: %v", err) + } else if svc == nil || svc.SOA != nil || len(svc.NameServers) != 0 { + t.Errorf("want empty service, got %+v", svc) + } + + // Wrong type. + bad := serviceMessage{Type: "abstract.NotOrigin", Service: json.RawMessage(`{}`)} + if _, err := loadService(sdk.CheckerOptions{"service": bad}); err == nil { + t.Errorf("want error for wrong service type") + } + + // Valid Origin payload. + msg := serviceMessage{ + Type: "abstract.Origin", + Service: json.RawMessage(`{"soa":{"Hdr":{"Name":"example.com.","Rrtype":6,"Class":1,"Ttl":3600},"Ns":"ns.example.com.","Mbox":"hm.example.com.","Serial":42,"Refresh":3600,"Retry":600,"Expire":86400,"Minttl":300}}`), + } + svc, err := loadService(sdk.CheckerOptions{"service": msg}) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if svc.SOA == nil || svc.SOA.Serial != 42 { + t.Errorf("got SOA = %+v", svc.SOA) + } + + // Empty type is accepted. + emptyType := serviceMessage{Type: "", Service: json.RawMessage(`{}`)} + if _, err := loadService(sdk.CheckerOptions{"service": emptyType}); err != nil { + t.Errorf("empty type should be allowed: %v", err) + } + + // Malformed JSON in Service. + bad2 := serviceMessage{Type: "abstract.Origin", Service: json.RawMessage(`not-json`)} + if _, err := loadService(sdk.CheckerOptions{"service": bad2}); err == nil { + t.Errorf("want decode error") + } +} + +func TestLoadZone(t *testing.T) { + // From explicit option. + z, err := loadZone(sdk.CheckerOptions{"domain_name": "example.com"}, &originService{}) + if err != nil || z != "example.com." { + t.Errorf("explicit: %q %v", z, err) + } + + // Fallback to SOA header. + soa := &dns.SOA{Hdr: dns.RR_Header{Name: "fallback.test."}} + z, err = loadZone(sdk.CheckerOptions{}, &originService{SOA: soa}) + if err != nil || z != "fallback.test." { + t.Errorf("fallback: %q %v", z, err) + } + + // No source available. + if _, err := loadZone(sdk.CheckerOptions{}, &originService{}); err == nil { + t.Errorf("want error when nothing supplies a zone") + } +} + +func TestNamesAreDeduplicated(t *testing.T) { + // Smoke test for the dedup loop in Collect: build the same names slice + // the way Collect does and confirm extras don't double-up. + zone := dns.Fqdn("example.com") + names := []string{zone} + seen := map[string]bool{names[0]: true} + for _, sd := range []string{"@", "www", "www", "mail"} { + full := joinSubdomain(sd, zone) + if !seen[full] { + seen[full] = true + names = append(names, full) + } + } + sort.Strings(names) + want := []string{"example.com.", "mail.example.com.", "www.example.com."} + if !reflect.DeepEqual(names, want) { + t.Errorf("names = %v, want %v", names, want) + } +} diff --git a/checker/consensus.go b/checker/consensus.go new file mode 100644 index 0000000..062101a --- /dev/null +++ b/checker/consensus.go @@ -0,0 +1,123 @@ +package checker + +import ( + "sort" +) + +// Idempotent: rules and report both call it; both must see the same grouping. +func deriveView(data *ResolverPropagationData) { + if data == nil { + return + } + + for key, view := range data.RRsets { + // Reset derived fields so repeated calls stay idempotent. + view.Groups = nil + view.ConsensusSig = "" + view.Agreeing = nil + view.Dissenting = nil + view.MatchesExpected = false + + voteCount := map[string]int{} + type group struct { + rcode string + records []string + resolvers []string + } + groups := map[string]*group{} + + for _, rv := range data.Resolvers { + p := rv.Probes[key] + if p == nil || p.Error != "" { + continue + } + g := groups[p.Signature] + if g == nil { + g = &group{rcode: p.Rcode, records: p.Records} + groups[p.Signature] = g + } + g.resolvers = append(g.resolvers, rv.ID) + if !rv.Filtered { + voteCount[p.Signature]++ + } + } + + // Pick the winning signature, preferring NOERROR responses. + var winSig string + var winVotes int + for sig, g := range groups { + if g.rcode != "NOERROR" && winSig != "" { + continue + } + if voteCount[sig] > winVotes { + winSig = sig + winVotes = voteCount[sig] + } + } + if winSig == "" { + for sig := range groups { + winSig = sig + break + } + } + view.ConsensusSig = winSig + + type gEntry struct { + sig string + g *group + } + var entries []gEntry + for s, g := range groups { + sort.Strings(g.resolvers) + entries = append(entries, gEntry{sig: s, g: g}) + } + sort.Slice(entries, func(i, j int) bool { + return len(entries[i].g.resolvers) > len(entries[j].g.resolvers) + }) + for _, e := range entries { + view.Groups = append(view.Groups, SignatureGroup{ + Signature: e.sig, + Records: e.g.records, + Resolvers: e.g.resolvers, + Rcode: e.g.rcode, + }) + if e.sig == winSig { + view.Agreeing = append(view.Agreeing, e.g.resolvers...) + } else { + view.Dissenting = append(view.Dissenting, e.g.resolvers...) + } + } + sort.Strings(view.Agreeing) + sort.Strings(view.Dissenting) + + if view.Expected != "" { + view.MatchesExpected = view.ConsensusSig == view.Expected + } + } + + // Recompute UnfilteredAgreeing from the consensus we just built. + agree := 0 + for _, rv := range data.Resolvers { + if rv.Filtered || !rv.Reachable { + continue + } + ok := true + for key, p := range rv.Probes { + if p == nil || p.Error != "" { + continue + } + v := data.RRsets[key] + if v == nil || v.ConsensusSig == "" { + continue + } + if p.Signature != v.ConsensusSig { + ok = false + break + } + } + if ok { + agree++ + } + } + data.Stats.UnfilteredAgreeing = agree +} diff --git a/checker/consensus_test.go b/checker/consensus_test.go new file mode 100644 index 0000000..6ad6d89 --- /dev/null +++ b/checker/consensus_test.go @@ -0,0 +1,146 @@ +package checker + +import ( + "reflect" + "testing" +) + +func TestDeriveView_Nil(t *testing.T) { + deriveView(nil) // must not panic +} + +func TestDeriveView_PicksMajoritySignature(t *testing.T) { + key := "example.com./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "b": mkResolver("b", "global", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "c": mkResolver("c", "na", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "9.9.9.9")}), + }, + RRsets: map[string]*RRsetView{ + key: {Name: "example.com.", Type: "A"}, + }, + } + deriveView(data) + v := data.RRsets[key] + if v.ConsensusSig != "1.1.1.1" { + t.Errorf("consensus = %q", v.ConsensusSig) + } + if !reflect.DeepEqual(v.Agreeing, []string{"a", "b"}) { + t.Errorf("agreeing = %v", v.Agreeing) + } + if !reflect.DeepEqual(v.Dissenting, []string{"c"}) { + t.Errorf("dissenting = %v", v.Dissenting) + } + if data.Stats.UnfilteredAgreeing != 2 { + t.Errorf("unfilteredAgreeing = %d", data.Stats.UnfilteredAgreeing) + } +} + +func TestDeriveView_FilteredResolverDoesNotVote(t *testing.T) { + key := "example.com./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "good": mkResolver("good", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "filt": mkResolver("filt", "eu", true, true, map[string]*RRProbe{key: mkProbe("NOERROR", "0.0.0.0")}), + "filt2": mkResolver("filt2", "eu", true, true, map[string]*RRProbe{key: mkProbe("NOERROR", "0.0.0.0")}), + "filt3": mkResolver("filt3", "eu", true, true, map[string]*RRProbe{key: mkProbe("NOERROR", "0.0.0.0")}), + }, + RRsets: map[string]*RRsetView{ + key: {Name: "example.com.", Type: "A"}, + }, + } + deriveView(data) + if data.RRsets[key].ConsensusSig != "1.1.1.1" { + t.Errorf("filtered resolvers should not win: %q", data.RRsets[key].ConsensusSig) + } +} + +func TestDeriveView_ExpectedMatch(t *testing.T) { + key := "example.com./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + }, + RRsets: map[string]*RRsetView{ + key: {Name: "example.com.", Type: "A", Expected: "1.1.1.1"}, + }, + } + deriveView(data) + if !data.RRsets[key].MatchesExpected { + t.Errorf("expected match should be true") + } + + // Drift case. + data.RRsets[key].Expected = "9.9.9.9" + data.RRsets[key].MatchesExpected = false + deriveView(data) + if data.RRsets[key].MatchesExpected { + t.Errorf("expected match should be false on drift") + } +} + +func TestDeriveView_Idempotent(t *testing.T) { + key := "example.com./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "b": mkResolver("b", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "example.com.", Type: "A"}}, + } + deriveView(data) + first := *data.RRsets[key] + deriveView(data) + second := *data.RRsets[key] + if !reflect.DeepEqual(first.Groups, second.Groups) || + first.ConsensusSig != second.ConsensusSig || + !reflect.DeepEqual(first.Agreeing, second.Agreeing) { + t.Errorf("deriveView is not idempotent: %+v vs %+v", first, second) + } +} + +func TestDeriveView_SkipsErrorProbes(t *testing.T) { + key := "example.com./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: {Error: "timeout", Transport: TransportUDP}}), + "b": mkResolver("b", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "example.com.", Type: "A"}}, + } + deriveView(data) + if data.RRsets[key].ConsensusSig != "1.1.1.1" { + t.Errorf("err probe shouldn't be counted: %q", data.RRsets[key].ConsensusSig) + } +} + +func TestDeriveView_DissenterDoesNotAgree(t *testing.T) { + // Resolver "c" probes two RRsets and disagrees on one ⇒ should not be + // counted in UnfilteredAgreeing. + k1, k2 := "ex./A", "ex./MX" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{ + k1: mkProbe("NOERROR", "1.1.1.1"), + k2: mkProbe("NOERROR", "10 mx."), + }), + "b": mkResolver("b", "eu", false, true, map[string]*RRProbe{ + k1: mkProbe("NOERROR", "1.1.1.1"), + k2: mkProbe("NOERROR", "10 mx."), + }), + "c": mkResolver("c", "eu", false, true, map[string]*RRProbe{ + k1: mkProbe("NOERROR", "1.1.1.1"), + k2: mkProbe("NOERROR", "20 nope."), // dissents + }), + }, + RRsets: map[string]*RRsetView{ + k1: {Name: "ex.", Type: "A"}, + k2: {Name: "ex.", Type: "MX"}, + }, + } + deriveView(data) + if data.Stats.UnfilteredAgreeing != 2 { + t.Errorf("UnfilteredAgreeing = %d, want 2", data.Stats.UnfilteredAgreeing) + } +} diff --git a/checker/definition.go b/checker/definition.go new file mode 100644 index 0000000..99376a0 --- /dev/null +++ b/checker/definition.go @@ -0,0 +1,111 @@ +package checker + +import ( + "time" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// Version is the checker version reported in CheckerDefinition.Version. +var Version = "built-in" + +// Definition returns the CheckerDefinition for the resolver-propagation +// checker. +func (p *resolverPropagationProvider) Definition() *sdk.CheckerDefinition { + return &sdk.CheckerDefinition{ + ID: "resolver-propagation", + Name: "Worldwide DNS propagation", + Version: Version, + Availability: sdk.CheckerAvailability{ + ApplyToService: true, + LimitToServices: []string{ + "abstract.Origin", + "abstract.NSOnlyOrigin", + }, + }, + ObservationKeys: []sdk.ObservationKey{ObservationKeyResolverPropagation}, + HasHTMLReport: true, + HasMetrics: true, + Options: sdk.CheckerOptionsDocumentation{ + UserOpts: []sdk.CheckerOptionDocumentation{ + { + Id: "recordTypes", + Type: "string", + Label: "Record types to probe", + Description: "Comma-separated list of RR types. The checker probes every listed type at the zone apex (and at each 'subdomains' entry).", + Default: "SOA,NS,A,AAAA,MX,TXT,CAA", + }, + { + Id: "subdomains", + Type: "string", + Label: "Extra subdomains to probe", + Description: "Comma-separated list of owner names to probe in addition to the zone apex (e.g. \"www,mail,@\"). Leave empty to only probe the apex.", + Default: "www", + }, + { + Id: "includeFiltered", + Type: "bool", + Label: "Include filtered resolvers", + Description: "Probe filtering resolvers (malware/family/adblock). Their answers routinely disagree with the consensus by design; enable only when diagnosing a blocklist hit.", + Default: false, + }, + { + Id: "region", + Type: "string", + Label: "Restrict to region", + Description: "Only probe resolvers from the given region. Use 'all' for a worldwide run.", + Choices: []string{"all", "global", "na", "eu", "asia", "ru", "me"}, + Default: "all", + }, + { + Id: "transports", + Type: "string", + Label: "Transports", + Description: "Comma-separated list of transports to probe. 'udp' is the baseline; 'tcp', 'dot' and 'doh' add coverage. Encrypted transports are only used for resolvers that publish an endpoint.", + Default: "udp", + }, + { + Id: "resolverAllowlist", + Type: "string", + Label: "Resolver allowlist (advanced)", + Description: "Comma-separated list of resolver IDs or IPs to probe exclusively. Leave empty to use the catalog selection. Example: \"cloudflare,google,9.9.9.9\".", + Default: "", + }, + { + Id: "latencyThresholdMs", + Type: "uint", + Label: "Latency warning threshold (ms)", + Description: "Resolvers averaging above this value produce an info finding.", + Default: float64(500), + }, + { + Id: "runTimeoutSeconds", + Type: "uint", + Label: "Run timeout (seconds)", + Description: "Hard wall-clock budget for one propagation run. Slow resolvers beyond this simply report as unreachable.", + Default: float64(30), + }, + }, + DomainOpts: []sdk.CheckerOptionDocumentation{ + { + Id: "domain_name", + Label: "Zone name", + AutoFill: sdk.AutoFillDomainName, + }, + }, + ServiceOpts: []sdk.CheckerOptionDocumentation{ + { + Id: "service", + Label: "Origin service", + AutoFill: sdk.AutoFillService, + }, + }, + }, + Rules: Rules(), + Interval: &sdk.CheckIntervalSpec{ + Min: 5 * time.Minute, + Max: 24 * time.Hour, + Default: 30 * time.Minute, + }, + } +} diff --git a/checker/definition_test.go b/checker/definition_test.go new file mode 100644 index 0000000..5001264 --- /dev/null +++ b/checker/definition_test.go @@ -0,0 +1,49 @@ +package checker + +import ( + "testing" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +func TestDefinitionSmoke(t *testing.T) { + prov, ok := Provider().(sdk.CheckerDefinitionProvider) + if !ok { + t.Fatalf("Provider does not implement CheckerDefinitionProvider") + } + def := prov.Definition() + if def == nil { + t.Fatalf("nil definition") + } + if def.ID == "" || def.Name == "" { + t.Errorf("missing ID/Name: %+v", def) + } + if !def.HasHTMLReport || !def.HasMetrics { + t.Errorf("expected HasHTMLReport and HasMetrics: %+v", def) + } + if len(def.Rules) == 0 { + t.Errorf("definition exposes no rules") + } + // Recordtype default option must be present for users. + var has bool + for _, opt := range def.Options.UserOpts { + if opt.Id == "recordTypes" { + has = true + break + } + } + if !has { + t.Errorf("missing recordTypes user option") + } + + // Service restriction. + if len(def.Availability.LimitToServices) == 0 { + t.Errorf("expected LimitToServices to be set") + } +} + +func TestProviderKey(t *testing.T) { + if Provider().Key() != ObservationKeyResolverPropagation { + t.Errorf("unexpected observation key") + } +} diff --git a/checker/dns.go b/checker/dns.go new file mode 100644 index 0000000..db6c5c2 --- /dev/null +++ b/checker/dns.go @@ -0,0 +1,245 @@ +package checker + +import ( + "bytes" + "context" + "crypto/tls" + "encoding/base64" + "fmt" + "io" + "net" + "net/http" + "net/url" + "sort" + "strings" + "time" + + "github.com/miekg/dns" +) + +// Slower than this, a public resolver is unreachable or too flaky to be useful. +const dnsTimeout = 5 * time.Second + +// 4096 is the de-facto ceiling for unfragmented EDNS0 responses on the public Internet. +const ednsUDPSize = 4096 + +// Bound DoH reads so a hostile server can't stream junk indefinitely. +const maxDoHResponseBytes = 64 * 1024 + +// Shared so concurrent probes reuse connections and TLS state. +var dohClient = &http.Client{ + Timeout: dnsTimeout + 2*time.Second, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{ + MinVersion: tls.VersionTLS12, + }, + TLSHandshakeTimeout: dnsTimeout, + ResponseHeaderTimeout: dnsTimeout, + ExpectContinueTimeout: 1 * time.Second, + DisableKeepAlives: false, + MaxIdleConnsPerHost: 4, + }, +} + +// Flatter than *dns.Msg so the collector stays protocol-agnostic. +type queryResult struct { + Rcode int + Answer []dns.RR + AD bool + Latency time.Duration +} + +// Forces RD=1 (recurse), CD=0 (let resolver validate DNSSEC), AD=1 (signal validation back). +func queryResolver(ctx context.Context, r Resolver, tr Transport, name string, qtype uint16) (*queryResult, error) { + q := dns.Question{Name: dns.Fqdn(name), Qtype: qtype, Qclass: dns.ClassINET} + + m := new(dns.Msg) + m.Id = dns.Id() + m.Question = []dns.Question{q} + m.RecursionDesired = true + m.CheckingDisabled = false + m.AuthenticatedData = true + m.SetEdns0(ednsUDPSize, true) + + switch tr { + case TransportUDP: + return exchangeUDPOrTCP(ctx, m, r.IP+":53", "udp") + case TransportTCP: + return exchangeUDPOrTCP(ctx, m, r.IP+":53", "tcp") + case TransportDoT: + if r.DoTHost == "" { + return nil, fmt.Errorf("no DoT endpoint for %s", r.ID) + } + return exchangeDoT(ctx, m, r.IP, r.DoTHost) + case TransportDoH: + if r.DoHURL == "" { + return nil, fmt.Errorf("no DoH endpoint for %s", r.ID) + } + return exchangeDoH(ctx, m, r.DoHURL) + default: + return nil, fmt.Errorf("unknown transport %q", tr) + } +} + +func exchangeUDPOrTCP(ctx context.Context, m *dns.Msg, server, proto string) (*queryResult, error) { + client := dns.Client{Net: proto, Timeout: dnsTimeout} + if deadline, ok := ctx.Deadline(); ok { + if d := time.Until(deadline); d > 0 && d < client.Timeout { + client.Timeout = d + } + } + + r, rtt, err := client.ExchangeContext(ctx, m, server) + if err != nil { + return nil, err + } + if r == nil { + return nil, fmt.Errorf("nil response from %s", server) + } + + // Truncated UDP answers force a retry over TCP per RFC 5966. + if proto == "udp" && r.Truncated { + tcpClient := dns.Client{Net: "tcp", Timeout: dnsTimeout} + if r2, rtt2, err2 := tcpClient.ExchangeContext(ctx, m, server); err2 == nil && r2 != nil { + return &queryResult{ + Rcode: r2.Rcode, Answer: r2.Answer, + AD: r2.AuthenticatedData, Latency: rtt2, + }, nil + } + } + + return &queryResult{ + Rcode: r.Rcode, Answer: r.Answer, + AD: r.AuthenticatedData, Latency: rtt, + }, nil +} + +// sni validates the certificate; the IP is what we actually dial. +func exchangeDoT(ctx context.Context, m *dns.Msg, ip, sni string) (*queryResult, error) { + client := dns.Client{ + Net: "tcp-tls", + Timeout: dnsTimeout, + TLSConfig: &tls.Config{ + ServerName: sni, + MinVersion: tls.VersionTLS12, + }, + } + if deadline, ok := ctx.Deadline(); ok { + if d := time.Until(deadline); d > 0 && d < client.Timeout { + client.Timeout = d + } + } + r, rtt, err := client.ExchangeContext(ctx, m, net.JoinHostPort(ip, "853")) + if err != nil { + return nil, err + } + if r == nil { + return nil, fmt.Errorf("nil response from %s", ip) + } + return &queryResult{ + Rcode: r.Rcode, Answer: r.Answer, + AD: r.AuthenticatedData, Latency: rtt, + }, nil +} + +// GET (per RFC 8484) so HTTP caches can merge equivalent queries. +func exchangeDoH(ctx context.Context, m *dns.Msg, endpoint string) (*queryResult, error) { + // Id=0 lets HTTP caches merge equivalent queries. + m.Id = 0 + packed, err := m.Pack() + if err != nil { + return nil, fmt.Errorf("packing message: %w", err) + } + + u, err := url.Parse(endpoint) + if err != nil { + return nil, fmt.Errorf("invalid DoH endpoint %q: %w", endpoint, err) + } + q := u.Query() + q.Set("dns", base64.RawURLEncoding.EncodeToString(packed)) + u.RawQuery = q.Encode() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) + if err != nil { + return nil, err + } + req.Header.Set("Accept", "application/dns-message") + req.Header.Set("User-Agent", "happyDomain-checker-resolver-propagation/"+Version) + + start := time.Now() + resp, err := dohClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("DoH HTTP %d", resp.StatusCode) + } + ct := resp.Header.Get("Content-Type") + if !strings.HasPrefix(ct, "application/dns-message") { + return nil, fmt.Errorf("DoH unexpected content-type %q", ct) + } + + var buf bytes.Buffer + if _, err := io.Copy(&buf, io.LimitReader(resp.Body, maxDoHResponseBytes)); err != nil { + return nil, err + } + latency := time.Since(start) + + r := new(dns.Msg) + if err := r.Unpack(buf.Bytes()); err != nil { + return nil, fmt.Errorf("unpacking DoH response: %w", err) + } + return &queryResult{ + Rcode: r.Rcode, Answer: r.Answer, + AD: r.AuthenticatedData, Latency: latency, + }, nil +} + +// Strips the "owner TTL class type" header from miekg's zone-file form to leave RDATA. +func canonicalRR(rr dns.RR) string { + if rr == nil { + return "" + } + fields := strings.Fields(rr.String()) + if len(fields) <= 4 { + return "" + } + rdata := strings.Join(fields[4:], " ") + // Lowercase so case-only drift in hostnames doesn't read as disagreement. + return strings.ToLower(strings.TrimSpace(rdata)) +} + +// Deterministic signature for cross-resolver comparison; sort-then-join keeps RRset order irrelevant. +func signatureFromRRs(rrs []dns.RR, owner string, qtype uint16) (sig string, records []string, minTTL uint32) { + ownerL := strings.ToLower(dns.Fqdn(owner)) + for _, rr := range rrs { + h := rr.Header() + if h == nil { + continue + } + if !strings.EqualFold(dns.Fqdn(h.Name), ownerL) { + continue + } + if h.Rrtype != qtype { + continue + } + if c := canonicalRR(rr); c != "" { + records = append(records, c) + if minTTL == 0 || h.Ttl < minTTL { + minTTL = h.Ttl + } + } + } + sort.Strings(records) + sig = strings.Join(records, "|") + return sig, records, minTTL +} + +func rcodeToString(c int) string { + if s, ok := dns.RcodeToString[c]; ok { + return s + } + return fmt.Sprintf("RCODE%d", c) +} diff --git a/checker/dns_test.go b/checker/dns_test.go new file mode 100644 index 0000000..d0d27d5 --- /dev/null +++ b/checker/dns_test.go @@ -0,0 +1,305 @@ +package checker + +import ( + "context" + "net" + "strings" + "sync" + "testing" + "time" + + "github.com/miekg/dns" +) + +func mustRR(t *testing.T, s string) dns.RR { + t.Helper() + rr, err := dns.NewRR(s) + if err != nil { + t.Fatalf("dns.NewRR(%q): %v", s, err) + } + return rr +} + +func TestCanonicalRR(t *testing.T) { + if got := canonicalRR(nil); got != "" { + t.Errorf("nil RR: want empty, got %q", got) + } + + cases := []struct { + rr string + want string + }{ + {"example.com. 300 IN A 192.0.2.1", "192.0.2.1"}, + {"Example.Com. 300 IN NS Ns1.Example.Com.", "ns1.example.com."}, + {"example.com. 60 IN MX 10 mail.example.com.", "10 mail.example.com."}, + {"example.com. 30 IN TXT \"v=spf1 -all\"", "\"v=spf1 -all\""}, + } + for _, c := range cases { + if got := canonicalRR(mustRR(t, c.rr)); got != c.want { + t.Errorf("canonicalRR(%q) = %q, want %q", c.rr, got, c.want) + } + } +} + +func TestSignatureFromRRs(t *testing.T) { + rrs := []dns.RR{ + mustRR(t, "example.com. 300 IN A 192.0.2.2"), + mustRR(t, "example.com. 60 IN A 192.0.2.1"), + mustRR(t, "example.com. 300 IN AAAA 2001:db8::1"), // wrong type + mustRR(t, "other.example.com. 300 IN A 198.51.100.1"), // wrong owner + } + sig, recs, ttl := signatureFromRRs(rrs, "example.com", dns.TypeA) + if sig != "192.0.2.1|192.0.2.2" { + t.Errorf("sig = %q", sig) + } + if len(recs) != 2 || recs[0] != "192.0.2.1" || recs[1] != "192.0.2.2" { + t.Errorf("records = %v", recs) + } + if ttl != 60 { + t.Errorf("minTTL = %d, want 60", ttl) + } + + // Owner case-insensitivity. + sig2, _, _ := signatureFromRRs(rrs, "EXAMPLE.com.", dns.TypeA) + if sig2 != sig { + t.Errorf("owner case sensitivity: %q vs %q", sig2, sig) + } + + // Empty input. + if s, r, ttl := signatureFromRRs(nil, "x", dns.TypeA); s != "" || r != nil || ttl != 0 { + t.Errorf("empty input: %q %v %d", s, r, ttl) + } +} + +func TestSignatureDeterministic(t *testing.T) { + a := []dns.RR{ + mustRR(t, "x. 30 IN A 1.1.1.1"), + mustRR(t, "x. 30 IN A 2.2.2.2"), + } + b := []dns.RR{ + mustRR(t, "x. 30 IN A 2.2.2.2"), + mustRR(t, "x. 30 IN A 1.1.1.1"), + } + sa, _, _ := signatureFromRRs(a, "x", dns.TypeA) + sb, _, _ := signatureFromRRs(b, "x", dns.TypeA) + if sa != sb { + t.Errorf("ordering changed sig: %q vs %q", sa, sb) + } +} + +func TestRcodeToString(t *testing.T) { + cases := []struct { + in int + want string + }{ + {dns.RcodeSuccess, "NOERROR"}, + {dns.RcodeNameError, "NXDOMAIN"}, + {dns.RcodeServerFailure, "SERVFAIL"}, + {42, "RCODE42"}, + } + for _, c := range cases { + if got := rcodeToString(c.in); got != c.want { + t.Errorf("rcodeToString(%d) = %q, want %q", c.in, got, c.want) + } + } +} + +// startUDPServer brings up a tiny miekg/dns UDP server bound to a free port, +// returning its address and a stop func. The handler is called for every +// query and decides what to write back. +func startUDPServer(t *testing.T, handler dns.HandlerFunc) (string, func()) { + t.Helper() + pc, err := net.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + srv := &dns.Server{PacketConn: pc, Handler: handler} + done := make(chan struct{}) + go func() { + _ = srv.ActivateAndServe() + close(done) + }() + // give the server a moment + time.Sleep(20 * time.Millisecond) + return pc.LocalAddr().String(), func() { + _ = srv.Shutdown() + <-done + } +} + +func TestExchangeUDPOrTCP_Success(t *testing.T) { + addr, stop := startUDPServer(t, func(w dns.ResponseWriter, m *dns.Msg) { + resp := new(dns.Msg) + resp.SetReply(m) + resp.Authoritative = true + resp.Answer = []dns.RR{mustRR(t, "example.com. 60 IN A 192.0.2.10")} + _ = w.WriteMsg(resp) + }) + defer stop() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + m := new(dns.Msg) + m.SetQuestion("example.com.", dns.TypeA) + res, err := exchangeUDPOrTCP(ctx, m, addr, "udp") + if err != nil { + t.Fatalf("exchange: %v", err) + } + if res.Rcode != dns.RcodeSuccess { + t.Errorf("rcode = %d", res.Rcode) + } + if len(res.Answer) != 1 { + t.Fatalf("answers: %v", res.Answer) + } +} + +func TestQueryResolver_UnknownTransport(t *testing.T) { + _, err := queryResolver(context.Background(), Resolver{ID: "x", IP: "127.0.0.1"}, Transport("xyz"), "x.", dns.TypeA) + if err == nil || !strings.Contains(err.Error(), "unknown transport") { + t.Errorf("want unknown transport error, got %v", err) + } +} + +func TestQueryResolver_MissingDoTEndpoint(t *testing.T) { + _, err := queryResolver(context.Background(), Resolver{ID: "x", IP: "127.0.0.1"}, TransportDoT, "x.", dns.TypeA) + if err == nil || !strings.Contains(err.Error(), "no DoT endpoint") { + t.Errorf("want missing DoT err, got %v", err) + } +} + +func TestQueryResolver_MissingDoHEndpoint(t *testing.T) { + _, err := queryResolver(context.Background(), Resolver{ID: "x", IP: "127.0.0.1"}, TransportDoH, "x.", dns.TypeA) + if err == nil || !strings.Contains(err.Error(), "no DoH endpoint") { + t.Errorf("want missing DoH err, got %v", err) + } +} + +func TestRunProbe_TransportError(t *testing.T) { + // Missing DoT host on the resolver: queryResolver returns an error, + // runProbe converts it into RRProbe.Error. + p := runProbe(context.Background(), Resolver{ID: "x", IP: "127.0.0.1"}, TransportDoT, "ex.", dns.TypeA) + if p.Error == "" { + t.Errorf("expected error for missing DoT host") + } + if p.Transport != TransportDoT { + t.Errorf("transport = %v", p.Transport) + } +} + +func TestQueryAuthoritative(t *testing.T) { + addr, stop := startUDPServer(t, func(w dns.ResponseWriter, m *dns.Msg) { + resp := new(dns.Msg) + resp.SetReply(m) + resp.Authoritative = true + resp.Answer = []dns.RR{mustRR(t, "ex. 60 IN A 5.6.7.8")} + _ = w.WriteMsg(resp) + }) + defer stop() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + e := queryAuthoritative(ctx, []string{addr}, "ex.", dns.TypeA) + if e == nil { + t.Fatal("nil entry") + } + if e.sig != "5.6.7.8" { + t.Errorf("sig = %q", e.sig) + } +} + +func TestQueryAuthoritative_NotAuthoritative(t *testing.T) { + addr, stop := startUDPServer(t, func(w dns.ResponseWriter, m *dns.Msg) { + resp := new(dns.Msg) + resp.SetReply(m) + resp.Authoritative = false + resp.Answer = []dns.RR{mustRR(t, "ex. 60 IN A 5.6.7.8")} + _ = w.WriteMsg(resp) + }) + defer stop() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if e := queryAuthoritative(ctx, []string{addr}, "ex.", dns.TypeA); e != nil { + t.Errorf("non-authoritative answer should be ignored, got %+v", e) + } +} + +func TestQueryAuthoritative_NXDOMAIN(t *testing.T) { + addr, stop := startUDPServer(t, func(w dns.ResponseWriter, m *dns.Msg) { + resp := new(dns.Msg) + resp.SetReply(m) + resp.Authoritative = true + resp.Rcode = dns.RcodeNameError + _ = w.WriteMsg(resp) + }) + defer stop() + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + e := queryAuthoritative(ctx, []string{addr}, "ex.", dns.TypeA) + if e == nil { + t.Fatal("want non-nil entry for NXDOMAIN") + } + if e.sig != "" { + t.Errorf("NXDOMAIN should give empty sig: %q", e.sig) + } +} + +func TestExchangeUDP_TruncationFallsBackToTCP(t *testing.T) { + // UDP returns truncated; we also start a TCP listener that returns the full + // answer. miekg/dns ServeMux supports both via a single Server, but we + // keep it explicit here. + pcUDP, err := net.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + t.Fatalf("udp listen: %v", err) + } + defer pcUDP.Close() + addr := pcUDP.LocalAddr().String() + host, port, err := net.SplitHostPort(addr) + if err != nil { + t.Fatalf("split: %v", err) + } + // TCP needs to share the same port; bind a TCP listener on it. + tcpL, err := net.Listen("tcp", net.JoinHostPort(host, port)) + if err != nil { + t.Fatalf("tcp listen: %v", err) + } + defer tcpL.Close() + + udpHandler := dns.HandlerFunc(func(w dns.ResponseWriter, m *dns.Msg) { + resp := new(dns.Msg) + resp.SetReply(m) + resp.Truncated = true + _ = w.WriteMsg(resp) + }) + tcpHandler := dns.HandlerFunc(func(w dns.ResponseWriter, m *dns.Msg) { + resp := new(dns.Msg) + resp.SetReply(m) + resp.Answer = []dns.RR{mustRR(t, "ex. 60 IN A 1.2.3.4")} + _ = w.WriteMsg(resp) + }) + + udpSrv := &dns.Server{PacketConn: pcUDP, Handler: udpHandler} + tcpSrv := &dns.Server{Listener: tcpL, Handler: tcpHandler} + var wg sync.WaitGroup + wg.Add(2) + go func() { defer wg.Done(); _ = udpSrv.ActivateAndServe() }() + go func() { defer wg.Done(); _ = tcpSrv.ActivateAndServe() }() + defer func() { + _ = udpSrv.Shutdown() + _ = tcpSrv.Shutdown() + wg.Wait() + }() + time.Sleep(30 * time.Millisecond) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + m := new(dns.Msg) + m.SetQuestion("ex.", dns.TypeA) + res, err := exchangeUDPOrTCP(ctx, m, addr, "udp") + if err != nil { + t.Fatalf("exchange: %v", err) + } + if len(res.Answer) != 1 { + t.Fatalf("expected TCP fallback to populate answer, got %v", res.Answer) + } +} diff --git a/checker/interactive.go b/checker/interactive.go new file mode 100644 index 0000000..2ea5b67 --- /dev/null +++ b/checker/interactive.go @@ -0,0 +1,123 @@ +//go:build standalone + +package checker + +import ( + "errors" + "net/http" + "strconv" + "strings" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +func (p *resolverPropagationProvider) RenderForm() []sdk.CheckerOptionField { + return []sdk.CheckerOptionField{ + { + Id: "domain_name", + Type: "string", + Label: "Zone name", + Placeholder: "example.com", + Required: true, + Description: "Apex of the zone to probe across public resolvers.", + }, + { + Id: "recordTypes", + Type: "string", + Label: "Record types to probe", + Placeholder: "SOA,NS,A,AAAA,MX,TXT,CAA", + Description: "Comma-separated list of RR types. Probed at the apex (and at each 'subdomains' entry).", + }, + { + Id: "subdomains", + Type: "string", + Label: "Extra subdomains to probe", + Placeholder: "www", + Description: "Comma-separated list of owner names to probe in addition to the apex (e.g. \"www,mail,@\").", + }, + { + Id: "includeFiltered", + Type: "bool", + Label: "Include filtered resolvers", + Description: "Probe filtering resolvers (malware/family/adblock). Their answers routinely disagree with the consensus by design.", + }, + { + Id: "region", + Type: "string", + Label: "Restrict to region", + Placeholder: "all", + Description: "One of: all, global, na, eu, asia, ru, me.", + Choices: []string{"all", "global", "na", "eu", "asia", "ru", "me"}, + }, + { + Id: "transports", + Type: "string", + Label: "Transports", + Placeholder: "udp", + Description: "Comma-separated list of transports to probe: udp, tcp, dot, doh.", + }, + { + Id: "resolverAllowlist", + Type: "string", + Label: "Resolver allowlist (advanced)", + Placeholder: "cloudflare,google,9.9.9.9", + Description: "Comma-separated list of resolver IDs or IPs to probe exclusively. Leave empty to use the catalog selection.", + }, + { + Id: "latencyThresholdMs", + Type: "uint", + Label: "Latency warning threshold (ms)", + Placeholder: "500", + Description: "Resolvers averaging above this value produce an info finding.", + }, + { + Id: "runTimeoutSeconds", + Type: "uint", + Label: "Run timeout (seconds)", + Placeholder: "30", + Description: "Hard wall-clock budget for one propagation run.", + }, + } +} + +func (p *resolverPropagationProvider) ParseForm(r *http.Request) (sdk.CheckerOptions, error) { + name := strings.TrimSpace(r.FormValue("domain_name")) + if name == "" { + return nil, errors.New("domain_name is required") + } + name = strings.TrimSuffix(name, ".") + + opts := sdk.CheckerOptions{ + "domain_name": name, + } + + for _, key := range []string{ + "recordTypes", "subdomains", "region", + "transports", "resolverAllowlist", + } { + if v := strings.TrimSpace(r.FormValue(key)); v != "" { + opts[key] = v + } + } + + if v := strings.TrimSpace(r.FormValue("includeFiltered")); v != "" { + switch strings.ToLower(v) { + case "1", "true", "on", "yes": + opts["includeFiltered"] = true + case "0", "false", "off", "no": + opts["includeFiltered"] = false + } + } + + for _, key := range []string{"latencyThresholdMs", "runTimeoutSeconds"} { + if v := strings.TrimSpace(r.FormValue(key)); v != "" { + n, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return nil, errors.New(key + " must be a non-negative integer") + } + opts[key] = float64(n) + } + } + + return opts, nil +} diff --git a/checker/metrics.go b/checker/metrics.go new file mode 100644 index 0000000..cf14af0 --- /dev/null +++ b/checker/metrics.go @@ -0,0 +1,137 @@ +package checker + +import ( + "encoding/json" + "fmt" + "time" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// ExtractMetrics implements sdk.CheckerMetricsReporter. It consumes the raw +// observation, derives consensus on the fly, and emits time-series for +// dashboards. Severity counters are computed from the rule states carried +// in the ReportContext rather than re-derived from raw data. +func (p *resolverPropagationProvider) ExtractMetrics(ctx sdk.ReportContext, collectedAt time.Time) ([]sdk.CheckMetric, error) { + var data ResolverPropagationData + if err := json.Unmarshal(ctx.Data(), &data); err != nil { + return nil, fmt.Errorf("resolver-propagation: decoding observation: %w", err) + } + deriveView(&data) + + var out []sdk.CheckMetric + zone := data.Zone + + rollups := []struct { + name string + val float64 + }{ + {"resolver_propagation_resolvers_total", float64(data.Stats.TotalResolvers)}, + {"resolver_propagation_resolvers_reachable", float64(data.Stats.ReachableResolvers)}, + {"resolver_propagation_unfiltered_agreeing", float64(data.Stats.UnfilteredAgreeing)}, + {"resolver_propagation_regions_covered", float64(data.Stats.CountriesCovered)}, + {"resolver_propagation_run_duration_ms", float64(data.RunDurationMs)}, + } + for _, r := range rollups { + out = append(out, sdk.CheckMetric{ + Name: r.name, + Value: r.val, + Labels: map[string]string{"zone": zone}, + Timestamp: collectedAt, + }) + } + + if data.DeclaredSerial != 0 { + out = append(out, sdk.CheckMetric{ + Name: "resolver_propagation_declared_serial", + Value: float64(data.DeclaredSerial), + Labels: map[string]string{"zone": zone}, + Timestamp: collectedAt, + }) + } + + soaKey := rrsetKey(zone, "SOA") + var staleResolvers int + for id, rv := range data.Resolvers { + if rv.Filtered { + continue + } + p := rv.Probes[soaKey] + if p == nil || p.Error != "" || p.Rcode != "NOERROR" { + continue + } + s := extractSerial(p.Records) + if s == 0 { + continue + } + out = append(out, sdk.CheckMetric{ + Name: "resolver_propagation_observed_serial", + Value: float64(s), + Labels: map[string]string{ + "zone": zone, + "resolver": id, + }, + Timestamp: collectedAt, + }) + if data.DeclaredSerial != 0 && s < data.DeclaredSerial { + staleResolvers++ + } + } + if data.DeclaredSerial != 0 { + out = append(out, sdk.CheckMetric{ + Name: "resolver_propagation_serial_drift_resolvers", + Value: float64(staleResolvers), + Labels: map[string]string{"zone": zone}, + Timestamp: collectedAt, + }) + } + + for id, rv := range data.Resolvers { + labels := map[string]string{ + "zone": zone, + "resolver": id, + "ip": rv.IP, + "region": rv.Region, + "transport": string(rv.Transport), + } + + up := float64(0) + if rv.Reachable { + up = 1 + } + out = append(out, sdk.CheckMetric{ + Name: "resolver_propagation_resolver_up", + Value: up, Labels: labels, Timestamp: collectedAt, + }) + + var total, n int64 + for _, p := range rv.Probes { + if p.Error != "" { + continue + } + total += p.LatencyMs + n++ + } + if n > 0 { + out = append(out, sdk.CheckMetric{ + Name: "resolver_propagation_resolver_latency_ms", + Value: float64(total) / float64(n), Unit: "ms", + Labels: labels, Timestamp: collectedAt, + }) + } + } + + for key, v := range data.RRsets { + out = append(out, sdk.CheckMetric{ + Name: "resolver_propagation_rrset_signatures", + Value: float64(len(v.Groups)), + Labels: map[string]string{ + "zone": zone, + "rrset": key, + }, + Timestamp: collectedAt, + }) + } + + return out, nil +} diff --git a/checker/metrics_test.go b/checker/metrics_test.go new file mode 100644 index 0000000..a0c254b --- /dev/null +++ b/checker/metrics_test.go @@ -0,0 +1,97 @@ +package checker + +import ( + "encoding/json" + "testing" + "time" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +func TestExtractMetrics(t *testing.T) { + key := "ex./A" + soaKey := "ex./SOA" + data := &ResolverPropagationData{ + Zone: "ex.", + Names: []string{"ex."}, + Types: []string{"A", "SOA"}, + RunDurationMs: 1234, + DeclaredSerial: 100, + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{ + key: {Rcode: "NOERROR", Signature: "1.1.1.1", LatencyMs: 50, Transport: TransportUDP}, + soaKey: {Rcode: "NOERROR", Records: []string{"ns.ex. hm.ex. 100 3600 600 86400 300"}, LatencyMs: 50, Transport: TransportUDP}, + }), + "b": mkResolver("b", "eu", false, false, map[string]*RRProbe{ + key: {Error: "timeout", Transport: TransportUDP}, + soaKey: {Rcode: "NOERROR", Records: []string{"ns.ex. hm.ex. 90 3600 600 86400 300"}, LatencyMs: 80, Transport: TransportUDP}, + }), + }, + RRsets: map[string]*RRsetView{ + key: {Name: "ex.", Type: "A"}, + soaKey: {Name: "ex.", Type: "SOA"}, + }, + Stats: Stats{ + TotalResolvers: 2, + ReachableResolvers: 1, + CountriesCovered: 1, + }, + } + raw, err := json.Marshal(data) + if err != nil { + t.Fatalf("marshal: %v", err) + } + + states := []sdk.CheckState{ + {Status: sdk.StatusCrit, Code: "x"}, + {Status: sdk.StatusWarn, Code: "y"}, + {Status: sdk.StatusInfo, Code: "z"}, + {Status: sdk.StatusInfo, Code: "z2"}, + } + ctx := sdk.NewReportContext(raw, nil, states) + + prov := &resolverPropagationProvider{} + metrics, err := prov.ExtractMetrics(ctx, time.Unix(0, 0)) + if err != nil { + t.Fatalf("ExtractMetrics: %v", err) + } + + want := map[string]float64{ + "resolver_propagation_resolvers_total": 2, + "resolver_propagation_resolvers_reachable": 1, + "resolver_propagation_run_duration_ms": 1234, + "resolver_propagation_declared_serial": 100, + "resolver_propagation_serial_drift_resolvers": 1, + } + got := map[string]float64{} + for _, m := range metrics { + // Keep the first sample per name (most are zone-only labels). + if _, ok := got[m.Name]; !ok { + got[m.Name] = m.Value + } + } + for name, v := range want { + if got[name] != v { + t.Errorf("metric %s = %v, want %v", name, got[name], v) + } + } + + // resolver_up should appear once per resolver. + var ups int + for _, m := range metrics { + if m.Name == "resolver_propagation_resolver_up" { + ups++ + } + } + if ups != 2 { + t.Errorf("resolver_up samples = %d, want 2", ups) + } +} + +func TestExtractMetrics_BadPayload(t *testing.T) { + ctx := sdk.StaticReportContext(json.RawMessage(`not-json`)) + prov := &resolverPropagationProvider{} + if _, err := prov.ExtractMetrics(ctx, time.Now()); err == nil { + t.Errorf("want decode error") + } +} diff --git a/checker/provider.go b/checker/provider.go new file mode 100644 index 0000000..c73105d --- /dev/null +++ b/checker/provider.go @@ -0,0 +1,16 @@ +package checker + +import ( + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// Provider returns a new resolver-propagation observation provider. +func Provider() sdk.ObservationProvider { + return &resolverPropagationProvider{} +} + +type resolverPropagationProvider struct{} + +func (p *resolverPropagationProvider) Key() sdk.ObservationKey { + return ObservationKeyResolverPropagation +} diff --git a/checker/report.go b/checker/report.go new file mode 100644 index 0000000..ed7d851 --- /dev/null +++ b/checker/report.go @@ -0,0 +1,733 @@ +package checker + +import ( + "bytes" + "encoding/json" + "fmt" + "html/template" + "sort" + "strings" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// GetHTMLReport implements sdk.CheckerHTMLReporter. +// +// The report is laid out top-down by decreasing importance: +// 1. a "Fix these first" banner listing the common failures (drift, +// DNSSEC, NXDOMAIN, SERVFAIL, regional split, etc.) with a plain-English +// remediation for each; +// 2. a per-RRset consensus table that shows which answers dominate and +// which resolvers disagree: the meat of the check; +// 3. a per-region matrix (consensus / drift / error per region × RRset); +// 4. a detailed per-resolver table for operators who want the raw data. +func (p *resolverPropagationProvider) GetHTMLReport(ctx sdk.ReportContext) (string, error) { + var data ResolverPropagationData + if raw := ctx.Data(); len(raw) > 0 { + if err := json.Unmarshal(raw, &data); err != nil { + return "", fmt.Errorf("parse resolver-propagation data: %w", err) + } + } + + deriveView(&data) + findings := statesToFindings(ctx.States()) + view := buildReportView(&data, findings) + + buf := &bytes.Buffer{} + if err := reportTmpl.Execute(buf, view); err != nil { + return "", err + } + return buf.String(), nil +} + +// topFailureOrder is the priority used by the "Fix these first" banner. +// Items at the top reflect more impactful / more actionable issues so the +// reader has a triage path. +var topFailureOrder = []string{ + CodeAllResolversDown, + CodeUnexpectedSERVFAIL, + CodeDNSSECFailure, + CodeAnswerDrift, + CodeUnexpectedNXDOMAIN, + CodeSerialDrift, + CodeRegionalSplit, + CodePartialPropagation, + CodeDNSSECUnvalidated, + CodeStaleCache, + CodeResolverRewrote, + CodeResolverUnreachable, + CodeResolverHighLatency, + CodeResolverFilteredHit, + CodeNoResolvers, +} + +// reportView is the flattened shape the HTML template consumes. +type reportView struct { + Zone string + OverallStatus string + OverallClass string + OverallMessage string + Stats Stats + + TopFailures []topFailure + OtherFindings []Finding + + RRsets []rrsetRow + Regions []regionRow + + Resolvers []resolverRow +} + +type topFailure struct { + Code string + Severity string + Message string + Remedy string + Count int + Class string + Headline string // short, human-readable label for the card +} + +type rrsetRow struct { + Key string + Name string + Type string + MatchesExpected bool + Expected []string + HasExpected bool + Groups []groupRow + Agreeing int + Dissenting int + StatusClass string + StatusLabel string +} + +type groupRow struct { + Rcode string + Records []string + Resolvers []string + IsConsensus bool +} + +type regionRow struct { + Region string + Label string + Resolvers int + Reachable int + Agreeing int + Disagreeing int + Errored int +} + +type resolverRow struct { + ID string + Name string + IP string + Region string + Transport string + Filtered bool + Reachable bool + AvgMs int64 + Probes []probeRow +} + +type probeRow struct { + Key string + Rcode string + Records []string + MinTTL uint32 + AD bool + AgreesWithConsensus bool + Error string + LatencyMs int64 +} + +func buildReportView(d *ResolverPropagationData, findings []Finding) *reportView { + v := &reportView{ + Zone: d.Zone, + Stats: d.Stats, + } + + // Overall banner: worst severity drives colour. + worst := "" + for _, f := range findings { + switch f.Severity { + case SeverityCrit: + worst = "crit" + case SeverityWarn: + if worst == "" { + worst = "warn" + } + case SeverityInfo: + if worst == "" { + worst = "info" + } + } + if worst == "crit" { + break + } + } + switch worst { + case "crit": + v.OverallStatus = "Critical issues" + v.OverallClass = "banner-crit" + v.OverallMessage = fmt.Sprintf("%s is not propagating correctly across public resolvers.", d.Zone) + case "warn": + v.OverallStatus = "Warnings" + v.OverallClass = "banner-warn" + v.OverallMessage = fmt.Sprintf("%s is propagating, but some resolvers or resource sets disagree.", d.Zone) + case "info": + v.OverallStatus = "Informational" + v.OverallClass = "banner-info" + v.OverallMessage = fmt.Sprintf("%s looks healthy; a few advisory notes below.", d.Zone) + default: + v.OverallStatus = "OK" + v.OverallClass = "banner-ok" + v.OverallMessage = fmt.Sprintf("%s is propagated consistently across %d of %d unfiltered resolvers.", + d.Zone, d.Stats.UnfilteredAgreeing, d.Stats.UnfilteredProbed) + } + + // Top failures: bucket findings by code, keep each code's most severe + // occurrence, render in topFailureOrder. + byCode := map[string][]Finding{} + for _, f := range findings { + byCode[f.Code] = append(byCode[f.Code], f) + } + order := map[string]int{} + for i, c := range topFailureOrder { + order[c] = i + 1 + } + used := map[string]bool{} + for _, code := range topFailureOrder { + list, ok := byCode[code] + if !ok { + continue + } + used[code] = true + f := list[0] + tf := topFailure{ + Code: code, + Severity: string(f.Severity), + Message: f.Message, + Remedy: f.Remedy, + Count: len(list), + Class: "severity-" + string(f.Severity), + Headline: headlineFor(code), + } + v.TopFailures = append(v.TopFailures, tf) + } + // Anything else → "Other findings" + for code, list := range byCode { + if used[code] { + continue + } + for _, f := range list { + v.OtherFindings = append(v.OtherFindings, f) + } + } + sort.SliceStable(v.OtherFindings, func(i, j int) bool { + return severityRank(v.OtherFindings[i].Severity) > severityRank(v.OtherFindings[j].Severity) + }) + + // RRset rows, sorted by "name/type". + keys := make([]string, 0, len(d.RRsets)) + for k := range d.RRsets { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + rv := d.RRsets[k] + row := rrsetRow{ + Key: k, + Name: rv.Name, + Type: rv.Type, + MatchesExpected: rv.MatchesExpected, + Expected: rv.ExpectedRecords, + HasExpected: rv.Expected != "", + Agreeing: len(rv.Agreeing), + Dissenting: len(rv.Dissenting), + } + for _, g := range rv.Groups { + row.Groups = append(row.Groups, groupRow{ + Rcode: g.Rcode, + Records: g.Records, + Resolvers: g.Resolvers, + IsConsensus: g.Signature == rv.ConsensusSig, + }) + } + switch { + case rv.Expected != "" && !rv.MatchesExpected: + row.StatusClass = "pill-crit" + row.StatusLabel = "drift" + case len(rv.Groups) > 1: + row.StatusClass = "pill-warn" + row.StatusLabel = "partial" + case len(rv.Groups) == 1: + row.StatusClass = "pill-ok" + row.StatusLabel = "consensus" + default: + row.StatusClass = "pill-info" + row.StatusLabel = "no data" + } + v.RRsets = append(v.RRsets, row) + } + + // Per-region rollup. + byRegion := map[string]*regionRow{} + for _, rv := range d.Resolvers { + r, ok := byRegion[rv.Region] + if !ok { + r = ®ionRow{Region: rv.Region, Label: regionLabel(rv.Region)} + byRegion[rv.Region] = r + } + r.Resolvers++ + if rv.Reachable { + r.Reachable++ + } + if rv.Reachable && !rv.Filtered { + ok := true + for key, p := range rv.Probes { + if p == nil || p.Error != "" { + r.Errored++ + ok = false + break + } + cv := d.RRsets[key] + if cv == nil || cv.ConsensusSig == "" { + continue + } + if p.Signature != cv.ConsensusSig { + ok = false + break + } + } + if ok { + r.Agreeing++ + } else { + r.Disagreeing++ + } + } + } + for _, r := range byRegion { + v.Regions = append(v.Regions, *r) + } + sort.Slice(v.Regions, func(i, j int) bool { return v.Regions[i].Label < v.Regions[j].Label }) + + // Per-resolver rows. + rids := make([]string, 0, len(d.Resolvers)) + for k := range d.Resolvers { + rids = append(rids, k) + } + sort.Strings(rids) + for _, rid := range rids { + rv := d.Resolvers[rid] + var total, n int64 + probes := []probeRow{} + pkeys := make([]string, 0, len(rv.Probes)) + for k := range rv.Probes { + pkeys = append(pkeys, k) + } + sort.Strings(pkeys) + for _, k := range pkeys { + p := rv.Probes[k] + pr := probeRow{ + Key: k, + Rcode: p.Rcode, + Records: p.Records, + MinTTL: p.MinTTL, + AD: p.AD, + Error: p.Error, + LatencyMs: p.LatencyMs, + } + if cv := d.RRsets[k]; cv != nil && cv.ConsensusSig != "" { + pr.AgreesWithConsensus = p.Signature == cv.ConsensusSig + } + if p.Error == "" { + total += p.LatencyMs + n++ + } + probes = append(probes, pr) + } + avg := int64(0) + if n > 0 { + avg = total / n + } + v.Resolvers = append(v.Resolvers, resolverRow{ + ID: rv.ID, + Name: rv.Name, + IP: rv.IP, + Region: regionLabel(rv.Region), + Transport: string(rv.Transport), + Filtered: rv.Filtered, + Reachable: rv.Reachable, + AvgMs: avg, + Probes: probes, + }) + } + + return v +} + +// Kept here (not in rules) so user-facing wording lives in one layer. +func headlineFor(code string) string { + switch code { + case CodeAllResolversDown: + return "No resolver could be reached" + case CodeUnexpectedSERVFAIL: + return "A resolver returns SERVFAIL" + case CodeDNSSECFailure: + return "DNSSEC validation fails" + case CodeAnswerDrift: + return "Public resolvers disagree with your authoritative answer" + case CodeUnexpectedNXDOMAIN: + return "A resolver sees your zone as non-existent" + case CodeSerialDrift: + return "SOA serial differs between resolvers" + case CodeRegionalSplit: + return "A whole region sees a different answer" + case CodePartialPropagation: + return "Change is mid-propagation" + case CodeDNSSECUnvalidated: + return "Validating resolver did not set AD" + case CodeStaleCache: + return "Resolvers still serve the previous SOA serial" + case CodeResolverRewrote: + return "Resolver rewrote the answer" + case CodeResolverUnreachable: + return "Resolver unreachable from the checker" + case CodeResolverHighLatency: + return "Slow resolver" + case CodeResolverFilteredHit: + return "Filtered resolver is blocking your zone" + case CodeNoResolvers: + return "No resolver matched the current selection" + default: + return code + } +} + +// View-layer translation only: rules own severity/code/message, report adds remedy + subject scoping. +func statesToFindings(states []sdk.CheckState) []Finding { + if len(states) == 0 { + return nil + } + var out []Finding + for _, st := range states { + sev, ok := severityFromStatus(st.Status) + if !ok { + continue + } + f := Finding{ + Code: st.Code, + Severity: sev, + Message: st.Message, + Remedy: remedyFor(st.Code), + } + if isResolverScopedCode(st.Code) { + f.Resolver = st.Subject + } else if st.Subject != "" && strings.Contains(st.Subject, "/") { + f.RRset = st.Subject + } + out = append(out, f) + } + sort.SliceStable(out, func(i, j int) bool { + if a, b := severityRank(out[i].Severity), severityRank(out[j].Severity); a != b { + return a > b + } + if out[i].Code != out[j].Code { + return out[i].Code < out[j].Code + } + if out[i].RRset != out[j].RRset { + return out[i].RRset < out[j].RRset + } + return out[i].Resolver < out[j].Resolver + }) + return out +} + +func severityFromStatus(s sdk.Status) (Severity, bool) { + switch s { + case sdk.StatusCrit: + return SeverityCrit, true + case sdk.StatusWarn: + return SeverityWarn, true + case sdk.StatusInfo: + return SeverityInfo, true + } + return "", false +} + +func isResolverScopedCode(code string) bool { + switch code { + case CodeResolverUnreachable, CodeResolverTimeout, CodeResolverRewrote, + CodeResolverFilteredHit, CodeResolverHighLatency, + CodeDNSSECFailure, CodeDNSSECUnvalidated: + return true + } + return false +} + +// Wording lives here, not in rules: severity is judgment, copy is presentation. +func remedyFor(code string) string { + switch code { + case CodeNoResolvers: + return "loosen the region filter or reset the allowlist in the checker options" + case CodeAllResolversDown: + return "retry later, or verify the checker host's outgoing UDP/53 connectivity" + case CodeSerialDrift: + return "usually transient caching right after a zone push" + case CodeStaleCache: + return "the resolvers cached the previous zone version" + case CodeDNSSECFailure: + return "check that the DS record at the parent matches the DNSKEY at the zone apex" + case CodeDNSSECUnvalidated: + return "enable DNSSEC signing at your provider to get full validation downstream" + case CodeRegionalSplit: + return "possible GeoDNS misconfiguration or regional censorship" + case CodePartialPropagation: + return "wait up to the previous TTL for the old cached answer to expire everywhere" + case CodeAnswerDrift: + return "wait for the old TTL to expire or force a flush on the affected resolvers" + case CodeUnexpectedNXDOMAIN: + return "a resolver returning NXDOMAIN while others return NOERROR usually means a poisoned cache or lame delegation" + case CodeUnexpectedSERVFAIL: + return "check DNSSEC signatures and that every authoritative NS is reachable over UDP and TCP" + case CodeResolverUnreachable: + return "the resolver might be blocking the checker's traffic, firewalled, or temporarily down" + case CodeResolverRewrote: + return "the resolver appears to rewrite answers; users relying on it will see a different zone" + case CodeResolverFilteredHit: + return "normal for a filtered resolver when the zone is on a blocklist" + case CodeResolverHighLatency: + return "usually reflects the checker-to-resolver network path" + } + return "" +} + +// severityRank orders severities for sorting; higher = more severe. +func severityRank(s Severity) int { + switch s { + case SeverityCrit: + return 3 + case SeverityWarn: + return 2 + case SeverityInfo: + return 1 + } + return 0 +} + +// reportFuncs exposes small helpers to the template so it can stay concise. +var reportFuncs = template.FuncMap{ + "join": func(sep string, s []string) string { return strings.Join(s, sep) }, + "len": func(s []string) int { return len(s) }, +} + +var reportTmpl = template.Must(template.New("report").Funcs(reportFuncs).Parse(reportTemplateHTML)) + +const reportTemplateHTML = ` + + + +Resolver propagation report — {{.Zone}} + + + + +

Worldwide DNS propagation — {{.Zone}}

+
Probe across public recursive resolvers; consensus compared to the zone's own authoritative answer.
+ + + +
+
{{.Stats.ReachableResolvers}} / {{.Stats.TotalResolvers}}
Resolvers reachable
+
{{.Stats.UnfilteredAgreeing}} / {{.Stats.UnfilteredProbed}}
Unfiltered agreeing
+
{{.Stats.CountriesCovered}}
Regions covered
+ {{if .Stats.FilteredProbed}}
{{.Stats.FilteredProbed}}
Filtered probed
{{end}} +
+ +{{if .TopFailures}} +

Fix these first

+{{range .TopFailures}} +
+
+ {{.Headline}} + {{.Count}}× · {{.Severity}} +
+
{{.Message}}
+ {{if .Remedy}}
What to do: {{.Remedy}}
{{end}} +
+{{end}} +{{end}} + +

Per-RRset consensus

+ + + +{{range .RRsets}} + + + + + + +{{end}} + +
RecordStatusExpected (authoritative)What resolvers see
{{.Name}}
{{.Type}}
{{.StatusLabel}}
+ {{.Agreeing}} ok · {{.Dissenting}} diff
+ {{if .HasExpected}} + {{if .Expected}}
    {{range .Expected}}
  • {{.}}
  • {{end}}
{{else}}(no data / NODATA){{end}} + {{else}} + (auth unreachable) + {{end}} +
+ {{range .Groups}} +
+ {{.Rcode}} + {{if .IsConsensus}}consensus{{end}} + {{if .Records}}
    {{range .Records}}
  • {{.}}
  • {{end}}
{{else}}(empty){{end}} +
{{len .Resolvers}} resolver(s): {{join ", " .Resolvers}}
+
+ {{end}} +
+ +

Per-region view

+ + + +{{range .Regions}} + + + + + + + +{{end}} + +
RegionReachableAgreeingDisagreeingErrored
{{.Label}}{{.Reachable}} / {{.Resolvers}}{{.Agreeing}}{{if .Disagreeing}}{{.Disagreeing}}{{else}}0{{end}}{{if .Errored}}{{.Errored}}{{else}}0{{end}}
+ +

Per-resolver details

+ + + +{{range .Resolvers}} + + + + + + + +{{end}} + +
ResolverRegionTransportAvg msAnswers
+ {{.Name}}{{if .Filtered}} filtered{{end}}
+ {{.IP}} · {{.ID}} +
{{.Region}}{{.Transport}}{{if .Reachable}}{{.AvgMs}}{{else}}unreachable{{end}} + {{range .Probes}} +
+ + {{.Key}} + + {{if .Error}}error{{else}}{{.Rcode}}{{if .AgreesWithConsensus}} · ✓{{else}} · ≠{{end}}{{end}} + + {{if .AD}}AD{{end}} + {{.LatencyMs}}ms{{if .MinTTL}} · TTL {{.MinTTL}}{{end}} + + {{if .Error}}
{{.Error}}
{{else if .Records}}
    {{range .Records}}
  • {{.}}
  • {{end}}
{{else}}(empty answer){{end}} +
+ {{end}} +
+ +{{if .OtherFindings}} +

Other findings

+ + + +{{range .OtherFindings}} + + + + + + +{{end}} + +
SeverityCodeMessageRemedy
{{.Severity}}{{.Code}}{{.Message}}{{.Remedy}}
+{{end}} + + + +` diff --git a/checker/report_test.go b/checker/report_test.go new file mode 100644 index 0000000..699accf --- /dev/null +++ b/checker/report_test.go @@ -0,0 +1,128 @@ +package checker + +import ( + "encoding/json" + "strings" + "testing" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +func TestStatesToFindings(t *testing.T) { + states := []sdk.CheckState{ + {Status: sdk.StatusOK, Code: "ok"}, + {Status: sdk.StatusInfo, Code: "i", Subject: "ex./A", Message: "info"}, + {Status: sdk.StatusWarn, Code: "w", Subject: "ex./A", Message: "warn"}, + {Status: sdk.StatusCrit, Code: "c", Subject: "ex./A", Message: "crit"}, + } + got := statesToFindings(states) + if len(got) != 3 { + t.Errorf("expected 3 findings (OK skipped), got %d: %+v", len(got), got) + } + severities := map[Severity]bool{} + for _, f := range got { + severities[f.Severity] = true + } + if !severities[SeverityCrit] || !severities[SeverityWarn] || !severities[SeverityInfo] { + t.Errorf("missing some severities: %+v", got) + } +} + +func TestSeverityFromStatus(t *testing.T) { + cases := []struct { + in sdk.Status + want Severity + wantOK bool + }{ + {sdk.StatusCrit, SeverityCrit, true}, + {sdk.StatusWarn, SeverityWarn, true}, + {sdk.StatusInfo, SeverityInfo, true}, + {sdk.StatusOK, "", false}, + } + for _, c := range cases { + got, ok := severityFromStatus(c.in) + if got != c.want || ok != c.wantOK { + t.Errorf("severityFromStatus(%v) = (%v,%v), want (%v,%v)", c.in, got, ok, c.want, c.wantOK) + } + } +} + +func TestSeverityRank(t *testing.T) { + if severityRank(SeverityCrit) <= severityRank(SeverityWarn) { + t.Errorf("crit should outrank warn") + } + if severityRank(SeverityWarn) <= severityRank(SeverityInfo) { + t.Errorf("warn should outrank info") + } +} + +func TestRemedyFor(t *testing.T) { + // known code returns a non-empty hint + if r := remedyFor(CodeAnswerDrift); r == "" { + t.Errorf("expected remedy for %q", CodeAnswerDrift) + } + // unknown code is allowed to be empty + _ = remedyFor("totally-bogus-code-xyz") +} + +func TestIsResolverScopedCode(t *testing.T) { + if !isResolverScopedCode(CodeResolverUnreachable) { + t.Errorf("resolver code should be scoped") + } + if isResolverScopedCode(CodeAnswerDrift) { + t.Errorf("rrset code should not be scoped") + } +} + +func TestGetHTMLReport_Smoke(t *testing.T) { + key := "ex./A" + data := &ResolverPropagationData{ + Zone: "ex.", + Names: []string{"ex."}, + Types: []string{"A"}, + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{ + key: {Rcode: "NOERROR", Signature: "1.1.1.1", Records: []string{"1.1.1.1"}, LatencyMs: 30, Transport: TransportUDP}, + }), + "b": mkResolver("b", "global", false, true, map[string]*RRProbe{ + key: {Rcode: "NOERROR", Signature: "1.1.1.1", Records: []string{"1.1.1.1"}, LatencyMs: 40, Transport: TransportUDP}, + }), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + Stats: Stats{TotalResolvers: 2, ReachableResolvers: 2, UnfilteredProbed: 2}, + } + raw, err := json.Marshal(data) + if err != nil { + t.Fatalf("marshal: %v", err) + } + ctx := sdk.NewReportContext(raw, nil, []sdk.CheckState{ + {Status: sdk.StatusOK, Code: "ok", Message: "fine"}, + }) + + prov := &resolverPropagationProvider{} + html, err := prov.GetHTMLReport(ctx) + if err != nil { + t.Fatalf("GetHTMLReport: %v", err) + } + for _, want := range []string{"ex.", "1.1.1.1", "Europe", "Global"} { + if !strings.Contains(html, want) { + t.Errorf("HTML report missing %q", want) + } + } +} + +func TestGetHTMLReport_BadPayload(t *testing.T) { + ctx := sdk.StaticReportContext(json.RawMessage(`not-json`)) + prov := &resolverPropagationProvider{} + if _, err := prov.GetHTMLReport(ctx); err == nil { + t.Errorf("want decode error") + } +} + +func TestGetHTMLReport_EmptyPayload(t *testing.T) { + ctx := sdk.StaticReportContext(nil) + prov := &resolverPropagationProvider{} + if _, err := prov.GetHTMLReport(ctx); err != nil { + t.Errorf("empty payload should not error, got %v", err) + } +} diff --git a/checker/resolvers.go b/checker/resolvers.go new file mode 100644 index 0000000..d3fcc76 --- /dev/null +++ b/checker/resolvers.go @@ -0,0 +1,174 @@ +package checker + +// Region is a coarse bucket for the report, not a geolocation claim (Anycast → "global"). +type Resolver struct { + // ID is a stable identifier, exposed in JSON/metrics. + ID string + + // Name is the human-readable provider + flavor (shown in the report). + Name string + + // IP is the plain-text UDP/TCP address (without port). + IP string + + // Region tags the resolver geographically. One of: global, na, eu, asia, + // ru, me, oceania, sa, africa. + Region string + + // Filtered marks resolvers that intentionally rewrite or block answers + // (malware / adult / ad / family filters). These are expected to differ + // from the consensus on some zones and therefore kept out of the default + // unfiltered probe set. + Filtered bool + + // DoHURL is the RFC 8484 endpoint, when the provider publishes one. + DoHURL string + + // DoTHost is the DNS-over-TLS server name (SNI target), when available. + // DoT always runs on port 853 against this same hostname. + DoTHost string +} + +// Derived from happydomain3/web/src/lib/resolver.ts; regions are best-effort from AS paths and provider docs. +var allResolvers = []Resolver{ + // ── Unfiltered / Anycast global ────────────────────────────────────── + {ID: "cloudflare", Name: "Cloudflare DNS", IP: "1.1.1.1", Region: "global", + DoHURL: "https://cloudflare-dns.com/dns-query", DoTHost: "cloudflare-dns.com"}, + {ID: "google", Name: "Google Public DNS", IP: "8.8.8.8", Region: "global", + DoHURL: "https://dns.google/dns-query", DoTHost: "dns.google"}, + {ID: "quad9-unfiltered", Name: "Quad9 (no blocklist)", IP: "9.9.9.10", Region: "global", + DoHURL: "https://dns10.quad9.net/dns-query", DoTHost: "dns10.quad9.net"}, + {ID: "opendns", Name: "OpenDNS", IP: "208.67.222.222", Region: "global"}, + {ID: "he", Name: "Hurricane Electric", IP: "74.82.42.42", Region: "global"}, + {ID: "dns-sb", Name: "DNS.SB", IP: "185.222.222.222", Region: "global", + DoHURL: "https://doh.dns.sb/dns-query", DoTHost: "dns.sb"}, + {ID: "adguard-unfiltered", Name: "AdGuard (non-filtering)", IP: "94.140.14.140", Region: "global", + DoHURL: "https://unfiltered.adguard-dns.com/dns-query", DoTHost: "unfiltered.adguard-dns.com"}, + + // ── North America ── + {ID: "level3", Name: "Level3", IP: "4.2.2.1", Region: "na"}, + {ID: "verisign", Name: "Verisign", IP: "64.6.64.6", Region: "na"}, + {ID: "comodo", Name: "Comodo Secure DNS", IP: "8.26.56.26", Region: "na"}, + {ID: "norton", Name: "Norton ConnectSafe", IP: "199.85.126.10", Region: "na"}, + {ID: "safeserve", Name: "Namecheap SafeServe", IP: "198.54.117.10", Region: "na"}, + {ID: "dyn", Name: "Dyn", IP: "216.146.35.35", Region: "na"}, + {ID: "neustar", Name: "Neustar / DNS Advantage", IP: "156.154.70.1", Region: "na"}, + {ID: "smartviper", Name: "SmartViper", IP: "208.76.50.50", Region: "na"}, + {ID: "alternate", Name: "Alternate DNS", IP: "23.253.163.53", Region: "na"}, + {ID: "strongdns", Name: "StrongDNS", IP: "216.131.65.63", Region: "na"}, + + // ── Europe ── + {ID: "dns-watch", Name: "DNS.WATCH (DE)", IP: "84.200.69.80", Region: "eu"}, + {ID: "freedns", Name: "FreeDNS (AT)", IP: "37.235.1.174", Region: "eu"}, + {ID: "freenom", Name: "Freenom World (NL)", IP: "80.80.80.80", Region: "eu"}, + {ID: "uncensored", Name: "UncensoredDNS (DK)", IP: "91.239.100.100", Region: "eu"}, + {ID: "fdn", Name: "French Data Network (FR)", IP: "80.67.169.12", Region: "eu"}, + {ID: "fooldns", Name: "FoolDNS (IT)", IP: "87.118.111.215", Region: "eu"}, + {ID: "puntcat", Name: "puntCAT (ES)", IP: "109.69.8.51", Region: "eu"}, + {ID: "opennic", Name: "OpenNIC", IP: "185.121.177.177", Region: "eu"}, + {ID: "dns4eu-unfiltered", Name: "DNS4EU (unfiltered)", IP: "86.54.11.100", Region: "eu", + DoHURL: "https://unfiltered.joindns4.eu/dns-query", DoTHost: "unfiltered.joindns4.eu"}, + {ID: "dns4all", Name: "DNS4ALL", IP: "194.0.5.3", Region: "eu"}, + + // ── Asia (East & SE) ── + {ID: "ntt-jp", Name: "NTT (JP)", IP: "129.250.35.250", Region: "asia"}, + {ID: "alidns", Name: "AliDNS (CN)", IP: "223.5.5.5", Region: "asia"}, + {ID: "cnnic-sdns", Name: "CNNIC SDNS (CN)", IP: "1.2.4.8", Region: "asia"}, + {ID: "dnspod", Name: "DNSPod (CN)", IP: "119.29.29.29", Region: "asia"}, + {ID: "onedns", Name: "oneDNS (CN)", IP: "114.215.126.16", Region: "asia"}, + {ID: "cloudxns", Name: "CloudXNS (CN)", IP: "124.251.124.251", Region: "asia"}, + {ID: "114dns", Name: "114DNS (CN)", IP: "114.114.114.114", Region: "asia"}, + {ID: "dnspai", Name: "DNSpai (CN)", IP: "101.226.4.6", Region: "asia"}, + {ID: "quad101", Name: "Quad101 (TW)", IP: "101.101.101.101", Region: "asia"}, + {ID: "hinet", Name: "HiNet (TW)", IP: "168.95.1.1", Region: "asia"}, + + // ── Russia ── + {ID: "yandex", Name: "Yandex.DNS", IP: "77.88.8.8", Region: "ru", + DoTHost: "common.dot.dns.yandex.net"}, + + // ── Middle East ── + {ID: "greenteam", Name: "GreenTeam DNS (IL)", IP: "81.218.119.11", Region: "me"}, + + // ── Filtered (opt-in) ───────────────────────────────────────────────── + {ID: "cloudflare-malware", Name: "Cloudflare (malware blocking)", IP: "1.1.1.2", Region: "global", Filtered: true, + DoHURL: "https://security.cloudflare-dns.com/dns-query", DoTHost: "security.cloudflare-dns.com"}, + {ID: "cloudflare-family", Name: "Cloudflare (malware + adult)", IP: "1.1.1.3", Region: "global", Filtered: true, + DoHURL: "https://family.cloudflare-dns.com/dns-query", DoTHost: "family.cloudflare-dns.com"}, + {ID: "quad9", Name: "Quad9 (blocklist)", IP: "9.9.9.9", Region: "global", Filtered: true, + DoHURL: "https://dns.quad9.net/dns-query", DoTHost: "dns.quad9.net"}, + {ID: "adguard", Name: "AdGuard (default)", IP: "94.140.14.14", Region: "global", Filtered: true, + DoHURL: "https://dns.adguard-dns.com/dns-query", DoTHost: "dns.adguard-dns.com"}, + {ID: "adguard-family", Name: "AdGuard (family protection)", IP: "94.140.14.15", Region: "global", Filtered: true, + DoHURL: "https://family.adguard-dns.com/dns-query", DoTHost: "family.adguard-dns.com"}, + {ID: "yandex-safe", Name: "Yandex Safe", IP: "77.88.8.2", Region: "ru", Filtered: true, + DoTHost: "common.dot.dns.yandex.net"}, + {ID: "yandex-family", Name: "Yandex Family", IP: "77.88.8.3", Region: "ru", Filtered: true, + DoTHost: "common.dot.dns.yandex.net"}, + {ID: "dns-advantage-threat", Name: "DNS Advantage Threat", IP: "156.154.70.2", Region: "na", Filtered: true}, + {ID: "dns-advantage-family", Name: "DNS Advantage Family", IP: "156.154.70.3", Region: "na", Filtered: true}, + {ID: "dns-advantage-business", Name: "DNS Advantage Business", IP: "156.154.70.4", Region: "na", Filtered: true}, + {ID: "cleanbrowsing-family", Name: "CleanBrowsing Family", IP: "185.228.168.168", Region: "global", Filtered: true, + DoHURL: "https://doh.cleanbrowsing.org/doh/family-filter/", DoTHost: "family-filter-dns.cleanbrowsing.org"}, + {ID: "cleanbrowsing-adult", Name: "CleanBrowsing Adult", IP: "185.228.168.10", Region: "global", Filtered: true, + DoHURL: "https://doh.cleanbrowsing.org/doh/adult-filter/", DoTHost: "adult-filter-dns.cleanbrowsing.org"}, + {ID: "dns4eu-protective", Name: "DNS4EU Protective", IP: "86.54.11.1", Region: "eu", Filtered: true, + DoHURL: "https://protective.joindns4.eu/dns-query", DoTHost: "protective.joindns4.eu"}, + {ID: "dns4eu-child", Name: "DNS4EU Child Protection", IP: "86.54.11.12", Region: "eu", Filtered: true, + DoHURL: "https://child.joindns4.eu/dns-query", DoTHost: "child.joindns4.eu"}, + {ID: "dns4eu-adblock", Name: "DNS4EU Ad-blocking", IP: "86.54.11.13", Region: "eu", Filtered: true, + DoHURL: "https://ads.joindns4.eu/dns-query", DoTHost: "ads.joindns4.eu"}, +} + +// A non-empty allowlist takes precedence; filter and region knobs are then ignored. +func selectedResolvers(includeFiltered bool, region string, allowlist []string) []Resolver { + if len(allowlist) > 0 { + allow := make(map[string]bool, len(allowlist)) + for _, a := range allowlist { + allow[a] = true + } + var out []Resolver + for _, r := range allResolvers { + if allow[r.ID] || allow[r.IP] { + out = append(out, r) + } + } + return out + } + + var out []Resolver + for _, r := range allResolvers { + if r.Filtered && !includeFiltered { + continue + } + if region != "" && region != "all" && region != r.Region { + continue + } + out = append(out, r) + } + return out +} + +func regionLabel(region string) string { + switch region { + case "global": + return "Global / Anycast" + case "na": + return "North America" + case "eu": + return "Europe" + case "asia": + return "Asia" + case "ru": + return "Russia" + case "me": + return "Middle East" + case "oceania": + return "Oceania" + case "sa": + return "South America" + case "africa": + return "Africa" + default: + return region + } +} diff --git a/checker/resolvers_test.go b/checker/resolvers_test.go new file mode 100644 index 0000000..1a9013e --- /dev/null +++ b/checker/resolvers_test.go @@ -0,0 +1,105 @@ +package checker + +import ( + "strings" + "testing" +) + +func TestSelectedResolvers_DefaultExcludesFiltered(t *testing.T) { + out := selectedResolvers(false, "all", nil) + if len(out) == 0 { + t.Fatalf("default selection is empty") + } + for _, r := range out { + if r.Filtered { + t.Errorf("filtered resolver %q leaked into default selection", r.ID) + } + } +} + +func TestSelectedResolvers_IncludeFiltered(t *testing.T) { + withF := selectedResolvers(true, "all", nil) + withoutF := selectedResolvers(false, "all", nil) + if len(withF) <= len(withoutF) { + t.Errorf("includeFiltered=true should add resolvers, got %d vs %d", len(withF), len(withoutF)) + } +} + +func TestSelectedResolvers_RegionFilter(t *testing.T) { + out := selectedResolvers(false, "eu", nil) + if len(out) == 0 { + t.Fatalf("eu selection is empty") + } + for _, r := range out { + if r.Region != "eu" { + t.Errorf("non-eu resolver %q (%s) leaked in", r.ID, r.Region) + } + } +} + +func TestSelectedResolvers_AllowlistByID(t *testing.T) { + out := selectedResolvers(false, "all", []string{"cloudflare", "9.9.9.10"}) + ids := make(map[string]bool) + for _, r := range out { + ids[r.ID] = true + } + if !ids["cloudflare"] || !ids["quad9-unfiltered"] { + t.Errorf("allowlist failed: %v", ids) + } + if len(out) != 2 { + t.Errorf("expected exactly 2 resolvers, got %d", len(out)) + } +} + +func TestSelectedResolvers_AllowlistOverridesFilteredAndRegion(t *testing.T) { + // quad9 is Filtered + global; allowlist must still pick it. + out := selectedResolvers(false, "eu", []string{"quad9"}) + if len(out) != 1 || out[0].ID != "quad9" { + t.Errorf("allowlist should override filtered/region, got %v", out) + } +} + +func TestRegionLabel(t *testing.T) { + cases := map[string]string{ + "global": "Global / Anycast", + "na": "North America", + "eu": "Europe", + "asia": "Asia", + "ru": "Russia", + "me": "Middle East", + "oceania": "Oceania", + "sa": "South America", + "africa": "Africa", + "unknown": "unknown", + "": "", + } + for in, want := range cases { + if got := regionLabel(in); got != want { + t.Errorf("regionLabel(%q) = %q, want %q", in, got, want) + } + } +} + +func TestAllResolversCatalogIntegrity(t *testing.T) { + // Catch typos / duplicates in the static catalog. + ids := map[string]bool{} + for _, r := range allResolvers { + if r.ID == "" { + t.Errorf("resolver with empty ID: %+v", r) + } + if r.IP == "" { + t.Errorf("resolver %q has empty IP", r.ID) + } + if strings.Contains(r.ID, "|") { + t.Errorf("resolver ID %q contains reserved separator '|'", r.ID) + } + if ids[r.ID] { + t.Errorf("duplicate resolver ID %q", r.ID) + } + ids[r.ID] = true + if r.DoTHost == "" && r.DoHURL != "" { + // DoH-only is acceptable but log it for visibility. + t.Logf("resolver %q has DoH but no DoT", r.ID) + } + } +} diff --git a/checker/rules.go b/checker/rules.go new file mode 100644 index 0000000..9ef20bf --- /dev/null +++ b/checker/rules.go @@ -0,0 +1,59 @@ +package checker + +import ( + "context" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// Rules returns every CheckRule exposed by the resolver-propagation checker. +// Each rule covers one concern so the UI and metrics consumers can reason +// about them independently. +func Rules() []sdk.CheckRule { + return []sdk.CheckRule{ + &resolverSelectionRule{}, + &resolversReachableRule{}, + &consensusRule{}, + &authoritativeMatchRule{}, + &nxdomainRule{}, + &servfailRule{}, + ®ionalSplitRule{}, + &serialDriftRule{}, + &staleCacheRule{}, + &dnssecRule{}, + &resolverLatencyRule{}, + &filteredHitRule{}, + } +} + +// loadData fetches the observation and returns an error state on failure. +// It also runs deriveView so every rule sees a ready-to-use consensus. +func loadData(ctx context.Context, obs sdk.ObservationGetter) (*ResolverPropagationData, *sdk.CheckState) { + var data ResolverPropagationData + if err := obs.Get(ctx, ObservationKeyResolverPropagation, &data); err != nil { + return nil, &sdk.CheckState{ + Status: sdk.StatusError, + Message: fmt.Sprintf("failed to load resolver-propagation observation: %v", err), + Code: "resolver_propagation_error", + } + } + deriveView(&data) + return &data, nil +} + +func passState(code, message string) sdk.CheckState { + return sdk.CheckState{Status: sdk.StatusOK, Message: message, Code: code} +} + +func infoState(code, subject, message string) sdk.CheckState { + return sdk.CheckState{Status: sdk.StatusInfo, Message: message, Code: code, Subject: subject} +} + +func warnState(code, subject, message string) sdk.CheckState { + return sdk.CheckState{Status: sdk.StatusWarn, Message: message, Code: code, Subject: subject} +} + +func critState(code, subject, message string) sdk.CheckState { + return sdk.CheckState{Status: sdk.StatusCrit, Message: message, Code: code, Subject: subject} +} diff --git a/checker/rules_consensus.go b/checker/rules_consensus.go new file mode 100644 index 0000000..eb45834 --- /dev/null +++ b/checker/rules_consensus.go @@ -0,0 +1,245 @@ +package checker + +import ( + "context" + "fmt" + "sort" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// consensusRule emits one state per RRset summarising how much of the probed +// resolver set agrees on its answer. It covers the "partial propagation" +// case (several distinct NOERROR signatures observed). +type consensusRule struct{} + +func (r *consensusRule) Name() string { return "resolver_propagation.consensus" } +func (r *consensusRule) Description() string { + return "Checks that public resolvers agree on a single answer for each probed RRset." +} +func (r *consensusRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Resolvers) == 0 || len(data.RRsets) == 0 { + return []sdk.CheckState{{Status: sdk.StatusUnknown, + Code: "resolver_propagation.consensus.skipped", + Message: "no resolver probes available"}} + } + + keys := sortedRRsetKeys(data) + var states []sdk.CheckState + for _, key := range keys { + v := data.RRsets[key] + + unfilteredNOERRORSigs := map[string]bool{} + for _, g := range v.Groups { + if g.Rcode != "NOERROR" { + continue + } + for _, rid := range g.Resolvers { + rv := data.Resolvers[rid] + if rv != nil && !rv.Filtered { + unfilteredNOERRORSigs[g.Signature] = true + break + } + } + } + + switch { + case v.ConsensusSig == "" && len(v.Groups) == 0: + states = append(states, infoState("resolver_propagation.consensus.no_data", key, + fmt.Sprintf("no resolver returned a usable answer for %s", key))) + case len(unfilteredNOERRORSigs) > 1: + states = append(states, warnState(CodePartialPropagation, key, + fmt.Sprintf("%d distinct answers seen across public resolvers for %s, change is mid-propagation", + len(unfilteredNOERRORSigs), key))) + default: + states = append(states, sdk.CheckState{ + Status: sdk.StatusOK, + Code: "resolver_propagation.consensus.ok", + Subject: key, + Message: fmt.Sprintf("all %d probed resolver(s) agree on %s", len(v.Agreeing), key), + }) + } + } + return states +} + +// authoritativeMatchRule checks the consensus against the answer served by +// the zone's own authoritative servers. +type authoritativeMatchRule struct{} + +func (r *authoritativeMatchRule) Name() string { return "resolver_propagation.matches_authoritative" } +func (r *authoritativeMatchRule) Description() string { + return "Checks that the public consensus matches the answer served by the zone's authoritative nameservers." +} +func (r *authoritativeMatchRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + + var states []sdk.CheckState + anyExpected := false + for _, key := range sortedRRsetKeys(data) { + v := data.RRsets[key] + if v.Expected == "" { + continue + } + anyExpected = true + switch { + case v.ConsensusSig == "": + states = append(states, critState("resolver_propagation.matches_authoritative.no_consensus", key, + fmt.Sprintf("no public resolver returned a usable answer for %s (authoritative answer is known)", key))) + case !v.MatchesExpected: + states = append(states, critState(CodeAnswerDrift, key, + fmt.Sprintf("consensus of public resolvers for %s differs from the authoritative answer, wait for TTL expiry or force a flush", key))) + default: + states = append(states, sdk.CheckState{ + Status: sdk.StatusOK, Code: "resolver_propagation.matches_authoritative.ok", Subject: key, + Message: fmt.Sprintf("public consensus for %s matches the authoritative answer", key), + }) + } + } + if !anyExpected { + return []sdk.CheckState{{Status: sdk.StatusUnknown, + Code: "resolver_propagation.matches_authoritative.skipped", + Message: "authoritative nameservers were unreachable; cannot compare consensus to ground truth"}} + } + return states +} + +// nxdomainRule flags RRsets returning NXDOMAIN on some (but not all) resolvers. +type nxdomainRule struct{} + +func (r *nxdomainRule) Name() string { return "resolver_propagation.nxdomain" } +func (r *nxdomainRule) Description() string { + return "Flags RRsets for which some resolvers return NXDOMAIN while others return NOERROR." +} +func (r *nxdomainRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + var states []sdk.CheckState + for _, key := range sortedRRsetKeys(data) { + v := data.RRsets[key] + var nxList []string + for _, g := range v.Groups { + if g.Rcode == "NXDOMAIN" { + nxList = append(nxList, g.Resolvers...) + } + } + if len(nxList) > 0 && len(nxList) < len(data.Resolvers) { + states = append(states, critState(CodeUnexpectedNXDOMAIN, key, + fmt.Sprintf("%s resolved as NXDOMAIN on %d resolver(s): %s", key, len(nxList), firstN(nxList, 6)))) + } + } + if len(states) == 0 { + return []sdk.CheckState{passState("resolver_propagation.nxdomain.ok", + "No resolver unexpectedly returns NXDOMAIN.")} + } + return states +} + +// servfailRule flags RRsets returning SERVFAIL on any resolver. +type servfailRule struct{} + +func (r *servfailRule) Name() string { return "resolver_propagation.servfail" } +func (r *servfailRule) Description() string { + return "Flags RRsets for which any resolver returns SERVFAIL (usually DNSSEC or reachability failure)." +} +func (r *servfailRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + var states []sdk.CheckState + for _, key := range sortedRRsetKeys(data) { + v := data.RRsets[key] + var sfList []string + for _, g := range v.Groups { + if g.Rcode == "SERVFAIL" { + sfList = append(sfList, g.Resolvers...) + } + } + if len(sfList) > 0 { + states = append(states, critState(CodeUnexpectedSERVFAIL, key, + fmt.Sprintf("%s returned SERVFAIL on %d resolver(s): %s", key, len(sfList), firstN(sfList, 6)))) + } + } + if len(states) == 0 { + return []sdk.CheckState{passState("resolver_propagation.servfail.ok", + "No resolver returns SERVFAIL.")} + } + return states +} + +// regionalSplitRule flags regions in which all resolvers agree on an answer +// that diverges from the global consensus. +type regionalSplitRule struct{} + +func (r *regionalSplitRule) Name() string { return "resolver_propagation.regional_split" } +func (r *regionalSplitRule) Description() string { + return "Flags regions in which every resolver agrees on an answer that differs from the global consensus." +} +func (r *regionalSplitRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + + var states []sdk.CheckState + for _, key := range sortedRRsetKeys(data) { + v := data.RRsets[key] + region2sig := map[string]map[string]int{} + for _, g := range v.Groups { + for _, rid := range g.Resolvers { + rv := data.Resolvers[rid] + if rv == nil || rv.Filtered { + continue + } + if region2sig[rv.Region] == nil { + region2sig[rv.Region] = map[string]int{} + } + region2sig[rv.Region][g.Signature]++ + } + } + regions := make([]string, 0, len(region2sig)) + for r := range region2sig { + regions = append(regions, r) + } + sort.Strings(regions) + for _, region := range regions { + sigs := region2sig[region] + if len(sigs) != 1 { + continue + } + var only string + for s := range sigs { + only = s + } + if only != "" && only != v.ConsensusSig { + states = append(states, warnState(CodeRegionalSplit, region+" "+key, + fmt.Sprintf("all %s resolvers agree on an answer that differs from the global consensus for %s", + regionLabel(region), key))) + } + } + } + if len(states) == 0 { + return []sdk.CheckState{passState("resolver_propagation.regional_split.ok", + "No region is split from the global consensus.")} + } + return states +} + +func sortedRRsetKeys(data *ResolverPropagationData) []string { + keys := make([]string, 0, len(data.RRsets)) + for k := range data.RRsets { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} diff --git a/checker/rules_resolvers.go b/checker/rules_resolvers.go new file mode 100644 index 0000000..8fa8b17 --- /dev/null +++ b/checker/rules_resolvers.go @@ -0,0 +1,139 @@ +package checker + +import ( + "context" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// resolverSelectionRule flags an empty selection (nothing to probe). +type resolverSelectionRule struct{} + +func (r *resolverSelectionRule) Name() string { return "resolver_propagation.selection" } +func (r *resolverSelectionRule) Description() string { + return "Checks that the current option set selects at least one public resolver." +} +func (r *resolverSelectionRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Resolvers) == 0 { + return []sdk.CheckState{critState(CodeNoResolvers, data.Zone, + "no resolvers match the current selection (region / filtered / allowlist), loosen the region filter or reset the allowlist")} + } + return []sdk.CheckState{passState("resolver_propagation.selection.ok", + fmt.Sprintf("%d resolver(s) selected for probing", len(data.Resolvers)))} +} + +// resolversReachableRule flags the "no resolver answered" case. +type resolversReachableRule struct{} + +func (r *resolversReachableRule) Name() string { return "resolver_propagation.reachable" } +func (r *resolversReachableRule) Description() string { + return "Checks that at least one selected resolver answered a query (detects a checker host with no DNS connectivity)." +} +func (r *resolversReachableRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if len(data.Resolvers) == 0 { + return []sdk.CheckState{{Status: sdk.StatusUnknown, Code: "resolver_propagation.reachable.skipped", + Message: "no resolver in selection"}} + } + for _, rv := range data.Resolvers { + if rv.Reachable { + return []sdk.CheckState{passState("resolver_propagation.reachable.ok", + fmt.Sprintf("%d/%d resolver(s) answered at least one query", + data.Stats.ReachableResolvers, data.Stats.TotalResolvers))} + } + } + return []sdk.CheckState{critState(CodeAllResolversDown, data.Zone, + "no public resolver answered, the checker host may be offline, or DNS traffic is blocked on its network")} +} + +// resolverLatencyRule flags resolvers with high average latency. +type resolverLatencyRule struct{} + +func (r *resolverLatencyRule) Name() string { return "resolver_propagation.latency" } +func (r *resolverLatencyRule) Description() string { + return "Flags resolvers whose average response time exceeds the configured threshold." +} +func (r *resolverLatencyRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, opts sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + threshold := int64(sdk.GetIntOption(opts, "latencyThresholdMs", 500)) + + var states []sdk.CheckState + for _, rv := range data.Resolvers { + if !rv.Reachable { + states = append(states, warnState(CodeResolverUnreachable, rv.ID, + fmt.Sprintf("resolver %s (%s, %s) did not answer any query", rv.Name, rv.IP, rv.Transport))) + continue + } + var total, n int64 + for _, p := range rv.Probes { + if p.Error != "" { + continue + } + total += p.LatencyMs + n++ + } + if n > 0 { + avg := total / n + if avg > threshold { + states = append(states, infoState(CodeResolverHighLatency, rv.ID, + fmt.Sprintf("%s answered in %d ms on average (threshold %d ms)", rv.Name, avg, threshold))) + } + } + } + + if len(states) == 0 { + return []sdk.CheckState{passState("resolver_propagation.latency.ok", + "All reachable resolvers respond within the latency threshold.")} + } + return states +} + +// filteredHitRule notes when a filtered resolver returns a different answer +// than the consensus (i.e. a likely blocklist hit). +type filteredHitRule struct{} + +func (r *filteredHitRule) Name() string { return "resolver_propagation.filtered_hit" } +func (r *filteredHitRule) Description() string { + return "Reports filtered resolvers returning a different answer than the consensus (typical blocklist behaviour)." +} +func (r *filteredHitRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + var states []sdk.CheckState + for _, rv := range data.Resolvers { + if !rv.Filtered { + continue + } + for key, p := range rv.Probes { + if p == nil || p.Error != "" || p.Rcode != "NOERROR" { + continue + } + rv2 := data.RRsets[key] + if rv2 == nil || rv2.ConsensusSig == "" { + continue + } + if p.Signature != rv2.ConsensusSig { + states = append(states, infoState(CodeResolverFilteredHit, rv.ID+" "+key, + fmt.Sprintf("%s (filtered) returned a different answer than the consensus for %s, likely a blocklist hit", rv.Name, key))) + } + } + } + if len(states) == 0 { + return []sdk.CheckState{passState("resolver_propagation.filtered_hit.ok", + "No filtered resolver deviates from the consensus (no blocklist hit detected).")} + } + return states +} diff --git a/checker/rules_soa.go b/checker/rules_soa.go new file mode 100644 index 0000000..0d21840 --- /dev/null +++ b/checker/rules_soa.go @@ -0,0 +1,144 @@ +package checker + +import ( + "context" + "fmt" + "sort" + "strings" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// serialDriftRule flags disagreement between resolvers on the SOA serial. +type serialDriftRule struct{} + +func (r *serialDriftRule) Name() string { return "resolver_propagation.serial_drift" } +func (r *serialDriftRule) Description() string { + return "Flags disagreement on the SOA serial across unfiltered resolvers." +} +func (r *serialDriftRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + soaKey := rrsetKey(data.Zone, "SOA") + if data.RRsets[soaKey] == nil { + return []sdk.CheckState{{Status: sdk.StatusUnknown, + Code: "resolver_propagation.serial_drift.skipped", + Message: "SOA was not probed"}} + } + serials := map[uint32][]string{} + for _, rv := range data.Resolvers { + if rv.Filtered { + continue + } + p := rv.Probes[soaKey] + if p == nil || p.Error != "" || p.Rcode != "NOERROR" { + continue + } + if s := extractSerial(p.Records); s != 0 { + serials[s] = append(serials[s], rv.ID) + } + } + if len(serials) < 2 { + return []sdk.CheckState{passState("resolver_propagation.serial_drift.ok", + "SOA serial is consistent across unfiltered resolvers.")} + } + var parts []string + for s, rs := range serials { + sort.Strings(rs) + parts = append(parts, fmt.Sprintf("serial %d on %s", s, firstN(rs, 6))) + } + sort.Strings(parts) + return []sdk.CheckState{warnState(CodeSerialDrift, soaKey, + "SOA serial differs across resolvers, "+strings.Join(parts, "; "))} +} + +// staleCacheRule flags resolvers still serving a serial below the declared one. +type staleCacheRule struct{} + +func (r *staleCacheRule) Name() string { return "resolver_propagation.stale_cache" } +func (r *staleCacheRule) Description() string { + return "Flags resolvers still serving an SOA serial below the one saved by happyDomain." +} +func (r *staleCacheRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + if data.DeclaredSerial == 0 { + return []sdk.CheckState{{Status: sdk.StatusUnknown, + Code: "resolver_propagation.stale_cache.skipped", + Message: "no declared SOA serial available for comparison"}} + } + soaKey := rrsetKey(data.Zone, "SOA") + if data.RRsets[soaKey] == nil { + return []sdk.CheckState{{Status: sdk.StatusUnknown, + Code: "resolver_propagation.stale_cache.skipped", + Message: "SOA was not probed"}} + } + var below []string + for _, rv := range data.Resolvers { + if rv.Filtered { + continue + } + p := rv.Probes[soaKey] + if p == nil || p.Error != "" || p.Rcode != "NOERROR" { + continue + } + s := extractSerial(p.Records) + if s != 0 && s < data.DeclaredSerial { + below = append(below, rv.ID) + } + } + if len(below) == 0 { + return []sdk.CheckState{passState("resolver_propagation.stale_cache.ok", + "No resolver is still serving an outdated SOA serial.")} + } + sort.Strings(below) + return []sdk.CheckState{infoState(CodeStaleCache, soaKey, + fmt.Sprintf("%d resolver(s) still return a serial below the declared one (%d): %s", + len(below), data.DeclaredSerial, firstN(below, 6)))} +} + +// dnssecRule flags DNSSEC failures (SERVFAIL or missing AD) at the zone apex +// on resolvers known to validate. +type dnssecRule struct{} + +func (r *dnssecRule) Name() string { return "resolver_propagation.dnssec" } +func (r *dnssecRule) Description() string { + return "Checks that validating resolvers successfully validate the zone's DNSSEC chain." +} +func (r *dnssecRule) Evaluate(ctx context.Context, obs sdk.ObservationGetter, _ sdk.CheckerOptions) []sdk.CheckState { + data, errSt := loadData(ctx, obs) + if errSt != nil { + return []sdk.CheckState{*errSt} + } + soaKey := rrsetKey(data.Zone, "SOA") + + var states []sdk.CheckState + for _, rv := range data.Resolvers { + if rv.Filtered || !isValidatingResolver(rv.ID) { + continue + } + soa := rv.Probes[soaKey] + if soa == nil || soa.Error != "" { + continue + } + switch soa.Rcode { + case "SERVFAIL": + states = append(states, critState(CodeDNSSECFailure, rv.ID, + fmt.Sprintf("%s returned SERVFAIL for %s, typically a broken DNSSEC chain", rv.Name, data.Zone))) + case "NOERROR": + if !soa.AD { + states = append(states, infoState(CodeDNSSECUnvalidated, rv.ID, + fmt.Sprintf("%s did not set AD=1 for %s, zone may not be DNSSEC-signed, or signature is broken", rv.Name, data.Zone))) + } + } + } + if len(states) == 0 { + return []sdk.CheckState{passState("resolver_propagation.dnssec.ok", + "Validating resolvers report no DNSSEC issue.")} + } + return states +} diff --git a/checker/rules_test.go b/checker/rules_test.go new file mode 100644 index 0000000..e14d999 --- /dev/null +++ b/checker/rules_test.go @@ -0,0 +1,348 @@ +package checker + +import ( + "context" + "errors" + "strings" + "testing" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +func TestRules_AllAreUniqueAndNamed(t *testing.T) { + seen := map[string]bool{} + for _, r := range Rules() { + if r.Name() == "" { + t.Errorf("rule with empty name: %T", r) + } + if r.Description() == "" { + t.Errorf("rule %s has empty description", r.Name()) + } + if seen[r.Name()] { + t.Errorf("duplicate rule name: %s", r.Name()) + } + seen[r.Name()] = true + } + if len(seen) < 10 { + t.Errorf("expected many rules, got %d", len(seen)) + } +} + +func TestLoadData_ObsError(t *testing.T) { + obs := &errObs{err: errors.New("boom")} + data, st := loadData(context.Background(), obs) + if data != nil { + t.Errorf("data should be nil on error") + } + if st == nil || st.Status != sdk.StatusError { + t.Errorf("want error state, got %+v", st) + } +} + +// runRule is a tiny helper to evaluate a CheckRule with a payload. +func runRule(t *testing.T, r sdk.CheckRule, data *ResolverPropagationData, opts sdk.CheckerOptions) []sdk.CheckState { + t.Helper() + return r.Evaluate(context.Background(), newFakeObs(data), opts) +} + +func TestResolverSelectionRule(t *testing.T) { + // Empty resolver map → crit. + st := runRule(t, &resolverSelectionRule{}, &ResolverPropagationData{Zone: "ex."}, nil) + if len(st) != 1 || st[0].Status != sdk.StatusCrit || st[0].Code != CodeNoResolvers { + t.Errorf("empty: %+v", st) + } + + // Non-empty → ok. + data := &ResolverPropagationData{Resolvers: map[string]*ResolverView{"a": {ID: "a"}}} + st = runRule(t, &resolverSelectionRule{}, data, nil) + if len(st) != 1 || st[0].Status != sdk.StatusOK { + t.Errorf("ok: %+v", st) + } +} + +func TestResolversReachableRule(t *testing.T) { + // No resolvers → unknown. + st := runRule(t, &resolversReachableRule{}, &ResolverPropagationData{}, nil) + if len(st) != 1 || st[0].Status != sdk.StatusUnknown { + t.Errorf("empty: %+v", st) + } + + // All unreachable → crit. + data := &ResolverPropagationData{ + Zone: "ex.", + Resolvers: map[string]*ResolverView{ + "a": {ID: "a", Reachable: false}, + }, + } + st = runRule(t, &resolversReachableRule{}, data, nil) + if len(st) != 1 || st[0].Status != sdk.StatusCrit || st[0].Code != CodeAllResolversDown { + t.Errorf("all-down: %+v", st) + } + + // One reachable → ok. + data.Resolvers["a"].Reachable = true + data.Stats.ReachableResolvers = 1 + data.Stats.TotalResolvers = 1 + st = runRule(t, &resolversReachableRule{}, data, nil) + if len(st) != 1 || st[0].Status != sdk.StatusOK { + t.Errorf("reach: %+v", st) + } +} + +func TestConsensusRule_PartialPropagation(t *testing.T) { + key := "ex./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "b": mkResolver("b", "na", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "9.9.9.9")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + } + st := runRule(t, &consensusRule{}, data, nil) + codes := statesByCode(st) + if _, ok := codes[CodePartialPropagation]; !ok { + t.Errorf("want partial propagation, got %+v", st) + } +} + +func TestConsensusRule_AllAgree(t *testing.T) { + key := "ex./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "b": mkResolver("b", "na", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + } + st := runRule(t, &consensusRule{}, data, nil) + if len(st) != 1 || st[0].Status != sdk.StatusOK { + t.Errorf("want OK, got %+v", st) + } +} + +func TestAuthoritativeMatchRule(t *testing.T) { + key := "ex./A" + mkData := func(expected, returned string) *ResolverPropagationData { + return &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", returned)}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A", Expected: expected}}, + } + } + + // Match. + st := runRule(t, &authoritativeMatchRule{}, mkData("1.1.1.1", "1.1.1.1"), nil) + if len(st) != 1 || st[0].Status != sdk.StatusOK { + t.Errorf("match: %+v", st) + } + + // Drift. + st = runRule(t, &authoritativeMatchRule{}, mkData("1.1.1.1", "9.9.9.9"), nil) + if len(st) != 1 || st[0].Code != CodeAnswerDrift { + t.Errorf("drift: %+v", st) + } + + // No expected anywhere → skipped. + skipped := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{"a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1")})}, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, // no Expected + } + st = runRule(t, &authoritativeMatchRule{}, skipped, nil) + if len(st) != 1 || st[0].Status != sdk.StatusUnknown { + t.Errorf("skipped: %+v", st) + } +} + +func TestNXDOMAINRule(t *testing.T) { + key := "ex./A" + // Some resolvers say NXDOMAIN, others NOERROR. + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "nx": mkResolver("nx", "eu", false, true, map[string]*RRProbe{key: mkProbe("NXDOMAIN", "")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + } + st := runRule(t, &nxdomainRule{}, data, nil) + if _, ok := statesByCode(st)[CodeUnexpectedNXDOMAIN]; !ok { + t.Errorf("want NXDOMAIN finding, got %+v", st) + } + + // All same NXDOMAIN ⇒ rule does NOT fire (it's an "unexpected" rule). + for _, rv := range data.Resolvers { + rv.Probes[key] = mkProbe("NXDOMAIN", "") + } + st = runRule(t, &nxdomainRule{}, data, nil) + if _, ok := statesByCode(st)[CodeUnexpectedNXDOMAIN]; ok { + t.Errorf("uniform NXDOMAIN should not trigger, got %+v", st) + } +} + +func TestSERVFAILRule(t *testing.T) { + key := "ex./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "sf": mkResolver("sf", "eu", false, true, map[string]*RRProbe{key: mkProbe("SERVFAIL", "")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + } + st := runRule(t, &servfailRule{}, data, nil) + if _, ok := statesByCode(st)[CodeUnexpectedSERVFAIL]; !ok { + t.Errorf("want SERVFAIL finding, got %+v", st) + } +} + +func TestRegionalSplitRule(t *testing.T) { + key := "ex./A" + // EU resolvers all see "9.9.9.9", global resolvers see "1.1.1.1" (consensus). + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "g1": mkResolver("g1", "global", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "g2": mkResolver("g2", "global", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "g3": mkResolver("g3", "global", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "eu1": mkResolver("eu1", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "9.9.9.9")}), + "eu2": mkResolver("eu2", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "9.9.9.9")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + } + st := runRule(t, ®ionalSplitRule{}, data, nil) + if _, ok := statesByCode(st)[CodeRegionalSplit]; !ok { + t.Errorf("want regional split, got %+v", st) + } +} + +func TestSerialDriftRule(t *testing.T) { + soaKey := rrsetKey("ex.", "SOA") + mk := func(serial string) *RRProbe { + return &RRProbe{Rcode: "NOERROR", Records: []string{"ns. hm. " + serial + " 1 2 3 4"}, Transport: TransportUDP} + } + data := &ResolverPropagationData{ + Zone: "ex.", + Resolvers: map[string]*ResolverView{ + "a": mkResolver("a", "eu", false, true, map[string]*RRProbe{soaKey: mk("100")}), + "b": mkResolver("b", "eu", false, true, map[string]*RRProbe{soaKey: mk("100")}), + "c": mkResolver("c", "eu", false, true, map[string]*RRProbe{soaKey: mk("99")}), + }, + RRsets: map[string]*RRsetView{soaKey: {Name: "ex.", Type: "SOA"}}, + } + st := runRule(t, &serialDriftRule{}, data, nil) + if len(st) != 1 || st[0].Code != CodeSerialDrift { + t.Errorf("want serial drift, got %+v", st) + } + + // All same → ok. + data.Resolvers["c"].Probes[soaKey] = mk("100") + st = runRule(t, &serialDriftRule{}, data, nil) + if len(st) != 1 || st[0].Status != sdk.StatusOK { + t.Errorf("want ok, got %+v", st) + } + + // SOA not probed → skipped. + delete(data.RRsets, soaKey) + st = runRule(t, &serialDriftRule{}, data, nil) + if len(st) != 1 || st[0].Status != sdk.StatusUnknown { + t.Errorf("want skipped, got %+v", st) + } +} + +func TestStaleCacheRule(t *testing.T) { + soaKey := rrsetKey("ex.", "SOA") + mk := func(serial string) *RRProbe { + return &RRProbe{Rcode: "NOERROR", Records: []string{"ns. hm. " + serial + " 1 2 3 4"}, Transport: TransportUDP} + } + + // No declared serial → skipped. + data := &ResolverPropagationData{Zone: "ex.", RRsets: map[string]*RRsetView{soaKey: {Type: "SOA"}}} + st := runRule(t, &staleCacheRule{}, data, nil) + if st[0].Status != sdk.StatusUnknown { + t.Errorf("no declared: %+v", st) + } + + // Below declared → info. + data = &ResolverPropagationData{ + Zone: "ex.", + DeclaredSerial: 100, + Resolvers: map[string]*ResolverView{ + "old": mkResolver("old", "eu", false, true, map[string]*RRProbe{soaKey: mk("99")}), + "new": mkResolver("new", "eu", false, true, map[string]*RRProbe{soaKey: mk("100")}), + }, + RRsets: map[string]*RRsetView{soaKey: {Type: "SOA"}}, + } + st = runRule(t, &staleCacheRule{}, data, nil) + if len(st) != 1 || st[0].Code != CodeStaleCache { + t.Errorf("stale: %+v", st) + } + if !strings.Contains(st[0].Message, "old") { + t.Errorf("stale msg should name resolver: %q", st[0].Message) + } + + // All up-to-date. + data.Resolvers["old"].Probes[soaKey] = mk("100") + st = runRule(t, &staleCacheRule{}, data, nil) + if st[0].Status != sdk.StatusOK { + t.Errorf("ok: %+v", st) + } +} + +func TestDNSSECRule(t *testing.T) { + soaKey := rrsetKey("ex.", "SOA") + data := &ResolverPropagationData{ + Zone: "ex.", + Resolvers: map[string]*ResolverView{ + // validating + AD set → no finding + "cloudflare": mkResolver("cloudflare", "global", false, true, map[string]*RRProbe{soaKey: {Rcode: "NOERROR", AD: true}}), + // validating + SERVFAIL → DNSSEC failure + "google": mkResolver("google", "global", false, true, map[string]*RRProbe{soaKey: {Rcode: "SERVFAIL"}}), + // validating + NOERROR + AD=false → unvalidated info + "quad9": mkResolver("quad9", "global", false, true, map[string]*RRProbe{soaKey: {Rcode: "NOERROR", AD: false}}), + // non-validating: ignored + "opendns": mkResolver("opendns", "global", false, true, map[string]*RRProbe{soaKey: {Rcode: "SERVFAIL"}}), + }, + RRsets: map[string]*RRsetView{soaKey: {Type: "SOA"}}, + } + st := runRule(t, &dnssecRule{}, data, nil) + codes := statesByCode(st) + if _, ok := codes[CodeDNSSECFailure]; !ok { + t.Errorf("want DNSSEC failure, got %+v", st) + } + if _, ok := codes[CodeDNSSECUnvalidated]; !ok { + t.Errorf("want DNSSEC unvalidated, got %+v", st) + } +} + +func TestResolverLatencyRule(t *testing.T) { + key := "ex./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "slow": mkResolver("slow", "eu", false, true, map[string]*RRProbe{key: {Rcode: "NOERROR", LatencyMs: 1500, Transport: TransportUDP}}), + "fast": mkResolver("fast", "eu", false, true, map[string]*RRProbe{key: {Rcode: "NOERROR", LatencyMs: 30, Transport: TransportUDP}}), + "absent": mkResolver("absent", "eu", false, false, map[string]*RRProbe{key: {Error: "timeout", Transport: TransportUDP}}), + }, + } + st := runRule(t, &resolverLatencyRule{}, data, sdk.CheckerOptions{"latencyThresholdMs": 500}) + codes := statesByCode(st) + if _, ok := codes[CodeResolverHighLatency]; !ok { + t.Errorf("want high latency for 'slow', got %+v", st) + } + if _, ok := codes[CodeResolverUnreachable]; !ok { + t.Errorf("want unreachable for 'absent', got %+v", st) + } +} + +func TestFilteredHitRule(t *testing.T) { + key := "ex./A" + data := &ResolverPropagationData{ + Resolvers: map[string]*ResolverView{ + "clean1": mkResolver("clean1", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "clean2": mkResolver("clean2", "eu", false, true, map[string]*RRProbe{key: mkProbe("NOERROR", "1.1.1.1")}), + "filt": mkResolver("filt", "eu", true, true, map[string]*RRProbe{key: mkProbe("NOERROR", "0.0.0.0")}), + }, + RRsets: map[string]*RRsetView{key: {Name: "ex.", Type: "A"}}, + } + st := runRule(t, &filteredHitRule{}, data, nil) + if _, ok := statesByCode(st)[CodeResolverFilteredHit]; !ok { + t.Errorf("want filtered hit, got %+v", st) + } +} diff --git a/checker/testhelpers_test.go b/checker/testhelpers_test.go new file mode 100644 index 0000000..a58162f --- /dev/null +++ b/checker/testhelpers_test.go @@ -0,0 +1,65 @@ +package checker + +import ( + "context" + "encoding/json" + "fmt" + + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// fakeObs is a tiny ObservationGetter that returns a single payload regardless +// of the key requested. Tests use it to feed canned ResolverPropagationData +// into rules. +type fakeObs struct { + payload any + err error +} + +func newFakeObs(payload any) *fakeObs { return &fakeObs{payload: payload} } + +func (f *fakeObs) Get(_ context.Context, _ sdk.ObservationKey, dest any) error { + if f.err != nil { + return f.err + } + raw, err := json.Marshal(f.payload) + if err != nil { + return fmt.Errorf("fakeObs marshal: %w", err) + } + return json.Unmarshal(raw, dest) +} + +func (f *fakeObs) GetRelated(_ context.Context, _ sdk.ObservationKey) ([]sdk.RelatedObservation, error) { + return nil, nil +} + +// errObs always fails Get; used to verify error-path branches in rules. +type errObs struct{ err error } + +func (e *errObs) Get(_ context.Context, _ sdk.ObservationKey, _ any) error { return e.err } +func (e *errObs) GetRelated(_ context.Context, _ sdk.ObservationKey) ([]sdk.RelatedObservation, error) { + return nil, nil +} + +// mkProbe is a small constructor used in many tests. +func mkProbe(rcode, sig string, records ...string) *RRProbe { + return &RRProbe{Rcode: rcode, Signature: sig, Records: records, Transport: TransportUDP} +} + +// mkResolver builds a ResolverView with a single probe. +func mkResolver(id, region string, filtered, reachable bool, probes map[string]*RRProbe) *ResolverView { + return &ResolverView{ + ID: id, Name: id, IP: "0.0.0.0", Region: region, + Filtered: filtered, Reachable: reachable, Transport: TransportUDP, + Probes: probes, + } +} + +// statesByCode reorganises a slice of CheckState by Code for easy lookup. +func statesByCode(states []sdk.CheckState) map[string][]sdk.CheckState { + out := map[string][]sdk.CheckState{} + for _, s := range states { + out[s.Code] = append(out[s.Code], s) + } + return out +} diff --git a/checker/types.go b/checker/types.go new file mode 100644 index 0000000..d16dc5b --- /dev/null +++ b/checker/types.go @@ -0,0 +1,215 @@ +package checker + +import ( + "encoding/json" + + "github.com/miekg/dns" +) + +// ObservationKeyResolverPropagation is the observation key used to store data +// produced by this checker. +const ObservationKeyResolverPropagation = "resolver_propagation" + +// Severity classifies a finding. +type Severity string + +const ( + SeverityInfo Severity = "info" + SeverityWarn Severity = "warn" + SeverityCrit Severity = "crit" +) + +// Finding codes: stable machine-readable identifiers surfaced in the UI. +const ( + // Zone-wide. + CodeNoResolvers = "rprop_no_resolvers" + CodeAllResolversDown = "rprop_all_resolvers_down" + CodeSerialDrift = "rprop_serial_drift" + CodeStaleCache = "rprop_stale_cache" + CodeDNSSECFailure = "rprop_dnssec_failure" + CodeDNSSECUnvalidated = "rprop_dnssec_not_validated" + CodeRegionalSplit = "rprop_regional_split" + CodePartialPropagation = "rprop_partial_propagation" + CodeAnswerDrift = "rprop_answer_drift" + CodeUnexpectedNXDOMAIN = "rprop_unexpected_nxdomain" + CodeUnexpectedSERVFAIL = "rprop_unexpected_servfail" + + // Per-resolver. + CodeResolverUnreachable = "rprop_resolver_unreachable" + CodeResolverTimeout = "rprop_resolver_timeout" + CodeResolverRewrote = "rprop_resolver_rewrote_answer" + CodeResolverFilteredHit = "rprop_resolver_filtered_hit" + CodeResolverHighLatency = "rprop_resolver_high_latency" +) + +// Transport identifies the protocol used to reach a resolver. +type Transport string + +const ( + TransportUDP Transport = "udp" + TransportTCP Transport = "tcp" + TransportDoT Transport = "dot" + TransportDoH Transport = "doh" +) + +// Finding is a single observation produced during collection. +type Finding struct { + Code string `json:"code"` + Severity Severity `json:"severity"` + Message string `json:"message"` + + // Resolver is the resolver ID when the finding is scoped to one. + Resolver string `json:"resolver,omitempty"` + + // RRset is "name/TYPE" when the finding is scoped to one RR set. + RRset string `json:"rrset,omitempty"` + + // Remedy is a short, user-facing sentence describing what to do. + Remedy string `json:"remedy,omitempty"` +} + +// RRProbe is the observation for a single (resolver, RRset) pair. +type RRProbe struct { + // Rcode is the response rcode in text form (NOERROR / NXDOMAIN / + // SERVFAIL / REFUSED / …). Empty when the probe failed before a + // response was parsed. + Rcode string `json:"rcode,omitempty"` + + // Signature is the sorted, TTL-stripped RDATA joined with "|". Two + // resolvers agree on an answer iff their signatures are equal. + Signature string `json:"signature,omitempty"` + + // Records is the list of record RDATA strings as returned by the + // resolver, sorted. + Records []string `json:"records,omitempty"` + + // MinTTL is the smallest TTL across the RRset. Useful to spot stale + // caches (TTL close to 0 means the resolver just refreshed). + MinTTL uint32 `json:"min_ttl,omitempty"` + + // AD indicates the resolver set the AD bit on the response (DNSSEC + // validated). Only meaningful on AD-capable resolvers. + AD bool `json:"ad,omitempty"` + + // LatencyMs is the observed round-trip time in milliseconds. + LatencyMs int64 `json:"latency_ms,omitempty"` + + // Transport is the protocol used for this probe. + Transport Transport `json:"transport,omitempty"` + + // Error describes a transport/protocol failure. Set means the probe + // did not complete and Rcode/Signature are empty. + Error string `json:"error,omitempty"` +} + +// ResolverView aggregates every probe performed against a single resolver. +type ResolverView struct { + ID string `json:"id"` + Name string `json:"name"` + IP string `json:"ip"` + Region string `json:"region"` + Filtered bool `json:"filtered,omitempty"` + Transport Transport `json:"transport"` + + // Reachable is true when at least one probe against this resolver + // produced a valid response (any rcode, including NXDOMAIN). + Reachable bool `json:"reachable"` + + // Probes is one RRProbe per "name/TYPE" string. + Probes map[string]*RRProbe `json:"probes,omitempty"` +} + +// RRsetView is the cross-resolver picture of a single (name, type): which +// signatures were seen, which resolvers returned each signature, and which +// one we pick as "consensus". The consensus is the most-returned signature +// from unfiltered, reachable resolvers. +type RRsetView struct { + Name string `json:"name"` + Type string `json:"type"` + + // Expected is the signature computed from the user's declared zone. Used + // to distinguish "resolvers disagree with each other" from "resolvers + // agree but are wrong". + Expected string `json:"expected,omitempty"` + ExpectedRecords []string `json:"expected_records,omitempty"` + + // Groups buckets resolvers by signature. + Groups []SignatureGroup `json:"groups,omitempty"` + + // ConsensusSig is the signature returned by the majority of unfiltered + // reachable resolvers. + ConsensusSig string `json:"consensus_sig,omitempty"` + + // Agreeing / Dissenting are resolver IDs relative to the consensus. + Agreeing []string `json:"agreeing,omitempty"` + Dissenting []string `json:"dissenting,omitempty"` + + // MatchesExpected is true when the consensus matches the expected + // signature. When Expected is empty we skip this check. + MatchesExpected bool `json:"matches_expected"` +} + +// SignatureGroup is one bucket in RRsetView: a signature + its records + the +// resolvers that returned it. +type SignatureGroup struct { + Signature string `json:"signature"` + Records []string `json:"records,omitempty"` + Resolvers []string `json:"resolvers,omitempty"` + Rcode string `json:"rcode,omitempty"` +} + +// ResolverPropagationData is the top-level observation payload. +type ResolverPropagationData struct { + Zone string `json:"zone"` + + // Names lists the owner names probed: apex + user-provided subdomains. + Names []string `json:"names"` + + // Types lists the RR types probed (text: "A", "AAAA", "MX", …). + Types []string `json:"types"` + + // Resolvers is the per-resolver view, keyed by resolver ID. + Resolvers map[string]*ResolverView `json:"resolvers,omitempty"` + + // RRsets is the per-RRset cross-resolver view, keyed by "name/TYPE". + RRsets map[string]*RRsetView `json:"rrsets,omitempty"` + + // DeclaredSerial is the SOA serial saved by happyDomain (when available). + DeclaredSerial uint32 `json:"declared_serial,omitempty"` + + // RunDurationMs is the wall-clock duration of the probe round. + RunDurationMs int64 `json:"run_duration_ms,omitempty"` + + // Stats summarizes the run. + Stats Stats `json:"stats"` +} + +// Stats is a rollup of resolver health, useful for the dashboard. +type Stats struct { + TotalResolvers int `json:"total_resolvers"` + ReachableResolvers int `json:"reachable_resolvers"` + UnfilteredProbed int `json:"unfiltered_probed"` + FilteredProbed int `json:"filtered_probed"` + CountriesCovered int `json:"countries_covered"` + UnfilteredAgreeing int `json:"unfiltered_agreeing"` +} + +// originService mirrors happyDomain's abstract.Origin payload (same shape as +// checker-propagation). We only need the NS list + SOA to detect "this zone +// is supposed to exist". +type originService struct { + SOA *dns.SOA `json:"soa,omitempty"` + NameServers []*dns.NS `json:"ns"` +} + +// serviceMessage mirrors happyDomain's ServiceMessage envelope. +type serviceMessage struct { + Type string `json:"_svctype"` + Domain string `json:"_domain"` + Service json.RawMessage `json:"Service"` +} + +// rrsetKey builds the "name/TYPE" identifier used to index RRsets. +func rrsetKey(name, typ string) string { + return dns.Fqdn(name) + "/" + typ +} diff --git a/checker/types_test.go b/checker/types_test.go new file mode 100644 index 0000000..23a8453 --- /dev/null +++ b/checker/types_test.go @@ -0,0 +1,20 @@ +package checker + +import "testing" + +func TestRRsetKey(t *testing.T) { + cases := []struct { + name, typ, want string + }{ + {"example.com", "A", "example.com./A"}, + {"example.com.", "A", "example.com./A"}, + {"WWW.Example.Com", "AAAA", "WWW.Example.Com./AAAA"}, // case is preserved (Fqdn doesn't downcase) + {".", "SOA", "./SOA"}, + {"sub.example.com", "MX", "sub.example.com./MX"}, + } + for _, c := range cases { + if got := rrsetKey(c.name, c.typ); got != c.want { + t.Errorf("rrsetKey(%q,%q) = %q, want %q", c.name, c.typ, got, c.want) + } + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..d409a7c --- /dev/null +++ b/go.mod @@ -0,0 +1,16 @@ +module git.happydns.org/checker-resolver-propagation + +go 1.25.0 + +require ( + git.happydns.org/checker-sdk-go v1.5.0 + github.com/miekg/dns v1.1.72 +) + +require ( + golang.org/x/mod v0.31.0 // indirect + golang.org/x/net v0.48.0 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/sys v0.39.0 // indirect + golang.org/x/tools v0.40.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..2a80023 --- /dev/null +++ b/go.sum @@ -0,0 +1,16 @@ +git.happydns.org/checker-sdk-go v1.5.0 h1:5uD5Cm6xJ+lwnhbJ09iCXGHbYS9zRh+Yh0NeBHkAPBY= +git.happydns.org/checker-sdk-go v1.5.0/go.mod h1:aNAcfYFfbhvH9kJhE0Njp5GX0dQbxdRB0rJ0KvSC5nI= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/miekg/dns v1.1.72 h1:vhmr+TF2A3tuoGNkLDFK9zi36F2LS+hKTRW0Uf8kbzI= +github.com/miekg/dns v1.1.72/go.mod h1:+EuEPhdHOsfk6Wk5TT2CzssZdqkmFhf8r+aVyDEToIs= +golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= +golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg= +golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= +golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= +golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= diff --git a/main.go b/main.go new file mode 100644 index 0000000..1b68c5d --- /dev/null +++ b/main.go @@ -0,0 +1,28 @@ +package main + +import ( + "flag" + "log" + + resolverpropagation "git.happydns.org/checker-resolver-propagation/checker" + "git.happydns.org/checker-sdk-go/checker/server" +) + +var listenAddr = flag.String("listen", ":8080", "HTTP listen address") + +// Version is the standalone binary's version. It defaults to "custom-build" +// and is meant to be overridden by the CI at link time: +// +// go build -ldflags "-X main.Version=1.2.3" . +var Version = "custom-build" + +func main() { + flag.Parse() + + resolverpropagation.Version = Version + + srv := server.New(resolverpropagation.Provider()) + if err := srv.ListenAndServe(*listenAddr); err != nil { + log.Fatalf("server error: %v", err) + } +} diff --git a/plugin/plugin.go b/plugin/plugin.go new file mode 100644 index 0000000..8cef7f3 --- /dev/null +++ b/plugin/plugin.go @@ -0,0 +1,18 @@ +// Command plugin is the happyDomain plugin entrypoint for the +// resolver-propagation checker. +package main + +import ( + resolverpropagation "git.happydns.org/checker-resolver-propagation/checker" + sdk "git.happydns.org/checker-sdk-go/checker" +) + +// Version is the plugin's version. +var Version = "custom-build" + +// NewCheckerPlugin is the symbol resolved by happyDomain when loading the .so. +func NewCheckerPlugin() (*sdk.CheckerDefinition, sdk.ObservationProvider, error) { + resolverpropagation.Version = Version + prvd := resolverpropagation.Provider() + return prvd.(sdk.CheckerDefinitionProvider).Definition(), prvd, nil +}