checker-dane/checker/report.go

package checker

import (
	"bytes"
	"encoding/json"
	"fmt"
	"html"
	"sort"
	"strings"

	sdk "git.happydns.org/checker-sdk-go/checker"
	tls "git.happydns.org/checker-tls/checker"
)

// GetHTMLReport implements sdk.CheckerHTMLReporter. The report opens with a
// diagnosis-first section that lists the most common DANE failure modes
// actually detected on the user's targets, each with a one-shot remediation
// snippet; a per-target table follows for reference.
func (p *daneProvider) GetHTMLReport(ctx sdk.ReportContext) (string, error) {
	var data DANEData
	if err := json.Unmarshal(ctx.Data(), &data); err != nil {
		return "", fmt.Errorf("decode DANE data: %w", err)
	}

	probes := map[string]*tls.TLSProbe{}
	for _, ro := range ctx.Related(tls.ObservationKeyTLSProbes) {
		for k, v := range parseTLSProbeMap(ro.Data) {
			probes[k] = &v
		}
	}

	var b bytes.Buffer
	fmt.Fprint(&b, `<!DOCTYPE html><html><head><meta charset="utf-8"><title>DANE report</title>`)
	fmt.Fprint(&b, reportCSS)
	fmt.Fprint(&b, `</head><body><main>`)
	fmt.Fprintf(&b, `<h1>DANE / TLSA</h1><p class="meta">Collected %s · %d endpoint(s).</p>`,
		html.EscapeString(data.CollectedAt.Format("2006-01-02 15:04 MST")), len(data.Targets))

	diag := diagnose(data, probes)
	if len(diag) > 0 {
		fmt.Fprint(&b, `<section class="diagnosis"><h2>Action required</h2>`)
		for _, d := range diag {
			fmt.Fprintf(&b,
				`<article class="finding sev-%s"><h3>%s</h3><p>%s</p>`,
				html.EscapeString(d.Severity),
				html.EscapeString(d.Title),
				html.EscapeString(d.Detail))
			if d.Fix != "" {
				fmt.Fprintf(&b, `<pre class="fix">%s</pre>`, html.EscapeString(d.Fix))
			}
			fmt.Fprint(&b, `</article>`)
		}
		fmt.Fprint(&b, `</section>`)
	}

	fmt.Fprint(&b, `<section class="targets"><h2>Endpoints</h2><table><thead><tr><th>Endpoint</th><th>Status</th><th>Records</th><th>Observed leaf</th></tr></thead><tbody>`)
	for _, t := range data.Targets {
		probe := probes[t.Ref]
		status, cls := targetStatus(t, probe)
		leaf := "—"
		if probe != nil && len(probe.Chain) > 0 {
			leaf = probe.Chain[0].Subject
		} else if probe != nil && probe.Error != "" {
			leaf = "handshake error"
		}
		fmt.Fprintf(&b,
			`<tr class="status-%s"><td><code>%s</code><br><small>%s → %s:%d%s</small></td><td>%s</td><td>%d</td><td>%s</td></tr>`,
			html.EscapeString(cls),
			html.EscapeString(t.Owner),
			html.EscapeString(t.Proto),
			html.EscapeString(t.Host),
			t.Port,
			starttlsLabel(t.STARTTLS),
			html.EscapeString(status),
			len(t.Records),
			html.EscapeString(leaf),
		)
	}
	fmt.Fprint(&b, `</tbody></table></section>`)

	fmt.Fprint(&b, `</main></body></html>`)
	return b.String(), nil
}

// diagnosis is a single actionable hint surfaced at the top of the report.
type diagnosis struct {
	Severity string // crit | warn | info
	Title    string
	Detail   string
	Fix      string // ready-to-apply snippet (shell or zone fragment)
}

// diagnose scans every target and produces the minimum set of high-signal
// cards users need to act on. Priority ordering (most-common first):
//
//  1. no_match: TLSA records do not cover the live cert (post-rotation miss).
//  2. handshake_failed: endpoint unreachable or TLS broken, DANE can't be
//     validated at all.
//  3. pkix_chain_invalid: usage 0/1 published but public chain is broken.
//  4. usage_3_matches_issuer: DANE-EE selector matches an intermediate
//     the record is probably miscategorized (usage 2 was intended).
//  5. no_probe_yet: quiet informational to avoid false alarms on first run.
// countMatched returns the number of TLSA records in t that match probe's chain.
func countMatched(t TargetResult, p *tls.TLSProbe) int {
	if p == nil {
		return 0
	}
	n := 0
	for _, r := range t.Records {
		if ok, _ := matchRecord(r, p); ok {
			n++
		}
	}
	return n
}

func diagnose(data DANEData, probes map[string]*tls.TLSProbe) []diagnosis {
	var out []diagnosis

	for _, t := range data.Targets {
		probe := probes[t.Ref]
		switch {
		case probe == nil:
			out = append(out, diagnosis{
				Severity: SeverityInfo,
				Title:    fmt.Sprintf("Waiting for first TLS probe on %s:%d", t.Host, t.Port),
				Detail:   "checker-tls has not yet probed this endpoint. This is normal immediately after publishing a new TLSA record; status will clear on the next cycle.",
			})
		case probe.Error != "" || len(probe.Chain) == 0:
			out = append(out, diagnosis{
				Severity: SeverityCrit,
				Title:    fmt.Sprintf("Cannot reach %s:%d to validate DANE", t.Host, t.Port),
				Detail:   "TLS handshake failed — DANE publishes hashes for a certificate nobody can see. Either the service is down, the port is blocked, or STARTTLS negotiation is broken.",
				Fix:      handshakeFix(t),
			})
		default:
			if countMatched(t, probe) == 0 && len(t.Records) > 0 {
				out = append(out, diagnosis{
					Severity: SeverityCrit,
					Title:    fmt.Sprintf("No TLSA record matches the live certificate on %s:%d", t.Host, t.Port),
					Detail:   "This is the most common DANE outage cause: the certificate was rotated without rolling over the TLSA RRset, and validating resolvers are now rejecting the connection. Publish a TLSA record for the new certificate before removing the old one.",
					Fix:      proposedTLSA(t, probe),
				})
			}
			if hasPKIXUsage(t) && (probe.ChainValid == nil || !*probe.ChainValid) {
				out = append(out, diagnosis{
					Severity: SeverityCrit,
					Title:    fmt.Sprintf("Usage 0/1 needs a publicly-trusted chain on %s:%d", t.Host, t.Port),
					Detail:   "TLSA usages 0 (PKIX-TA) and 1 (PKIX-EE) require the certificate chain to validate against system roots. Either re-issue through a publicly-trusted CA or switch to usage 2 / 3, which skip PKIX.",
				})
			}
			if warn := suspiciousUsage(t, probe); warn != "" {
				out = append(out, diagnosis{
					Severity: SeverityWarn,
					Title:    fmt.Sprintf("Suspicious TLSA usage on %s:%d", t.Host, t.Port),
					Detail:   warn,
				})
			}
		}
	}

	// Stable: crit first, then warn, then info; preserving encounter order
	// within each group keeps the table and the cards aligned.
	sort.SliceStable(out, func(i, j int) bool {
		return sevRank(out[i].Severity) < sevRank(out[j].Severity)
	})
	return out
}

func sevRank(s string) int {
	switch s {
	case SeverityCrit:
		return 0
	case SeverityWarn:
		return 1
	default:
		return 2
	}
}

// hasPKIXUsage reports whether any TLSA record at this target demands PKIX
// validation (usage 0 or 1).
func hasPKIXUsage(t TargetResult) bool {
	for _, r := range t.Records {
		if r.Usage == UsagePKIXTA || r.Usage == UsagePKIXEE {
			return true
		}
	}
	return false
}

// suspiciousUsage returns a human-readable hint when a record hash matches a
// chain slot that contradicts its declared usage (e.g. usage 3 whose hash
// actually matches the intermediate), almost always a publisher error.
func suspiciousUsage(t TargetResult, p *tls.TLSProbe) string {
	if len(p.Chain) < 2 {
		return ""
	}
	for _, r := range t.Records {
		if r.Usage != UsageDANEEE && r.Usage != UsagePKIXEE {
			continue
		}
		// Compare against non-leaf certs; any match there means the user
		// published the wrong usage.
		for _, c := range p.Chain[1:] {
			cand, err := recordCandidate(r, c)
			if err != nil {
				continue
			}
			if strings.EqualFold(cand, r.Certificate) {
				return "A record declared with usage 1/3 (end-entity) actually matches an intermediate certificate. It should probably use usage 0 or 2 (trust-anchor) instead."
			}
		}
	}
	return ""
}

// proposedTLSA renders a ready-to-paste replacement RR using the most common
// DANE-EE + SPKI + SHA-256 triplet computed from the live leaf. This is the
// profile Let's Encrypt users are pushed towards because it survives any
// cert rotation that keeps the same key pair.
func proposedTLSA(t TargetResult, p *tls.TLSProbe) string {
	if p == nil || len(p.Chain) == 0 {
		return ""
	}
	return fmt.Sprintf("%s IN TLSA 3 1 1 %s", t.Owner, p.Chain[0].SPKISHA256)
}

// handshakeFix proposes a STARTTLS-aware first step when the probe failed.
func handshakeFix(t TargetResult) string {
	if t.STARTTLS != "" {
		return fmt.Sprintf("openssl s_client -connect %s:%d -starttls %s -servername %s", t.Host, t.Port, t.STARTTLS, t.Host)
	}
	return fmt.Sprintf("openssl s_client -connect %s:%d -servername %s", t.Host, t.Port, t.Host)
}

func targetStatus(t TargetResult, p *tls.TLSProbe) (label, class string) {
	if p == nil {
		return "Waiting for probe", "unknown"
	}
	if p.Error != "" || len(p.Chain) == 0 {
		return "Handshake failed", "crit"
	}
	if len(t.Records) == 0 {
		return "No records", "info"
	}
	matched := countMatched(t, p)
	if matched == 0 {
		return "No match", "crit"
	}
	return fmt.Sprintf("%d/%d match", matched, len(t.Records)), "ok"
}

func starttlsLabel(s string) string {
	if s == "" {
		return ""
	}
	return " · STARTTLS " + html.EscapeString(s)
}

const reportCSS = `<style>
body{font-family:system-ui,sans-serif;margin:0;background:#fafbfc;color:#1b1f23;}
main{max-width:980px;margin:0 auto;padding:1.5rem;}
h1{margin:0 0 .25rem 0;}
.meta{color:#586069;margin:0 0 1.5rem 0;}
section{margin-bottom:2rem;}
h2{border-bottom:1px solid #e1e4e8;padding-bottom:.25rem;}
.finding{border-left:4px solid;padding:.75rem 1rem;margin:.75rem 0;background:#fff;border-radius:4px;}
.finding h3{margin:0 0 .25rem 0;font-size:1rem;}
.finding.sev-crit{border-color:#d73a49;}
.finding.sev-warn{border-color:#dbab09;}
.finding.sev-info{border-color:#0366d6;}
.fix{background:#1b1f23;color:#fafbfc;padding:.5rem .75rem;border-radius:4px;overflow-x:auto;font-size:.85rem;}
table{width:100%;border-collapse:collapse;background:#fff;}
th,td{padding:.5rem .75rem;border-bottom:1px solid #e1e4e8;text-align:left;vertical-align:top;}
tr.status-crit td:nth-child(2){color:#d73a49;font-weight:600;}
tr.status-ok td:nth-child(2){color:#22863a;font-weight:600;}
tr.status-unknown td:nth-child(2){color:#586069;}
code{font-size:.85rem;}
small{color:#586069;}
</style>`