checker-caa/checker/ccadb.go

package checker

import (
	"bytes"
	_ "embed"
	"encoding/csv"
	"fmt"
	"io"
	"sort"
	"strings"
	"sync"
)

//go:generate wget -O AllCAAIdentifiersReport.csv https://ccadb.my.salesforce-sites.com/ccadb/AllCAAIdentifiersReportCSVV2
//go:embed AllCAAIdentifiersReport.csv
var ccadbCSV []byte

// ccadbIndex is the in-memory representation of AllCAAIdentifiersReport.csv.
// Two indexes are maintained because CCADB rows sometimes have an empty
// Subject Key Identifier column (very rare; a handful of legacy entries)
// and we want to still resolve those via Subject DN.
type ccadbIndex struct {
	bySKI map[string][]string
	byDN  map[string][]string
}

var (
	ccadbOnce sync.Once
	ccadb     *ccadbIndex
	ccadbErr  error
)

// loadCCADB parses the embedded CSV into the two lookup indexes on first
// call. Subsequent calls are no-ops. The CSV is shipped with the binary
// so parse failures indicate a bug or a corrupted build, not a runtime
// condition; tests assert the parse succeeds for the checked-in file.
func loadCCADB() (*ccadbIndex, error) {
	ccadbOnce.Do(func() {
		ccadb, ccadbErr = parseCCADB(bytes.NewReader(ccadbCSV))
	})
	return ccadb, ccadbErr
}

// parseCCADB is exposed for testing with alternate CSV inputs.
func parseCCADB(r io.Reader) (*ccadbIndex, error) {
	reader := csv.NewReader(r)
	reader.FieldsPerRecord = -1 // some rows carry a trailing empty field

	header, err := reader.Read()
	if err != nil {
		return nil, fmt.Errorf("read header: %w", err)
	}

	idxSubject := -1
	idxSKI := -1
	idxDomains := -1
	for i, h := range header {
		switch strings.TrimSpace(h) {
		case "Subject":
			idxSubject = i
		case "Subject Key Identifier (Hex)":
			idxSKI = i
		case "Recognized CAA Domains":
			idxDomains = i
		}
	}
	if idxSubject < 0 || idxSKI < 0 || idxDomains < 0 {
		return nil, fmt.Errorf("unexpected CCADB header: %v", header)
	}

	idx := &ccadbIndex{
		bySKI: map[string][]string{},
		byDN:  map[string][]string{},
	}
	for {
		row, err := reader.Read()
		if err == io.EOF {
			break
		}
		if err != nil {
			return nil, fmt.Errorf("read row: %w", err)
		}

		domains := splitCAADomains(row[idxDomains])
		if len(domains) == 0 {
			continue
		}

		if ski := strings.ToUpper(strings.TrimSpace(row[idxSKI])); ski != "" {
			idx.bySKI[ski] = mergeDomains(idx.bySKI[ski], domains)
		}
		if dn := normalizeDN(row[idxSubject]); dn != "" {
			idx.byDN[dn] = mergeDomains(idx.byDN[dn], domains)
		}
	}
	return idx, nil
}

// Lookup resolves an observed certificate issuer to the CAA identifier
// domains the issuing CA publishes in its CPS. aki is the uppercase hex
// Authority Key Identifier of the leaf (i.e. the issuer's SKI); dn is
// the RFC 2253 subject DN of the issuer (leaf.Issuer.String() in Go).
//
// AKI takes precedence because CCADB keys by it. DN is a fallback for
// the rare rows where the SKI column is empty.
//
// Returns ok=false when neither key resolves. The returned slice is a
// fresh copy; callers may retain or mutate it.
func Lookup(aki, dn string) ([]string, bool) {
	idx, err := loadCCADB()
	if err != nil || idx == nil {
		return nil, false
	}
	if aki != "" {
		if d, ok := idx.bySKI[strings.ToUpper(strings.TrimSpace(aki))]; ok && len(d) > 0 {
			return append([]string(nil), d...), true
		}
	}
	if dn != "" {
		if d, ok := idx.byDN[normalizeDN(dn)]; ok && len(d) > 0 {
			return append([]string(nil), d...), true
		}
	}
	return nil, false
}

// splitCAADomains splits CCADB's "Recognized CAA Domains" cell, which
// can hold a comma-separated list (e.g. DigiCert rows list ~20
// domains). Whitespace is trimmed, empties are dropped, and the result
// is lowercased because CAA identifiers are case-insensitive.
func splitCAADomains(raw string) []string {
	var out []string
	for d := range strings.SplitSeq(raw, ",") {
		d = strings.TrimSpace(strings.ToLower(d))
		if d != "" {
			out = append(out, d)
		}
	}
	return out
}

// mergeDomains appends new entries to an existing slice, de-duplicating.
// CCADB occasionally lists the same CA twice (cross-signs, re-issues);
// we don't want that to bloat the lookup result.
func mergeDomains(existing, add []string) []string {
	if len(existing) == 0 {
		return append([]string(nil), add...)
	}
	seen := map[string]bool{}
	for _, d := range existing {
		seen[d] = true
	}
	for _, d := range add {
		if !seen[d] {
			existing = append(existing, d)
			seen[d] = true
		}
	}
	return existing
}

// normalizeDN produces a canonical key from a subject DN so that DNs
// produced by Go's pkix.Name.String (comma-joined) compare equal to
// DNs produced by CCADB (semicolon-joined) when their RDN sets match.
//
// Rules:
//   - split on ',' or ';';
//   - trim each RDN;
//   - uppercase the RDN type (left of '=') because RFC 4514 types are
//     case-insensitive; values are left as-is;
//   - sort the RDNs alphabetically so reordering does not break
//     comparison.
//
// This is intentionally permissive; escaping differences between
// implementations are ignored. Good enough for CCADB fallbacks, and
// the common path is the AKI lookup anyway.
func normalizeDN(dn string) string {
	if dn == "" {
		return ""
	}
	fields := splitRDNs(dn)
	for i, f := range fields {
		f = strings.TrimSpace(f)
		if eq := strings.IndexByte(f, '='); eq > 0 {
			f = strings.ToUpper(f[:eq]) + "=" + strings.TrimSpace(f[eq+1:])
		}
		fields[i] = f
	}
	sort.Strings(fields)
	return strings.Join(fields, ",")
}

// splitRDNs splits a DN string on either ',' or ';', respecting
// backslash escapes. Most RDN values in CCADB do not contain escaped
// separators, but a handful (paths in OU values) do.
func splitRDNs(dn string) []string {
	var out []string
	var cur strings.Builder
	escape := false
	for i := 0; i < len(dn); i++ {
		c := dn[i]
		if escape {
			cur.WriteByte(c)
			escape = false
			continue
		}
		switch c {
		case '\\':
			cur.WriteByte(c)
			escape = true
		case ',', ';':
			out = append(out, cur.String())
			cur.Reset()
		default:
			cur.WriteByte(c)
		}
	}
	if cur.Len() > 0 {
		out = append(out, cur.String())
	}
	return out
}