checker: add Janitor goroutine to enforce retention policy

The Janitor periodically walks every CheckPlan, loads its executions, and deletes the ones that the tiered RetentionPolicy says to drop. Per-user overrides are honoured: if a user's UserQuota.RetentionDays is set, that horizon replaces the system default for the user's plans. User lookups are cached per sweep to avoid repeated storage hits. The janitor is the long-tail counterpart of the (still TODO) cheap hard cap that will be applied at execution-creation time. It runs immediately on Start() and then every configured interval (default 6h).
2026-04-08 11:51:52 +07:00 · 2026-04-08 11:51:52 +07:00 · f626e814c7
commit f626e814c7
parent f54a1bea70
3 changed files with 928 additions and 0 deletions
--- a/internal/app/app.go
+++ b/internal/app/app.go
@ -273,6 +273,17 @@ func (app *App) initUsecases() {
 	)
 	app.usecases.checkerScheduler = checkerUC.NewScheduler(app.usecases.checkerEngine, app.cfg.CheckerMaxConcurrency, app.store, app.store, app.store, app.store)

+	// Retention janitor.
+	app.usecases.checkerJanitor = checkerUC.NewJanitor(
+		app.store,
+		app.store,
+		app.store,
+		app.store,
+		app.store,
+		checkerUC.DefaultRetentionPolicy(app.cfg.CheckerRetentionDays),
+		app.cfg.CheckerJanitorInterval,
+	)
+
 	// Wire scheduler notifications for incremental queue updates.
 	domainService.SetSchedulerNotifier(app.usecases.checkerScheduler)
 	app.usecases.orchestrator.SetSchedulerNotifier(app.usecases.checkerScheduler)
@ -348,6 +359,10 @@ func (app *App) Start() {
 		app.usecases.checkerScheduler.Start(context.Background())
 	}

+	if app.usecases.checkerJanitor != nil {
+		app.usecases.checkerJanitor.Start(context.Background())
+	}
+
 	log.Printf("Public interface listening on %s\n", app.cfg.Bind)
 	if err := app.srv.ListenAndServe(); err != nil && err != http.ErrServerClosed {
 		log.Fatalf("listen: %s\n", err)
@ -365,6 +380,10 @@ func (app *App) Stop() {
 		app.usecases.checkerScheduler.Stop()
 	}

+	if app.usecases.checkerJanitor != nil {
+		app.usecases.checkerJanitor.Stop()
+	}
+
 	// Close storage
 	if app.store != nil {
 		app.store.Close()
--- a/internal/usecase/checker/janitor.go
+++ b/internal/usecase/checker/janitor.go
@ -0,0 +1,241 @@
+// This file is part of the happyDomain (R) project.
+// Copyright (c) 2020-2026 happyDomain
+// Authors: Pierre-Olivier Mercier, et al.
+//
+// This program is offered under a commercial and under the AGPL license.
+// For commercial licensing, contact us at <contact@happydomain.org>.
+//
+// For AGPL licensing:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+package checker
+
+import (
+	"context"
+	"log"
+	"sync"
+	"time"
+
+	"git.happydns.org/happyDomain/model"
+)
+
+// JanitorUserResolver resolves a user from a CheckTarget so the janitor can
+// honour per-user retention overrides stored in UserQuota.
+type JanitorUserResolver interface {
+	GetUser(id happydns.Identifier) (*happydns.User, error)
+}
+
+// Janitor periodically prunes old check executions and evaluations according
+// to the tiered RetentionPolicy. It is the long-tail enforcement counterpart
+// of the cheap hard cap applied at execution-creation time.
+type Janitor struct {
+	planStore     CheckPlanStorage
+	execStore     ExecutionStorage
+	evalStore     CheckEvaluationStorage
+	snapStore     ObservationSnapshotStorage
+	userResolver  JanitorUserResolver
+	defaultPolicy RetentionPolicy
+	interval      time.Duration
+
+	mu      sync.Mutex
+	cancel  context.CancelFunc
+	done    chan struct{}
+	running bool
+}
+
+// NewJanitor builds a Janitor that runs every `interval`. The defaultPolicy
+// is applied to executions of users that did not customize their retention
+// horizon via UserQuota. evalStore and snapStore may be nil if evaluation
+// pruning is not desired.
+func NewJanitor(planStore CheckPlanStorage, execStore ExecutionStorage, evalStore CheckEvaluationStorage, snapStore ObservationSnapshotStorage, userResolver JanitorUserResolver, defaultPolicy RetentionPolicy, interval time.Duration) *Janitor {
+	if interval <= 0 {
+		interval = 6 * time.Hour
+	}
+	return &Janitor{
+		planStore:     planStore,
+		execStore:     execStore,
+		evalStore:     evalStore,
+		snapStore:     snapStore,
+		userResolver:  userResolver,
+		defaultPolicy: defaultPolicy,
+		interval:      interval,
+	}
+}
+
+// Start launches the janitor loop in a goroutine. It runs an immediate sweep
+// once the loop is up.
+func (j *Janitor) Start(ctx context.Context) {
+	j.mu.Lock()
+	if j.running {
+		j.mu.Unlock()
+		return
+	}
+	ctx, cancel := context.WithCancel(ctx)
+	j.cancel = cancel
+	j.done = make(chan struct{})
+	j.running = true
+	j.mu.Unlock()
+
+	go j.loop(ctx)
+}
+
+// Stop halts the janitor and waits for the current sweep to finish.
+func (j *Janitor) Stop() {
+	j.mu.Lock()
+	cancel := j.cancel
+	done := j.done
+	j.mu.Unlock()
+	if cancel != nil {
+		cancel()
+	}
+	if done != nil {
+		<-done
+	}
+	j.mu.Lock()
+	j.running = false
+	j.mu.Unlock()
+}
+
+func (j *Janitor) loop(ctx context.Context) {
+	defer close(j.done)
+
+	// Run immediately, then on the configured interval.
+	j.RunOnce(ctx)
+
+	ticker := time.NewTicker(j.interval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			j.RunOnce(ctx)
+		}
+	}
+}
+
+// RunOnce performs a single sweep over all check plans, applying the per-user
+// retention policy to both executions and evaluations. Returns the total
+// number of records deleted (executions + evaluations).
+func (j *Janitor) RunOnce(ctx context.Context) int {
+	iter, err := j.planStore.ListAllCheckPlans()
+	if err != nil {
+		log.Printf("Janitor: failed to list check plans: %v", err)
+		return 0
+	}
+	defer iter.Close()
+
+	now := time.Now()
+	deleted := 0
+
+	// Cache user policies to avoid resolving the same user repeatedly.
+	policyByUser := map[string]RetentionPolicy{}
+
+	for iter.Next() {
+		select {
+		case <-ctx.Done():
+			return deleted
+		default:
+		}
+
+		plan := iter.Item()
+		if plan == nil {
+			continue
+		}
+
+		policy := j.policyForTarget(plan.Target, policyByUser)
+		hardCutoff := now.AddDate(0, 0, -policy.RetentionDays)
+
+		// Prune executions using the tiered retention policy.
+		execs, err := j.execStore.ListExecutionsByPlan(plan.Id)
+		if err != nil {
+			log.Printf("Janitor: failed to list executions for plan %s: %v", plan.Id.String(), err)
+		} else if len(execs) > 0 {
+			// All executions share the same (CheckerID, Target) since they come
+			// from a single plan, so Decide's internal grouping is a no-op here.
+			_, drop := policy.Decide(execs, now)
+
+			for _, id := range drop {
+				if err := j.execStore.DeleteExecution(id); err != nil {
+					log.Printf("Janitor: failed to delete execution %s: %v", id.String(), err)
+					continue
+				}
+				deleted++
+			}
+		}
+
+		// Prune evaluations older than the hard cutoff.
+		if j.evalStore != nil {
+			deleted += j.pruneEvaluations(plan.Id, hardCutoff)
+		}
+	}
+
+	if err := iter.Err(); err != nil {
+		log.Printf("Janitor: iterator error while walking check plans: %v", err)
+	}
+
+	if deleted > 0 {
+		log.Printf("Janitor: pruned %d records", deleted)
+	}
+	return deleted
+}
+
+// pruneEvaluations deletes evaluations for the given plan that are older than
+// the cutoff, along with their associated snapshots.
+func (j *Janitor) pruneEvaluations(planID happydns.Identifier, cutoff time.Time) int {
+	evals, err := j.evalStore.ListEvaluationsByPlan(planID)
+	if err != nil {
+		log.Printf("Janitor: failed to list evaluations for plan %s: %v", planID.String(), err)
+		return 0
+	}
+
+	deleted := 0
+	for _, eval := range evals {
+		if eval.EvaluatedAt.Before(cutoff) {
+			// Delete the associated snapshot first.
+			if j.snapStore != nil && !eval.SnapshotID.IsEmpty() {
+				if err := j.snapStore.DeleteSnapshot(eval.SnapshotID); err != nil {
+					log.Printf("Janitor: failed to delete snapshot %s: %v", eval.SnapshotID.String(), err)
+				}
+			}
+			if err := j.evalStore.DeleteEvaluation(eval.Id); err != nil {
+				log.Printf("Janitor: failed to delete evaluation %s: %v", eval.Id.String(), err)
+				continue
+			}
+			deleted++
+		}
+	}
+	return deleted
+}
+
+func (j *Janitor) policyForTarget(target happydns.CheckTarget, cache map[string]RetentionPolicy) RetentionPolicy {
+	uid := target.UserId
+	if uid == "" || j.userResolver == nil {
+		return j.defaultPolicy
+	}
+	if p, ok := cache[uid]; ok {
+		return p
+	}
+	policy := j.defaultPolicy
+	id, err := happydns.NewIdentifierFromString(uid)
+	if err == nil {
+		if user, err := j.userResolver.GetUser(id); err == nil && user != nil {
+			if user.Quota.RetentionDays > 0 {
+				policy = DefaultRetentionPolicy(user.Quota.RetentionDays)
+			}
+		}
+	}
+	cache[uid] = policy
+	return policy
+}
--- a/internal/usecase/checker/janitor_test.go
+++ b/internal/usecase/checker/janitor_test.go
@ -0,0 +1,668 @@
+// This file is part of the happyDomain (R) project.
+// Copyright (c) 2020-2026 happyDomain
+// Authors: Pierre-Olivier Mercier, et al.
+//
+// This program is offered under a commercial and under the AGPL license.
+// For commercial licensing, contact us at <contact@happydomain.org>.
+//
+// For AGPL licensing:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+package checker
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+
+	"git.happydns.org/happyDomain/model"
+)
+
+// --- mock execution store for janitor tests ---
+
+type mockExecStore struct {
+	mu    sync.Mutex
+	execs map[string][]*happydns.Execution // planID (base64) -> executions
+	errs  map[string]error                 // planID (base64) -> error
+}
+
+func newMockExecStore() *mockExecStore {
+	return &mockExecStore{
+		execs: make(map[string][]*happydns.Execution),
+		errs:  make(map[string]error),
+	}
+}
+
+func (s *mockExecStore) addExec(planID happydns.Identifier, exec *happydns.Execution) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	key := planID.String()
+	s.execs[key] = append(s.execs[key], exec)
+}
+
+func (s *mockExecStore) setListError(planID happydns.Identifier, err error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.errs[planID.String()] = err
+}
+
+func (s *mockExecStore) ListExecutionsByPlan(planID happydns.Identifier) ([]*happydns.Execution, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	key := planID.String()
+	if err, ok := s.errs[key]; ok {
+		return nil, err
+	}
+	return s.execs[key], nil
+}
+
+func (s *mockExecStore) DeleteExecution(execID happydns.Identifier) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	for planKey, execs := range s.execs {
+		for i, e := range execs {
+			if e.Id.Equals(execID) {
+				s.execs[planKey] = append(execs[:i], execs[i+1:]...)
+				return nil
+			}
+		}
+	}
+	return fmt.Errorf("execution %s not found", execID.String())
+}
+
+// Unused interface methods.
+func (s *mockExecStore) ListAllExecutions() (happydns.Iterator[happydns.Execution], error) {
+	return nil, nil
+}
+func (s *mockExecStore) ListExecutionsByChecker(string, happydns.CheckTarget, int) ([]*happydns.Execution, error) {
+	return nil, nil
+}
+func (s *mockExecStore) ListExecutionsByUser(happydns.Identifier, int) ([]*happydns.Execution, error) {
+	return nil, nil
+}
+func (s *mockExecStore) ListExecutionsByDomain(happydns.Identifier, int) ([]*happydns.Execution, error) {
+	return nil, nil
+}
+func (s *mockExecStore) GetExecution(happydns.Identifier) (*happydns.Execution, error) {
+	return nil, nil
+}
+func (s *mockExecStore) CreateExecution(*happydns.Execution) error                          { return nil }
+func (s *mockExecStore) UpdateExecution(*happydns.Execution) error                          { return nil }
+func (s *mockExecStore) DeleteExecutionsByChecker(string, happydns.CheckTarget) error       { return nil }
+func (s *mockExecStore) TidyExecutionIndexes() error                                        { return nil }
+func (s *mockExecStore) ClearExecutions() error                                             { return nil }
+
+// --- mock user resolver ---
+
+type mockUserResolver struct {
+	users map[string]*happydns.User
+}
+
+func (r *mockUserResolver) GetUser(id happydns.Identifier) (*happydns.User, error) {
+	if u, ok := r.users[id.String()]; ok {
+		return u, nil
+	}
+	return nil, fmt.Errorf("user %s not found", id.String())
+}
+
+// --- counting wrapper ---
+
+type countingUserResolver struct {
+	inner JanitorUserResolver
+	calls *int
+}
+
+func (r *countingUserResolver) GetUser(id happydns.Identifier) (*happydns.User, error) {
+	*r.calls++
+	return r.inner.GetUser(id)
+}
+
+// --- failing plan store ---
+
+type failingPlanStore struct {
+	mockPlanStore
+	err error
+}
+
+func (s *failingPlanStore) ListAllCheckPlans() (happydns.Iterator[happydns.CheckPlan], error) {
+	return nil, s.err
+}
+
+// --- helpers ---
+
+func makePlan(id string, userID string) *happydns.CheckPlan {
+	return &happydns.CheckPlan{
+		Id:        happydns.Identifier(id),
+		CheckerID: "ping",
+		Target: happydns.CheckTarget{
+			UserId:   userID,
+			DomainId: "example.com",
+		},
+	}
+}
+
+func makeExec(id string, age time.Duration, now time.Time) *happydns.Execution {
+	return &happydns.Execution{
+		Id:        happydns.Identifier(id),
+		CheckerID: "ping",
+		Target:    happydns.CheckTarget{DomainId: "example.com"},
+		StartedAt: now.Add(-age),
+	}
+}
+
+// --- tests ---
+
+func TestJanitor_RunOnce_NoPlans(t *testing.T) {
+	ps := &mockPlanStore{}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+
+	deleted := j.RunOnce(context.Background())
+	if deleted != 0 {
+		t.Fatalf("expected 0 deletions, got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_NoExecutions(t *testing.T) {
+	plan := makePlan("plan1", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+
+	deleted := j.RunOnce(context.Background())
+	if deleted != 0 {
+		t.Fatalf("expected 0 deletions, got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_PrunesExpiredExecutions(t *testing.T) {
+	now := time.Now()
+	plan := makePlan("plan1", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+
+	// One recent execution (1 hour old) and one expired (100 days old with a 30-day policy).
+	es.addExec(plan.Id, makeExec("recent", 1*time.Hour, now))
+	es.addExec(plan.Id, makeExec("old", 100*24*time.Hour, now))
+
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion, got %d", deleted)
+	}
+
+	// Verify the old execution was deleted.
+	remaining, _ := es.ListExecutionsByPlan(plan.Id)
+	if len(remaining) != 1 {
+		t.Fatalf("expected 1 remaining execution, got %d", len(remaining))
+	}
+	if !remaining[0].Id.Equals(happydns.Identifier("recent")) {
+		t.Fatalf("expected 'recent' to survive, got %s", remaining[0].Id.String())
+	}
+}
+
+func TestJanitor_RunOnce_PerUserRetentionOverride(t *testing.T) {
+	now := time.Now()
+	userID := happydns.Identifier("user1")
+	plan := makePlan("plan1", userID.String())
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+
+	// Execution 20 days old. System default is 30 days (would keep), but user override is 10 days (should drop).
+	es.addExec(plan.Id, makeExec("exec1", 20*24*time.Hour, now))
+
+	resolver := &mockUserResolver{
+		users: map[string]*happydns.User{
+			userID.String(): {
+				Id:    userID,
+				Quota: happydns.UserQuota{RetentionDays: 10},
+			},
+		},
+	}
+
+	j := NewJanitor(ps, es, nil, nil, resolver, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion (user retention=10d), got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_UserCacheAvoidsRepeatedLookups(t *testing.T) {
+	now := time.Now()
+	userID := happydns.Identifier("user1")
+
+	// Two plans for the same user.
+	plan1 := makePlan("plan1", userID.String())
+	plan2 := makePlan("plan2", userID.String())
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan1, plan2}}
+	es := newMockExecStore()
+
+	es.addExec(plan1.Id, makeExec("e1", 20*24*time.Hour, now))
+	es.addExec(plan2.Id, makeExec("e2", 20*24*time.Hour, now))
+
+	calls := 0
+	resolver := &countingUserResolver{
+		inner: &mockUserResolver{
+			users: map[string]*happydns.User{
+				userID.String(): {
+					Id:    userID,
+					Quota: happydns.UserQuota{RetentionDays: 10},
+				},
+			},
+		},
+		calls: &calls,
+	}
+
+	j := NewJanitor(ps, es, nil, nil, resolver, DefaultRetentionPolicy(30), time.Hour)
+	j.RunOnce(context.Background())
+
+	if calls != 1 {
+		t.Fatalf("expected user resolver to be called once (cached), got %d", calls)
+	}
+}
+
+func TestJanitor_RunOnce_NilUserResolverUsesDefault(t *testing.T) {
+	now := time.Now()
+	plan := makePlan("plan1", "user1")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+
+	// 20 days old with a 30-day default policy: should be kept.
+	es.addExec(plan.Id, makeExec("exec1", 20*24*time.Hour, now))
+
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 0 {
+		t.Fatalf("expected 0 deletions (within default 30d retention), got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_ListPlanError(t *testing.T) {
+	ps := &failingPlanStore{err: errors.New("storage down")}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+
+	deleted := j.RunOnce(context.Background())
+	if deleted != 0 {
+		t.Fatalf("expected 0 on plan listing error, got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_ListExecErrorContinues(t *testing.T) {
+	now := time.Now()
+	plan1 := makePlan("plan1", "")
+	plan2 := makePlan("plan2", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan1, plan2}}
+	es := newMockExecStore()
+
+	// plan1 returns an error; plan2 has a deletable execution.
+	es.setListError(plan1.Id, errors.New("corrupt index"))
+	es.addExec(plan2.Id, makeExec("old", 100*24*time.Hour, now))
+
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion (plan1 error should be skipped), got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_ContextCancellation(t *testing.T) {
+	now := time.Now()
+	var plans []*happydns.CheckPlan
+	es := newMockExecStore()
+
+	// Create many plans with expired executions.
+	for i := 0; i < 100; i++ {
+		id := fmt.Sprintf("plan%d", i)
+		plan := makePlan(id, "")
+		plans = append(plans, plan)
+		es.addExec(plan.Id, makeExec(fmt.Sprintf("exec%d", i), 100*24*time.Hour, now))
+	}
+	ps := &mockPlanStore{plans: plans}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel() // cancel immediately
+
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(ctx)
+
+	// Should have stopped early - not all 100 should be deleted.
+	if deleted >= 100 {
+		t.Fatalf("expected early exit from cancellation, but all %d were deleted", deleted)
+	}
+}
+
+func TestJanitor_StartStop(t *testing.T) {
+	ps := &mockPlanStore{}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), 50*time.Millisecond)
+
+	ctx := context.Background()
+	j.Start(ctx)
+
+	// Let it run a couple of ticks.
+	time.Sleep(150 * time.Millisecond)
+
+	j.Stop()
+
+	// Verify it actually stopped by checking that Stop doesn't hang.
+}
+
+func TestJanitor_DoubleStartIsNoop(t *testing.T) {
+	ps := &mockPlanStore{}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+
+	ctx := context.Background()
+	j.Start(ctx)
+	j.Start(ctx) // should not panic or start a second goroutine
+
+	j.Stop()
+}
+
+func TestJanitor_StopBeforeStartIsNoop(t *testing.T) {
+	ps := &mockPlanStore{}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+
+	// Should not panic or hang.
+	j.Stop()
+}
+
+func TestJanitor_DefaultInterval(t *testing.T) {
+	ps := &mockPlanStore{}
+	es := newMockExecStore()
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), 0)
+
+	if j.interval != 6*time.Hour {
+		t.Fatalf("expected default interval 6h, got %v", j.interval)
+	}
+}
+
+func TestJanitor_RunOnce_MultiplePlansMultipleUsers(t *testing.T) {
+	now := time.Now()
+	user1 := happydns.Identifier("user1")
+	user2 := happydns.Identifier("user2")
+
+	plan1 := makePlan("plan1", user1.String())
+	plan2 := makePlan("plan2", user2.String())
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan1, plan2}}
+	es := newMockExecStore()
+
+	// user1 has retention=10d, exec at 15 days -> should be pruned.
+	es.addExec(plan1.Id, makeExec("u1_exec", 15*24*time.Hour, now))
+
+	// user2 has retention=30d, exec at 15 days -> should be kept.
+	es.addExec(plan2.Id, makeExec("u2_exec", 15*24*time.Hour, now))
+
+	resolver := &mockUserResolver{
+		users: map[string]*happydns.User{
+			user1.String(): {Id: user1, Quota: happydns.UserQuota{RetentionDays: 10}},
+			user2.String(): {Id: user2, Quota: happydns.UserQuota{RetentionDays: 30}},
+		},
+	}
+
+	j := NewJanitor(ps, es, nil, nil, resolver, DefaultRetentionPolicy(365), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion (user1 only), got %d", deleted)
+	}
+
+	remaining1, _ := es.ListExecutionsByPlan(plan1.Id)
+	if len(remaining1) != 0 {
+		t.Fatalf("expected user1's exec to be deleted, got %d remaining", len(remaining1))
+	}
+
+	remaining2, _ := es.ListExecutionsByPlan(plan2.Id)
+	if len(remaining2) != 1 {
+		t.Fatalf("expected user2's exec to be kept, got %d remaining", len(remaining2))
+	}
+}
+
+// --- mock evaluation store for janitor tests ---
+
+type mockEvalStore struct {
+	mu    sync.Mutex
+	evals map[string][]*happydns.CheckEvaluation // planID (base64) -> evaluations
+}
+
+func newMockEvalStore() *mockEvalStore {
+	return &mockEvalStore{
+		evals: make(map[string][]*happydns.CheckEvaluation),
+	}
+}
+
+func (s *mockEvalStore) addEval(planID happydns.Identifier, eval *happydns.CheckEvaluation) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	key := planID.String()
+	s.evals[key] = append(s.evals[key], eval)
+}
+
+func (s *mockEvalStore) ListEvaluationsByPlan(planID happydns.Identifier) ([]*happydns.CheckEvaluation, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return s.evals[planID.String()], nil
+}
+
+func (s *mockEvalStore) DeleteEvaluation(evalID happydns.Identifier) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	for planKey, evals := range s.evals {
+		for i, e := range evals {
+			if e.Id.Equals(evalID) {
+				s.evals[planKey] = append(evals[:i], evals[i+1:]...)
+				return nil
+			}
+		}
+	}
+	return fmt.Errorf("evaluation %s not found", evalID.String())
+}
+
+// Unused interface methods.
+func (s *mockEvalStore) ListAllEvaluations() (happydns.Iterator[happydns.CheckEvaluation], error) {
+	return nil, nil
+}
+func (s *mockEvalStore) ListEvaluationsByChecker(string, happydns.CheckTarget, int) ([]*happydns.CheckEvaluation, error) {
+	return nil, nil
+}
+func (s *mockEvalStore) GetEvaluation(happydns.Identifier) (*happydns.CheckEvaluation, error) {
+	return nil, nil
+}
+func (s *mockEvalStore) GetLatestEvaluation(happydns.Identifier) (*happydns.CheckEvaluation, error) {
+	return nil, nil
+}
+func (s *mockEvalStore) CreateEvaluation(*happydns.CheckEvaluation) error                    { return nil }
+func (s *mockEvalStore) DeleteEvaluationsByChecker(string, happydns.CheckTarget) error       { return nil }
+func (s *mockEvalStore) TidyEvaluationIndexes() error                                        { return nil }
+func (s *mockEvalStore) ClearEvaluations() error                                             { return nil }
+
+// --- mock snapshot store for janitor tests ---
+
+type mockSnapStore struct {
+	mu       sync.Mutex
+	deleted  []string // snapshot IDs that were deleted
+	failNext bool
+}
+
+func newMockSnapStore() *mockSnapStore {
+	return &mockSnapStore{}
+}
+
+func (s *mockSnapStore) DeleteSnapshot(snapID happydns.Identifier) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.failNext {
+		s.failNext = false
+		return fmt.Errorf("snapshot %s delete failed", snapID.String())
+	}
+	s.deleted = append(s.deleted, snapID.String())
+	return nil
+}
+
+func (s *mockSnapStore) deletedCount() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return len(s.deleted)
+}
+
+// Unused interface methods.
+func (s *mockSnapStore) ListAllSnapshots() (happydns.Iterator[happydns.ObservationSnapshot], error) {
+	return nil, nil
+}
+func (s *mockSnapStore) GetSnapshot(happydns.Identifier) (*happydns.ObservationSnapshot, error) {
+	return nil, nil
+}
+func (s *mockSnapStore) CreateSnapshot(*happydns.ObservationSnapshot) error { return nil }
+func (s *mockSnapStore) ClearSnapshots() error                              { return nil }
+
+// --- helpers ---
+
+func makeEval(id string, snapID string, age time.Duration, now time.Time, planID happydns.Identifier) *happydns.CheckEvaluation {
+	pid := planID
+	return &happydns.CheckEvaluation{
+		Id:          happydns.Identifier(id),
+		PlanID:      &pid,
+		CheckerID:   "ping",
+		Target:      happydns.CheckTarget{DomainId: "example.com"},
+		SnapshotID:  happydns.Identifier(snapID),
+		EvaluatedAt: now.Add(-age),
+	}
+}
+
+// --- evaluation pruning tests ---
+
+func TestJanitor_RunOnce_PrunesExpiredEvaluations(t *testing.T) {
+	now := time.Now()
+	plan := makePlan("plan1", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+	evs := newMockEvalStore()
+	ss := newMockSnapStore()
+
+	evs.addEval(plan.Id, makeEval("recent_eval", "snap1", 1*time.Hour, now, plan.Id))
+	evs.addEval(plan.Id, makeEval("old_eval", "snap2", 100*24*time.Hour, now, plan.Id))
+
+	j := NewJanitor(ps, es, evs, ss, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion, got %d", deleted)
+	}
+
+	remaining, _ := evs.ListEvaluationsByPlan(plan.Id)
+	if len(remaining) != 1 {
+		t.Fatalf("expected 1 remaining evaluation, got %d", len(remaining))
+	}
+	if !remaining[0].Id.Equals(happydns.Identifier("recent_eval")) {
+		t.Fatalf("expected 'recent_eval' to survive, got %s", remaining[0].Id.String())
+	}
+
+	if ss.deletedCount() != 1 {
+		t.Fatalf("expected 1 snapshot deleted, got %d", ss.deletedCount())
+	}
+}
+
+func TestJanitor_RunOnce_PrunesBothExecutionsAndEvaluations(t *testing.T) {
+	now := time.Now()
+	plan := makePlan("plan1", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+	evs := newMockEvalStore()
+	ss := newMockSnapStore()
+
+	es.addExec(plan.Id, makeExec("old_exec", 100*24*time.Hour, now))
+	evs.addEval(plan.Id, makeEval("old_eval", "snap1", 100*24*time.Hour, now, plan.Id))
+
+	j := NewJanitor(ps, es, evs, ss, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 2 {
+		t.Fatalf("expected 2 deletions (1 exec + 1 eval), got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_EvalPruningRespectsPerUserRetention(t *testing.T) {
+	now := time.Now()
+	userID := happydns.Identifier("user1")
+	plan := makePlan("plan1", userID.String())
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+	evs := newMockEvalStore()
+	ss := newMockSnapStore()
+
+	// Evaluation 20 days old. System default is 30 days (would keep), but user override is 10 days (should drop).
+	evs.addEval(plan.Id, makeEval("eval1", "snap1", 20*24*time.Hour, now, plan.Id))
+
+	resolver := &mockUserResolver{
+		users: map[string]*happydns.User{
+			userID.String(): {
+				Id:    userID,
+				Quota: happydns.UserQuota{RetentionDays: 10},
+			},
+		},
+	}
+
+	j := NewJanitor(ps, es, evs, ss, resolver, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion (user retention=10d), got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_NilEvalStoreSkipsEvalPruning(t *testing.T) {
+	now := time.Now()
+	plan := makePlan("plan1", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+
+	es.addExec(plan.Id, makeExec("old", 100*24*time.Hour, now))
+
+	j := NewJanitor(ps, es, nil, nil, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	// Should only delete the execution, not panic on nil evalStore.
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion, got %d", deleted)
+	}
+}
+
+func TestJanitor_RunOnce_SnapshotDeleteFailureContinues(t *testing.T) {
+	now := time.Now()
+	plan := makePlan("plan1", "")
+	ps := &mockPlanStore{plans: []*happydns.CheckPlan{plan}}
+	es := newMockExecStore()
+	evs := newMockEvalStore()
+	ss := newMockSnapStore()
+	ss.failNext = true
+
+	evs.addEval(plan.Id, makeEval("old_eval", "snap1", 100*24*time.Hour, now, plan.Id))
+
+	j := NewJanitor(ps, es, evs, ss, nil, DefaultRetentionPolicy(30), time.Hour)
+	deleted := j.RunOnce(context.Background())
+
+	// Evaluation should still be deleted even if snapshot deletion fails.
+	if deleted != 1 {
+		t.Fatalf("expected 1 deletion despite snapshot failure, got %d", deleted)
+	}
+}