checker-blacklist/checker/feedcache.go

89 lines
2.1 KiB
Go

package checker
import (
"context"
"net/url"
"strings"
"sync"
"time"
)
// feedCache is a generic URL-feed cache shared between phishing-feed
// sources (OpenPhish, PhishTank). It holds a hostname-indexed snapshot
// of the feed, refreshes on TTL expiry, and ensures only one refresh is
// in flight at a time so concurrent lookups still serve stale data
// during a refresh.
type feedCache struct {
mu sync.Mutex
urls []string
byHost map[string][]string
fetchedAt time.Time
lastAttemptAt time.Time
refreshing bool
ttl time.Duration
failBackoff time.Duration
fetchFn func(ctx context.Context) (urls []string, byHost map[string][]string, err error)
}
func newFeedCache(ttl time.Duration, fetch func(context.Context) ([]string, map[string][]string, error)) *feedCache {
if ttl <= 0 {
ttl = time.Hour
}
return &feedCache{
ttl: ttl,
failBackoff: time.Minute,
fetchFn: fetch,
}
}
func (c *feedCache) setTTL(d time.Duration) {
c.mu.Lock()
c.ttl = d
c.mu.Unlock()
}
func (c *feedCache) lookup(ctx context.Context, domain string) (urls []string, size int, fetchedAt time.Time, err error) {
domain = strings.ToLower(strings.TrimSuffix(domain, "."))
c.mu.Lock()
stale := c.byHost == nil || time.Since(c.fetchedAt) > c.ttl
doRefresh := stale && !c.refreshing && time.Since(c.lastAttemptAt) > c.failBackoff
if doRefresh {
c.refreshing = true
}
c.mu.Unlock()
if doRefresh {
newURLs, newByHost, ferr := c.fetchFn(ctx)
c.mu.Lock()
c.refreshing = false
c.lastAttemptAt = time.Now()
if ferr == nil {
c.urls = newURLs
c.byHost = newByHost
c.fetchedAt = c.lastAttemptAt
} else {
err = ferr
}
c.mu.Unlock()
}
c.mu.Lock()
for host, hostURLs := range c.byHost {
if host == domain || strings.HasSuffix(host, "."+domain) {
urls = append(urls, hostURLs...)
}
}
size = len(c.urls)
fetchedAt = c.fetchedAt
c.mu.Unlock()
return urls, size, fetchedAt, err
}
func hostOfURL(s string) string {
u, err := url.Parse(s)
if err != nil {
return ""
}
return strings.ToLower(u.Hostname())
}