Compare commits

...
Sign in to create a new pull request.

3 commits

Author SHA1 Message Date
43aec8fdc0 Add multilingual unsubscribe keywords for link detection
All checks were successful
continuous-integration/drone/push Build is passing
The list comes from github.com/knadh/listmonk i18n strings

Bug: https://github.com/happyDomain/happydeliver/issues/8
2026-02-22 15:20:17 +07:00
1c1d474870 Use List-Unsubscribe header URLs for unsubscribe link detection
All checks were successful
continuous-integration/drone/push Build is passing
Bug: https://github.com/happyDomain/happydeliver/issues/8
2026-02-20 00:21:02 +07:00
521d5da84c Use modern Go slices.Contains and switch instead of if/else if 2026-02-19 23:15:07 +07:00
2 changed files with 39 additions and 14 deletions

View file

@ -27,6 +27,7 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"regexp" "regexp"
"slices"
"strings" "strings"
"time" "time"
"unicode" "unicode"
@ -37,8 +38,9 @@ import (
// ContentAnalyzer analyzes email content (HTML, links, images) // ContentAnalyzer analyzes email content (HTML, links, images)
type ContentAnalyzer struct { type ContentAnalyzer struct {
Timeout time.Duration Timeout time.Duration
httpClient *http.Client httpClient *http.Client
listUnsubscribeURLs []string // URLs from List-Unsubscribe header
} }
// NewContentAnalyzer creates a new content analyzer with configurable timeout // NewContentAnalyzer creates a new content analyzer with configurable timeout
@ -110,6 +112,9 @@ func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
results.IsMultipart = len(email.Parts) > 1 results.IsMultipart = len(email.Parts) > 1
// Parse List-Unsubscribe header URLs for use in link detection
c.listUnsubscribeURLs = email.GetListUnsubscribeURLs()
// Get HTML and text parts // Get HTML and text parts
htmlParts := email.GetHTMLParts() htmlParts := email.GetHTMLParts()
textParts := email.GetTextParts() textParts := email.GetTextParts()
@ -331,9 +336,14 @@ func (c *ContentAnalyzer) getAttr(n *html.Node, key string) string {
// isUnsubscribeLink checks if a link is an unsubscribe link // isUnsubscribeLink checks if a link is an unsubscribe link
func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool { func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool {
// First check: does the href match a URL from the List-Unsubscribe header?
if slices.Contains(c.listUnsubscribeURLs, href) {
return true
}
// Check href for unsubscribe keywords // Check href for unsubscribe keywords
lowerHref := strings.ToLower(href) lowerHref := strings.ToLower(href)
unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe"} unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe", "отписване", "desubscripció", "zrušit odběr", "dad-danysgrifio", "afmeld", "abmelden", "διαγραφή", "darse de baja", "poistu postituslistalta", "se désabonner", "ביטול רישום", "leiratkozás", "cancella iscrizione", "登録を取り消す", "구독 해지", "വരിക്കാരനല്ലാതാകുക", "uitschrijven", "meld av", "odsubskrybuj", "cancelar assinatura", "cancelar subscrição", "dezabonare", "отписаться", "avsluta prenumeration", "zrušiť odber", "odjava", "üyeliği sonlandır", "відписатися", "hủy đăng ký", "退订", "退訂"}
for _, keyword := range unsubKeywords { for _, keyword := range unsubKeywords {
if strings.Contains(lowerHref, keyword) { if strings.Contains(lowerHref, keyword) {
return true return true
@ -439,7 +449,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
// Extract the actual destination domain/email based on scheme // Extract the actual destination domain/email based on scheme
var actualDomain string var actualDomain string
if parsedURL.Scheme == "mailto" { switch parsedURL.Scheme {
case "mailto":
// Extract email address from mailto: URL // Extract email address from mailto: URL
// Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=... // Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=...
mailtoAddr := parsedURL.Opaque mailtoAddr := parsedURL.Opaque
@ -457,7 +468,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
} else { } else {
return false // Invalid mailto return false // Invalid mailto
} }
} else if parsedURL.Scheme == "http" || parsedURL.Scheme == "https" { case "http":
case "https":
// Check if URL has a host // Check if URL has a host
if parsedURL.Host == "" { if parsedURL.Host == "" {
return false return false
@ -469,7 +481,7 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
actualDomain = actualDomain[:idx] actualDomain = actualDomain[:idx]
} }
actualDomain = strings.ToLower(actualDomain) actualDomain = strings.ToLower(actualDomain)
} else { default:
// Skip checks for other URL schemes (tel, etc.) // Skip checks for other URL schemes (tel, etc.)
return false return false
} }
@ -492,10 +504,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
"email us", "contact us", "send email", "get in touch", "reach out", "email us", "contact us", "send email", "get in touch", "reach out",
"contact", "email", "write to us", "contact", "email", "write to us",
} }
for _, generic := range genericTexts { if slices.Contains(genericTexts, linkText) {
if linkText == generic { return false
return false
}
} }
// Extract domain-like patterns from link text using regex // Extract domain-like patterns from link text using regex
@ -562,10 +572,8 @@ func (c *ContentAnalyzer) isSuspiciousURL(urlStr string, parsedURL *url.URL) boo
"bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co", "bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co",
"buff.ly", "is.gd", "bl.ink", "short.io", "buff.ly", "is.gd", "bl.ink", "short.io",
} }
for _, shortener := range shorteners { if slices.Contains(shorteners, strings.ToLower(parsedURL.Host)) {
if strings.ToLower(parsedURL.Host) == shortener { return true
return true
}
} }
// Check for excessive subdomains (possible obfuscation) // Check for excessive subdomains (possible obfuscation)

View file

@ -301,3 +301,20 @@ func (e *EmailMessage) GetHeaderValue(key string) string {
func (e *EmailMessage) HasHeader(key string) bool { func (e *EmailMessage) HasHeader(key string) bool {
return e.Header.Get(key) != "" return e.Header.Get(key) != ""
} }
// GetListUnsubscribeURLs parses the List-Unsubscribe header and returns all URLs.
// The header format is: <url1>, <url2>, ...
func (e *EmailMessage) GetListUnsubscribeURLs() []string {
value := e.Header.Get("List-Unsubscribe")
if value == "" {
return nil
}
var urls []string
for _, part := range strings.Split(value, ",") {
part = strings.TrimSpace(part)
if strings.HasPrefix(part, "<") && strings.HasSuffix(part, ">") {
urls = append(urls, part[1:len(part)-1])
}
}
return urls
}