Compare commits

...

2 commits

Author SHA1 Message Date
1c1d474870 Use List-Unsubscribe header URLs for unsubscribe link detection
All checks were successful
continuous-integration/drone/push Build is passing
Bug: https://github.com/happyDomain/happydeliver/issues/8
2026-02-20 00:21:02 +07:00
521d5da84c Use modern Go slices.Contains and switch instead of if/else if 2026-02-19 23:15:07 +07:00
2 changed files with 38 additions and 13 deletions

View file

@ -27,6 +27,7 @@ import (
"net/http" "net/http"
"net/url" "net/url"
"regexp" "regexp"
"slices"
"strings" "strings"
"time" "time"
"unicode" "unicode"
@ -37,8 +38,9 @@ import (
// ContentAnalyzer analyzes email content (HTML, links, images) // ContentAnalyzer analyzes email content (HTML, links, images)
type ContentAnalyzer struct { type ContentAnalyzer struct {
Timeout time.Duration Timeout time.Duration
httpClient *http.Client httpClient *http.Client
listUnsubscribeURLs []string // URLs from List-Unsubscribe header
} }
// NewContentAnalyzer creates a new content analyzer with configurable timeout // NewContentAnalyzer creates a new content analyzer with configurable timeout
@ -110,6 +112,9 @@ func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
results.IsMultipart = len(email.Parts) > 1 results.IsMultipart = len(email.Parts) > 1
// Parse List-Unsubscribe header URLs for use in link detection
c.listUnsubscribeURLs = email.GetListUnsubscribeURLs()
// Get HTML and text parts // Get HTML and text parts
htmlParts := email.GetHTMLParts() htmlParts := email.GetHTMLParts()
textParts := email.GetTextParts() textParts := email.GetTextParts()
@ -331,6 +336,11 @@ func (c *ContentAnalyzer) getAttr(n *html.Node, key string) string {
// isUnsubscribeLink checks if a link is an unsubscribe link // isUnsubscribeLink checks if a link is an unsubscribe link
func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool { func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool {
// First check: does the href match a URL from the List-Unsubscribe header?
if slices.Contains(c.listUnsubscribeURLs, href) {
return true
}
// Check href for unsubscribe keywords // Check href for unsubscribe keywords
lowerHref := strings.ToLower(href) lowerHref := strings.ToLower(href)
unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe"} unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe"}
@ -439,7 +449,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
// Extract the actual destination domain/email based on scheme // Extract the actual destination domain/email based on scheme
var actualDomain string var actualDomain string
if parsedURL.Scheme == "mailto" { switch parsedURL.Scheme {
case "mailto":
// Extract email address from mailto: URL // Extract email address from mailto: URL
// Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=... // Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=...
mailtoAddr := parsedURL.Opaque mailtoAddr := parsedURL.Opaque
@ -457,7 +468,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
} else { } else {
return false // Invalid mailto return false // Invalid mailto
} }
} else if parsedURL.Scheme == "http" || parsedURL.Scheme == "https" { case "http":
case "https":
// Check if URL has a host // Check if URL has a host
if parsedURL.Host == "" { if parsedURL.Host == "" {
return false return false
@ -469,7 +481,7 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
actualDomain = actualDomain[:idx] actualDomain = actualDomain[:idx]
} }
actualDomain = strings.ToLower(actualDomain) actualDomain = strings.ToLower(actualDomain)
} else { default:
// Skip checks for other URL schemes (tel, etc.) // Skip checks for other URL schemes (tel, etc.)
return false return false
} }
@ -492,10 +504,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
"email us", "contact us", "send email", "get in touch", "reach out", "email us", "contact us", "send email", "get in touch", "reach out",
"contact", "email", "write to us", "contact", "email", "write to us",
} }
for _, generic := range genericTexts { if slices.Contains(genericTexts, linkText) {
if linkText == generic { return false
return false
}
} }
// Extract domain-like patterns from link text using regex // Extract domain-like patterns from link text using regex
@ -562,10 +572,8 @@ func (c *ContentAnalyzer) isSuspiciousURL(urlStr string, parsedURL *url.URL) boo
"bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co", "bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co",
"buff.ly", "is.gd", "bl.ink", "short.io", "buff.ly", "is.gd", "bl.ink", "short.io",
} }
for _, shortener := range shorteners { if slices.Contains(shorteners, strings.ToLower(parsedURL.Host)) {
if strings.ToLower(parsedURL.Host) == shortener { return true
return true
}
} }
// Check for excessive subdomains (possible obfuscation) // Check for excessive subdomains (possible obfuscation)

View file

@ -301,3 +301,20 @@ func (e *EmailMessage) GetHeaderValue(key string) string {
func (e *EmailMessage) HasHeader(key string) bool { func (e *EmailMessage) HasHeader(key string) bool {
return e.Header.Get(key) != "" return e.Header.Get(key) != ""
} }
// GetListUnsubscribeURLs parses the List-Unsubscribe header and returns all URLs.
// The header format is: <url1>, <url2>, ...
func (e *EmailMessage) GetListUnsubscribeURLs() []string {
value := e.Header.Get("List-Unsubscribe")
if value == "" {
return nil
}
var urls []string
for _, part := range strings.Split(value, ",") {
part = strings.TrimSpace(part)
if strings.HasPrefix(part, "<") && strings.HasSuffix(part, ">") {
urls = append(urls, part[1:len(part)-1])
}
}
return urls
}