Compare commits
3 commits
master
...
f/unsubscr
| Author | SHA1 | Date | |
|---|---|---|---|
| 43aec8fdc0 | |||
| 1c1d474870 | |||
| 521d5da84c |
2 changed files with 39 additions and 14 deletions
|
|
@ -27,6 +27,7 @@ import (
|
|||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
|
|
@ -37,8 +38,9 @@ import (
|
|||
|
||||
// ContentAnalyzer analyzes email content (HTML, links, images)
|
||||
type ContentAnalyzer struct {
|
||||
Timeout time.Duration
|
||||
httpClient *http.Client
|
||||
Timeout time.Duration
|
||||
httpClient *http.Client
|
||||
listUnsubscribeURLs []string // URLs from List-Unsubscribe header
|
||||
}
|
||||
|
||||
// NewContentAnalyzer creates a new content analyzer with configurable timeout
|
||||
|
|
@ -110,6 +112,9 @@ func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
|
|||
|
||||
results.IsMultipart = len(email.Parts) > 1
|
||||
|
||||
// Parse List-Unsubscribe header URLs for use in link detection
|
||||
c.listUnsubscribeURLs = email.GetListUnsubscribeURLs()
|
||||
|
||||
// Get HTML and text parts
|
||||
htmlParts := email.GetHTMLParts()
|
||||
textParts := email.GetTextParts()
|
||||
|
|
@ -331,9 +336,14 @@ func (c *ContentAnalyzer) getAttr(n *html.Node, key string) string {
|
|||
|
||||
// isUnsubscribeLink checks if a link is an unsubscribe link
|
||||
func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool {
|
||||
// First check: does the href match a URL from the List-Unsubscribe header?
|
||||
if slices.Contains(c.listUnsubscribeURLs, href) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check href for unsubscribe keywords
|
||||
lowerHref := strings.ToLower(href)
|
||||
unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe"}
|
||||
unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe", "отписване", "desubscripció", "zrušit odběr", "dad-danysgrifio", "afmeld", "abmelden", "διαγραφή", "darse de baja", "poistu postituslistalta", "se désabonner", "ביטול רישום", "leiratkozás", "cancella iscrizione", "登録を取り消す", "구독 해지", "വരിക്കാരനല്ലാതാകുക", "uitschrijven", "meld av", "odsubskrybuj", "cancelar assinatura", "cancelar subscrição", "dezabonare", "отписаться", "avsluta prenumeration", "zrušiť odber", "odjava", "üyeliği sonlandır", "відписатися", "hủy đăng ký", "退订", "退訂"}
|
||||
for _, keyword := range unsubKeywords {
|
||||
if strings.Contains(lowerHref, keyword) {
|
||||
return true
|
||||
|
|
@ -439,7 +449,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
|
|||
// Extract the actual destination domain/email based on scheme
|
||||
var actualDomain string
|
||||
|
||||
if parsedURL.Scheme == "mailto" {
|
||||
switch parsedURL.Scheme {
|
||||
case "mailto":
|
||||
// Extract email address from mailto: URL
|
||||
// Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=...
|
||||
mailtoAddr := parsedURL.Opaque
|
||||
|
|
@ -457,7 +468,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
|
|||
} else {
|
||||
return false // Invalid mailto
|
||||
}
|
||||
} else if parsedURL.Scheme == "http" || parsedURL.Scheme == "https" {
|
||||
case "http":
|
||||
case "https":
|
||||
// Check if URL has a host
|
||||
if parsedURL.Host == "" {
|
||||
return false
|
||||
|
|
@ -469,7 +481,7 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
|
|||
actualDomain = actualDomain[:idx]
|
||||
}
|
||||
actualDomain = strings.ToLower(actualDomain)
|
||||
} else {
|
||||
default:
|
||||
// Skip checks for other URL schemes (tel, etc.)
|
||||
return false
|
||||
}
|
||||
|
|
@ -492,10 +504,8 @@ func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
|
|||
"email us", "contact us", "send email", "get in touch", "reach out",
|
||||
"contact", "email", "write to us",
|
||||
}
|
||||
for _, generic := range genericTexts {
|
||||
if linkText == generic {
|
||||
return false
|
||||
}
|
||||
if slices.Contains(genericTexts, linkText) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Extract domain-like patterns from link text using regex
|
||||
|
|
@ -562,10 +572,8 @@ func (c *ContentAnalyzer) isSuspiciousURL(urlStr string, parsedURL *url.URL) boo
|
|||
"bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co",
|
||||
"buff.ly", "is.gd", "bl.ink", "short.io",
|
||||
}
|
||||
for _, shortener := range shorteners {
|
||||
if strings.ToLower(parsedURL.Host) == shortener {
|
||||
return true
|
||||
}
|
||||
if slices.Contains(shorteners, strings.ToLower(parsedURL.Host)) {
|
||||
return true
|
||||
}
|
||||
|
||||
// Check for excessive subdomains (possible obfuscation)
|
||||
|
|
|
|||
|
|
@ -301,3 +301,20 @@ func (e *EmailMessage) GetHeaderValue(key string) string {
|
|||
func (e *EmailMessage) HasHeader(key string) bool {
|
||||
return e.Header.Get(key) != ""
|
||||
}
|
||||
|
||||
// GetListUnsubscribeURLs parses the List-Unsubscribe header and returns all URLs.
|
||||
// The header format is: <url1>, <url2>, ...
|
||||
func (e *EmailMessage) GetListUnsubscribeURLs() []string {
|
||||
value := e.Header.Get("List-Unsubscribe")
|
||||
if value == "" {
|
||||
return nil
|
||||
}
|
||||
var urls []string
|
||||
for _, part := range strings.Split(value, ",") {
|
||||
part = strings.TrimSpace(part)
|
||||
if strings.HasPrefix(part, "<") && strings.HasSuffix(part, ">") {
|
||||
urls = append(urls, part[1:len(part)-1])
|
||||
}
|
||||
}
|
||||
return urls
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue