// This file is part of the happyDeliver (R) project. // Copyright (c) 2025 happyDomain // Authors: Pierre-Olivier Mercier, et al. // // This program is offered under a commercial and under the AGPL license. // For commercial licensing, contact us at . // // For AGPL licensing: // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package analyzer import ( "net/mail" "net/url" "strings" "testing" "time" "golang.org/x/net/html" ) func TestNewContentAnalyzer(t *testing.T) { tests := []struct { name string timeout time.Duration expectedTimeout time.Duration }{ { name: "Default timeout", timeout: 0, expectedTimeout: 10 * time.Second, }, { name: "Custom timeout", timeout: 5 * time.Second, expectedTimeout: 5 * time.Second, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { analyzer := NewContentAnalyzer(tt.timeout) if analyzer.Timeout != tt.expectedTimeout { t.Errorf("Timeout = %v, want %v", analyzer.Timeout, tt.expectedTimeout) } if analyzer.httpClient == nil { t.Error("httpClient should not be nil") } }) } } func TestExtractTextFromHTML(t *testing.T) { tests := []struct { name string html string expectedText string }{ { name: "Simple text", html: "

Hello World

", expectedText: "Hello World", }, { name: "Multiple elements", html: "

Title

Paragraph

", expectedText: "Title Paragraph", }, { name: "With script tag", html: "

Text

", expectedText: "Text More", }, { name: "With style tag", html: "

Text

", expectedText: "Text More", }, { name: "Empty HTML", html: "", expectedText: "", }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { text := analyzer.extractTextFromHTML(tt.html) if text != tt.expectedText { t.Errorf("extractTextFromHTML() = %q, want %q", text, tt.expectedText) } }) } } func TestIsUnsubscribeLink(t *testing.T) { tests := []struct { name string href string linkText string expected bool }{ { name: "Unsubscribe in URL", href: "https://example.com/unsubscribe?id=123", linkText: "Click here", expected: true, }, { name: "Unsubscribe in text", href: "https://example.com/action?id=123", linkText: "Unsubscribe from this list", expected: true, }, { name: "Opt-out in URL", href: "https://example.com/optout", linkText: "Click here", expected: true, }, { name: "Remove in text", href: "https://example.com/action", linkText: "Remove me from list", expected: true, }, { name: "Normal link", href: "https://example.com/article", linkText: "Read more", expected: false, }, // Multilingual keyword detection - URL path { name: "German abmelden in URL", href: "https://example.com/abmelden?id=42", linkText: "Click here", expected: true, }, { name: "French se-desabonner slug in URL (no accent/space - not detected by keyword)", href: "https://example.com/se-desabonner?id=42", linkText: "Click here", expected: false, }, // Multilingual keyword detection - link text { name: "German Abmelden in link text", href: "https://example.com/manage?id=42&lang=de", linkText: "Abmelden", expected: true, }, { name: "French Se désabonner in link text", href: "https://example.com/manage?id=42&lang=fr", linkText: "Se désabonner", expected: true, }, { name: "Russian Отписаться in link text", href: "https://example.com/manage?id=42&lang=ru", linkText: "Отписаться", expected: true, }, { name: "Chinese 退订 in link text", href: "https://example.com/manage?id=42&lang=zh", linkText: "退订", expected: true, }, { name: "Japanese 登録を取り消す in link text", href: "https://example.com/manage?id=42&lang=ja", linkText: "登録を取り消す", expected: true, }, { name: "Korean 구독 해지 in link text", href: "https://example.com/manage?id=42&lang=ko", linkText: "구독 해지", expected: true, }, { name: "Dutch Uitschrijven in link text", href: "https://example.com/manage?id=42&lang=nl", linkText: "Uitschrijven", expected: true, }, { name: "Polish Odsubskrybuj in link text", href: "https://example.com/manage?id=42&lang=pl", linkText: "Odsubskrybuj", expected: true, }, { name: "Turkish Üyeliği sonlandır in link text", href: "https://example.com/manage?id=42&lang=tr", linkText: "Üyeliği sonlandır", expected: true, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { // Create a simple text node for testing html := "" + tt.linkText + "" doc, _ := parseHTML(html) linkNode := findFirstLink(doc) if linkNode == nil { t.Fatal("Failed to parse test HTML") } result := analyzer.isUnsubscribeLink(tt.href, linkNode) if result != tt.expected { t.Errorf("isUnsubscribeLink(%q, %q) = %v, want %v", tt.href, tt.linkText, result, tt.expected) } }) } } func TestIsSuspiciousURL(t *testing.T) { tests := []struct { name string url string expected bool }{ { name: "Normal HTTPS URL", url: "https://example.com/page", expected: false, }, { name: "URL with IP address", url: "https://192.168.1.1/page", expected: true, }, { name: "URL with IPv6", url: "https://[2001:db8::1]/page", expected: true, }, { name: "URL shortener - bit.ly", url: "https://bit.ly/abc123", expected: true, }, { name: "URL shortener - tinyurl", url: "https://tinyurl.com/abc123", expected: true, }, { name: "Excessive subdomains", url: "https://a.b.c.d.e.example.com/page", expected: true, }, { name: "URL with @ (phishing)", url: "https://user@example.com/page", expected: true, }, { name: "Normal subdomain", url: "https://mail.example.com/page", expected: false, }, { name: "Mailto with @ symbol", url: "mailto:support@example.com", expected: false, }, { name: "Mailto with multiple @ symbols", url: "mailto:user@subdomain@example.com", expected: false, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { parsedURL, err := parseURL(tt.url) if err != nil { t.Fatalf("Failed to parse URL: %v", err) } result := analyzer.isSuspiciousURL(tt.url, parsedURL) if result != tt.expected { t.Errorf("isSuspiciousURL(%q) = %v, want %v", tt.url, result, tt.expected) } }) } } func TestIsIPAddress(t *testing.T) { tests := []struct { name string host string expected bool }{ { name: "IPv4 address", host: "192.168.1.1", expected: true, }, { name: "IPv4 with port", host: "192.168.1.1:8080", expected: true, }, { name: "IPv6 address", host: "2001:db8::1", expected: true, }, { name: "Domain name", host: "example.com", expected: false, }, { name: "Subdomain", host: "mail.example.com", expected: false, }, { name: "Localhost", host: "localhost", expected: false, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := analyzer.isIPAddress(tt.host) if result != tt.expected { t.Errorf("isIPAddress(%q) = %v, want %v", tt.host, result, tt.expected) } }) } } func TestCalculateTextPlainConsistency(t *testing.T) { tests := []struct { name string plainText string htmlText string expectedMinRatio float32 expectedMaxRatio float32 }{ { name: "Identical content", plainText: "Hello World Test", htmlText: "

Hello World Test

", expectedMinRatio: 0.8, expectedMaxRatio: 1.0, }, { name: "Similar content", plainText: "Hello World", htmlText: "

Hello World Extra

", expectedMinRatio: 0.3, expectedMaxRatio: 0.8, }, { name: "Different content", plainText: "Completely different", htmlText: "

Nothing alike here

", expectedMinRatio: 0.0, expectedMaxRatio: 0.3, }, { name: "Empty plain text", plainText: "", htmlText: "

Some text

", expectedMinRatio: 0.0, expectedMaxRatio: 0.0, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { ratio := analyzer.calculateTextPlainConsistency(tt.plainText, tt.htmlText) if ratio < tt.expectedMinRatio || ratio > tt.expectedMaxRatio { t.Errorf("calculateTextPlainConsistency() = %v, want between %v and %v", ratio, tt.expectedMinRatio, tt.expectedMaxRatio) } }) } } func TestNormalizeText(t *testing.T) { tests := []struct { name string input string expected string }{ { name: "Uppercase to lowercase", input: "Hello WORLD", expected: "hello world", }, { name: "Multiple spaces", input: "Hello World", expected: "hello world", }, { name: "Tabs and newlines", input: "Hello\t\nWorld", expected: "hello world", }, { name: "Leading and trailing spaces", input: " Hello World ", expected: "hello world", }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := analyzer.normalizeText(tt.input) if result != tt.expected { t.Errorf("normalizeText(%q) = %q, want %q", tt.input, result, tt.expected) } }) } } func TestAnalyzeContent_HTMLParsing(t *testing.T) { tests := []struct { name string email *EmailMessage expectValid bool expectLinks int expectImages int }{ { name: "Valid HTML with links and images", email: &EmailMessage{ Header: make(mail.Header), Parts: []MessagePart{ { ContentType: "text/html", IsHTML: true, Content: `

Hello World

Link

`, }, }, }, expectValid: true, expectLinks: 1, expectImages: 1, }, { name: "Multiple links", email: &EmailMessage{ Header: make(mail.Header), Parts: []MessagePart{ { ContentType: "text/html", IsHTML: true, Content: ` Link 1 Link 2 Link 3 `, }, }, }, expectValid: true, expectLinks: 3, expectImages: 0, }, { name: "Plain text only", email: &EmailMessage{ Header: make(mail.Header), Parts: []MessagePart{ { ContentType: "text/plain", IsText: true, Content: "Plain text email", }, }, }, expectValid: false, expectLinks: 0, expectImages: 0, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { results := analyzer.AnalyzeContent(tt.email) if results == nil { t.Fatal("Expected results, got nil") } if results.HTMLValid != tt.expectValid { t.Errorf("HTMLValid = %v, want %v", results.HTMLValid, tt.expectValid) } if len(results.Links) != tt.expectLinks { t.Errorf("Got %d links, want %d", len(results.Links), tt.expectLinks) } if len(results.Images) != tt.expectImages { t.Errorf("Got %d images, want %d", len(results.Images), tt.expectImages) } }) } } func TestAnalyzeContent_UnsubscribeDetection(t *testing.T) { tests := []struct { name string html string expectUnsubscribe bool expectCount int }{ { name: "With unsubscribe link", html: `

Email content

Unsubscribe `, expectUnsubscribe: true, expectCount: 1, }, { name: "Multiple unsubscribe links", html: ` Unsubscribe Opt out `, expectUnsubscribe: true, expectCount: 2, }, { name: "No unsubscribe link", html: `

Email content

Read more `, expectUnsubscribe: false, expectCount: 0, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { email := &EmailMessage{ Header: make(mail.Header), Parts: []MessagePart{ { ContentType: "text/html", IsHTML: true, Content: tt.html, }, }, } results := analyzer.AnalyzeContent(email) if results.HasUnsubscribe != tt.expectUnsubscribe { t.Errorf("HasUnsubscribe = %v, want %v", results.HasUnsubscribe, tt.expectUnsubscribe) } if len(results.UnsubscribeLinks) != tt.expectCount { t.Errorf("Got %d unsubscribe links, want %d", len(results.UnsubscribeLinks), tt.expectCount) } }) } } func TestAnalyzeContent_ImageAltAttributes(t *testing.T) { tests := []struct { name string html string expectImages int expectWithAlt int }{ { name: "Images with alt", html: ` Description 1

`, expectImages: 2, expectWithAlt: 2, }, { name: "Images without alt", html: `

`, expectImages: 2, expectWithAlt: 0, }, { name: "Mixed images", html: ` Description

`, expectImages: 2, expectWithAlt: 1, }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { email := &EmailMessage{ Header: make(mail.Header), Parts: []MessagePart{ { ContentType: "text/html", IsHTML: true, Content: tt.html, }, }, } results := analyzer.AnalyzeContent(email) if len(results.Images) != tt.expectImages { t.Errorf("Got %d images, want %d", len(results.Images), tt.expectImages) } withAlt := 0 for _, img := range results.Images { if img.HasAlt { withAlt++ } } if withAlt != tt.expectWithAlt { t.Errorf("Got %d images with alt, want %d", withAlt, tt.expectWithAlt) } }) } } // Helper functions for testing func parseHTML(htmlStr string) (*html.Node, error) { return html.Parse(strings.NewReader(htmlStr)) } func findFirstLink(n *html.Node) *html.Node { if n.Type == html.ElementNode && n.Data == "a" { return n } for c := n.FirstChild; c != nil; c = c.NextSibling { if result := findFirstLink(c); result != nil { return result } } return nil } func parseURL(urlStr string) (*url.URL, error) { return url.Parse(urlStr) } func TestHasDomainMisalignment(t *testing.T) { tests := []struct { name string href string linkText string expected bool reason string }{ // Phishing cases - should return true { name: "Obvious phishing - different domains", href: "https://evil.com/page", linkText: "Click here to verify your paypal.com account", expected: true, reason: "Link text shows 'paypal.com' but URL points to 'evil.com'", }, { name: "Domain in link text differs from URL", href: "http://attacker.net", linkText: "Visit google.com for more info", expected: true, reason: "Link text shows 'google.com' but URL points to 'attacker.net'", }, { name: "URL shown in text differs from actual URL", href: "https://phishing-site.xyz/login", linkText: "https://www.bank.example.com/secure", expected: true, reason: "Full URL in text doesn't match actual destination", }, { name: "Similar but different domain", href: "https://paypa1.com/login", linkText: "Login to your paypal.com account", expected: true, reason: "Typosquatting: 'paypa1.com' vs 'paypal.com'", }, { name: "Subdomain spoofing", href: "https://paypal.com.evil.com/login", linkText: "Verify your paypal.com account", expected: true, reason: "Domain is 'evil.com', not 'paypal.com'", }, { name: "Multiple domains in text, none match", href: "https://badsite.com", linkText: "Transfer from bank.com to paypal.com", expected: true, reason: "Neither 'bank.com' nor 'paypal.com' matches 'badsite.com'", }, // Legitimate cases - should return false { name: "Exact domain match", href: "https://example.com/page", linkText: "Visit example.com for more information", expected: false, reason: "Domains match exactly", }, { name: "Legitimate subdomain", href: "https://mail.google.com/inbox", linkText: "Check your google.com email", expected: false, reason: "Subdomain of the mentioned domain", }, { name: "www prefix variation", href: "https://www.example.com/page", linkText: "Visit example.com", expected: false, reason: "www prefix is acceptable variation", }, { name: "Generic link text - click here", href: "https://anywhere.com", linkText: "click here", expected: false, reason: "Generic text doesn't contain a domain", }, { name: "Generic link text - read more", href: "https://example.com/article", linkText: "Read more", expected: false, reason: "Generic text doesn't contain a domain", }, { name: "Generic link text - learn more", href: "https://example.com/info", linkText: "Learn More", expected: false, reason: "Generic text doesn't contain a domain (case insensitive)", }, { name: "No domain in link text", href: "https://example.com/page", linkText: "Click to continue", expected: false, reason: "Link text has no domain reference", }, { name: "Short link text", href: "https://example.com", linkText: "Go", expected: false, reason: "Text too short to contain meaningful domain", }, { name: "Empty link text", href: "https://example.com", linkText: "", expected: false, reason: "Empty text cannot contain domain", }, { name: "Mailto link - matching domain", href: "mailto:support@example.com", linkText: "Email support@example.com", expected: false, reason: "Mailto email matches text email", }, { name: "Mailto link - domain mismatch (phishing)", href: "mailto:attacker@evil.com", linkText: "Contact support@paypal.com for help", expected: true, reason: "Mailto domain 'evil.com' doesn't match text domain 'paypal.com'", }, { name: "Mailto link - generic text", href: "mailto:info@example.com", linkText: "Contact us", expected: false, reason: "Generic text without domain reference", }, { name: "Mailto link - same domain different user", href: "mailto:sales@example.com", linkText: "Email support@example.com", expected: false, reason: "Both emails share the same domain", }, { name: "Mailto link - text shows only domain", href: "mailto:info@example.com", linkText: "Write to example.com", expected: false, reason: "Text domain matches mailto domain", }, { name: "Mailto link - domain in text doesn't match", href: "mailto:scam@phishing.net", linkText: "Reply to customer-service@amazon.com", expected: true, reason: "Mailto domain 'phishing.net' doesn't match 'amazon.com' in text", }, { name: "Tel link", href: "tel:+1234567890", linkText: "Call example.com support", expected: false, reason: "Non-HTTP(S) links are excluded", }, { name: "Same base domain with different subdomains", href: "https://www.example.com/page", linkText: "Visit blog.example.com", expected: false, reason: "Both share same base domain 'example.com'", }, { name: "URL with path matches domain in text", href: "https://example.com/section/page", linkText: "Go to example.com", expected: false, reason: "Domain matches, path doesn't matter", }, { name: "Generic text - subscribe", href: "https://newsletter.example.com/signup", linkText: "Subscribe", expected: false, reason: "Generic call-to-action text", }, { name: "Generic text - unsubscribe", href: "https://example.com/unsubscribe?id=123", linkText: "Unsubscribe", expected: false, reason: "Generic unsubscribe text", }, { name: "Generic text - download", href: "https://files.example.com/document.pdf", linkText: "Download", expected: false, reason: "Generic action text", }, { name: "Descriptive text without domain", href: "https://shop.example.com/products", linkText: "View our latest products", expected: false, reason: "No domain mentioned in text", }, // Edge cases { name: "Domain-like text but not valid domain", href: "https://example.com", linkText: "Save up to 50.00 dollars", expected: false, reason: "50.00 looks like domain but isn't", }, { name: "Text with http prefix matching domain", href: "https://example.com/page", linkText: "Visit http://example.com", expected: false, reason: "Domains match despite different protocols in display", }, { name: "Port in URL should not affect matching", href: "https://example.com:8080/page", linkText: "Go to example.com", expected: false, reason: "Port number doesn't affect domain matching", }, { name: "Whitespace in link text", href: "https://example.com", linkText: " example.com ", expected: false, reason: "Whitespace should be trimmed", }, { name: "Multiple spaces in generic text", href: "https://example.com", linkText: "click here", expected: false, reason: "Generic text with extra spaces", }, { name: "Anchor fragment in URL", href: "https://example.com/page#section", linkText: "example.com section", expected: false, reason: "Fragment doesn't affect domain matching", }, { name: "Query parameters in URL", href: "https://example.com/page?utm_source=email", linkText: "Visit example.com", expected: false, reason: "Query params don't affect domain matching", }, } analyzer := NewContentAnalyzer(5 * time.Second) for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := analyzer.hasDomainMisalignment(tt.href, tt.linkText) if result != tt.expected { t.Errorf("hasDomainMisalignment(%q, %q) = %v, want %v\nReason: %s", tt.href, tt.linkText, result, tt.expected, tt.reason) } }) } }