happyDeliver/pkg/analyzer/content_test.go
2026-02-23 00:15:17 +07:00

981 lines
24 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// This file is part of the happyDeliver (R) project.
// Copyright (c) 2025 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
//
// This program is offered under a commercial and under the AGPL license.
// For commercial licensing, contact us at <contact@happydomain.org>.
//
// For AGPL licensing:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package analyzer
import (
"net/mail"
"net/url"
"strings"
"testing"
"time"
"golang.org/x/net/html"
)
func TestNewContentAnalyzer(t *testing.T) {
tests := []struct {
name string
timeout time.Duration
expectedTimeout time.Duration
}{
{
name: "Default timeout",
timeout: 0,
expectedTimeout: 10 * time.Second,
},
{
name: "Custom timeout",
timeout: 5 * time.Second,
expectedTimeout: 5 * time.Second,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
analyzer := NewContentAnalyzer(tt.timeout)
if analyzer.Timeout != tt.expectedTimeout {
t.Errorf("Timeout = %v, want %v", analyzer.Timeout, tt.expectedTimeout)
}
if analyzer.httpClient == nil {
t.Error("httpClient should not be nil")
}
})
}
}
func TestExtractTextFromHTML(t *testing.T) {
tests := []struct {
name string
html string
expectedText string
}{
{
name: "Simple text",
html: "<p>Hello World</p>",
expectedText: "Hello World",
},
{
name: "Multiple elements",
html: "<div><h1>Title</h1><p>Paragraph</p></div>",
expectedText: "Title Paragraph",
},
{
name: "With script tag",
html: "<p>Text</p><script>alert('hi')</script><p>More</p>",
expectedText: "Text More",
},
{
name: "With style tag",
html: "<p>Text</p><style>.class { color: red; }</style><p>More</p>",
expectedText: "Text More",
},
{
name: "Empty HTML",
html: "",
expectedText: "",
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
text := analyzer.extractTextFromHTML(tt.html)
if text != tt.expectedText {
t.Errorf("extractTextFromHTML() = %q, want %q", text, tt.expectedText)
}
})
}
}
func TestIsUnsubscribeLink(t *testing.T) {
tests := []struct {
name string
href string
linkText string
expected bool
}{
{
name: "Unsubscribe in URL",
href: "https://example.com/unsubscribe?id=123",
linkText: "Click here",
expected: true,
},
{
name: "Unsubscribe in text",
href: "https://example.com/action?id=123",
linkText: "Unsubscribe from this list",
expected: true,
},
{
name: "Opt-out in URL",
href: "https://example.com/optout",
linkText: "Click here",
expected: true,
},
{
name: "Remove in text",
href: "https://example.com/action",
linkText: "Remove me from list",
expected: true,
},
{
name: "Normal link",
href: "https://example.com/article",
linkText: "Read more",
expected: false,
},
// Multilingual keyword detection - URL path
{
name: "German abmelden in URL",
href: "https://example.com/abmelden?id=42",
linkText: "Click here",
expected: true,
},
{
name: "French se-desabonner slug in URL (no accent/space - not detected by keyword)",
href: "https://example.com/se-desabonner?id=42",
linkText: "Click here",
expected: false,
},
// Multilingual keyword detection - link text
{
name: "German Abmelden in link text",
href: "https://example.com/manage?id=42&lang=de",
linkText: "Abmelden",
expected: true,
},
{
name: "French Se désabonner in link text",
href: "https://example.com/manage?id=42&lang=fr",
linkText: "Se désabonner",
expected: true,
},
{
name: "Russian Отписаться in link text",
href: "https://example.com/manage?id=42&lang=ru",
linkText: "Отписаться",
expected: true,
},
{
name: "Chinese 退订 in link text",
href: "https://example.com/manage?id=42&lang=zh",
linkText: "退订",
expected: true,
},
{
name: "Japanese 登録を取り消す in link text",
href: "https://example.com/manage?id=42&lang=ja",
linkText: "登録を取り消す",
expected: true,
},
{
name: "Korean 구독 해지 in link text",
href: "https://example.com/manage?id=42&lang=ko",
linkText: "구독 해지",
expected: true,
},
{
name: "Dutch Uitschrijven in link text",
href: "https://example.com/manage?id=42&lang=nl",
linkText: "Uitschrijven",
expected: true,
},
{
name: "Polish Odsubskrybuj in link text",
href: "https://example.com/manage?id=42&lang=pl",
linkText: "Odsubskrybuj",
expected: true,
},
{
name: "Turkish Üyeliği sonlandır in link text",
href: "https://example.com/manage?id=42&lang=tr",
linkText: "Üyeliği sonlandır",
expected: true,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a simple text node for testing
html := "<a href=\"" + tt.href + "\">" + tt.linkText + "</a>"
doc, _ := parseHTML(html)
linkNode := findFirstLink(doc)
if linkNode == nil {
t.Fatal("Failed to parse test HTML")
}
result := analyzer.isUnsubscribeLink(tt.href, linkNode)
if result != tt.expected {
t.Errorf("isUnsubscribeLink(%q, %q) = %v, want %v", tt.href, tt.linkText, result, tt.expected)
}
})
}
}
func TestIsSuspiciousURL(t *testing.T) {
tests := []struct {
name string
url string
expected bool
}{
{
name: "Normal HTTPS URL",
url: "https://example.com/page",
expected: false,
},
{
name: "URL with IP address",
url: "https://192.168.1.1/page",
expected: true,
},
{
name: "URL with IPv6",
url: "https://[2001:db8::1]/page",
expected: true,
},
{
name: "URL shortener - bit.ly",
url: "https://bit.ly/abc123",
expected: true,
},
{
name: "URL shortener - tinyurl",
url: "https://tinyurl.com/abc123",
expected: true,
},
{
name: "Excessive subdomains",
url: "https://a.b.c.d.e.example.com/page",
expected: true,
},
{
name: "URL with @ (phishing)",
url: "https://user@example.com/page",
expected: true,
},
{
name: "Normal subdomain",
url: "https://mail.example.com/page",
expected: false,
},
{
name: "Mailto with @ symbol",
url: "mailto:support@example.com",
expected: false,
},
{
name: "Mailto with multiple @ symbols",
url: "mailto:user@subdomain@example.com",
expected: false,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
parsedURL, err := parseURL(tt.url)
if err != nil {
t.Fatalf("Failed to parse URL: %v", err)
}
result := analyzer.isSuspiciousURL(tt.url, parsedURL)
if result != tt.expected {
t.Errorf("isSuspiciousURL(%q) = %v, want %v", tt.url, result, tt.expected)
}
})
}
}
func TestIsIPAddress(t *testing.T) {
tests := []struct {
name string
host string
expected bool
}{
{
name: "IPv4 address",
host: "192.168.1.1",
expected: true,
},
{
name: "IPv4 with port",
host: "192.168.1.1:8080",
expected: true,
},
{
name: "IPv6 address",
host: "2001:db8::1",
expected: true,
},
{
name: "Domain name",
host: "example.com",
expected: false,
},
{
name: "Subdomain",
host: "mail.example.com",
expected: false,
},
{
name: "Localhost",
host: "localhost",
expected: false,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := analyzer.isIPAddress(tt.host)
if result != tt.expected {
t.Errorf("isIPAddress(%q) = %v, want %v", tt.host, result, tt.expected)
}
})
}
}
func TestCalculateTextPlainConsistency(t *testing.T) {
tests := []struct {
name string
plainText string
htmlText string
expectedMinRatio float32
expectedMaxRatio float32
}{
{
name: "Identical content",
plainText: "Hello World Test",
htmlText: "<p>Hello World Test</p>",
expectedMinRatio: 0.8,
expectedMaxRatio: 1.0,
},
{
name: "Similar content",
plainText: "Hello World",
htmlText: "<p>Hello World Extra</p>",
expectedMinRatio: 0.3,
expectedMaxRatio: 0.8,
},
{
name: "Different content",
plainText: "Completely different",
htmlText: "<p>Nothing alike here</p>",
expectedMinRatio: 0.0,
expectedMaxRatio: 0.3,
},
{
name: "Empty plain text",
plainText: "",
htmlText: "<p>Some text</p>",
expectedMinRatio: 0.0,
expectedMaxRatio: 0.0,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ratio := analyzer.calculateTextPlainConsistency(tt.plainText, tt.htmlText)
if ratio < tt.expectedMinRatio || ratio > tt.expectedMaxRatio {
t.Errorf("calculateTextPlainConsistency() = %v, want between %v and %v",
ratio, tt.expectedMinRatio, tt.expectedMaxRatio)
}
})
}
}
func TestNormalizeText(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "Uppercase to lowercase",
input: "Hello WORLD",
expected: "hello world",
},
{
name: "Multiple spaces",
input: "Hello World",
expected: "hello world",
},
{
name: "Tabs and newlines",
input: "Hello\t\nWorld",
expected: "hello world",
},
{
name: "Leading and trailing spaces",
input: " Hello World ",
expected: "hello world",
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := analyzer.normalizeText(tt.input)
if result != tt.expected {
t.Errorf("normalizeText(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}
func TestAnalyzeContent_HTMLParsing(t *testing.T) {
tests := []struct {
name string
email *EmailMessage
expectValid bool
expectLinks int
expectImages int
}{
{
name: "Valid HTML with links and images",
email: &EmailMessage{
Header: make(mail.Header),
Parts: []MessagePart{
{
ContentType: "text/html",
IsHTML: true,
Content: `
<html>
<body>
<p>Hello World</p>
<a href="https://example.com">Link</a>
<img src="https://example.com/image.jpg" alt="Test">
</body>
</html>
`,
},
},
},
expectValid: true,
expectLinks: 1,
expectImages: 1,
},
{
name: "Multiple links",
email: &EmailMessage{
Header: make(mail.Header),
Parts: []MessagePart{
{
ContentType: "text/html",
IsHTML: true,
Content: `
<html>
<body>
<a href="https://example.com">Link 1</a>
<a href="https://example.org">Link 2</a>
<a href="https://example.net">Link 3</a>
</body>
</html>
`,
},
},
},
expectValid: true,
expectLinks: 3,
expectImages: 0,
},
{
name: "Plain text only",
email: &EmailMessage{
Header: make(mail.Header),
Parts: []MessagePart{
{
ContentType: "text/plain",
IsText: true,
Content: "Plain text email",
},
},
},
expectValid: false,
expectLinks: 0,
expectImages: 0,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
results := analyzer.AnalyzeContent(tt.email)
if results == nil {
t.Fatal("Expected results, got nil")
}
if results.HTMLValid != tt.expectValid {
t.Errorf("HTMLValid = %v, want %v", results.HTMLValid, tt.expectValid)
}
if len(results.Links) != tt.expectLinks {
t.Errorf("Got %d links, want %d", len(results.Links), tt.expectLinks)
}
if len(results.Images) != tt.expectImages {
t.Errorf("Got %d images, want %d", len(results.Images), tt.expectImages)
}
})
}
}
func TestAnalyzeContent_UnsubscribeDetection(t *testing.T) {
tests := []struct {
name string
html string
expectUnsubscribe bool
expectCount int
}{
{
name: "With unsubscribe link",
html: `<html><body>
<p>Email content</p>
<a href="https://example.com/unsubscribe">Unsubscribe</a>
</body></html>`,
expectUnsubscribe: true,
expectCount: 1,
},
{
name: "Multiple unsubscribe links",
html: `<html><body>
<a href="https://example.com/unsubscribe">Unsubscribe</a>
<a href="https://example.com/optout">Opt out</a>
</body></html>`,
expectUnsubscribe: true,
expectCount: 2,
},
{
name: "No unsubscribe link",
html: `<html><body>
<p>Email content</p>
<a href="https://example.com/article">Read more</a>
</body></html>`,
expectUnsubscribe: false,
expectCount: 0,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
email := &EmailMessage{
Header: make(mail.Header),
Parts: []MessagePart{
{
ContentType: "text/html",
IsHTML: true,
Content: tt.html,
},
},
}
results := analyzer.AnalyzeContent(email)
if results.HasUnsubscribe != tt.expectUnsubscribe {
t.Errorf("HasUnsubscribe = %v, want %v", results.HasUnsubscribe, tt.expectUnsubscribe)
}
if len(results.UnsubscribeLinks) != tt.expectCount {
t.Errorf("Got %d unsubscribe links, want %d", len(results.UnsubscribeLinks), tt.expectCount)
}
})
}
}
func TestAnalyzeContent_ImageAltAttributes(t *testing.T) {
tests := []struct {
name string
html string
expectImages int
expectWithAlt int
}{
{
name: "Images with alt",
html: `<html><body>
<img src="image1.jpg" alt="Description 1">
<img src="image2.jpg" alt="Description 2">
</body></html>`,
expectImages: 2,
expectWithAlt: 2,
},
{
name: "Images without alt",
html: `<html><body>
<img src="image1.jpg">
<img src="image2.jpg">
</body></html>`,
expectImages: 2,
expectWithAlt: 0,
},
{
name: "Mixed images",
html: `<html><body>
<img src="image1.jpg" alt="Description">
<img src="image2.jpg">
</body></html>`,
expectImages: 2,
expectWithAlt: 1,
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
email := &EmailMessage{
Header: make(mail.Header),
Parts: []MessagePart{
{
ContentType: "text/html",
IsHTML: true,
Content: tt.html,
},
},
}
results := analyzer.AnalyzeContent(email)
if len(results.Images) != tt.expectImages {
t.Errorf("Got %d images, want %d", len(results.Images), tt.expectImages)
}
withAlt := 0
for _, img := range results.Images {
if img.HasAlt {
withAlt++
}
}
if withAlt != tt.expectWithAlt {
t.Errorf("Got %d images with alt, want %d", withAlt, tt.expectWithAlt)
}
})
}
}
// Helper functions for testing
func parseHTML(htmlStr string) (*html.Node, error) {
return html.Parse(strings.NewReader(htmlStr))
}
func findFirstLink(n *html.Node) *html.Node {
if n.Type == html.ElementNode && n.Data == "a" {
return n
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if result := findFirstLink(c); result != nil {
return result
}
}
return nil
}
func parseURL(urlStr string) (*url.URL, error) {
return url.Parse(urlStr)
}
func TestHasDomainMisalignment(t *testing.T) {
tests := []struct {
name string
href string
linkText string
expected bool
reason string
}{
// Phishing cases - should return true
{
name: "Obvious phishing - different domains",
href: "https://evil.com/page",
linkText: "Click here to verify your paypal.com account",
expected: true,
reason: "Link text shows 'paypal.com' but URL points to 'evil.com'",
},
{
name: "Domain in link text differs from URL",
href: "http://attacker.net",
linkText: "Visit google.com for more info",
expected: true,
reason: "Link text shows 'google.com' but URL points to 'attacker.net'",
},
{
name: "URL shown in text differs from actual URL",
href: "https://phishing-site.xyz/login",
linkText: "https://www.bank.example.com/secure",
expected: true,
reason: "Full URL in text doesn't match actual destination",
},
{
name: "Similar but different domain",
href: "https://paypa1.com/login",
linkText: "Login to your paypal.com account",
expected: true,
reason: "Typosquatting: 'paypa1.com' vs 'paypal.com'",
},
{
name: "Subdomain spoofing",
href: "https://paypal.com.evil.com/login",
linkText: "Verify your paypal.com account",
expected: true,
reason: "Domain is 'evil.com', not 'paypal.com'",
},
{
name: "Multiple domains in text, none match",
href: "https://badsite.com",
linkText: "Transfer from bank.com to paypal.com",
expected: true,
reason: "Neither 'bank.com' nor 'paypal.com' matches 'badsite.com'",
},
// Legitimate cases - should return false
{
name: "Exact domain match",
href: "https://example.com/page",
linkText: "Visit example.com for more information",
expected: false,
reason: "Domains match exactly",
},
{
name: "Legitimate subdomain",
href: "https://mail.google.com/inbox",
linkText: "Check your google.com email",
expected: false,
reason: "Subdomain of the mentioned domain",
},
{
name: "www prefix variation",
href: "https://www.example.com/page",
linkText: "Visit example.com",
expected: false,
reason: "www prefix is acceptable variation",
},
{
name: "Generic link text - click here",
href: "https://anywhere.com",
linkText: "click here",
expected: false,
reason: "Generic text doesn't contain a domain",
},
{
name: "Generic link text - read more",
href: "https://example.com/article",
linkText: "Read more",
expected: false,
reason: "Generic text doesn't contain a domain",
},
{
name: "Generic link text - learn more",
href: "https://example.com/info",
linkText: "Learn More",
expected: false,
reason: "Generic text doesn't contain a domain (case insensitive)",
},
{
name: "No domain in link text",
href: "https://example.com/page",
linkText: "Click to continue",
expected: false,
reason: "Link text has no domain reference",
},
{
name: "Short link text",
href: "https://example.com",
linkText: "Go",
expected: false,
reason: "Text too short to contain meaningful domain",
},
{
name: "Empty link text",
href: "https://example.com",
linkText: "",
expected: false,
reason: "Empty text cannot contain domain",
},
{
name: "Mailto link - matching domain",
href: "mailto:support@example.com",
linkText: "Email support@example.com",
expected: false,
reason: "Mailto email matches text email",
},
{
name: "Mailto link - domain mismatch (phishing)",
href: "mailto:attacker@evil.com",
linkText: "Contact support@paypal.com for help",
expected: true,
reason: "Mailto domain 'evil.com' doesn't match text domain 'paypal.com'",
},
{
name: "Mailto link - generic text",
href: "mailto:info@example.com",
linkText: "Contact us",
expected: false,
reason: "Generic text without domain reference",
},
{
name: "Mailto link - same domain different user",
href: "mailto:sales@example.com",
linkText: "Email support@example.com",
expected: false,
reason: "Both emails share the same domain",
},
{
name: "Mailto link - text shows only domain",
href: "mailto:info@example.com",
linkText: "Write to example.com",
expected: false,
reason: "Text domain matches mailto domain",
},
{
name: "Mailto link - domain in text doesn't match",
href: "mailto:scam@phishing.net",
linkText: "Reply to customer-service@amazon.com",
expected: true,
reason: "Mailto domain 'phishing.net' doesn't match 'amazon.com' in text",
},
{
name: "Tel link",
href: "tel:+1234567890",
linkText: "Call example.com support",
expected: false,
reason: "Non-HTTP(S) links are excluded",
},
{
name: "Same base domain with different subdomains",
href: "https://www.example.com/page",
linkText: "Visit blog.example.com",
expected: false,
reason: "Both share same base domain 'example.com'",
},
{
name: "URL with path matches domain in text",
href: "https://example.com/section/page",
linkText: "Go to example.com",
expected: false,
reason: "Domain matches, path doesn't matter",
},
{
name: "Generic text - subscribe",
href: "https://newsletter.example.com/signup",
linkText: "Subscribe",
expected: false,
reason: "Generic call-to-action text",
},
{
name: "Generic text - unsubscribe",
href: "https://example.com/unsubscribe?id=123",
linkText: "Unsubscribe",
expected: false,
reason: "Generic unsubscribe text",
},
{
name: "Generic text - download",
href: "https://files.example.com/document.pdf",
linkText: "Download",
expected: false,
reason: "Generic action text",
},
{
name: "Descriptive text without domain",
href: "https://shop.example.com/products",
linkText: "View our latest products",
expected: false,
reason: "No domain mentioned in text",
},
// Edge cases
{
name: "Domain-like text but not valid domain",
href: "https://example.com",
linkText: "Save up to 50.00 dollars",
expected: false,
reason: "50.00 looks like domain but isn't",
},
{
name: "Text with http prefix matching domain",
href: "https://example.com/page",
linkText: "Visit http://example.com",
expected: false,
reason: "Domains match despite different protocols in display",
},
{
name: "Port in URL should not affect matching",
href: "https://example.com:8080/page",
linkText: "Go to example.com",
expected: false,
reason: "Port number doesn't affect domain matching",
},
{
name: "Whitespace in link text",
href: "https://example.com",
linkText: " example.com ",
expected: false,
reason: "Whitespace should be trimmed",
},
{
name: "Multiple spaces in generic text",
href: "https://example.com",
linkText: "click here",
expected: false,
reason: "Generic text with extra spaces",
},
{
name: "Anchor fragment in URL",
href: "https://example.com/page#section",
linkText: "example.com section",
expected: false,
reason: "Fragment doesn't affect domain matching",
},
{
name: "Query parameters in URL",
href: "https://example.com/page?utm_source=email",
linkText: "Visit example.com",
expected: false,
reason: "Query params don't affect domain matching",
},
}
analyzer := NewContentAnalyzer(5 * time.Second)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := analyzer.hasDomainMisalignment(tt.href, tt.linkText)
if result != tt.expected {
t.Errorf("hasDomainMisalignment(%q, %q) = %v, want %v\nReason: %s",
tt.href, tt.linkText, result, tt.expected, tt.reason)
}
})
}
}