All checks were successful
continuous-integration/drone/push Build is passing
Detect the List-Unsubscribe-Post: List-Unsubscribe=One-Click header (RFC 8058) and expose it as the 'one-click' unsubscribe method in the content analysis. When unsubscribe methods are present but one-click is absent, the summary card now shows a warning nudging senders to adopt it.
986 lines
28 KiB
Go
986 lines
28 KiB
Go
// This file is part of the happyDeliver (R) project.
|
||
// Copyright (c) 2025 happyDomain
|
||
// Authors: Pierre-Olivier Mercier, et al.
|
||
//
|
||
// This program is offered under a commercial and under the AGPL license.
|
||
// For commercial licensing, contact us at <contact@happydomain.org>.
|
||
//
|
||
// For AGPL licensing:
|
||
// This program is free software: you can redistribute it and/or modify
|
||
// it under the terms of the GNU Affero General Public License as published by
|
||
// the Free Software Foundation, either version 3 of the License, or
|
||
// (at your option) any later version.
|
||
//
|
||
// This program is distributed in the hope that it will be useful,
|
||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
// GNU Affero General Public License for more details.
|
||
//
|
||
// You should have received a copy of the GNU Affero General Public License
|
||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
|
||
package analyzer
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"net/http"
|
||
"net/url"
|
||
"regexp"
|
||
"slices"
|
||
"strings"
|
||
"time"
|
||
"unicode"
|
||
|
||
"git.happydns.org/happyDeliver/internal/api"
|
||
"golang.org/x/net/html"
|
||
)
|
||
|
||
// ContentAnalyzer analyzes email content (HTML, links, images)
|
||
type ContentAnalyzer struct {
|
||
Timeout time.Duration
|
||
httpClient *http.Client
|
||
listUnsubscribeURLs []string // URLs from List-Unsubscribe header
|
||
hasOneClickUnsubscribe bool // True if List-Unsubscribe-Post: List-Unsubscribe=One-Click
|
||
}
|
||
|
||
// NewContentAnalyzer creates a new content analyzer with configurable timeout
|
||
func NewContentAnalyzer(timeout time.Duration) *ContentAnalyzer {
|
||
if timeout == 0 {
|
||
timeout = 10 * time.Second // Default timeout
|
||
}
|
||
return &ContentAnalyzer{
|
||
Timeout: timeout,
|
||
httpClient: &http.Client{
|
||
Timeout: timeout,
|
||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||
// Allow up to 10 redirects
|
||
if len(via) >= 10 {
|
||
return fmt.Errorf("too many redirects")
|
||
}
|
||
return nil
|
||
},
|
||
},
|
||
}
|
||
}
|
||
|
||
// ContentResults represents content analysis results
|
||
type ContentResults struct {
|
||
IsMultipart bool
|
||
HTMLValid bool
|
||
HTMLErrors []string
|
||
Links []LinkCheck
|
||
Images []ImageCheck
|
||
HasUnsubscribe bool
|
||
UnsubscribeLinks []string
|
||
TextContent string
|
||
HTMLContent string
|
||
TextPlainRatio float32 // Ratio of plain text to HTML consistency
|
||
ImageTextRatio float32 // Ratio of images to text
|
||
SuspiciousURLs []string
|
||
ContentIssues []string
|
||
HarmfullIssues []string
|
||
}
|
||
|
||
// HasPlaintext returns true if the email has plain text content
|
||
func (r *ContentResults) HasPlaintext() bool {
|
||
return r.TextContent != ""
|
||
}
|
||
|
||
// LinkCheck represents a link validation result
|
||
type LinkCheck struct {
|
||
URL string
|
||
Valid bool
|
||
Status int
|
||
Error string
|
||
IsSafe bool
|
||
Warning string
|
||
}
|
||
|
||
// ImageCheck represents an image validation result
|
||
type ImageCheck struct {
|
||
Src string
|
||
HasAlt bool
|
||
AltText string
|
||
Valid bool
|
||
Error string
|
||
IsBroken bool
|
||
}
|
||
|
||
// AnalyzeContent performs content analysis on email message
|
||
func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
|
||
results := &ContentResults{}
|
||
|
||
results.IsMultipart = len(email.Parts) > 1
|
||
|
||
// Parse List-Unsubscribe header URLs for use in link detection
|
||
c.listUnsubscribeURLs = email.GetListUnsubscribeURLs()
|
||
|
||
// Check for one-click unsubscribe support
|
||
listUnsubscribePost := email.Header.Get("List-Unsubscribe-Post")
|
||
c.hasOneClickUnsubscribe = strings.EqualFold(strings.TrimSpace(listUnsubscribePost), "List-Unsubscribe=One-Click")
|
||
|
||
// Get HTML and text parts
|
||
htmlParts := email.GetHTMLParts()
|
||
textParts := email.GetTextParts()
|
||
|
||
// Analyze HTML parts
|
||
if len(htmlParts) > 0 {
|
||
for _, part := range htmlParts {
|
||
c.analyzeHTML(part.Content, results)
|
||
}
|
||
}
|
||
|
||
// Analyze text parts
|
||
if len(textParts) > 0 {
|
||
for _, part := range textParts {
|
||
results.TextContent += part.Content
|
||
}
|
||
// Extract and validate links from plain text
|
||
c.analyzeTextLinks(results.TextContent, results)
|
||
}
|
||
|
||
// Check plain text/HTML consistency
|
||
if len(htmlParts) > 0 && len(textParts) > 0 {
|
||
results.TextPlainRatio = c.calculateTextPlainConsistency(results.TextContent, results.HTMLContent)
|
||
} else if !results.IsMultipart {
|
||
results.TextPlainRatio = 1.0
|
||
}
|
||
|
||
return results
|
||
}
|
||
|
||
// analyzeTextLinks extracts and validates URLs from plain text
|
||
func (c *ContentAnalyzer) analyzeTextLinks(textContent string, results *ContentResults) {
|
||
// Regular expression to match URLs in plain text
|
||
// Matches http://, https://, and www. URLs
|
||
urlRegex := regexp.MustCompile(`(?i)\b(?:https?://|www\.)[^\s<>"{}|\\^\[\]` + "`" + `]+`)
|
||
|
||
matches := urlRegex.FindAllString(textContent, -1)
|
||
|
||
for _, match := range matches {
|
||
// Normalize URL (add http:// if missing)
|
||
urlStr := match
|
||
if strings.HasPrefix(strings.ToLower(urlStr), "www.") {
|
||
urlStr = "http://" + urlStr
|
||
}
|
||
|
||
// Check if this URL already exists in results.Links (from HTML analysis)
|
||
exists := false
|
||
for _, link := range results.Links {
|
||
if link.URL == urlStr {
|
||
exists = true
|
||
break
|
||
}
|
||
}
|
||
|
||
// Only validate if not already checked
|
||
if !exists {
|
||
linkCheck := c.validateLink(urlStr)
|
||
results.Links = append(results.Links, linkCheck)
|
||
|
||
// Check for suspicious URLs
|
||
if !linkCheck.IsSafe {
|
||
results.SuspiciousURLs = append(results.SuspiciousURLs, urlStr)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// analyzeHTML parses and analyzes HTML content
|
||
func (c *ContentAnalyzer) analyzeHTML(htmlContent string, results *ContentResults) {
|
||
results.HTMLContent = htmlContent
|
||
|
||
// Parse HTML
|
||
doc, err := html.Parse(strings.NewReader(htmlContent))
|
||
if err != nil {
|
||
results.HTMLValid = false
|
||
results.HTMLErrors = append(results.HTMLErrors, fmt.Sprintf("Failed to parse HTML: %v", err))
|
||
return
|
||
}
|
||
|
||
results.HTMLValid = true
|
||
|
||
// Traverse HTML tree
|
||
c.traverseHTML(doc, results)
|
||
|
||
// Calculate image-to-text ratio
|
||
if results.HTMLContent != "" {
|
||
textLength := len(c.extractTextFromHTML(htmlContent))
|
||
imageCount := len(results.Images)
|
||
if textLength > 0 {
|
||
results.ImageTextRatio = float32(imageCount) / float32(textLength) * 1000 // Images per 1000 chars
|
||
}
|
||
}
|
||
}
|
||
|
||
// traverseHTML recursively traverses HTML nodes
|
||
func (c *ContentAnalyzer) traverseHTML(n *html.Node, results *ContentResults) {
|
||
if n.Type == html.ElementNode {
|
||
switch n.Data {
|
||
case "a":
|
||
// Extract and validate links
|
||
href := c.getAttr(n, "href")
|
||
if href != "" {
|
||
// Check for unsubscribe links
|
||
if c.isUnsubscribeLink(href, n) {
|
||
results.HasUnsubscribe = true
|
||
results.UnsubscribeLinks = append(results.UnsubscribeLinks, href)
|
||
}
|
||
|
||
// Validate link
|
||
linkCheck := c.validateLink(href)
|
||
|
||
// Check for domain misalignment (phishing detection)
|
||
linkText := c.getNodeText(n)
|
||
if c.hasDomainMisalignment(href, linkText) {
|
||
linkCheck.IsSafe = false
|
||
if linkCheck.Warning == "" {
|
||
linkCheck.Warning = "Link text domain does not match actual URL domain (possible phishing)"
|
||
} else {
|
||
linkCheck.Warning += "; Link text domain does not match actual URL domain (possible phishing)"
|
||
}
|
||
}
|
||
|
||
results.Links = append(results.Links, linkCheck)
|
||
|
||
// Check for suspicious URLs
|
||
if !linkCheck.IsSafe {
|
||
results.SuspiciousURLs = append(results.SuspiciousURLs, href)
|
||
}
|
||
}
|
||
|
||
case "img":
|
||
// Extract and validate images
|
||
src := c.getAttr(n, "src")
|
||
alt := c.getAttr(n, "alt")
|
||
|
||
imageCheck := ImageCheck{
|
||
Src: src,
|
||
HasAlt: alt != "",
|
||
AltText: alt,
|
||
Valid: src != "",
|
||
}
|
||
|
||
if src == "" {
|
||
imageCheck.Error = "Image missing src attribute"
|
||
}
|
||
|
||
results.Images = append(results.Images, imageCheck)
|
||
|
||
case "script":
|
||
// JavaScript in emails is a security risk and typically blocked
|
||
results.HarmfullIssues = append(results.HarmfullIssues, "Dangerous <script> tag detected - JavaScript is blocked by most email clients")
|
||
|
||
case "iframe":
|
||
// Iframes are security risks and blocked by most email clients
|
||
src := c.getAttr(n, "src")
|
||
issue := "Dangerous <iframe> tag detected"
|
||
if src != "" {
|
||
issue += fmt.Sprintf(" with src='%s'", src)
|
||
}
|
||
results.HarmfullIssues = append(results.HarmfullIssues, issue+" - iframes are blocked by most email clients")
|
||
|
||
case "object", "embed", "applet":
|
||
// Legacy embedding tags, security risks
|
||
results.HarmfullIssues = append(results.HarmfullIssues, fmt.Sprintf("Dangerous <%s> tag detected - legacy embedding tags are security risks and blocked by email clients", n.Data))
|
||
|
||
case "form":
|
||
// Forms in emails can be phishing vectors
|
||
action := c.getAttr(n, "action")
|
||
issue := "Suspicious <form> tag detected"
|
||
if action != "" {
|
||
issue += fmt.Sprintf(" with action='%s'", action)
|
||
}
|
||
results.HarmfullIssues = append(results.HarmfullIssues, issue+" - forms can be phishing vectors and are often blocked")
|
||
|
||
case "base":
|
||
// Base tag can be used for phishing by redirecting relative URLs
|
||
href := c.getAttr(n, "href")
|
||
issue := "Potentially dangerous <base> tag detected"
|
||
if href != "" {
|
||
issue += fmt.Sprintf(" with href='%s'", href)
|
||
}
|
||
results.HarmfullIssues = append(results.HarmfullIssues, issue+" - can redirect all relative URLs")
|
||
|
||
case "meta":
|
||
// Check for suspicious meta redirects
|
||
httpEquiv := c.getAttr(n, "http-equiv")
|
||
if strings.ToLower(httpEquiv) == "refresh" {
|
||
content := c.getAttr(n, "content")
|
||
results.HarmfullIssues = append(results.HarmfullIssues, fmt.Sprintf("Suspicious <meta http-equiv='refresh'> tag detected with content='%s' - can be used for phishing redirects", content))
|
||
}
|
||
|
||
case "link":
|
||
// Check for external stylesheet links (potential privacy/tracking concerns)
|
||
rel := c.getAttr(n, "rel")
|
||
href := c.getAttr(n, "href")
|
||
if strings.Contains(strings.ToLower(rel), "stylesheet") && href != "" {
|
||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||
results.ContentIssues = append(results.ContentIssues, fmt.Sprintf("External stylesheet link detected: %s - may cause rendering issues or privacy concerns", href))
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Traverse children
|
||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||
c.traverseHTML(child, results)
|
||
}
|
||
}
|
||
|
||
// getAttr gets an attribute value from an HTML node
|
||
func (c *ContentAnalyzer) getAttr(n *html.Node, key string) string {
|
||
for _, attr := range n.Attr {
|
||
if attr.Key == key {
|
||
return attr.Val
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
// isUnsubscribeLink checks if a link is an unsubscribe link
|
||
func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool {
|
||
// First check: does the href match a URL from the List-Unsubscribe header?
|
||
if slices.Contains(c.listUnsubscribeURLs, href) {
|
||
return true
|
||
}
|
||
|
||
// Check href for unsubscribe keywords
|
||
lowerHref := strings.ToLower(href)
|
||
unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe", "отписване", "desubscripció", "zrušit odběr", "dad-danysgrifio", "afmeld", "abmelden", "διαγραφή", "darse de baja", "poistu postituslistalta", "se désabonner", "ביטול רישום", "leiratkozás", "cancella iscrizione", "登録を取り消す", "구독 해지", "വരിക്കാരനല്ലാതാകുക", "uitschrijven", "meld av", "odsubskrybuj", "cancelar assinatura", "cancelar subscrição", "dezabonare", "отписаться", "avsluta prenumeration", "zrušiť odber", "odjava", "üyeliği sonlandır", "відписатися", "hủy đăng ký", "退订", "退訂"}
|
||
for _, keyword := range unsubKeywords {
|
||
if strings.Contains(lowerHref, keyword) {
|
||
return true
|
||
}
|
||
}
|
||
|
||
// Check link text for unsubscribe keywords
|
||
text := c.getNodeText(node)
|
||
lowerText := strings.ToLower(text)
|
||
for _, keyword := range unsubKeywords {
|
||
if strings.Contains(lowerText, keyword) {
|
||
return true
|
||
}
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
// getNodeText extracts text content from a node
|
||
func (c *ContentAnalyzer) getNodeText(n *html.Node) string {
|
||
if n.Type == html.TextNode {
|
||
return n.Data
|
||
}
|
||
var text string
|
||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||
text += c.getNodeText(child)
|
||
}
|
||
return text
|
||
}
|
||
|
||
// validateLink validates a URL and checks if it's accessible
|
||
func (c *ContentAnalyzer) validateLink(urlStr string) LinkCheck {
|
||
check := LinkCheck{
|
||
URL: urlStr,
|
||
IsSafe: true,
|
||
}
|
||
|
||
// Parse URL
|
||
parsedURL, err := url.Parse(urlStr)
|
||
if err != nil {
|
||
check.Valid = false
|
||
check.Error = fmt.Sprintf("Invalid URL: %v", err)
|
||
return check
|
||
}
|
||
|
||
// Check URL safety
|
||
if c.isSuspiciousURL(urlStr, parsedURL) {
|
||
check.IsSafe = false
|
||
check.Warning = "URL appears suspicious (obfuscated, shortened, or unusual)"
|
||
}
|
||
|
||
// Only check HTTP/HTTPS links
|
||
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
|
||
check.Valid = true
|
||
return check
|
||
}
|
||
|
||
// Check if link is accessible (with timeout)
|
||
ctx, cancel := context.WithTimeout(context.Background(), c.Timeout)
|
||
defer cancel()
|
||
|
||
req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
|
||
if err != nil {
|
||
check.Valid = false
|
||
check.Error = fmt.Sprintf("Failed to create request: %v", err)
|
||
return check
|
||
}
|
||
|
||
// Set a reasonable user agent
|
||
req.Header.Set("User-Agent", "happyDeliver/1.0 (Email Deliverability Tester)")
|
||
|
||
resp, err := c.httpClient.Do(req)
|
||
if err != nil {
|
||
// Don't fail on timeout/connection errors for external links
|
||
// Just mark as warning
|
||
check.Valid = true
|
||
check.Status = 0
|
||
check.Warning = fmt.Sprintf("Could not verify link: %v", err)
|
||
return check
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
check.Status = resp.StatusCode
|
||
check.Valid = true
|
||
|
||
// Check for error status codes
|
||
if resp.StatusCode >= 400 {
|
||
check.Error = fmt.Sprintf("Link returns %d status", resp.StatusCode)
|
||
}
|
||
|
||
return check
|
||
}
|
||
|
||
// hasDomainMisalignment checks if the link text contains a different domain than the actual URL
|
||
// This is a common phishing technique (e.g., text shows "paypal.com" but links to "evil.com")
|
||
func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
|
||
// Parse the actual URL
|
||
parsedURL, err := url.Parse(href)
|
||
if err != nil {
|
||
return false
|
||
}
|
||
|
||
// Extract the actual destination domain/email based on scheme
|
||
var actualDomain string
|
||
|
||
switch parsedURL.Scheme {
|
||
case "mailto":
|
||
// Extract email address from mailto: URL
|
||
// Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=...
|
||
mailtoAddr := parsedURL.Opaque
|
||
|
||
// Remove query parameters if present
|
||
if idx := strings.Index(mailtoAddr, "?"); idx != -1 {
|
||
mailtoAddr = mailtoAddr[:idx]
|
||
}
|
||
|
||
mailtoAddr = strings.TrimSpace(strings.ToLower(mailtoAddr))
|
||
|
||
// Extract domain from email address
|
||
if idx := strings.Index(mailtoAddr, "@"); idx != -1 {
|
||
actualDomain = mailtoAddr[idx+1:]
|
||
} else {
|
||
return false // Invalid mailto
|
||
}
|
||
case "http":
|
||
case "https":
|
||
// Check if URL has a host
|
||
if parsedURL.Host == "" {
|
||
return false
|
||
}
|
||
|
||
// Extract the actual URL's domain (remove port if present)
|
||
actualDomain = parsedURL.Host
|
||
if idx := strings.LastIndex(actualDomain, ":"); idx != -1 {
|
||
actualDomain = actualDomain[:idx]
|
||
}
|
||
actualDomain = strings.ToLower(actualDomain)
|
||
default:
|
||
// Skip checks for other URL schemes (tel, etc.)
|
||
return false
|
||
}
|
||
|
||
// Normalize link text
|
||
linkText = strings.TrimSpace(linkText)
|
||
linkText = strings.ToLower(linkText)
|
||
|
||
// Skip if link text is empty, too short, or just generic text like "click here"
|
||
if linkText == "" || len(linkText) < 4 {
|
||
return false
|
||
}
|
||
|
||
// Common generic link texts that shouldn't trigger warnings
|
||
genericTexts := []string{
|
||
"click here", "read more", "learn more", "download", "subscribe",
|
||
"unsubscribe", "view online", "view in browser", "click", "here",
|
||
"update", "verify", "confirm", "continue", "get started",
|
||
// mailto-specific generic texts
|
||
"email us", "contact us", "send email", "get in touch", "reach out",
|
||
"contact", "email", "write to us",
|
||
}
|
||
if slices.Contains(genericTexts, linkText) {
|
||
return false
|
||
}
|
||
|
||
// Extract domain-like patterns from link text using regex
|
||
// Matches patterns like "example.com", "www.example.com", "http://example.com"
|
||
domainRegex := regexp.MustCompile(`(?i)(?:https?://)?(?:www\.)?([a-z0-9][-a-z0-9]*\.)+[a-z]{2,}`)
|
||
matches := domainRegex.FindAllString(linkText, -1)
|
||
|
||
if len(matches) == 0 {
|
||
return false
|
||
}
|
||
|
||
// Check each domain-like pattern found in the text
|
||
for _, textDomain := range matches {
|
||
// Normalize the text domain
|
||
textDomain = strings.ToLower(textDomain)
|
||
textDomain = strings.TrimPrefix(textDomain, "http://")
|
||
textDomain = strings.TrimPrefix(textDomain, "https://")
|
||
textDomain = strings.TrimPrefix(textDomain, "www.")
|
||
|
||
// Remove trailing slashes and paths
|
||
if idx := strings.Index(textDomain, "/"); idx != -1 {
|
||
textDomain = textDomain[:idx]
|
||
}
|
||
|
||
// Compare domains - they should match or the actual URL should be a subdomain of the text domain
|
||
if textDomain != actualDomain {
|
||
// Check if actual domain is a subdomain of text domain
|
||
if !strings.HasSuffix(actualDomain, "."+textDomain) && !strings.HasSuffix(actualDomain, textDomain) {
|
||
// Check if they share the same base domain (last 2 parts)
|
||
textParts := strings.Split(textDomain, ".")
|
||
actualParts := strings.Split(actualDomain, ".")
|
||
|
||
if len(textParts) >= 2 && len(actualParts) >= 2 {
|
||
textBase := strings.Join(textParts[len(textParts)-2:], ".")
|
||
actualBase := strings.Join(actualParts[len(actualParts)-2:], ".")
|
||
|
||
if textBase != actualBase {
|
||
return true // Domain mismatch detected!
|
||
}
|
||
} else {
|
||
return true // Domain mismatch detected!
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
// isSuspiciousURL checks if a URL looks suspicious
|
||
func (c *ContentAnalyzer) isSuspiciousURL(urlStr string, parsedURL *url.URL) bool {
|
||
// Skip checks for mailto: URLs
|
||
if parsedURL.Scheme == "mailto" {
|
||
return false
|
||
}
|
||
|
||
// Check for IP address instead of domain
|
||
if c.isIPAddress(parsedURL.Host) {
|
||
return true
|
||
}
|
||
|
||
// Check for URL shorteners (common ones)
|
||
shorteners := []string{
|
||
"bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co",
|
||
"buff.ly", "is.gd", "bl.ink", "short.io",
|
||
}
|
||
if slices.Contains(shorteners, strings.ToLower(parsedURL.Host)) {
|
||
return true
|
||
}
|
||
|
||
// Check for excessive subdomains (possible obfuscation)
|
||
parts := strings.Split(parsedURL.Host, ".")
|
||
if len(parts) > 4 {
|
||
return true
|
||
}
|
||
|
||
// Check for URL obfuscation techniques
|
||
if strings.Count(urlStr, "@") > 0 { // @ in URL (possible phishing)
|
||
return true
|
||
}
|
||
|
||
// Check for suspicious characters in domain
|
||
if strings.ContainsAny(parsedURL.Host, "[]()<>") {
|
||
return true
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
// isIPAddress checks if a string is an IP address
|
||
func (c *ContentAnalyzer) isIPAddress(host string) bool {
|
||
// Remove port if present
|
||
if idx := strings.LastIndex(host, ":"); idx != -1 {
|
||
host = host[:idx]
|
||
}
|
||
|
||
// Simple check for IPv4
|
||
parts := strings.Split(host, ".")
|
||
if len(parts) == 4 {
|
||
for _, part := range parts {
|
||
// Check if all characters are digits
|
||
for _, ch := range part {
|
||
if !unicode.IsDigit(ch) {
|
||
return false
|
||
}
|
||
}
|
||
}
|
||
return true
|
||
}
|
||
|
||
// Check for IPv6 (contains colons)
|
||
if strings.Contains(host, ":") {
|
||
return true
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
// extractTextFromHTML extracts plain text from HTML
|
||
func (c *ContentAnalyzer) extractTextFromHTML(htmlContent string) string {
|
||
doc, err := html.Parse(strings.NewReader(htmlContent))
|
||
if err != nil {
|
||
return ""
|
||
}
|
||
|
||
var text strings.Builder
|
||
var extract func(*html.Node)
|
||
extract = func(n *html.Node) {
|
||
if n.Type == html.TextNode {
|
||
text.WriteString(" " + n.Data)
|
||
}
|
||
// Skip script and style tags
|
||
if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
|
||
return
|
||
}
|
||
for child := n.FirstChild; child != nil; child = child.NextSibling {
|
||
extract(child)
|
||
}
|
||
}
|
||
extract(doc)
|
||
|
||
return strings.TrimSpace(text.String())
|
||
}
|
||
|
||
// calculateTextPlainConsistency compares plain text and HTML versions
|
||
func (c *ContentAnalyzer) calculateTextPlainConsistency(plainText, htmlText string) float32 {
|
||
// Extract text from HTML
|
||
htmlPlainText := c.extractTextFromHTML(htmlText)
|
||
|
||
// Normalize both texts
|
||
plainNorm := c.normalizeText(plainText)
|
||
htmlNorm := c.normalizeText(htmlPlainText)
|
||
|
||
// Calculate similarity using simple word overlap
|
||
plainWords := strings.Fields(plainNorm)
|
||
htmlWords := strings.Fields(htmlNorm)
|
||
|
||
if len(plainWords) == 0 || len(htmlWords) == 0 {
|
||
return 0.0
|
||
}
|
||
|
||
// Count common words by building sets
|
||
plainWordSet := make(map[string]int)
|
||
for _, word := range plainWords {
|
||
plainWordSet[word]++
|
||
}
|
||
|
||
htmlWordSet := make(map[string]int)
|
||
for _, word := range htmlWords {
|
||
htmlWordSet[word]++
|
||
}
|
||
|
||
// Count matches: for each unique word, count minimum occurrences in both texts
|
||
commonWords := 0
|
||
for word, plainCount := range plainWordSet {
|
||
if htmlCount, exists := htmlWordSet[word]; exists {
|
||
// Count the minimum occurrences between both texts
|
||
if plainCount < htmlCount {
|
||
commonWords += plainCount
|
||
} else {
|
||
commonWords += htmlCount
|
||
}
|
||
}
|
||
}
|
||
|
||
// Calculate ratio using total words from both texts (union approach)
|
||
// This provides a balanced measure: perfect match = 1.0, partial overlap = 0.3-0.8
|
||
totalWords := len(plainWords) + len(htmlWords)
|
||
if totalWords == 0 {
|
||
return 0.0
|
||
}
|
||
|
||
// Divide by average word count for better scoring
|
||
avgWords := float32(totalWords) / 2.0
|
||
ratio := float32(commonWords) / avgWords
|
||
|
||
// Cap at 1.0 for perfect matches
|
||
if ratio > 1.0 {
|
||
ratio = 1.0
|
||
}
|
||
|
||
return ratio
|
||
}
|
||
|
||
// normalizeText normalizes text for comparison
|
||
func (c *ContentAnalyzer) normalizeText(text string) string {
|
||
// Convert to lowercase
|
||
text = strings.ToLower(text)
|
||
|
||
// Remove extra whitespace
|
||
text = strings.TrimSpace(text)
|
||
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
|
||
|
||
return text
|
||
}
|
||
|
||
// GenerateContentAnalysis creates structured content analysis from results
|
||
func (c *ContentAnalyzer) GenerateContentAnalysis(results *ContentResults) *api.ContentAnalysis {
|
||
if results == nil {
|
||
return nil
|
||
}
|
||
|
||
analysis := &api.ContentAnalysis{
|
||
HasHtml: api.PtrTo(results.HTMLContent != ""),
|
||
HasPlaintext: api.PtrTo(results.TextContent != ""),
|
||
HasUnsubscribeLink: api.PtrTo(results.HasUnsubscribe),
|
||
UnsubscribeMethods: &[]api.ContentAnalysisUnsubscribeMethods{},
|
||
}
|
||
|
||
// Calculate text-to-image ratio (inverse of image-to-text)
|
||
if len(results.Images) > 0 && results.HTMLContent != "" {
|
||
textLen := float32(len(c.extractTextFromHTML(results.HTMLContent)))
|
||
if textLen > 0 {
|
||
ratio := textLen / float32(len(results.Images))
|
||
analysis.TextToImageRatio = &ratio
|
||
}
|
||
}
|
||
|
||
// Build HTML issues
|
||
htmlIssues := []api.ContentIssue{}
|
||
|
||
// Add HTML parsing errors
|
||
if !results.HTMLValid && len(results.HTMLErrors) > 0 {
|
||
for _, errMsg := range results.HTMLErrors {
|
||
htmlIssues = append(htmlIssues, api.ContentIssue{
|
||
Type: api.BrokenHtml,
|
||
Severity: api.ContentIssueSeverityHigh,
|
||
Message: errMsg,
|
||
Advice: api.PtrTo("Fix HTML structure errors to improve email rendering across clients"),
|
||
})
|
||
}
|
||
}
|
||
|
||
// Add missing alt text issues
|
||
if len(results.Images) > 0 {
|
||
missingAltCount := 0
|
||
for _, img := range results.Images {
|
||
if !img.HasAlt {
|
||
missingAltCount++
|
||
}
|
||
}
|
||
if missingAltCount > 0 {
|
||
htmlIssues = append(htmlIssues, api.ContentIssue{
|
||
Type: api.MissingAlt,
|
||
Severity: api.ContentIssueSeverityMedium,
|
||
Message: fmt.Sprintf("%d image(s) missing alt attributes", missingAltCount),
|
||
Advice: api.PtrTo("Add descriptive alt text to all images for better accessibility and deliverability"),
|
||
})
|
||
}
|
||
}
|
||
|
||
// Add excessive images issue
|
||
if results.ImageTextRatio > 10.0 {
|
||
htmlIssues = append(htmlIssues, api.ContentIssue{
|
||
Type: api.ExcessiveImages,
|
||
Severity: api.ContentIssueSeverityMedium,
|
||
Message: "Email is excessively image-heavy",
|
||
Advice: api.PtrTo("Reduce the number of images relative to text content"),
|
||
})
|
||
}
|
||
|
||
// Add suspicious URL issues
|
||
for _, suspURL := range results.SuspiciousURLs {
|
||
htmlIssues = append(htmlIssues, api.ContentIssue{
|
||
Type: api.SuspiciousLink,
|
||
Severity: api.ContentIssueSeverityHigh,
|
||
Message: "Suspicious URL detected",
|
||
Location: &suspURL,
|
||
Advice: api.PtrTo("Avoid URL shorteners, IP addresses, and obfuscated URLs in emails"),
|
||
})
|
||
}
|
||
|
||
// Add harmful HTML tag issues
|
||
for _, harmfulIssue := range results.HarmfullIssues {
|
||
htmlIssues = append(htmlIssues, api.ContentIssue{
|
||
Type: api.DangerousHtml,
|
||
Severity: api.ContentIssueSeverityCritical,
|
||
Message: harmfulIssue,
|
||
Advice: api.PtrTo("Remove dangerous HTML tags like <script>, <iframe>, <object>, <embed>, <applet>, <form>, and <base> from email content"),
|
||
})
|
||
}
|
||
|
||
// Add general content issues (like external stylesheets)
|
||
for _, contentIssue := range results.ContentIssues {
|
||
htmlIssues = append(htmlIssues, api.ContentIssue{
|
||
Type: api.BrokenHtml,
|
||
Severity: api.ContentIssueSeverityLow,
|
||
Message: contentIssue,
|
||
Advice: api.PtrTo("Use inline CSS instead of external stylesheets for better email compatibility"),
|
||
})
|
||
}
|
||
|
||
if len(htmlIssues) > 0 {
|
||
analysis.HtmlIssues = &htmlIssues
|
||
}
|
||
|
||
// Convert links
|
||
if len(results.Links) > 0 {
|
||
links := make([]api.LinkCheck, 0, len(results.Links))
|
||
for _, link := range results.Links {
|
||
status := api.Valid
|
||
if link.Status >= 400 {
|
||
status = api.Broken
|
||
} else if !link.IsSafe {
|
||
status = api.Suspicious
|
||
} else if link.Warning != "" {
|
||
status = api.Timeout
|
||
}
|
||
|
||
apiLink := api.LinkCheck{
|
||
Url: link.URL,
|
||
Status: status,
|
||
}
|
||
|
||
if link.Status > 0 {
|
||
apiLink.HttpCode = api.PtrTo(link.Status)
|
||
}
|
||
|
||
// Check if it's a URL shortener
|
||
parsedURL, err := url.Parse(link.URL)
|
||
if err == nil {
|
||
isShortened := c.isSuspiciousURL(link.URL, parsedURL)
|
||
apiLink.IsShortened = api.PtrTo(isShortened)
|
||
}
|
||
|
||
links = append(links, apiLink)
|
||
}
|
||
analysis.Links = &links
|
||
}
|
||
|
||
// Convert images
|
||
if len(results.Images) > 0 {
|
||
images := make([]api.ImageCheck, 0, len(results.Images))
|
||
for _, img := range results.Images {
|
||
apiImg := api.ImageCheck{
|
||
HasAlt: img.HasAlt,
|
||
}
|
||
if img.Src != "" {
|
||
apiImg.Src = &img.Src
|
||
}
|
||
if img.AltText != "" {
|
||
apiImg.AltText = &img.AltText
|
||
}
|
||
// Simple heuristic: tracking pixels are typically 1x1
|
||
apiImg.IsTrackingPixel = api.PtrTo(false)
|
||
|
||
images = append(images, apiImg)
|
||
}
|
||
analysis.Images = &images
|
||
}
|
||
|
||
// Unsubscribe methods
|
||
if results.HasUnsubscribe {
|
||
*analysis.UnsubscribeMethods = append(*analysis.UnsubscribeMethods, api.Link)
|
||
}
|
||
|
||
for _, url := range c.listUnsubscribeURLs {
|
||
if strings.HasPrefix(url, "mailto:") {
|
||
*analysis.UnsubscribeMethods = append(*analysis.UnsubscribeMethods, api.Mailto)
|
||
} else if strings.HasPrefix(url, "http:") || strings.HasPrefix(url, "https:") {
|
||
*analysis.UnsubscribeMethods = append(*analysis.UnsubscribeMethods, api.ListUnsubscribeHeader)
|
||
}
|
||
}
|
||
|
||
if slices.Contains(*analysis.UnsubscribeMethods, api.ListUnsubscribeHeader) && c.hasOneClickUnsubscribe {
|
||
*analysis.UnsubscribeMethods = append(*analysis.UnsubscribeMethods, api.OneClick)
|
||
}
|
||
|
||
return analysis
|
||
}
|
||
|
||
// CalculateContentScore calculates the content score (0-20 points)
|
||
func (c *ContentAnalyzer) CalculateContentScore(results *ContentResults) (int, string) {
|
||
if results == nil {
|
||
return 0, ""
|
||
}
|
||
|
||
var score int = 10
|
||
|
||
// HTML validity or text alone (10 points)
|
||
if results.HTMLValid || (!results.IsMultipart && results.HasPlaintext()) {
|
||
score += 10
|
||
}
|
||
|
||
// Requires plain text alternative (10 points)
|
||
if results.HasPlaintext() {
|
||
score += 10
|
||
}
|
||
|
||
// Links (25 points)
|
||
if len(results.Links) > 0 {
|
||
brokenLinks := 0
|
||
for _, link := range results.Links {
|
||
if link.Status >= 400 {
|
||
brokenLinks++
|
||
}
|
||
}
|
||
score += 20 * (len(results.Links) - brokenLinks) / len(results.Links)
|
||
// Too much links, 10 points penalty
|
||
if len(results.Links) > 30 {
|
||
score -= 10
|
||
}
|
||
} else {
|
||
// No links is better, less suspiscous
|
||
score += 25
|
||
}
|
||
|
||
// Images (15 points)
|
||
if len(results.Images) > 0 {
|
||
noAltCount := 0
|
||
for _, img := range results.Images {
|
||
if !img.HasAlt {
|
||
noAltCount++
|
||
}
|
||
}
|
||
score += 15 * (len(results.Images) - noAltCount) / len(results.Images)
|
||
} else {
|
||
// No images is Ok
|
||
score += 15
|
||
}
|
||
|
||
// Text consistency (15 points)
|
||
if results.TextPlainRatio >= 0.3 {
|
||
score += 15
|
||
}
|
||
|
||
// Image ratio (15 points)
|
||
if results.ImageTextRatio <= 5.0 {
|
||
score += 15
|
||
} else if results.ImageTextRatio <= 10.0 {
|
||
score += 7
|
||
}
|
||
|
||
// Penalize suspicious URLs (deduct up to 5 points)
|
||
if len(results.SuspiciousURLs) > 0 {
|
||
score -= min(len(results.SuspiciousURLs), 5)
|
||
}
|
||
|
||
// Penalize harmful HTML tags (deduct 20 points per harmful tag, max 40 points)
|
||
if len(results.HarmfullIssues) > 0 {
|
||
score -= min(len(results.HarmfullIssues)*20, 40)
|
||
}
|
||
|
||
// Ensure score is between 0 and 100
|
||
if score < 0 {
|
||
score = 0
|
||
}
|
||
if score > 100 {
|
||
score = 100
|
||
}
|
||
|
||
return score, ScoreToGrade(score)
|
||
}
|