// This file is part of the happyDeliver (R) project. // Copyright (c) 2025 happyDomain // Authors: Pierre-Olivier Mercier, et al. // // This program is offered under a commercial and under the AGPL license. // For commercial licensing, contact us at . // // For AGPL licensing: // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package analyzer import ( "context" "fmt" "net/http" "net/url" "regexp" "slices" "strings" "time" "unicode" "git.happydns.org/happyDeliver/internal/api" "golang.org/x/net/html" ) // ContentAnalyzer analyzes email content (HTML, links, images) type ContentAnalyzer struct { Timeout time.Duration httpClient *http.Client listUnsubscribeURLs []string // URLs from List-Unsubscribe header hasOneClickUnsubscribe bool // True if List-Unsubscribe-Post: List-Unsubscribe=One-Click } // NewContentAnalyzer creates a new content analyzer with configurable timeout func NewContentAnalyzer(timeout time.Duration) *ContentAnalyzer { if timeout == 0 { timeout = 10 * time.Second // Default timeout } return &ContentAnalyzer{ Timeout: timeout, httpClient: &http.Client{ Timeout: timeout, CheckRedirect: func(req *http.Request, via []*http.Request) error { // Allow up to 10 redirects if len(via) >= 10 { return fmt.Errorf("too many redirects") } return nil }, }, } } // ContentResults represents content analysis results type ContentResults struct { IsMultipart bool HTMLValid bool HTMLErrors []string Links []LinkCheck Images []ImageCheck HasUnsubscribe bool UnsubscribeLinks []string TextContent string HTMLContent string TextPlainRatio float32 // Ratio of plain text to HTML consistency ImageTextRatio float32 // Ratio of images to text SuspiciousURLs []string ContentIssues []string HarmfullIssues []string } // HasPlaintext returns true if the email has plain text content func (r *ContentResults) HasPlaintext() bool { return r.TextContent != "" } // LinkCheck represents a link validation result type LinkCheck struct { URL string Valid bool Status int Error string IsSafe bool Warning string } // ImageCheck represents an image validation result type ImageCheck struct { Src string HasAlt bool AltText string Valid bool Error string IsBroken bool } // AnalyzeContent performs content analysis on email message func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults { results := &ContentResults{} results.IsMultipart = len(email.Parts) > 1 // Parse List-Unsubscribe header URLs for use in link detection c.listUnsubscribeURLs = email.GetListUnsubscribeURLs() // Check for one-click unsubscribe support listUnsubscribePost := email.Header.Get("List-Unsubscribe-Post") c.hasOneClickUnsubscribe = strings.EqualFold(strings.TrimSpace(listUnsubscribePost), "List-Unsubscribe=One-Click") // Get HTML and text parts htmlParts := email.GetHTMLParts() textParts := email.GetTextParts() // Analyze HTML parts if len(htmlParts) > 0 { for _, part := range htmlParts { c.analyzeHTML(part.Content, results) } } // Analyze text parts if len(textParts) > 0 { for _, part := range textParts { results.TextContent += part.Content } // Extract and validate links from plain text c.analyzeTextLinks(results.TextContent, results) } // Check plain text/HTML consistency if len(htmlParts) > 0 && len(textParts) > 0 { results.TextPlainRatio = c.calculateTextPlainConsistency(results.TextContent, results.HTMLContent) } else if !results.IsMultipart { results.TextPlainRatio = 1.0 } return results } // analyzeTextLinks extracts and validates URLs from plain text func (c *ContentAnalyzer) analyzeTextLinks(textContent string, results *ContentResults) { // Regular expression to match URLs in plain text // Matches http://, https://, and www. URLs urlRegex := regexp.MustCompile(`(?i)\b(?:https?://|www\.)[^\s<>"{}|\\^\[\]` + "`" + `]+`) matches := urlRegex.FindAllString(textContent, -1) for _, match := range matches { // Normalize URL (add http:// if missing) urlStr := match if strings.HasPrefix(strings.ToLower(urlStr), "www.") { urlStr = "http://" + urlStr } // Check if this URL already exists in results.Links (from HTML analysis) exists := false for _, link := range results.Links { if link.URL == urlStr { exists = true break } } // Only validate if not already checked if !exists { linkCheck := c.validateLink(urlStr) results.Links = append(results.Links, linkCheck) // Check for suspicious URLs if !linkCheck.IsSafe { results.SuspiciousURLs = append(results.SuspiciousURLs, urlStr) } } } } // analyzeHTML parses and analyzes HTML content func (c *ContentAnalyzer) analyzeHTML(htmlContent string, results *ContentResults) { results.HTMLContent = htmlContent // Parse HTML doc, err := html.Parse(strings.NewReader(htmlContent)) if err != nil { results.HTMLValid = false results.HTMLErrors = append(results.HTMLErrors, fmt.Sprintf("Failed to parse HTML: %v", err)) return } results.HTMLValid = true // Traverse HTML tree c.traverseHTML(doc, results) // Calculate image-to-text ratio if results.HTMLContent != "" { textLength := len(c.extractTextFromHTML(htmlContent)) imageCount := len(results.Images) if textLength > 0 { results.ImageTextRatio = float32(imageCount) / float32(textLength) * 1000 // Images per 1000 chars } } } // traverseHTML recursively traverses HTML nodes func (c *ContentAnalyzer) traverseHTML(n *html.Node, results *ContentResults) { if n.Type == html.ElementNode { switch n.Data { case "a": // Extract and validate links href := c.getAttr(n, "href") if href != "" { // Check for unsubscribe links if c.isUnsubscribeLink(href, n) { results.HasUnsubscribe = true results.UnsubscribeLinks = append(results.UnsubscribeLinks, href) } // Validate link linkCheck := c.validateLink(href) // Check for domain misalignment (phishing detection) linkText := c.getNodeText(n) if c.hasDomainMisalignment(href, linkText) { linkCheck.IsSafe = false if linkCheck.Warning == "" { linkCheck.Warning = "Link text domain does not match actual URL domain (possible phishing)" } else { linkCheck.Warning += "; Link text domain does not match actual URL domain (possible phishing)" } } results.Links = append(results.Links, linkCheck) // Check for suspicious URLs if !linkCheck.IsSafe { results.SuspiciousURLs = append(results.SuspiciousURLs, href) } } case "img": // Extract and validate images src := c.getAttr(n, "src") alt := c.getAttr(n, "alt") imageCheck := ImageCheck{ Src: src, HasAlt: alt != "", AltText: alt, Valid: src != "", } if src == "" { imageCheck.Error = "Image missing src attribute" } results.Images = append(results.Images, imageCheck) case "script": // JavaScript in emails is a security risk and typically blocked results.HarmfullIssues = append(results.HarmfullIssues, "Dangerous