happyDeliver/pkg/analyzer/content.go

961 lines
26 KiB
Go

// This file is part of the happyDeliver (R) project.
// Copyright (c) 2025 happyDomain
// Authors: Pierre-Olivier Mercier, et al.
//
// This program is offered under a commercial and under the AGPL license.
// For commercial licensing, contact us at <contact@happydomain.org>.
//
// For AGPL licensing:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package analyzer
import (
"context"
"fmt"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"unicode"
"git.happydns.org/happyDeliver/internal/api"
"golang.org/x/net/html"
)
// ContentAnalyzer analyzes email content (HTML, links, images)
type ContentAnalyzer struct {
Timeout time.Duration
httpClient *http.Client
}
// NewContentAnalyzer creates a new content analyzer with configurable timeout
func NewContentAnalyzer(timeout time.Duration) *ContentAnalyzer {
if timeout == 0 {
timeout = 10 * time.Second // Default timeout
}
return &ContentAnalyzer{
Timeout: timeout,
httpClient: &http.Client{
Timeout: timeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
// Allow up to 10 redirects
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
}
}
// ContentResults represents content analysis results
type ContentResults struct {
IsMultipart bool
HTMLValid bool
HTMLErrors []string
Links []LinkCheck
Images []ImageCheck
HasUnsubscribe bool
UnsubscribeLinks []string
TextContent string
HTMLContent string
TextPlainRatio float32 // Ratio of plain text to HTML consistency
ImageTextRatio float32 // Ratio of images to text
SuspiciousURLs []string
ContentIssues []string
HarmfullIssues []string
}
// HasPlaintext returns true if the email has plain text content
func (r *ContentResults) HasPlaintext() bool {
return r.TextContent != ""
}
// LinkCheck represents a link validation result
type LinkCheck struct {
URL string
Valid bool
Status int
Error string
IsSafe bool
Warning string
}
// ImageCheck represents an image validation result
type ImageCheck struct {
Src string
HasAlt bool
AltText string
Valid bool
Error string
IsBroken bool
}
// AnalyzeContent performs content analysis on email message
func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
results := &ContentResults{}
results.IsMultipart = len(email.Parts) > 1
// Get HTML and text parts
htmlParts := email.GetHTMLParts()
textParts := email.GetTextParts()
// Analyze HTML parts
if len(htmlParts) > 0 {
for _, part := range htmlParts {
c.analyzeHTML(part.Content, results)
}
}
// Analyze text parts
if len(textParts) > 0 {
for _, part := range textParts {
results.TextContent += part.Content
}
// Extract and validate links from plain text
c.analyzeTextLinks(results.TextContent, results)
}
// Check plain text/HTML consistency
if len(htmlParts) > 0 && len(textParts) > 0 {
results.TextPlainRatio = c.calculateTextPlainConsistency(results.TextContent, results.HTMLContent)
} else if !results.IsMultipart {
results.TextPlainRatio = 1.0
}
return results
}
// analyzeTextLinks extracts and validates URLs from plain text
func (c *ContentAnalyzer) analyzeTextLinks(textContent string, results *ContentResults) {
// Regular expression to match URLs in plain text
// Matches http://, https://, and www. URLs
urlRegex := regexp.MustCompile(`(?i)\b(?:https?://|www\.)[^\s<>"{}|\\^\[\]` + "`" + `]+`)
matches := urlRegex.FindAllString(textContent, -1)
for _, match := range matches {
// Normalize URL (add http:// if missing)
urlStr := match
if strings.HasPrefix(strings.ToLower(urlStr), "www.") {
urlStr = "http://" + urlStr
}
// Check if this URL already exists in results.Links (from HTML analysis)
exists := false
for _, link := range results.Links {
if link.URL == urlStr {
exists = true
break
}
}
// Only validate if not already checked
if !exists {
linkCheck := c.validateLink(urlStr)
results.Links = append(results.Links, linkCheck)
// Check for suspicious URLs
if !linkCheck.IsSafe {
results.SuspiciousURLs = append(results.SuspiciousURLs, urlStr)
}
}
}
}
// analyzeHTML parses and analyzes HTML content
func (c *ContentAnalyzer) analyzeHTML(htmlContent string, results *ContentResults) {
results.HTMLContent = htmlContent
// Parse HTML
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
results.HTMLValid = false
results.HTMLErrors = append(results.HTMLErrors, fmt.Sprintf("Failed to parse HTML: %v", err))
return
}
results.HTMLValid = true
// Traverse HTML tree
c.traverseHTML(doc, results)
// Calculate image-to-text ratio
if results.HTMLContent != "" {
textLength := len(c.extractTextFromHTML(htmlContent))
imageCount := len(results.Images)
if textLength > 0 {
results.ImageTextRatio = float32(imageCount) / float32(textLength) * 1000 // Images per 1000 chars
}
}
}
// traverseHTML recursively traverses HTML nodes
func (c *ContentAnalyzer) traverseHTML(n *html.Node, results *ContentResults) {
if n.Type == html.ElementNode {
switch n.Data {
case "a":
// Extract and validate links
href := c.getAttr(n, "href")
if href != "" {
// Check for unsubscribe links
if c.isUnsubscribeLink(href, n) {
results.HasUnsubscribe = true
results.UnsubscribeLinks = append(results.UnsubscribeLinks, href)
}
// Validate link
linkCheck := c.validateLink(href)
// Check for domain misalignment (phishing detection)
linkText := c.getNodeText(n)
if c.hasDomainMisalignment(href, linkText) {
linkCheck.IsSafe = false
if linkCheck.Warning == "" {
linkCheck.Warning = "Link text domain does not match actual URL domain (possible phishing)"
} else {
linkCheck.Warning += "; Link text domain does not match actual URL domain (possible phishing)"
}
}
results.Links = append(results.Links, linkCheck)
// Check for suspicious URLs
if !linkCheck.IsSafe {
results.SuspiciousURLs = append(results.SuspiciousURLs, href)
}
}
case "img":
// Extract and validate images
src := c.getAttr(n, "src")
alt := c.getAttr(n, "alt")
imageCheck := ImageCheck{
Src: src,
HasAlt: alt != "",
AltText: alt,
Valid: src != "",
}
if src == "" {
imageCheck.Error = "Image missing src attribute"
}
results.Images = append(results.Images, imageCheck)
case "script":
// JavaScript in emails is a security risk and typically blocked
results.HarmfullIssues = append(results.HarmfullIssues, "Dangerous <script> tag detected - JavaScript is blocked by most email clients")
case "iframe":
// Iframes are security risks and blocked by most email clients
src := c.getAttr(n, "src")
issue := "Dangerous <iframe> tag detected"
if src != "" {
issue += fmt.Sprintf(" with src='%s'", src)
}
results.HarmfullIssues = append(results.HarmfullIssues, issue+" - iframes are blocked by most email clients")
case "object", "embed", "applet":
// Legacy embedding tags, security risks
results.HarmfullIssues = append(results.HarmfullIssues, fmt.Sprintf("Dangerous <%s> tag detected - legacy embedding tags are security risks and blocked by email clients", n.Data))
case "form":
// Forms in emails can be phishing vectors
action := c.getAttr(n, "action")
issue := "Suspicious <form> tag detected"
if action != "" {
issue += fmt.Sprintf(" with action='%s'", action)
}
results.HarmfullIssues = append(results.HarmfullIssues, issue+" - forms can be phishing vectors and are often blocked")
case "base":
// Base tag can be used for phishing by redirecting relative URLs
href := c.getAttr(n, "href")
issue := "Potentially dangerous <base> tag detected"
if href != "" {
issue += fmt.Sprintf(" with href='%s'", href)
}
results.HarmfullIssues = append(results.HarmfullIssues, issue+" - can redirect all relative URLs")
case "meta":
// Check for suspicious meta redirects
httpEquiv := c.getAttr(n, "http-equiv")
if strings.ToLower(httpEquiv) == "refresh" {
content := c.getAttr(n, "content")
results.HarmfullIssues = append(results.HarmfullIssues, fmt.Sprintf("Suspicious <meta http-equiv='refresh'> tag detected with content='%s' - can be used for phishing redirects", content))
}
case "link":
// Check for external stylesheet links (potential privacy/tracking concerns)
rel := c.getAttr(n, "rel")
href := c.getAttr(n, "href")
if strings.Contains(strings.ToLower(rel), "stylesheet") && href != "" {
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
results.ContentIssues = append(results.ContentIssues, fmt.Sprintf("External stylesheet link detected: %s - may cause rendering issues or privacy concerns", href))
}
}
}
}
// Traverse children
for child := n.FirstChild; child != nil; child = child.NextSibling {
c.traverseHTML(child, results)
}
}
// getAttr gets an attribute value from an HTML node
func (c *ContentAnalyzer) getAttr(n *html.Node, key string) string {
for _, attr := range n.Attr {
if attr.Key == key {
return attr.Val
}
}
return ""
}
// isUnsubscribeLink checks if a link is an unsubscribe link
func (c *ContentAnalyzer) isUnsubscribeLink(href string, node *html.Node) bool {
// Check href for unsubscribe keywords
lowerHref := strings.ToLower(href)
unsubKeywords := []string{"unsubscribe", "opt-out", "optout", "remove", "list-unsubscribe"}
for _, keyword := range unsubKeywords {
if strings.Contains(lowerHref, keyword) {
return true
}
}
// Check link text for unsubscribe keywords
text := c.getNodeText(node)
lowerText := strings.ToLower(text)
for _, keyword := range unsubKeywords {
if strings.Contains(lowerText, keyword) {
return true
}
}
return false
}
// getNodeText extracts text content from a node
func (c *ContentAnalyzer) getNodeText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
var text string
for child := n.FirstChild; child != nil; child = child.NextSibling {
text += c.getNodeText(child)
}
return text
}
// validateLink validates a URL and checks if it's accessible
func (c *ContentAnalyzer) validateLink(urlStr string) LinkCheck {
check := LinkCheck{
URL: urlStr,
IsSafe: true,
}
// Parse URL
parsedURL, err := url.Parse(urlStr)
if err != nil {
check.Valid = false
check.Error = fmt.Sprintf("Invalid URL: %v", err)
return check
}
// Check URL safety
if c.isSuspiciousURL(urlStr, parsedURL) {
check.IsSafe = false
check.Warning = "URL appears suspicious (obfuscated, shortened, or unusual)"
}
// Only check HTTP/HTTPS links
if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" {
check.Valid = true
return check
}
// Check if link is accessible (with timeout)
ctx, cancel := context.WithTimeout(context.Background(), c.Timeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
if err != nil {
check.Valid = false
check.Error = fmt.Sprintf("Failed to create request: %v", err)
return check
}
// Set a reasonable user agent
req.Header.Set("User-Agent", "happyDeliver/1.0 (Email Deliverability Tester)")
resp, err := c.httpClient.Do(req)
if err != nil {
// Don't fail on timeout/connection errors for external links
// Just mark as warning
check.Valid = true
check.Status = 0
check.Warning = fmt.Sprintf("Could not verify link: %v", err)
return check
}
defer resp.Body.Close()
check.Status = resp.StatusCode
check.Valid = true
// Check for error status codes
if resp.StatusCode >= 400 {
check.Error = fmt.Sprintf("Link returns %d status", resp.StatusCode)
}
return check
}
// hasDomainMisalignment checks if the link text contains a different domain than the actual URL
// This is a common phishing technique (e.g., text shows "paypal.com" but links to "evil.com")
func (c *ContentAnalyzer) hasDomainMisalignment(href, linkText string) bool {
// Parse the actual URL
parsedURL, err := url.Parse(href)
if err != nil {
return false
}
// Extract the actual destination domain/email based on scheme
var actualDomain string
if parsedURL.Scheme == "mailto" {
// Extract email address from mailto: URL
// Format can be: mailto:user@domain.com or mailto:user@domain.com?subject=...
mailtoAddr := parsedURL.Opaque
// Remove query parameters if present
if idx := strings.Index(mailtoAddr, "?"); idx != -1 {
mailtoAddr = mailtoAddr[:idx]
}
mailtoAddr = strings.TrimSpace(strings.ToLower(mailtoAddr))
// Extract domain from email address
if idx := strings.Index(mailtoAddr, "@"); idx != -1 {
actualDomain = mailtoAddr[idx+1:]
} else {
return false // Invalid mailto
}
} else if parsedURL.Scheme == "http" || parsedURL.Scheme == "https" {
// Check if URL has a host
if parsedURL.Host == "" {
return false
}
// Extract the actual URL's domain (remove port if present)
actualDomain = parsedURL.Host
if idx := strings.LastIndex(actualDomain, ":"); idx != -1 {
actualDomain = actualDomain[:idx]
}
actualDomain = strings.ToLower(actualDomain)
} else {
// Skip checks for other URL schemes (tel, etc.)
return false
}
// Normalize link text
linkText = strings.TrimSpace(linkText)
linkText = strings.ToLower(linkText)
// Skip if link text is empty, too short, or just generic text like "click here"
if linkText == "" || len(linkText) < 4 {
return false
}
// Common generic link texts that shouldn't trigger warnings
genericTexts := []string{
"click here", "read more", "learn more", "download", "subscribe",
"unsubscribe", "view online", "view in browser", "click", "here",
"update", "verify", "confirm", "continue", "get started",
// mailto-specific generic texts
"email us", "contact us", "send email", "get in touch", "reach out",
"contact", "email", "write to us",
}
for _, generic := range genericTexts {
if linkText == generic {
return false
}
}
// Extract domain-like patterns from link text using regex
// Matches patterns like "example.com", "www.example.com", "http://example.com"
domainRegex := regexp.MustCompile(`(?i)(?:https?://)?(?:www\.)?([a-z0-9][-a-z0-9]*\.)+[a-z]{2,}`)
matches := domainRegex.FindAllString(linkText, -1)
if len(matches) == 0 {
return false
}
// Check each domain-like pattern found in the text
for _, textDomain := range matches {
// Normalize the text domain
textDomain = strings.ToLower(textDomain)
textDomain = strings.TrimPrefix(textDomain, "http://")
textDomain = strings.TrimPrefix(textDomain, "https://")
textDomain = strings.TrimPrefix(textDomain, "www.")
// Remove trailing slashes and paths
if idx := strings.Index(textDomain, "/"); idx != -1 {
textDomain = textDomain[:idx]
}
// Compare domains - they should match or the actual URL should be a subdomain of the text domain
if textDomain != actualDomain {
// Check if actual domain is a subdomain of text domain
if !strings.HasSuffix(actualDomain, "."+textDomain) && !strings.HasSuffix(actualDomain, textDomain) {
// Check if they share the same base domain (last 2 parts)
textParts := strings.Split(textDomain, ".")
actualParts := strings.Split(actualDomain, ".")
if len(textParts) >= 2 && len(actualParts) >= 2 {
textBase := strings.Join(textParts[len(textParts)-2:], ".")
actualBase := strings.Join(actualParts[len(actualParts)-2:], ".")
if textBase != actualBase {
return true // Domain mismatch detected!
}
} else {
return true // Domain mismatch detected!
}
}
}
}
return false
}
// isSuspiciousURL checks if a URL looks suspicious
func (c *ContentAnalyzer) isSuspiciousURL(urlStr string, parsedURL *url.URL) bool {
// Skip checks for mailto: URLs
if parsedURL.Scheme == "mailto" {
return false
}
// Check for IP address instead of domain
if c.isIPAddress(parsedURL.Host) {
return true
}
// Check for URL shorteners (common ones)
shorteners := []string{
"bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co",
"buff.ly", "is.gd", "bl.ink", "short.io",
}
for _, shortener := range shorteners {
if strings.ToLower(parsedURL.Host) == shortener {
return true
}
}
// Check for excessive subdomains (possible obfuscation)
parts := strings.Split(parsedURL.Host, ".")
if len(parts) > 4 {
return true
}
// Check for URL obfuscation techniques
if strings.Count(urlStr, "@") > 0 { // @ in URL (possible phishing)
return true
}
// Check for suspicious characters in domain
if strings.ContainsAny(parsedURL.Host, "[]()<>") {
return true
}
return false
}
// isIPAddress checks if a string is an IP address
func (c *ContentAnalyzer) isIPAddress(host string) bool {
// Remove port if present
if idx := strings.LastIndex(host, ":"); idx != -1 {
host = host[:idx]
}
// Simple check for IPv4
parts := strings.Split(host, ".")
if len(parts) == 4 {
for _, part := range parts {
// Check if all characters are digits
for _, ch := range part {
if !unicode.IsDigit(ch) {
return false
}
}
}
return true
}
// Check for IPv6 (contains colons)
if strings.Contains(host, ":") {
return true
}
return false
}
// extractTextFromHTML extracts plain text from HTML
func (c *ContentAnalyzer) extractTextFromHTML(htmlContent string) string {
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return ""
}
var text strings.Builder
var extract func(*html.Node)
extract = func(n *html.Node) {
if n.Type == html.TextNode {
text.WriteString(" " + n.Data)
}
// Skip script and style tags
if n.Type == html.ElementNode && (n.Data == "script" || n.Data == "style") {
return
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
extract(child)
}
}
extract(doc)
return strings.TrimSpace(text.String())
}
// calculateTextPlainConsistency compares plain text and HTML versions
func (c *ContentAnalyzer) calculateTextPlainConsistency(plainText, htmlText string) float32 {
// Extract text from HTML
htmlPlainText := c.extractTextFromHTML(htmlText)
// Normalize both texts
plainNorm := c.normalizeText(plainText)
htmlNorm := c.normalizeText(htmlPlainText)
// Calculate similarity using simple word overlap
plainWords := strings.Fields(plainNorm)
htmlWords := strings.Fields(htmlNorm)
if len(plainWords) == 0 || len(htmlWords) == 0 {
return 0.0
}
// Count common words by building sets
plainWordSet := make(map[string]int)
for _, word := range plainWords {
plainWordSet[word]++
}
htmlWordSet := make(map[string]int)
for _, word := range htmlWords {
htmlWordSet[word]++
}
// Count matches: for each unique word, count minimum occurrences in both texts
commonWords := 0
for word, plainCount := range plainWordSet {
if htmlCount, exists := htmlWordSet[word]; exists {
// Count the minimum occurrences between both texts
if plainCount < htmlCount {
commonWords += plainCount
} else {
commonWords += htmlCount
}
}
}
// Calculate ratio using total words from both texts (union approach)
// This provides a balanced measure: perfect match = 1.0, partial overlap = 0.3-0.8
totalWords := len(plainWords) + len(htmlWords)
if totalWords == 0 {
return 0.0
}
// Divide by average word count for better scoring
avgWords := float32(totalWords) / 2.0
ratio := float32(commonWords) / avgWords
// Cap at 1.0 for perfect matches
if ratio > 1.0 {
ratio = 1.0
}
return ratio
}
// normalizeText normalizes text for comparison
func (c *ContentAnalyzer) normalizeText(text string) string {
// Convert to lowercase
text = strings.ToLower(text)
// Remove extra whitespace
text = strings.TrimSpace(text)
text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ")
return text
}
// GenerateContentAnalysis creates structured content analysis from results
func (c *ContentAnalyzer) GenerateContentAnalysis(results *ContentResults) *api.ContentAnalysis {
if results == nil {
return nil
}
analysis := &api.ContentAnalysis{
HasHtml: api.PtrTo(results.HTMLContent != ""),
HasPlaintext: api.PtrTo(results.TextContent != ""),
HasUnsubscribeLink: api.PtrTo(results.HasUnsubscribe),
}
// Calculate text-to-image ratio (inverse of image-to-text)
if len(results.Images) > 0 && results.HTMLContent != "" {
textLen := float32(len(c.extractTextFromHTML(results.HTMLContent)))
if textLen > 0 {
ratio := textLen / float32(len(results.Images))
analysis.TextToImageRatio = &ratio
}
}
// Build HTML issues
htmlIssues := []api.ContentIssue{}
// Add HTML parsing errors
if !results.HTMLValid && len(results.HTMLErrors) > 0 {
for _, errMsg := range results.HTMLErrors {
htmlIssues = append(htmlIssues, api.ContentIssue{
Type: api.BrokenHtml,
Severity: api.ContentIssueSeverityHigh,
Message: errMsg,
Advice: api.PtrTo("Fix HTML structure errors to improve email rendering across clients"),
})
}
}
// Add missing alt text issues
if len(results.Images) > 0 {
missingAltCount := 0
for _, img := range results.Images {
if !img.HasAlt {
missingAltCount++
}
}
if missingAltCount > 0 {
htmlIssues = append(htmlIssues, api.ContentIssue{
Type: api.MissingAlt,
Severity: api.ContentIssueSeverityMedium,
Message: fmt.Sprintf("%d image(s) missing alt attributes", missingAltCount),
Advice: api.PtrTo("Add descriptive alt text to all images for better accessibility and deliverability"),
})
}
}
// Add excessive images issue
if results.ImageTextRatio > 10.0 {
htmlIssues = append(htmlIssues, api.ContentIssue{
Type: api.ExcessiveImages,
Severity: api.ContentIssueSeverityMedium,
Message: "Email is excessively image-heavy",
Advice: api.PtrTo("Reduce the number of images relative to text content"),
})
}
// Add suspicious URL issues
for _, suspURL := range results.SuspiciousURLs {
htmlIssues = append(htmlIssues, api.ContentIssue{
Type: api.SuspiciousLink,
Severity: api.ContentIssueSeverityHigh,
Message: "Suspicious URL detected",
Location: &suspURL,
Advice: api.PtrTo("Avoid URL shorteners, IP addresses, and obfuscated URLs in emails"),
})
}
// Add harmful HTML tag issues
for _, harmfulIssue := range results.HarmfullIssues {
htmlIssues = append(htmlIssues, api.ContentIssue{
Type: api.DangerousHtml,
Severity: api.ContentIssueSeverityCritical,
Message: harmfulIssue,
Advice: api.PtrTo("Remove dangerous HTML tags like <script>, <iframe>, <object>, <embed>, <applet>, <form>, and <base> from email content"),
})
}
// Add general content issues (like external stylesheets)
for _, contentIssue := range results.ContentIssues {
htmlIssues = append(htmlIssues, api.ContentIssue{
Type: api.BrokenHtml,
Severity: api.ContentIssueSeverityLow,
Message: contentIssue,
Advice: api.PtrTo("Use inline CSS instead of external stylesheets for better email compatibility"),
})
}
if len(htmlIssues) > 0 {
analysis.HtmlIssues = &htmlIssues
}
// Convert links
if len(results.Links) > 0 {
links := make([]api.LinkCheck, 0, len(results.Links))
for _, link := range results.Links {
status := api.Valid
if link.Status >= 400 {
status = api.Broken
} else if !link.IsSafe {
status = api.Suspicious
} else if link.Warning != "" {
status = api.Timeout
}
apiLink := api.LinkCheck{
Url: link.URL,
Status: status,
}
if link.Status > 0 {
apiLink.HttpCode = api.PtrTo(link.Status)
}
// Check if it's a URL shortener
parsedURL, err := url.Parse(link.URL)
if err == nil {
isShortened := c.isSuspiciousURL(link.URL, parsedURL)
apiLink.IsShortened = api.PtrTo(isShortened)
}
links = append(links, apiLink)
}
analysis.Links = &links
}
// Convert images
if len(results.Images) > 0 {
images := make([]api.ImageCheck, 0, len(results.Images))
for _, img := range results.Images {
apiImg := api.ImageCheck{
HasAlt: img.HasAlt,
}
if img.Src != "" {
apiImg.Src = &img.Src
}
if img.AltText != "" {
apiImg.AltText = &img.AltText
}
// Simple heuristic: tracking pixels are typically 1x1
apiImg.IsTrackingPixel = api.PtrTo(false)
images = append(images, apiImg)
}
analysis.Images = &images
}
// Unsubscribe methods
if results.HasUnsubscribe {
methods := []api.ContentAnalysisUnsubscribeMethods{api.Link}
analysis.UnsubscribeMethods = &methods
}
return analysis
}
// CalculateContentScore calculates the content score (0-20 points)
func (c *ContentAnalyzer) CalculateContentScore(results *ContentResults) (int, string) {
if results == nil {
return 0, ""
}
var score int = 10
// HTML validity or text alone (10 points)
if results.HTMLValid || (!results.IsMultipart && results.HasPlaintext()) {
score += 10
}
// Requires plain text alternative (10 points)
if results.HasPlaintext() {
score += 10
}
// Links (25 points)
if len(results.Links) > 0 {
brokenLinks := 0
for _, link := range results.Links {
if link.Status >= 400 {
brokenLinks++
}
}
score += 20 * (len(results.Links) - brokenLinks) / len(results.Links)
// Too much links, 10 points penalty
if len(results.Links) > 30 {
score -= 10
}
} else {
// No links is better, less suspiscous
score += 25
}
// Images (15 points)
if len(results.Images) > 0 {
noAltCount := 0
for _, img := range results.Images {
if !img.HasAlt {
noAltCount++
}
}
score += 15 * (len(results.Images) - noAltCount) / len(results.Images)
} else {
// No images is Ok
score += 15
}
// Text consistency (15 points)
if results.TextPlainRatio >= 0.3 {
score += 15
}
// Image ratio (15 points)
if results.ImageTextRatio <= 5.0 {
score += 15
} else if results.ImageTextRatio <= 10.0 {
score += 7
}
// Penalize suspicious URLs (deduct up to 5 points)
if len(results.SuspiciousURLs) > 0 {
score -= min(len(results.SuspiciousURLs), 5)
}
// Penalize harmful HTML tags (deduct 20 points per harmful tag, max 40 points)
if len(results.HarmfullIssues) > 0 {
score -= min(len(results.HarmfullIssues)*20, 40)
}
// Ensure score is between 0 and 100
if score < 0 {
score = 0
}
if score > 100 {
score = 100
}
return score, ScoreToGrade(score)
}