Improve content analyzing and reporting

2025-10-22 16:16:32 +07:00 · 2025-10-22 16:16:32 +07:00 · 8247e74dd5
commit 8247e74dd5
parent ebb2d992d7
2 changed files with 151 additions and 15 deletions
--- a/api/openapi.yaml
+++ b/api/openapi.yaml
@ -392,7 +392,7 @@ components:
      properties:
        type:
          type: string
-          enum: [broken_html, missing_alt, excessive_images, obfuscated_url, suspicious_link]
+          enum: [broken_html, missing_alt, excessive_images, obfuscated_url, suspicious_link, dangerous_html]
          description: Type of content issue
          example: "missing_alt"
        severity:
--- a/pkg/analyzer/content.go
+++ b/pkg/analyzer/content.go
@ -63,6 +63,7 @@ func NewContentAnalyzer(timeout time.Duration) *ContentAnalyzer {

 // ContentResults represents content analysis results
 type ContentResults struct {
+	IsMultipart      bool
 	HTMLValid        bool
 	HTMLErrors       []string
 	Links            []LinkCheck
@ -75,6 +76,12 @@ type ContentResults struct {
 	ImageTextRatio   float32 // Ratio of images to text
 	SuspiciousURLs   []string
 	ContentIssues    []string
+	HarmfullIssues   []string
+}
+
+// HasPlaintext returns true if the email has plain text content
+func (r *ContentResults) HasPlaintext() bool {
+	return r.TextContent != ""
 }

 // LinkCheck represents a link validation result
@ -101,6 +108,8 @@ type ImageCheck struct {
 func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
 	results := &ContentResults{}

+	results.IsMultipart = len(email.Parts) > 1
+
 	// Get HTML and text parts
 	htmlParts := email.GetHTMLParts()
 	textParts := email.GetTextParts()
@ -117,16 +126,57 @@ func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults {
 		for _, part := range textParts {
 			results.TextContent += part.Content
 		}
+		// Extract and validate links from plain text
+		c.analyzeTextLinks(results.TextContent, results)
 	}

 	// Check plain text/HTML consistency
 	if len(htmlParts) > 0 && len(textParts) > 0 {
 		results.TextPlainRatio = c.calculateTextPlainConsistency(results.TextContent, results.HTMLContent)
+	} else if !results.IsMultipart {
+		results.TextPlainRatio = 1.0
 	}

 	return results
 }

+// analyzeTextLinks extracts and validates URLs from plain text
+func (c *ContentAnalyzer) analyzeTextLinks(textContent string, results *ContentResults) {
+	// Regular expression to match URLs in plain text
+	// Matches http://, https://, and www. URLs
+	urlRegex := regexp.MustCompile(`(?i)\b(?:https?://|www\.)[^\s<>"{}|\\^\[\]` + "`" + `]+`)
+
+	matches := urlRegex.FindAllString(textContent, -1)
+
+	for _, match := range matches {
+		// Normalize URL (add http:// if missing)
+		urlStr := match
+		if strings.HasPrefix(strings.ToLower(urlStr), "www.") {
+			urlStr = "http://" + urlStr
+		}
+
+		// Check if this URL already exists in results.Links (from HTML analysis)
+		exists := false
+		for _, link := range results.Links {
+			if link.URL == urlStr {
+				exists = true
+				break
+			}
+		}
+
+		// Only validate if not already checked
+		if !exists {
+			linkCheck := c.validateLink(urlStr)
+			results.Links = append(results.Links, linkCheck)
+
+			// Check for suspicious URLs
+			if !linkCheck.IsSafe {
+				results.SuspiciousURLs = append(results.SuspiciousURLs, urlStr)
+			}
+		}
+	}
+}
+
 // analyzeHTML parses and analyzes HTML content
 func (c *ContentAnalyzer) analyzeHTML(htmlContent string, results *ContentResults) {
 	results.HTMLContent = htmlContent
@ -195,6 +245,59 @@ func (c *ContentAnalyzer) traverseHTML(n *html.Node, results *ContentResults) {
 			}

 			results.Images = append(results.Images, imageCheck)
+
+		case "script":
+			// JavaScript in emails is a security risk and typically blocked
+			results.HarmfullIssues = append(results.HarmfullIssues, "Dangerous <script> tag detected - JavaScript is blocked by most email clients")
+
+		case "iframe":
+			// Iframes are security risks and blocked by most email clients
+			src := c.getAttr(n, "src")
+			issue := "Dangerous <iframe> tag detected"
+			if src != "" {
+				issue += fmt.Sprintf(" with src='%s'", src)
+			}
+			results.HarmfullIssues = append(results.HarmfullIssues, issue+" - iframes are blocked by most email clients")
+
+		case "object", "embed", "applet":
+			// Legacy embedding tags, security risks
+			results.HarmfullIssues = append(results.HarmfullIssues, fmt.Sprintf("Dangerous <%s> tag detected - legacy embedding tags are security risks and blocked by email clients", n.Data))
+
+		case "form":
+			// Forms in emails can be phishing vectors
+			action := c.getAttr(n, "action")
+			issue := "Suspicious <form> tag detected"
+			if action != "" {
+				issue += fmt.Sprintf(" with action='%s'", action)
+			}
+			results.HarmfullIssues = append(results.HarmfullIssues, issue+" - forms can be phishing vectors and are often blocked")
+
+		case "base":
+			// Base tag can be used for phishing by redirecting relative URLs
+			href := c.getAttr(n, "href")
+			issue := "Potentially dangerous <base> tag detected"
+			if href != "" {
+				issue += fmt.Sprintf(" with href='%s'", href)
+			}
+			results.HarmfullIssues = append(results.HarmfullIssues, issue+" - can redirect all relative URLs")
+
+		case "meta":
+			// Check for suspicious meta redirects
+			httpEquiv := c.getAttr(n, "http-equiv")
+			if strings.ToLower(httpEquiv) == "refresh" {
+				content := c.getAttr(n, "content")
+				results.HarmfullIssues = append(results.HarmfullIssues, fmt.Sprintf("Suspicious <meta http-equiv='refresh'> tag detected with content='%s' - can be used for phishing redirects", content))
+			}
+
+		case "link":
+			// Check for external stylesheet links (potential privacy/tracking concerns)
+			rel := c.getAttr(n, "rel")
+			href := c.getAttr(n, "href")
+			if strings.Contains(strings.ToLower(rel), "stylesheet") && href != "" {
+				if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
+					results.ContentIssues = append(results.ContentIssues, fmt.Sprintf("External stylesheet link detected: %s - may cause rendering issues or privacy concerns", href))
+				}
+			}
 		}
 	}

@ -288,7 +391,7 @@ func (c *ContentAnalyzer) validateLink(urlStr string) LinkCheck {
 	}

 	// Set a reasonable user agent
-	req.Header.Set("User-Agent", "HappyDeliver/1.0 (Email Deliverability Tester)")
+	req.Header.Set("User-Agent", "happyDeliver/1.0 (Email Deliverability Tester)")

 	resp, err := c.httpClient.Do(req)
 	if err != nil {
@ -325,7 +428,7 @@ func (c *ContentAnalyzer) isSuspiciousURL(urlStr string, parsedURL *url.URL) boo
 		"buff.ly", "is.gd", "bl.ink", "short.io",
 	}
 	for _, shortener := range shorteners {
-		if strings.Contains(strings.ToLower(parsedURL.Host), shortener) {
+		if strings.ToLower(parsedURL.Host) == shortener {
 			return true
 		}
 	}
@ -534,6 +637,26 @@ func (c *ContentAnalyzer) GenerateContentAnalysis(results *ContentResults) *api.
 		})
 	}

+	// Add harmful HTML tag issues
+	for _, harmfulIssue := range results.HarmfullIssues {
+		htmlIssues = append(htmlIssues, api.ContentIssue{
+			Type:     api.DangerousHtml,
+			Severity: api.ContentIssueSeverityCritical,
+			Message:  harmfulIssue,
+			Advice:   api.PtrTo("Remove dangerous HTML tags like <script>, <iframe>, <object>, <embed>, <applet>, <form>, and <base> from email content"),
+		})
+	}
+
+	// Add general content issues (like external stylesheets)
+	for _, contentIssue := range results.ContentIssues {
+		htmlIssues = append(htmlIssues, api.ContentIssue{
+			Type:     api.BrokenHtml,
+			Severity: api.ContentIssueSeverityLow,
+			Message:  contentIssue,
+			Advice:   api.PtrTo("Use inline CSS instead of external stylesheets for better email compatibility"),
+		})
+	}
+
 	if len(htmlIssues) > 0 {
 		analysis.HtmlIssues = &htmlIssues
 	}
@ -608,14 +731,19 @@ func (c *ContentAnalyzer) CalculateContentScore(results *ContentResults) int {
 		return 0
 	}

-	var score int = 0
+	var score int = 10

-	// HTML validity (10 points)
-	if results.HTMLValid {
+	// HTML validity or text alone (10 points)
+	if results.HTMLValid || (!results.IsMultipart && results.HasPlaintext()) {
 		score += 10
 	}

-	// Links (20 points)
+	// Requires plain text alternative (10 points)
+	if results.HasPlaintext() {
+		score += 10
+	}
+
+	// Links (25 points)
 	if len(results.Links) > 0 {
 		brokenLinks := 0
 		for _, link := range results.Links {
@ -626,9 +754,13 @@ func (c *ContentAnalyzer) CalculateContentScore(results *ContentResults) int {
 		if brokenLinks == 0 {
 			score += 20
 		}
+		// Too much links, 10 points penalty
+		if len(results.Links) > 30 {
+			score -= 10
+		}
 	} else {
-		// No links is neutral, give partial score
-		score += 10
+		// No links is better, less suspiscous
+		score += 25
 	}

 	// Images (15 points)
@ -645,12 +777,7 @@ func (c *ContentAnalyzer) CalculateContentScore(results *ContentResults) int {
 			score += 7
 		}
 	} else {
-		// No images is neutral
-		score += 7
-	}
-
-	// Unsubscribe link (15 points)
-	if results.HasUnsubscribe {
+		// No images is Ok
 		score += 15
 	}

@ -675,6 +802,15 @@ func (c *ContentAnalyzer) CalculateContentScore(results *ContentResults) int {
 		score -= penalty
 	}

+	// Penalize harmful HTML tags (deduct 20 points per harmful tag, max 40 points)
+	if len(results.HarmfullIssues) > 0 {
+		penalty := len(results.HarmfullIssues) * 20
+		if penalty > 40 {
+			penalty = 40
+		}
+		score -= penalty
+	}
+
 	// Ensure score is between 0 and 100
 	if score < 0 {
 		score = 0