diff --git a/api/openapi.yaml b/api/openapi.yaml index 2be8dca..4f9a39d 100644 --- a/api/openapi.yaml +++ b/api/openapi.yaml @@ -392,7 +392,7 @@ components: properties: type: type: string - enum: [broken_html, missing_alt, excessive_images, obfuscated_url, suspicious_link] + enum: [broken_html, missing_alt, excessive_images, obfuscated_url, suspicious_link, dangerous_html] description: Type of content issue example: "missing_alt" severity: diff --git a/pkg/analyzer/content.go b/pkg/analyzer/content.go index 7964693..74f6b2a 100644 --- a/pkg/analyzer/content.go +++ b/pkg/analyzer/content.go @@ -63,6 +63,7 @@ func NewContentAnalyzer(timeout time.Duration) *ContentAnalyzer { // ContentResults represents content analysis results type ContentResults struct { + IsMultipart bool HTMLValid bool HTMLErrors []string Links []LinkCheck @@ -75,6 +76,12 @@ type ContentResults struct { ImageTextRatio float32 // Ratio of images to text SuspiciousURLs []string ContentIssues []string + HarmfullIssues []string +} + +// HasPlaintext returns true if the email has plain text content +func (r *ContentResults) HasPlaintext() bool { + return r.TextContent != "" } // LinkCheck represents a link validation result @@ -101,6 +108,8 @@ type ImageCheck struct { func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults { results := &ContentResults{} + results.IsMultipart = len(email.Parts) > 1 + // Get HTML and text parts htmlParts := email.GetHTMLParts() textParts := email.GetTextParts() @@ -117,16 +126,57 @@ func (c *ContentAnalyzer) AnalyzeContent(email *EmailMessage) *ContentResults { for _, part := range textParts { results.TextContent += part.Content } + // Extract and validate links from plain text + c.analyzeTextLinks(results.TextContent, results) } // Check plain text/HTML consistency if len(htmlParts) > 0 && len(textParts) > 0 { results.TextPlainRatio = c.calculateTextPlainConsistency(results.TextContent, results.HTMLContent) + } else if !results.IsMultipart { + results.TextPlainRatio = 1.0 } return results } +// analyzeTextLinks extracts and validates URLs from plain text +func (c *ContentAnalyzer) analyzeTextLinks(textContent string, results *ContentResults) { + // Regular expression to match URLs in plain text + // Matches http://, https://, and www. URLs + urlRegex := regexp.MustCompile(`(?i)\b(?:https?://|www\.)[^\s<>"{}|\\^\[\]` + "`" + `]+`) + + matches := urlRegex.FindAllString(textContent, -1) + + for _, match := range matches { + // Normalize URL (add http:// if missing) + urlStr := match + if strings.HasPrefix(strings.ToLower(urlStr), "www.") { + urlStr = "http://" + urlStr + } + + // Check if this URL already exists in results.Links (from HTML analysis) + exists := false + for _, link := range results.Links { + if link.URL == urlStr { + exists = true + break + } + } + + // Only validate if not already checked + if !exists { + linkCheck := c.validateLink(urlStr) + results.Links = append(results.Links, linkCheck) + + // Check for suspicious URLs + if !linkCheck.IsSafe { + results.SuspiciousURLs = append(results.SuspiciousURLs, urlStr) + } + } + } +} + // analyzeHTML parses and analyzes HTML content func (c *ContentAnalyzer) analyzeHTML(htmlContent string, results *ContentResults) { results.HTMLContent = htmlContent @@ -195,6 +245,59 @@ func (c *ContentAnalyzer) traverseHTML(n *html.Node, results *ContentResults) { } results.Images = append(results.Images, imageCheck) + + case "script": + // JavaScript in emails is a security risk and typically blocked + results.HarmfullIssues = append(results.HarmfullIssues, "Dangerous