diff --git a/pkg/analyzer/content.go b/pkg/analyzer/content.go index 87c423f..8f5d059 100644 --- a/pkg/analyzer/content.go +++ b/pkg/analyzer/content.go @@ -659,30 +659,47 @@ func (c *ContentAnalyzer) calculateTextPlainConsistency(plainText, htmlText stri return 0.0 } - // Count common words - commonWords := 0 - plainWordSet := make(map[string]bool) + // Count common words by building sets + plainWordSet := make(map[string]int) for _, word := range plainWords { - plainWordSet[word] = true + plainWordSet[word]++ } + htmlWordSet := make(map[string]int) for _, word := range htmlWords { - if plainWordSet[word] { - commonWords++ + htmlWordSet[word]++ + } + + // Count matches: for each unique word, count minimum occurrences in both texts + commonWords := 0 + for word, plainCount := range plainWordSet { + if htmlCount, exists := htmlWordSet[word]; exists { + // Count the minimum occurrences between both texts + if plainCount < htmlCount { + commonWords += plainCount + } else { + commonWords += htmlCount + } } } - // Calculate ratio (Jaccard similarity approximation) - maxWords := len(plainWords) - if len(htmlWords) > maxWords { - maxWords = len(htmlWords) - } - - if maxWords == 0 { + // Calculate ratio using total words from both texts (union approach) + // This provides a balanced measure: perfect match = 1.0, partial overlap = 0.3-0.8 + totalWords := len(plainWords) + len(htmlWords) + if totalWords == 0 { return 0.0 } - return float32(commonWords) / float32(maxWords) + // Divide by average word count for better scoring + avgWords := float32(totalWords) / 2.0 + ratio := float32(commonWords) / avgWords + + // Cap at 1.0 for perfect matches + if ratio > 1.0 { + ratio = 1.0 + } + + return ratio } // normalizeText normalizes text for comparison diff --git a/web/package-lock.json b/web/package-lock.json index 01d6a6d..9062418 100644 --- a/web/package-lock.json +++ b/web/package-lock.json @@ -2776,9 +2776,9 @@ } }, "node_modules/globals": { - "version": "16.4.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-16.4.0.tgz", - "integrity": "sha512-ob/2LcVVaVGCYN+r14cnwnoDPUufjiYgSqRhiFD0Q1iI4Odora5RE8Iv1D24hAz5oMophRGkGz+yuvQmmUMnMw==", + "version": "16.5.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-16.5.0.tgz", + "integrity": "sha512-c/c15i26VrJ4IRt5Z89DnIzCGDn9EcebibhAOjw5ibqEHsE1wLUgkPn9RDmNcUKyU87GeaL633nyJ+pplFR2ZQ==", "dev": true, "license": "MIT", "engines": {