Fix calculateTextPlainConsistency algorithm

This commit is contained in:
nemunaire 2025-11-01 18:08:06 +07:00 committed by nemunaire
commit e05c6d0bc2

View file

@ -659,30 +659,47 @@ func (c *ContentAnalyzer) calculateTextPlainConsistency(plainText, htmlText stri
return 0.0 return 0.0
} }
// Count common words // Count common words by building sets
commonWords := 0 plainWordSet := make(map[string]int)
plainWordSet := make(map[string]bool)
for _, word := range plainWords { for _, word := range plainWords {
plainWordSet[word] = true plainWordSet[word]++
} }
htmlWordSet := make(map[string]int)
for _, word := range htmlWords { for _, word := range htmlWords {
if plainWordSet[word] { htmlWordSet[word]++
commonWords++ }
// Count matches: for each unique word, count minimum occurrences in both texts
commonWords := 0
for word, plainCount := range plainWordSet {
if htmlCount, exists := htmlWordSet[word]; exists {
// Count the minimum occurrences between both texts
if plainCount < htmlCount {
commonWords += plainCount
} else {
commonWords += htmlCount
}
} }
} }
// Calculate ratio (Jaccard similarity approximation) // Calculate ratio using total words from both texts (union approach)
maxWords := len(plainWords) // This provides a balanced measure: perfect match = 1.0, partial overlap = 0.3-0.8
if len(htmlWords) > maxWords { totalWords := len(plainWords) + len(htmlWords)
maxWords = len(htmlWords) if totalWords == 0 {
}
if maxWords == 0 {
return 0.0 return 0.0
} }
return float32(commonWords) / float32(maxWords) // Divide by average word count for better scoring
avgWords := float32(totalWords) / 2.0
ratio := float32(commonWords) / avgWords
// Cap at 1.0 for perfect matches
if ratio > 1.0 {
ratio = 1.0
}
return ratio
} }
// normalizeText normalizes text for comparison // normalizeText normalizes text for comparison