internal/articles/scorer.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / scorer.go
at main 389 lines 10 kB view raw
  1package articles
  2
  3import (
  4	"math"
  5	"regexp"
  6	"strings"
  7
  8	"golang.org/x/net/html"
  9)
 10
 11// ContentScore represents the score and metadata for a content node.
 12type ContentScore struct {
 13	Node            *html.Node
 14	Score           float64
 15	TextLength      int
 16	LinkDensity     float64
 17	ParagraphCount  int
 18	AncestorDepth   int
 19	ConfidenceLevel float64
 20}
 21
 22// Scorer implements Readability-style heuristic scoring for content extraction.
 23type Scorer struct {
 24	linkDensityWeight   float64
 25	classWeightPositive float64
 26	classWeightNegative float64
 27	paragraphWeight     float64
 28	ancestorDecayFactor float64
 29	minContentLength    int
 30	minScore            float64
 31	positivePattern     *regexp.Regexp
 32	negativePattern     *regexp.Regexp
 33	unlikelyPattern     *regexp.Regexp
 34}
 35
 36// NewScorer creates a new Scorer with default Readability.js-inspired weights.
 37func NewScorer() *Scorer {
 38	return &Scorer{
 39		linkDensityWeight:   -1.0,
 40		classWeightPositive: 25.0,
 41		classWeightNegative: -25.0,
 42		paragraphWeight:     1.0,
 43		ancestorDecayFactor: 0.5,
 44		minContentLength:    140,
 45		minScore:            20.0,
 46
 47		positivePattern: regexp.MustCompile(`(?i)(article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story)`),
 48		negativePattern: regexp.MustCompile(`(?i)(combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|ad-|advertisement|breadcrumb|hidden|nav|menu|header)`),
 49		unlikelyPattern: regexp.MustCompile(`(?i)(banner|cookie|popup|modal)`),
 50	}
 51}
 52
 53// ScoreNode calculates a content score for the given node based on multiple heuristics.
 54// This implements the core Readability scoring algorithm.
 55func (s *Scorer) ScoreNode(node *html.Node) *ContentScore {
 56	if node == nil || node.Type != html.ElementNode {
 57		return nil
 58	}
 59
 60	score := &ContentScore{
 61		Node:          node,
 62		Score:         0.0,
 63		AncestorDepth: s.calculateDepth(node),
 64	}
 65
 66	score.Score = s.getTagScore(node.Data)
 67	score.Score += s.getClassIdScore(node)
 68
 69	score.TextLength = s.calculateTextLength(node)
 70	score.LinkDensity = s.calculateLinkDensity(node)
 71	score.ParagraphCount = s.countParagraphs(node)
 72
 73	score.Score += score.LinkDensity * s.linkDensityWeight
 74	score.Score += float64(score.ParagraphCount) * s.paragraphWeight
 75	score.Score += s.getTextLengthScore(score.TextLength)
 76
 77	score.ConfidenceLevel = s.calculateConfidence(score)
 78	return score
 79}
 80
 81// getTagScore returns a base score based on the HTML tag type.
 82// Some tags are more likely to contain main content than others.
 83func (s *Scorer) getTagScore(tagName string) float64 {
 84	switch strings.ToLower(tagName) {
 85	case "article":
 86		return 30.0
 87	case "section":
 88		return 15.0
 89	case "div":
 90		return 5.0
 91	case "main":
 92		return 40.0
 93	case "p":
 94		return 3.0
 95	case "pre", "td", "blockquote":
 96		return 3.0
 97	case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
 98		return -3.0
 99	case "h1", "h2", "h3", "h4", "h5", "h6", "th":
100		return -5.0
101	default:
102		return 0.0
103	}
104}
105
106// getClassIdScore analyzes class and ID attributes for positive/negative indicators.
107// Returns a positive score for content-like names, negative for navigation/ads.
108func (s *Scorer) getClassIdScore(node *html.Node) float64 {
109	score := 0.0
110	classID := s.getClassAndID(node)
111
112	if classID == "" {
113		return 0.0
114	}
115
116	if s.unlikelyPattern.MatchString(classID) {
117		return -50.0
118	}
119
120	if s.negativePattern.MatchString(classID) {
121		score += s.classWeightNegative
122	}
123
124	if s.positivePattern.MatchString(classID) {
125		score += s.classWeightPositive
126	}
127
128	return score
129}
130
131// getClassAndID concatenates class and ID attributes for pattern matching.
132func (s *Scorer) getClassAndID(node *html.Node) string {
133	var parts []string
134
135	for _, attr := range node.Attr {
136		if attr.Key == "class" || attr.Key == "id" {
137			parts = append(parts, attr.Val)
138		}
139	}
140
141	return strings.Join(parts, " ")
142}
143
144// calculateTextLength returns the total text length within the node.
145func (s *Scorer) calculateTextLength(node *html.Node) int {
146	text := s.getInnerText(node)
147	return len(strings.TrimSpace(text))
148}
149
150// calculateLinkDensity calculates the ratio of link text to total text.
151// Higher link density indicates navigation or related links, not main content.
152func (s *Scorer) calculateLinkDensity(node *html.Node) float64 {
153	totalText := s.getInnerText(node)
154	linkText := s.getLinkText(node)
155
156	totalLen := len(strings.TrimSpace(totalText))
157	linkLen := len(strings.TrimSpace(linkText))
158
159	if totalLen == 0 {
160		return 0.0
161	}
162
163	return float64(linkLen) / float64(totalLen)
164}
165
166// getInnerText extracts all text content from a node and its descendants.
167func (s *Scorer) getInnerText(node *html.Node) string {
168	var buf strings.Builder
169	s.extractText(node, &buf)
170	return buf.String()
171}
172
173// extractText recursively extracts text from a node tree.
174func (s *Scorer) extractText(node *html.Node, buf *strings.Builder) {
175	if node == nil {
176		return
177	}
178
179	if node.Type == html.TextNode {
180		buf.WriteString(node.Data)
181		buf.WriteString(" ")
182		return
183	}
184
185	if node.Type == html.ElementNode {
186		tag := strings.ToLower(node.Data)
187		if tag == "script" || tag == "style" || tag == "noscript" {
188			return
189		}
190	}
191
192	for child := node.FirstChild; child != nil; child = child.NextSibling {
193		s.extractText(child, buf)
194	}
195}
196
197// getLinkText extracts text from anchor tags only.
198func (s *Scorer) getLinkText(node *html.Node) string {
199	var buf strings.Builder
200	s.extractLinkText(node, &buf)
201	return buf.String()
202}
203
204// extractLinkText recursively extracts text from anchor tags.
205func (s *Scorer) extractLinkText(node *html.Node, buf *strings.Builder) {
206	if node == nil {
207		return
208	}
209
210	if node.Type == html.ElementNode && strings.ToLower(node.Data) == "a" {
211		s.extractText(node, buf)
212		return
213	}
214
215	for child := node.FirstChild; child != nil; child = child.NextSibling {
216		s.extractLinkText(child, buf)
217	}
218}
219
220// countParagraphs counts paragraph elements within the node.
221func (s *Scorer) countParagraphs(node *html.Node) int {
222	count := 0
223	s.walkParagraphs(node, &count)
224	return count
225}
226
227// walkParagraphs recursively counts paragraph elements.
228func (s *Scorer) walkParagraphs(node *html.Node, count *int) {
229	if node == nil {
230		return
231	}
232
233	if node.Type == html.ElementNode && strings.ToLower(node.Data) == "p" {
234		*count++
235	}
236
237	for child := node.FirstChild; child != nil; child = child.NextSibling {
238		s.walkParagraphs(child, count)
239	}
240}
241
242// getTextLengthScore provides a bonus for nodes with substantial text content.
243func (s *Scorer) getTextLengthScore(textLen int) float64 {
244	if textLen < 25 {
245		return 0.0
246	}
247	return math.Log10(float64(textLen)) * 2.0
248}
249
250// calculateDepth calculates how deep in the DOM tree this node is.
251func (s *Scorer) calculateDepth(node *html.Node) int {
252	depth := 0
253	for n := node.Parent; n != nil; n = n.Parent {
254		depth++
255	}
256	return depth
257}
258
259// ScoreAncestors propagates scores up the DOM tree with decay.
260// This implements the Readability algorithm's ancestor scoring.
261func (s *Scorer) ScoreAncestors(scores map[*html.Node]*ContentScore, node *html.Node, baseScore float64) {
262	if node == nil || baseScore <= 0 {
263		return
264	}
265
266	currentScore := baseScore
267	level := 0
268
269	for parent := node.Parent; parent != nil && level < 5; parent = parent.Parent {
270		if parent.Type != html.ElementNode {
271			continue
272		}
273
274		if _, exists := scores[parent]; !exists {
275			scores[parent] = s.ScoreNode(parent)
276			if scores[parent] == nil {
277				continue
278			}
279		}
280
281		decayedScore := currentScore * math.Pow(s.ancestorDecayFactor, float64(level+1))
282		scores[parent].Score += decayedScore
283		level++
284	}
285}
286
287// FindTopCandidates identifies the N highest-scoring content candidates.
288func (s *Scorer) FindTopCandidates(root *html.Node, n int) []*ContentScore {
289	if root == nil || n <= 0 {
290		return nil
291	}
292
293	scores := make(map[*html.Node]*ContentScore)
294	s.scoreTree(root, scores)
295
296	var candidates []*ContentScore
297	for _, score := range scores {
298		if score.Score >= s.minScore && score.TextLength >= s.minContentLength {
299			candidates = append(candidates, score)
300		}
301	}
302
303	for i := 0; i < len(candidates); i++ {
304		for j := i + 1; j < len(candidates); j++ {
305			if candidates[j].Score > candidates[i].Score {
306				candidates[i], candidates[j] = candidates[j], candidates[i]
307			}
308		}
309	}
310
311	if len(candidates) > n {
312		candidates = candidates[:n]
313	}
314
315	return candidates
316}
317
318// scoreTree recursively scores all nodes in the tree.
319func (s *Scorer) scoreTree(node *html.Node, scores map[*html.Node]*ContentScore) {
320	if node == nil {
321		return
322	}
323
324	if node.Type == html.ElementNode {
325		tag := strings.ToLower(node.Data)
326		if tag != "script" && tag != "style" && tag != "noscript" {
327			score := s.ScoreNode(node)
328			if score != nil && score.Score > 0 {
329				scores[node] = score
330				s.ScoreAncestors(scores, node, score.Score)
331			}
332		}
333	}
334
335	for child := node.FirstChild; child != nil; child = child.NextSibling {
336		s.scoreTree(child, scores)
337	}
338}
339
340// calculateConfidence estimates how confident we are in this content selection (between 0 & 1).
341func (s *Scorer) calculateConfidence(score *ContentScore) float64 {
342	if score == nil {
343		return 0.0
344	}
345
346	confidence := 0.0
347
348	if score.Score > s.minScore*2 {
349		confidence += 0.3
350	} else if score.Score > s.minScore {
351		confidence += 0.15
352	}
353
354	if score.TextLength > s.minContentLength*3 {
355		confidence += 0.3
356	} else if score.TextLength > s.minContentLength {
357		confidence += 0.15
358	}
359
360	if score.LinkDensity < 0.2 {
361		confidence += 0.2
362	} else if score.LinkDensity < 0.4 {
363		confidence += 0.1
364	}
365
366	if score.ParagraphCount >= 3 {
367		confidence += 0.2
368	} else if score.ParagraphCount >= 1 {
369		confidence += 0.1
370	}
371
372	if confidence > 1.0 {
373		confidence = 1.0
374	}
375
376	return confidence
377}
378
379// IsProbablyReadable determines if a document is likely to have extractable content.
380// This is inspired by Readability.js's isProbablyReaderable function.
381func (s *Scorer) IsProbablyReadable(doc *html.Node) bool {
382	if doc == nil {
383		return false
384	}
385
386	paragraphCount := s.countParagraphs(doc)
387	textLength := s.calculateTextLength(doc)
388	return paragraphCount >= 3 && textLength >= s.minContentLength
389}