internal/articles/scorer_test.go at main · desertthunder.dev/noteleaf

cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
noteleaf / internal / articles / scorer_test.go
at main 421 lines 11 kB view raw
  1package articles
  2
  3import (
  4	"strings"
  5	"testing"
  6
  7	"golang.org/x/net/html"
  8)
  9
 10func parseHTML(htmlStr string) *html.Node {
 11	doc, err := html.Parse(strings.NewReader(htmlStr))
 12	if err != nil {
 13		return nil
 14	}
 15	return doc
 16}
 17
 18func findElement(node *html.Node, tagName string) *html.Node {
 19	if node == nil {
 20		return nil
 21	}
 22
 23	if node.Type == html.ElementNode && strings.EqualFold(node.Data, tagName) {
 24		return node
 25	}
 26
 27	for child := node.FirstChild; child != nil; child = child.NextSibling {
 28		if result := findElement(child, tagName); result != nil {
 29			return result
 30		}
 31	}
 32
 33	return nil
 34}
 35
 36func findElementWithClass(node *html.Node, className string) *html.Node {
 37	if node == nil {
 38		return nil
 39	}
 40
 41	if node.Type == html.ElementNode {
 42		for _, attr := range node.Attr {
 43			if attr.Key == "class" && strings.Contains(attr.Val, className) {
 44				return node
 45			}
 46		}
 47	}
 48
 49	for child := node.FirstChild; child != nil; child = child.NextSibling {
 50		if result := findElementWithClass(child, className); result != nil {
 51			return result
 52		}
 53	}
 54
 55	return nil
 56}
 57
 58func TestScorer(t *testing.T) {
 59	t.Run("NewScorer", func(t *testing.T) {
 60		t.Run("creates scorer with default weights", func(t *testing.T) {
 61			scorer := NewScorer()
 62
 63			if scorer == nil {
 64				t.Fatal("Expected scorer to be created, got nil")
 65			}
 66
 67			if scorer.minContentLength != 140 {
 68				t.Errorf("Expected minContentLength 140, got %d", scorer.minContentLength)
 69			}
 70
 71			if scorer.minScore != 20.0 {
 72				t.Errorf("Expected minScore 20.0, got %f", scorer.minScore)
 73			}
 74		})
 75	})
 76
 77	t.Run("ScoreNode", func(t *testing.T) {
 78		scorer := NewScorer()
 79
 80		t.Run("scores article tag highly", func(t *testing.T) {
 81			htmlStr := `<html><body><article class="main-content">Article content</article></body></html>`
 82			doc := parseHTML(htmlStr)
 83			article := findElement(doc, "article")
 84
 85			score := scorer.ScoreNode(article)
 86
 87			if score == nil {
 88				t.Fatal("Expected score, got nil")
 89			}
 90
 91			if score.Score <= 0 {
 92				t.Errorf("Expected positive score for article tag, got %f", score.Score)
 93			}
 94		})
 95
 96		t.Run("penalizes navigation elements", func(t *testing.T) {
 97			htmlStr := `<html><body><div class="navigation sidebar">Nav</div></body></html>`
 98			doc := parseHTML(htmlStr)
 99			nav := findElementWithClass(doc, "navigation")
100
101			score := scorer.ScoreNode(nav)
102
103			if score == nil {
104				t.Fatal("Expected score, got nil")
105			}
106
107			if score.Score >= 0 {
108				t.Errorf("Expected negative score for navigation, got %f", score.Score)
109			}
110		})
111
112		t.Run("calculates text length", func(t *testing.T) {
113			htmlStr := `<html><body><div>This is some test content with multiple words</div></body></html>`
114			doc := parseHTML(htmlStr)
115			div := findElement(doc, "div")
116
117			score := scorer.ScoreNode(div)
118
119			if score == nil {
120				t.Fatal("Expected score, got nil")
121			}
122
123			if score.TextLength == 0 {
124				t.Error("Expected non-zero text length")
125			}
126		})
127
128		t.Run("returns nil for text nodes", func(t *testing.T) {
129			textNode := &html.Node{Type: html.TextNode, Data: "text"}
130			score := scorer.ScoreNode(textNode)
131
132			if score != nil {
133				t.Error("Expected nil score for text node")
134			}
135		})
136	})
137
138	t.Run("calculateLinkDensity", func(t *testing.T) {
139		scorer := NewScorer()
140
141		t.Run("calculates high link density", func(t *testing.T) {
142			htmlStr := `<html><body><div><a href="#">link1</a> <a href="#">link2</a></div></body></html>`
143			doc := parseHTML(htmlStr)
144			div := findElement(doc, "div")
145
146			density := scorer.calculateLinkDensity(div)
147
148			if density < 0.5 {
149				t.Errorf("Expected high link density (>0.5), got %f", density)
150			}
151		})
152
153		t.Run("calculates low link density", func(t *testing.T) {
154			htmlStr := `<html><body><div>Lots of regular text content here with just <a href="#">one link</a> in it</div></body></html>`
155			doc := parseHTML(htmlStr)
156			div := findElement(doc, "div")
157
158			density := scorer.calculateLinkDensity(div)
159
160			if density > 0.3 {
161				t.Errorf("Expected low link density (<0.3), got %f", density)
162			}
163		})
164
165		t.Run("returns zero for empty content", func(t *testing.T) {
166			htmlStr := `<html><body><div></div></body></html>`
167			doc := parseHTML(htmlStr)
168			div := findElement(doc, "div")
169
170			density := scorer.calculateLinkDensity(div)
171
172			if density != 0.0 {
173				t.Errorf("Expected zero density for empty content, got %f", density)
174			}
175		})
176	})
177
178	t.Run("getClassIdScore", func(t *testing.T) {
179		scorer := NewScorer()
180
181		t.Run("positive score for content class", func(t *testing.T) {
182			node := &html.Node{
183				Type: html.ElementNode,
184				Data: "div",
185				Attr: []html.Attribute{{Key: "class", Val: "article-content"}},
186			}
187
188			score := scorer.getClassIdScore(node)
189
190			if score <= 0 {
191				t.Errorf("Expected positive score for content class, got %f", score)
192			}
193		})
194
195		t.Run("negative score for sidebar class", func(t *testing.T) {
196			node := &html.Node{
197				Type: html.ElementNode,
198				Data: "div",
199				Attr: []html.Attribute{{Key: "class", Val: "sidebar"}},
200			}
201
202			score := scorer.getClassIdScore(node)
203
204			if score >= 0 {
205				t.Errorf("Expected negative score for sidebar class, got %f", score)
206			}
207		})
208
209		t.Run("strong negative score for banner", func(t *testing.T) {
210			node := &html.Node{
211				Type: html.ElementNode,
212				Data: "div",
213				Attr: []html.Attribute{{Key: "id", Val: "banner"}},
214			}
215
216			score := scorer.getClassIdScore(node)
217
218			if score > -30 {
219				t.Errorf("Expected strong negative score for banner, got %f", score)
220			}
221		})
222	})
223
224	t.Run("countParagraphs", func(t *testing.T) {
225		scorer := NewScorer()
226
227		t.Run("counts multiple paragraphs", func(t *testing.T) {
228			htmlStr := `<html><body><div><p>First</p><p>Second</p><p>Third</p></div></body></html>`
229			doc := parseHTML(htmlStr)
230			div := findElement(doc, "div")
231
232			count := scorer.countParagraphs(div)
233
234			if count != 3 {
235				t.Errorf("Expected 3 paragraphs, got %d", count)
236			}
237		})
238
239		t.Run("returns zero for no paragraphs", func(t *testing.T) {
240			htmlStr := `<html><body><div>Just text</div></body></html>`
241			doc := parseHTML(htmlStr)
242			div := findElement(doc, "div")
243
244			count := scorer.countParagraphs(div)
245
246			if count != 0 {
247				t.Errorf("Expected 0 paragraphs, got %d", count)
248			}
249		})
250	})
251
252	t.Run("FindTopCandidates", func(t *testing.T) {
253		scorer := NewScorer()
254
255		t.Run("finds article with substantial content", func(t *testing.T) {
256			htmlStr := `<html><body>
257				<article class="main-content">
258					<p>This is a long paragraph with substantial content that should score well in the readability algorithm.</p>
259					<p>This is another paragraph with more content to increase the score.</p>
260					<p>And a third paragraph to ensure we have enough text and structure.</p>
261				</article>
262				<aside class="sidebar">
263					<a href="#">Link</a>
264				</aside>
265			</body></html>`
266			doc := parseHTML(htmlStr)
267
268			candidates := scorer.FindTopCandidates(doc, 5)
269
270			if len(candidates) == 0 {
271				t.Fatal("Expected to find candidates")
272			}
273
274			topScore := candidates[0]
275			if topScore.Score <= 0 {
276				t.Errorf("Expected positive score for top candidate, got %f", topScore.Score)
277			}
278
279			if topScore.ParagraphCount < 3 {
280				t.Errorf("Expected top candidate to contain paragraphs, got %d", topScore.ParagraphCount)
281			}
282		})
283
284		t.Run("filters out low-scoring nodes", func(t *testing.T) {
285			htmlStr := `<html><body>
286				<div class="ad">Short ad</div>
287				<nav class="menu"><a href="#">Link</a></nav>
288			</body></html>`
289			doc := parseHTML(htmlStr)
290
291			candidates := scorer.FindTopCandidates(doc, 5)
292
293			for _, candidate := range candidates {
294				if candidate.Score < scorer.minScore {
295					t.Errorf("Expected all candidates to meet minimum score, got %f", candidate.Score)
296				}
297				if candidate.TextLength < scorer.minContentLength {
298					t.Errorf("Expected all candidates to meet minimum length, got %d", candidate.TextLength)
299				}
300			}
301		})
302
303		t.Run("returns empty for nil root", func(t *testing.T) {
304			candidates := scorer.FindTopCandidates(nil, 5)
305
306			if candidates != nil {
307				t.Error("Expected nil for nil root")
308			}
309		})
310	})
311
312	t.Run("calculateConfidence", func(t *testing.T) {
313		scorer := NewScorer()
314
315		t.Run("high confidence for good content", func(t *testing.T) {
316			score := &ContentScore{
317				Score:          60.0,
318				TextLength:     500,
319				LinkDensity:    0.1,
320				ParagraphCount: 5,
321			}
322
323			confidence := scorer.calculateConfidence(score)
324
325			if confidence < 0.5 {
326				t.Errorf("Expected high confidence (>0.5) for good content, got %f", confidence)
327			}
328
329			if confidence > 1.0 {
330				t.Errorf("Expected confidence <= 1.0, got %f", confidence)
331			}
332		})
333
334		t.Run("low confidence for poor content", func(t *testing.T) {
335			score := &ContentScore{
336				Score:          10.0,
337				TextLength:     50,
338				LinkDensity:    0.8,
339				ParagraphCount: 0,
340			}
341
342			confidence := scorer.calculateConfidence(score)
343
344			if confidence > 0.3 {
345				t.Errorf("Expected low confidence (<0.3) for poor content, got %f", confidence)
346			}
347		})
348
349		t.Run("returns zero for nil score", func(t *testing.T) {
350			confidence := scorer.calculateConfidence(nil)
351
352			if confidence != 0.0 {
353				t.Errorf("Expected 0.0 for nil score, got %f", confidence)
354			}
355		})
356	})
357
358	t.Run("IsProbablyReadable", func(t *testing.T) {
359		scorer := NewScorer()
360
361		t.Run("returns true for readable document", func(t *testing.T) {
362			htmlStr := `<html><body>
363				<article>
364					<p>First paragraph with sufficient text content to be considered readable.</p>
365					<p>Second paragraph with more text.</p>
366					<p>Third paragraph with additional content.</p>
367				</article>
368			</body></html>`
369			doc := parseHTML(htmlStr)
370
371			readable := scorer.IsProbablyReadable(doc)
372
373			if !readable {
374				t.Error("Expected document to be readable")
375			}
376		})
377
378		t.Run("returns false for short document", func(t *testing.T) {
379			htmlStr := `<html><body><div>Short</div></body></html>`
380			doc := parseHTML(htmlStr)
381
382			readable := scorer.IsProbablyReadable(doc)
383
384			if readable {
385				t.Error("Expected document to not be readable")
386			}
387		})
388
389		t.Run("returns false for nil document", func(t *testing.T) {
390			readable := scorer.IsProbablyReadable(nil)
391
392			if readable {
393				t.Error("Expected nil document to not be readable")
394			}
395		})
396	})
397
398	t.Run("ScoreAncestors", func(t *testing.T) {
399		scorer := NewScorer()
400
401		t.Run("propagates score to parent nodes", func(t *testing.T) {
402			htmlStr := `<html><body><div><article><p>Content</p></article></div></body></html>`
403			doc := parseHTML(htmlStr)
404			p := findElement(doc, "p")
405
406			scores := make(map[*html.Node]*ContentScore)
407			scores[p] = &ContentScore{Node: p, Score: 10.0}
408
409			scorer.ScoreAncestors(scores, p, 100.0)
410
411			article := findElement(doc, "article")
412			if scores[article] == nil {
413				t.Error("Expected article to receive propagated score")
414			}
415
416			if scores[article].Score <= 0 {
417				t.Errorf("Expected positive propagated score, got %f", scores[article].Score)
418			}
419		})
420	})
421}