cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm
leaflet
readability
golang
1package articles
2
3import (
4 "strings"
5 "testing"
6
7 "golang.org/x/net/html"
8)
9
10func parseHTML(htmlStr string) *html.Node {
11 doc, err := html.Parse(strings.NewReader(htmlStr))
12 if err != nil {
13 return nil
14 }
15 return doc
16}
17
18func findElement(node *html.Node, tagName string) *html.Node {
19 if node == nil {
20 return nil
21 }
22
23 if node.Type == html.ElementNode && strings.EqualFold(node.Data, tagName) {
24 return node
25 }
26
27 for child := node.FirstChild; child != nil; child = child.NextSibling {
28 if result := findElement(child, tagName); result != nil {
29 return result
30 }
31 }
32
33 return nil
34}
35
36func findElementWithClass(node *html.Node, className string) *html.Node {
37 if node == nil {
38 return nil
39 }
40
41 if node.Type == html.ElementNode {
42 for _, attr := range node.Attr {
43 if attr.Key == "class" && strings.Contains(attr.Val, className) {
44 return node
45 }
46 }
47 }
48
49 for child := node.FirstChild; child != nil; child = child.NextSibling {
50 if result := findElementWithClass(child, className); result != nil {
51 return result
52 }
53 }
54
55 return nil
56}
57
58func TestScorer(t *testing.T) {
59 t.Run("NewScorer", func(t *testing.T) {
60 t.Run("creates scorer with default weights", func(t *testing.T) {
61 scorer := NewScorer()
62
63 if scorer == nil {
64 t.Fatal("Expected scorer to be created, got nil")
65 }
66
67 if scorer.minContentLength != 140 {
68 t.Errorf("Expected minContentLength 140, got %d", scorer.minContentLength)
69 }
70
71 if scorer.minScore != 20.0 {
72 t.Errorf("Expected minScore 20.0, got %f", scorer.minScore)
73 }
74 })
75 })
76
77 t.Run("ScoreNode", func(t *testing.T) {
78 scorer := NewScorer()
79
80 t.Run("scores article tag highly", func(t *testing.T) {
81 htmlStr := `<html><body><article class="main-content">Article content</article></body></html>`
82 doc := parseHTML(htmlStr)
83 article := findElement(doc, "article")
84
85 score := scorer.ScoreNode(article)
86
87 if score == nil {
88 t.Fatal("Expected score, got nil")
89 }
90
91 if score.Score <= 0 {
92 t.Errorf("Expected positive score for article tag, got %f", score.Score)
93 }
94 })
95
96 t.Run("penalizes navigation elements", func(t *testing.T) {
97 htmlStr := `<html><body><div class="navigation sidebar">Nav</div></body></html>`
98 doc := parseHTML(htmlStr)
99 nav := findElementWithClass(doc, "navigation")
100
101 score := scorer.ScoreNode(nav)
102
103 if score == nil {
104 t.Fatal("Expected score, got nil")
105 }
106
107 if score.Score >= 0 {
108 t.Errorf("Expected negative score for navigation, got %f", score.Score)
109 }
110 })
111
112 t.Run("calculates text length", func(t *testing.T) {
113 htmlStr := `<html><body><div>This is some test content with multiple words</div></body></html>`
114 doc := parseHTML(htmlStr)
115 div := findElement(doc, "div")
116
117 score := scorer.ScoreNode(div)
118
119 if score == nil {
120 t.Fatal("Expected score, got nil")
121 }
122
123 if score.TextLength == 0 {
124 t.Error("Expected non-zero text length")
125 }
126 })
127
128 t.Run("returns nil for text nodes", func(t *testing.T) {
129 textNode := &html.Node{Type: html.TextNode, Data: "text"}
130 score := scorer.ScoreNode(textNode)
131
132 if score != nil {
133 t.Error("Expected nil score for text node")
134 }
135 })
136 })
137
138 t.Run("calculateLinkDensity", func(t *testing.T) {
139 scorer := NewScorer()
140
141 t.Run("calculates high link density", func(t *testing.T) {
142 htmlStr := `<html><body><div><a href="#">link1</a> <a href="#">link2</a></div></body></html>`
143 doc := parseHTML(htmlStr)
144 div := findElement(doc, "div")
145
146 density := scorer.calculateLinkDensity(div)
147
148 if density < 0.5 {
149 t.Errorf("Expected high link density (>0.5), got %f", density)
150 }
151 })
152
153 t.Run("calculates low link density", func(t *testing.T) {
154 htmlStr := `<html><body><div>Lots of regular text content here with just <a href="#">one link</a> in it</div></body></html>`
155 doc := parseHTML(htmlStr)
156 div := findElement(doc, "div")
157
158 density := scorer.calculateLinkDensity(div)
159
160 if density > 0.3 {
161 t.Errorf("Expected low link density (<0.3), got %f", density)
162 }
163 })
164
165 t.Run("returns zero for empty content", func(t *testing.T) {
166 htmlStr := `<html><body><div></div></body></html>`
167 doc := parseHTML(htmlStr)
168 div := findElement(doc, "div")
169
170 density := scorer.calculateLinkDensity(div)
171
172 if density != 0.0 {
173 t.Errorf("Expected zero density for empty content, got %f", density)
174 }
175 })
176 })
177
178 t.Run("getClassIdScore", func(t *testing.T) {
179 scorer := NewScorer()
180
181 t.Run("positive score for content class", func(t *testing.T) {
182 node := &html.Node{
183 Type: html.ElementNode,
184 Data: "div",
185 Attr: []html.Attribute{{Key: "class", Val: "article-content"}},
186 }
187
188 score := scorer.getClassIdScore(node)
189
190 if score <= 0 {
191 t.Errorf("Expected positive score for content class, got %f", score)
192 }
193 })
194
195 t.Run("negative score for sidebar class", func(t *testing.T) {
196 node := &html.Node{
197 Type: html.ElementNode,
198 Data: "div",
199 Attr: []html.Attribute{{Key: "class", Val: "sidebar"}},
200 }
201
202 score := scorer.getClassIdScore(node)
203
204 if score >= 0 {
205 t.Errorf("Expected negative score for sidebar class, got %f", score)
206 }
207 })
208
209 t.Run("strong negative score for banner", func(t *testing.T) {
210 node := &html.Node{
211 Type: html.ElementNode,
212 Data: "div",
213 Attr: []html.Attribute{{Key: "id", Val: "banner"}},
214 }
215
216 score := scorer.getClassIdScore(node)
217
218 if score > -30 {
219 t.Errorf("Expected strong negative score for banner, got %f", score)
220 }
221 })
222 })
223
224 t.Run("countParagraphs", func(t *testing.T) {
225 scorer := NewScorer()
226
227 t.Run("counts multiple paragraphs", func(t *testing.T) {
228 htmlStr := `<html><body><div><p>First</p><p>Second</p><p>Third</p></div></body></html>`
229 doc := parseHTML(htmlStr)
230 div := findElement(doc, "div")
231
232 count := scorer.countParagraphs(div)
233
234 if count != 3 {
235 t.Errorf("Expected 3 paragraphs, got %d", count)
236 }
237 })
238
239 t.Run("returns zero for no paragraphs", func(t *testing.T) {
240 htmlStr := `<html><body><div>Just text</div></body></html>`
241 doc := parseHTML(htmlStr)
242 div := findElement(doc, "div")
243
244 count := scorer.countParagraphs(div)
245
246 if count != 0 {
247 t.Errorf("Expected 0 paragraphs, got %d", count)
248 }
249 })
250 })
251
252 t.Run("FindTopCandidates", func(t *testing.T) {
253 scorer := NewScorer()
254
255 t.Run("finds article with substantial content", func(t *testing.T) {
256 htmlStr := `<html><body>
257 <article class="main-content">
258 <p>This is a long paragraph with substantial content that should score well in the readability algorithm.</p>
259 <p>This is another paragraph with more content to increase the score.</p>
260 <p>And a third paragraph to ensure we have enough text and structure.</p>
261 </article>
262 <aside class="sidebar">
263 <a href="#">Link</a>
264 </aside>
265 </body></html>`
266 doc := parseHTML(htmlStr)
267
268 candidates := scorer.FindTopCandidates(doc, 5)
269
270 if len(candidates) == 0 {
271 t.Fatal("Expected to find candidates")
272 }
273
274 topScore := candidates[0]
275 if topScore.Score <= 0 {
276 t.Errorf("Expected positive score for top candidate, got %f", topScore.Score)
277 }
278
279 if topScore.ParagraphCount < 3 {
280 t.Errorf("Expected top candidate to contain paragraphs, got %d", topScore.ParagraphCount)
281 }
282 })
283
284 t.Run("filters out low-scoring nodes", func(t *testing.T) {
285 htmlStr := `<html><body>
286 <div class="ad">Short ad</div>
287 <nav class="menu"><a href="#">Link</a></nav>
288 </body></html>`
289 doc := parseHTML(htmlStr)
290
291 candidates := scorer.FindTopCandidates(doc, 5)
292
293 for _, candidate := range candidates {
294 if candidate.Score < scorer.minScore {
295 t.Errorf("Expected all candidates to meet minimum score, got %f", candidate.Score)
296 }
297 if candidate.TextLength < scorer.minContentLength {
298 t.Errorf("Expected all candidates to meet minimum length, got %d", candidate.TextLength)
299 }
300 }
301 })
302
303 t.Run("returns empty for nil root", func(t *testing.T) {
304 candidates := scorer.FindTopCandidates(nil, 5)
305
306 if candidates != nil {
307 t.Error("Expected nil for nil root")
308 }
309 })
310 })
311
312 t.Run("calculateConfidence", func(t *testing.T) {
313 scorer := NewScorer()
314
315 t.Run("high confidence for good content", func(t *testing.T) {
316 score := &ContentScore{
317 Score: 60.0,
318 TextLength: 500,
319 LinkDensity: 0.1,
320 ParagraphCount: 5,
321 }
322
323 confidence := scorer.calculateConfidence(score)
324
325 if confidence < 0.5 {
326 t.Errorf("Expected high confidence (>0.5) for good content, got %f", confidence)
327 }
328
329 if confidence > 1.0 {
330 t.Errorf("Expected confidence <= 1.0, got %f", confidence)
331 }
332 })
333
334 t.Run("low confidence for poor content", func(t *testing.T) {
335 score := &ContentScore{
336 Score: 10.0,
337 TextLength: 50,
338 LinkDensity: 0.8,
339 ParagraphCount: 0,
340 }
341
342 confidence := scorer.calculateConfidence(score)
343
344 if confidence > 0.3 {
345 t.Errorf("Expected low confidence (<0.3) for poor content, got %f", confidence)
346 }
347 })
348
349 t.Run("returns zero for nil score", func(t *testing.T) {
350 confidence := scorer.calculateConfidence(nil)
351
352 if confidence != 0.0 {
353 t.Errorf("Expected 0.0 for nil score, got %f", confidence)
354 }
355 })
356 })
357
358 t.Run("IsProbablyReadable", func(t *testing.T) {
359 scorer := NewScorer()
360
361 t.Run("returns true for readable document", func(t *testing.T) {
362 htmlStr := `<html><body>
363 <article>
364 <p>First paragraph with sufficient text content to be considered readable.</p>
365 <p>Second paragraph with more text.</p>
366 <p>Third paragraph with additional content.</p>
367 </article>
368 </body></html>`
369 doc := parseHTML(htmlStr)
370
371 readable := scorer.IsProbablyReadable(doc)
372
373 if !readable {
374 t.Error("Expected document to be readable")
375 }
376 })
377
378 t.Run("returns false for short document", func(t *testing.T) {
379 htmlStr := `<html><body><div>Short</div></body></html>`
380 doc := parseHTML(htmlStr)
381
382 readable := scorer.IsProbablyReadable(doc)
383
384 if readable {
385 t.Error("Expected document to not be readable")
386 }
387 })
388
389 t.Run("returns false for nil document", func(t *testing.T) {
390 readable := scorer.IsProbablyReadable(nil)
391
392 if readable {
393 t.Error("Expected nil document to not be readable")
394 }
395 })
396 })
397
398 t.Run("ScoreAncestors", func(t *testing.T) {
399 scorer := NewScorer()
400
401 t.Run("propagates score to parent nodes", func(t *testing.T) {
402 htmlStr := `<html><body><div><article><p>Content</p></article></div></body></html>`
403 doc := parseHTML(htmlStr)
404 p := findElement(doc, "p")
405
406 scores := make(map[*html.Node]*ContentScore)
407 scores[p] = &ContentScore{Node: p, Score: 10.0}
408
409 scorer.ScoreAncestors(scores, p, 100.0)
410
411 article := findElement(doc, "article")
412 if scores[article] == nil {
413 t.Error("Expected article to receive propagated score")
414 }
415
416 if scores[article].Score <= 0 {
417 t.Errorf("Expected positive propagated score, got %f", scores[article].Score)
418 }
419 })
420 })
421}