cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 馃崈
charm leaflet readability golang
at main 421 lines 11 kB view raw
1package articles 2 3import ( 4 "strings" 5 "testing" 6 7 "golang.org/x/net/html" 8) 9 10func parseHTML(htmlStr string) *html.Node { 11 doc, err := html.Parse(strings.NewReader(htmlStr)) 12 if err != nil { 13 return nil 14 } 15 return doc 16} 17 18func findElement(node *html.Node, tagName string) *html.Node { 19 if node == nil { 20 return nil 21 } 22 23 if node.Type == html.ElementNode && strings.EqualFold(node.Data, tagName) { 24 return node 25 } 26 27 for child := node.FirstChild; child != nil; child = child.NextSibling { 28 if result := findElement(child, tagName); result != nil { 29 return result 30 } 31 } 32 33 return nil 34} 35 36func findElementWithClass(node *html.Node, className string) *html.Node { 37 if node == nil { 38 return nil 39 } 40 41 if node.Type == html.ElementNode { 42 for _, attr := range node.Attr { 43 if attr.Key == "class" && strings.Contains(attr.Val, className) { 44 return node 45 } 46 } 47 } 48 49 for child := node.FirstChild; child != nil; child = child.NextSibling { 50 if result := findElementWithClass(child, className); result != nil { 51 return result 52 } 53 } 54 55 return nil 56} 57 58func TestScorer(t *testing.T) { 59 t.Run("NewScorer", func(t *testing.T) { 60 t.Run("creates scorer with default weights", func(t *testing.T) { 61 scorer := NewScorer() 62 63 if scorer == nil { 64 t.Fatal("Expected scorer to be created, got nil") 65 } 66 67 if scorer.minContentLength != 140 { 68 t.Errorf("Expected minContentLength 140, got %d", scorer.minContentLength) 69 } 70 71 if scorer.minScore != 20.0 { 72 t.Errorf("Expected minScore 20.0, got %f", scorer.minScore) 73 } 74 }) 75 }) 76 77 t.Run("ScoreNode", func(t *testing.T) { 78 scorer := NewScorer() 79 80 t.Run("scores article tag highly", func(t *testing.T) { 81 htmlStr := `<html><body><article class="main-content">Article content</article></body></html>` 82 doc := parseHTML(htmlStr) 83 article := findElement(doc, "article") 84 85 score := scorer.ScoreNode(article) 86 87 if score == nil { 88 t.Fatal("Expected score, got nil") 89 } 90 91 if score.Score <= 0 { 92 t.Errorf("Expected positive score for article tag, got %f", score.Score) 93 } 94 }) 95 96 t.Run("penalizes navigation elements", func(t *testing.T) { 97 htmlStr := `<html><body><div class="navigation sidebar">Nav</div></body></html>` 98 doc := parseHTML(htmlStr) 99 nav := findElementWithClass(doc, "navigation") 100 101 score := scorer.ScoreNode(nav) 102 103 if score == nil { 104 t.Fatal("Expected score, got nil") 105 } 106 107 if score.Score >= 0 { 108 t.Errorf("Expected negative score for navigation, got %f", score.Score) 109 } 110 }) 111 112 t.Run("calculates text length", func(t *testing.T) { 113 htmlStr := `<html><body><div>This is some test content with multiple words</div></body></html>` 114 doc := parseHTML(htmlStr) 115 div := findElement(doc, "div") 116 117 score := scorer.ScoreNode(div) 118 119 if score == nil { 120 t.Fatal("Expected score, got nil") 121 } 122 123 if score.TextLength == 0 { 124 t.Error("Expected non-zero text length") 125 } 126 }) 127 128 t.Run("returns nil for text nodes", func(t *testing.T) { 129 textNode := &html.Node{Type: html.TextNode, Data: "text"} 130 score := scorer.ScoreNode(textNode) 131 132 if score != nil { 133 t.Error("Expected nil score for text node") 134 } 135 }) 136 }) 137 138 t.Run("calculateLinkDensity", func(t *testing.T) { 139 scorer := NewScorer() 140 141 t.Run("calculates high link density", func(t *testing.T) { 142 htmlStr := `<html><body><div><a href="#">link1</a> <a href="#">link2</a></div></body></html>` 143 doc := parseHTML(htmlStr) 144 div := findElement(doc, "div") 145 146 density := scorer.calculateLinkDensity(div) 147 148 if density < 0.5 { 149 t.Errorf("Expected high link density (>0.5), got %f", density) 150 } 151 }) 152 153 t.Run("calculates low link density", func(t *testing.T) { 154 htmlStr := `<html><body><div>Lots of regular text content here with just <a href="#">one link</a> in it</div></body></html>` 155 doc := parseHTML(htmlStr) 156 div := findElement(doc, "div") 157 158 density := scorer.calculateLinkDensity(div) 159 160 if density > 0.3 { 161 t.Errorf("Expected low link density (<0.3), got %f", density) 162 } 163 }) 164 165 t.Run("returns zero for empty content", func(t *testing.T) { 166 htmlStr := `<html><body><div></div></body></html>` 167 doc := parseHTML(htmlStr) 168 div := findElement(doc, "div") 169 170 density := scorer.calculateLinkDensity(div) 171 172 if density != 0.0 { 173 t.Errorf("Expected zero density for empty content, got %f", density) 174 } 175 }) 176 }) 177 178 t.Run("getClassIdScore", func(t *testing.T) { 179 scorer := NewScorer() 180 181 t.Run("positive score for content class", func(t *testing.T) { 182 node := &html.Node{ 183 Type: html.ElementNode, 184 Data: "div", 185 Attr: []html.Attribute{{Key: "class", Val: "article-content"}}, 186 } 187 188 score := scorer.getClassIdScore(node) 189 190 if score <= 0 { 191 t.Errorf("Expected positive score for content class, got %f", score) 192 } 193 }) 194 195 t.Run("negative score for sidebar class", func(t *testing.T) { 196 node := &html.Node{ 197 Type: html.ElementNode, 198 Data: "div", 199 Attr: []html.Attribute{{Key: "class", Val: "sidebar"}}, 200 } 201 202 score := scorer.getClassIdScore(node) 203 204 if score >= 0 { 205 t.Errorf("Expected negative score for sidebar class, got %f", score) 206 } 207 }) 208 209 t.Run("strong negative score for banner", func(t *testing.T) { 210 node := &html.Node{ 211 Type: html.ElementNode, 212 Data: "div", 213 Attr: []html.Attribute{{Key: "id", Val: "banner"}}, 214 } 215 216 score := scorer.getClassIdScore(node) 217 218 if score > -30 { 219 t.Errorf("Expected strong negative score for banner, got %f", score) 220 } 221 }) 222 }) 223 224 t.Run("countParagraphs", func(t *testing.T) { 225 scorer := NewScorer() 226 227 t.Run("counts multiple paragraphs", func(t *testing.T) { 228 htmlStr := `<html><body><div><p>First</p><p>Second</p><p>Third</p></div></body></html>` 229 doc := parseHTML(htmlStr) 230 div := findElement(doc, "div") 231 232 count := scorer.countParagraphs(div) 233 234 if count != 3 { 235 t.Errorf("Expected 3 paragraphs, got %d", count) 236 } 237 }) 238 239 t.Run("returns zero for no paragraphs", func(t *testing.T) { 240 htmlStr := `<html><body><div>Just text</div></body></html>` 241 doc := parseHTML(htmlStr) 242 div := findElement(doc, "div") 243 244 count := scorer.countParagraphs(div) 245 246 if count != 0 { 247 t.Errorf("Expected 0 paragraphs, got %d", count) 248 } 249 }) 250 }) 251 252 t.Run("FindTopCandidates", func(t *testing.T) { 253 scorer := NewScorer() 254 255 t.Run("finds article with substantial content", func(t *testing.T) { 256 htmlStr := `<html><body> 257 <article class="main-content"> 258 <p>This is a long paragraph with substantial content that should score well in the readability algorithm.</p> 259 <p>This is another paragraph with more content to increase the score.</p> 260 <p>And a third paragraph to ensure we have enough text and structure.</p> 261 </article> 262 <aside class="sidebar"> 263 <a href="#">Link</a> 264 </aside> 265 </body></html>` 266 doc := parseHTML(htmlStr) 267 268 candidates := scorer.FindTopCandidates(doc, 5) 269 270 if len(candidates) == 0 { 271 t.Fatal("Expected to find candidates") 272 } 273 274 topScore := candidates[0] 275 if topScore.Score <= 0 { 276 t.Errorf("Expected positive score for top candidate, got %f", topScore.Score) 277 } 278 279 if topScore.ParagraphCount < 3 { 280 t.Errorf("Expected top candidate to contain paragraphs, got %d", topScore.ParagraphCount) 281 } 282 }) 283 284 t.Run("filters out low-scoring nodes", func(t *testing.T) { 285 htmlStr := `<html><body> 286 <div class="ad">Short ad</div> 287 <nav class="menu"><a href="#">Link</a></nav> 288 </body></html>` 289 doc := parseHTML(htmlStr) 290 291 candidates := scorer.FindTopCandidates(doc, 5) 292 293 for _, candidate := range candidates { 294 if candidate.Score < scorer.minScore { 295 t.Errorf("Expected all candidates to meet minimum score, got %f", candidate.Score) 296 } 297 if candidate.TextLength < scorer.minContentLength { 298 t.Errorf("Expected all candidates to meet minimum length, got %d", candidate.TextLength) 299 } 300 } 301 }) 302 303 t.Run("returns empty for nil root", func(t *testing.T) { 304 candidates := scorer.FindTopCandidates(nil, 5) 305 306 if candidates != nil { 307 t.Error("Expected nil for nil root") 308 } 309 }) 310 }) 311 312 t.Run("calculateConfidence", func(t *testing.T) { 313 scorer := NewScorer() 314 315 t.Run("high confidence for good content", func(t *testing.T) { 316 score := &ContentScore{ 317 Score: 60.0, 318 TextLength: 500, 319 LinkDensity: 0.1, 320 ParagraphCount: 5, 321 } 322 323 confidence := scorer.calculateConfidence(score) 324 325 if confidence < 0.5 { 326 t.Errorf("Expected high confidence (>0.5) for good content, got %f", confidence) 327 } 328 329 if confidence > 1.0 { 330 t.Errorf("Expected confidence <= 1.0, got %f", confidence) 331 } 332 }) 333 334 t.Run("low confidence for poor content", func(t *testing.T) { 335 score := &ContentScore{ 336 Score: 10.0, 337 TextLength: 50, 338 LinkDensity: 0.8, 339 ParagraphCount: 0, 340 } 341 342 confidence := scorer.calculateConfidence(score) 343 344 if confidence > 0.3 { 345 t.Errorf("Expected low confidence (<0.3) for poor content, got %f", confidence) 346 } 347 }) 348 349 t.Run("returns zero for nil score", func(t *testing.T) { 350 confidence := scorer.calculateConfidence(nil) 351 352 if confidence != 0.0 { 353 t.Errorf("Expected 0.0 for nil score, got %f", confidence) 354 } 355 }) 356 }) 357 358 t.Run("IsProbablyReadable", func(t *testing.T) { 359 scorer := NewScorer() 360 361 t.Run("returns true for readable document", func(t *testing.T) { 362 htmlStr := `<html><body> 363 <article> 364 <p>First paragraph with sufficient text content to be considered readable.</p> 365 <p>Second paragraph with more text.</p> 366 <p>Third paragraph with additional content.</p> 367 </article> 368 </body></html>` 369 doc := parseHTML(htmlStr) 370 371 readable := scorer.IsProbablyReadable(doc) 372 373 if !readable { 374 t.Error("Expected document to be readable") 375 } 376 }) 377 378 t.Run("returns false for short document", func(t *testing.T) { 379 htmlStr := `<html><body><div>Short</div></body></html>` 380 doc := parseHTML(htmlStr) 381 382 readable := scorer.IsProbablyReadable(doc) 383 384 if readable { 385 t.Error("Expected document to not be readable") 386 } 387 }) 388 389 t.Run("returns false for nil document", func(t *testing.T) { 390 readable := scorer.IsProbablyReadable(nil) 391 392 if readable { 393 t.Error("Expected nil document to not be readable") 394 } 395 }) 396 }) 397 398 t.Run("ScoreAncestors", func(t *testing.T) { 399 scorer := NewScorer() 400 401 t.Run("propagates score to parent nodes", func(t *testing.T) { 402 htmlStr := `<html><body><div><article><p>Content</p></article></div></body></html>` 403 doc := parseHTML(htmlStr) 404 p := findElement(doc, "p") 405 406 scores := make(map[*html.Node]*ContentScore) 407 scores[p] = &ContentScore{Node: p, Score: 10.0} 408 409 scorer.ScoreAncestors(scores, p, 100.0) 410 411 article := findElement(doc, "article") 412 if scores[article] == nil { 413 t.Error("Expected article to receive propagated score") 414 } 415 416 if scores[article].Score <= 0 { 417 t.Errorf("Expected positive propagated score, got %f", scores[article].Score) 418 } 419 }) 420 }) 421}