cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists 🍃
charm leaflet readability golang
at main 443 lines 12 kB view raw
1package articles 2 3import ( 4 "strings" 5 "testing" 6 7 "golang.org/x/net/html" 8) 9 10func TestHeuristicExtractor(t *testing.T) { 11 t.Run("NewHeuristicExtractor", func(t *testing.T) { 12 t.Run("creates extractor with scorer", func(t *testing.T) { 13 extractor := NewHeuristicExtractor() 14 15 if extractor == nil { 16 t.Fatal("Expected extractor to be created, got nil") 17 } 18 19 if extractor.scorer == nil { 20 t.Error("Expected extractor to have scorer") 21 } 22 }) 23 }) 24 25 t.Run("ExtractContent", func(t *testing.T) { 26 extractor := NewHeuristicExtractor() 27 28 t.Run("extracts content from article", func(t *testing.T) { 29 htmlStr := `<html><body> 30 <article class="main-content"> 31 <p>This is the first paragraph of the article with substantial content.</p> 32 <p>This is the second paragraph with more information and details.</p> 33 <p>And this is the third paragraph to ensure sufficient content.</p> 34 </article> 35 <aside class="sidebar"><a href="#">Sidebar link</a></aside> 36 </body></html>` 37 doc := parseHTML(htmlStr) 38 39 result := extractor.ExtractContent(doc) 40 41 if result == nil { 42 t.Fatal("Expected extraction result, got nil") 43 } 44 45 if result.Content == "" { 46 t.Error("Expected content to be extracted") 47 } 48 49 if result.Confidence == 0.0 { 50 t.Error("Expected non-zero confidence") 51 } 52 53 if !strings.Contains(result.Content, "first paragraph") { 54 t.Error("Expected content to contain article text") 55 } 56 }) 57 58 t.Run("returns low confidence for unreadable document", func(t *testing.T) { 59 htmlStr := `<html><body><div>Short</div></body></html>` 60 doc := parseHTML(htmlStr) 61 62 result := extractor.ExtractContent(doc) 63 64 if result == nil { 65 t.Fatal("Expected extraction result, got nil") 66 } 67 68 if result.Confidence > 0.3 { 69 t.Errorf("Expected low confidence for short document, got %f", result.Confidence) 70 } 71 }) 72 73 t.Run("returns nil for nil document", func(t *testing.T) { 74 result := extractor.ExtractContent(nil) 75 76 if result != nil { 77 t.Error("Expected nil for nil document") 78 } 79 }) 80 }) 81 82 t.Run("cleanDocument", func(t *testing.T) { 83 extractor := NewHeuristicExtractor() 84 85 t.Run("removes script and style tags", func(t *testing.T) { 86 htmlStr := `<html><body> 87 <script>alert('test');</script> 88 <style>.test { color: red; }</style> 89 <p>Content</p> 90 </body></html>` 91 doc := parseHTML(htmlStr) 92 93 cleaned := extractor.cleanDocument(doc) 94 95 script := findElement(cleaned, "script") 96 style := findElement(cleaned, "style") 97 98 if script != nil { 99 t.Error("Expected script tag to be removed") 100 } 101 102 if style != nil { 103 t.Error("Expected style tag to be removed") 104 } 105 }) 106 107 t.Run("removes hidden elements", func(t *testing.T) { 108 htmlStr := `<html><body> 109 <div style="display:none">Hidden</div> 110 <div hidden>Also hidden</div> 111 <p>Visible</p> 112 </body></html>` 113 doc := parseHTML(htmlStr) 114 115 cleaned := extractor.cleanDocument(doc) 116 117 // Count divs - should only have visible ones 118 divCount := 0 119 var countDivs func(*html.Node) 120 countDivs = func(node *html.Node) { 121 if node.Type == html.ElementNode && node.Data == "div" { 122 divCount++ 123 } 124 for child := node.FirstChild; child != nil; child = child.NextSibling { 125 countDivs(child) 126 } 127 } 128 countDivs(cleaned) 129 130 if divCount > 0 { 131 t.Errorf("Expected hidden divs to be removed, found %d", divCount) 132 } 133 }) 134 135 t.Run("removes high link density elements", func(t *testing.T) { 136 htmlStr := `<html><body> 137 <div class="links"> 138 <a href="#">Link1</a> 139 <a href="#">Link2</a> 140 <a href="#">Link3</a> 141 </div> 142 <p>Regular paragraph with actual content that should remain.</p> 143 </body></html>` 144 doc := parseHTML(htmlStr) 145 146 cleaned := extractor.cleanDocument(doc) 147 148 p := findElement(cleaned, "p") 149 if p == nil { 150 t.Error("Expected paragraph to remain") 151 } 152 }) 153 }) 154 155 t.Run("extractTextContent", func(t *testing.T) { 156 extractor := NewHeuristicExtractor() 157 158 t.Run("extracts text with basic formatting", func(t *testing.T) { 159 htmlStr := `<html><body><div> 160 <p>First paragraph</p> 161 <p>Second paragraph</p> 162 </div></body></html>` 163 doc := parseHTML(htmlStr) 164 div := findElement(doc, "div") 165 166 text := extractor.extractTextContent(div) 167 168 if !strings.Contains(text, "First paragraph") { 169 t.Error("Expected text to contain first paragraph") 170 } 171 172 if !strings.Contains(text, "Second paragraph") { 173 t.Error("Expected text to contain second paragraph") 174 } 175 }) 176 177 t.Run("formats list items with bullets", func(t *testing.T) { 178 htmlStr := `<html><body><ul> 179 <li>Item 1</li> 180 <li>Item 2</li> 181 </ul></body></html>` 182 doc := parseHTML(htmlStr) 183 ul := findElement(doc, "ul") 184 185 text := extractor.extractTextContent(ul) 186 187 if !strings.Contains(text, "•") { 188 t.Error("Expected text to contain bullet points") 189 } 190 }) 191 192 t.Run("returns empty string for nil node", func(t *testing.T) { 193 text := extractor.extractTextContent(nil) 194 195 if text != "" { 196 t.Error("Expected empty string for nil node") 197 } 198 }) 199 }) 200 201 t.Run("CompareWithXPath", func(t *testing.T) { 202 extractor := NewHeuristicExtractor() 203 204 t.Run("high confidence when XPath and heuristics agree", func(t *testing.T) { 205 htmlStr := `<html><body> 206 <article> 207 <p>This is substantial content that both methods should find.</p> 208 <p>Another paragraph with more details and information.</p> 209 <p>And a third paragraph for good measure and completeness.</p> 210 </article> 211 </body></html>` 212 doc := parseHTML(htmlStr) 213 article := findElement(doc, "article") 214 215 result := extractor.CompareWithXPath(doc, article) 216 217 if result == nil { 218 t.Fatal("Expected result, got nil") 219 } 220 221 if result.Confidence < 0.8 { 222 t.Errorf("Expected high confidence when methods agree, got %f", result.Confidence) 223 } 224 225 if !strings.Contains(result.ExtractionMethod, "dual") && !strings.Contains(result.ExtractionMethod, "validated") { 226 t.Errorf("Expected dual validation method, got %s", result.ExtractionMethod) 227 } 228 }) 229 230 t.Run("prefers XPath when it extracts more content", func(t *testing.T) { 231 htmlStr := `<html><body> 232 <div class="content"> 233 <p>Short content</p> 234 </div> 235 <div class="more"> 236 <p>This is additional content that XPath found but heuristics might miss.</p> 237 <p>Even more content here to make a significant difference in length.</p> 238 <p>And yet another paragraph to ensure XPath extraction is substantially longer.</p> 239 </div> 240 </body></html>` 241 doc := parseHTML(htmlStr) 242 243 // XPath would get more content 244 body := findElement(doc, "body") 245 246 result := extractor.CompareWithXPath(doc, body) 247 248 if result == nil { 249 t.Fatal("Expected result, got nil") 250 } 251 252 // Should prefer one method over the other 253 if result.ExtractionMethod == "heuristic" { 254 t.Errorf("Expected method preference, got %s", result.ExtractionMethod) 255 } 256 }) 257 258 t.Run("uses heuristics when XPath node is nil", func(t *testing.T) { 259 htmlStr := `<html><body> 260 <article> 261 <p>Content that heuristics should find on its own.</p> 262 <p>Additional paragraph for sufficient content length.</p> 263 <p>Third paragraph to meet minimum requirements.</p> 264 </article> 265 </body></html>` 266 doc := parseHTML(htmlStr) 267 268 result := extractor.CompareWithXPath(doc, nil) 269 270 if result == nil { 271 t.Fatal("Expected result, got nil") 272 } 273 274 if result.ExtractionMethod != "heuristic" { 275 t.Errorf("Expected heuristic method when XPath is nil, got %s", result.ExtractionMethod) 276 } 277 }) 278 279 t.Run("returns nil for nil document", func(t *testing.T) { 280 result := extractor.CompareWithXPath(nil, nil) 281 282 if result != nil { 283 t.Error("Expected nil for nil document") 284 } 285 }) 286 }) 287 288 t.Run("calculateSimilarity", func(t *testing.T) { 289 extractor := NewHeuristicExtractor() 290 291 t.Run("returns high similarity for identical text", func(t *testing.T) { 292 text := "This is some test content" 293 294 similarity := extractor.calculateSimilarity(text, text) 295 296 if similarity < 0.9 { 297 t.Errorf("Expected high similarity for identical text, got %f", similarity) 298 } 299 }) 300 301 t.Run("returns low similarity for different text", func(t *testing.T) { 302 text1 := "This is the first piece of content" 303 text2 := "Completely different words and phrases" 304 305 similarity := extractor.calculateSimilarity(text1, text2) 306 307 if similarity > 0.3 { 308 t.Errorf("Expected low similarity for different text, got %f", similarity) 309 } 310 }) 311 312 t.Run("returns zero for empty strings", func(t *testing.T) { 313 similarity := extractor.calculateSimilarity("text", "") 314 315 if similarity != 0.0 { 316 t.Errorf("Expected zero similarity for empty string, got %f", similarity) 317 } 318 }) 319 320 t.Run("returns one for both empty", func(t *testing.T) { 321 similarity := extractor.calculateSimilarity("", "") 322 323 if similarity != 1.0 { 324 t.Errorf("Expected 1.0 similarity for both empty, got %f", similarity) 325 } 326 }) 327 }) 328 329 t.Run("ExtractWithSemanticHTML", func(t *testing.T) { 330 extractor := NewHeuristicExtractor() 331 332 t.Run("extracts from article tag", func(t *testing.T) { 333 htmlStr := `<html><body> 334 <nav>Navigation</nav> 335 <article> 336 <p>This is the main article content that should be extracted.</p> 337 <p>Second paragraph of the article with more information.</p> 338 <p>Third paragraph to provide sufficient content length.</p> 339 </article> 340 <aside>Sidebar</aside> 341 </body></html>` 342 doc := parseHTML(htmlStr) 343 344 result := extractor.ExtractWithSemanticHTML(doc) 345 346 if result == nil { 347 t.Fatal("Expected result, got nil") 348 } 349 350 if result.ExtractionMethod != "semantic-html" { 351 t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod) 352 } 353 354 if !strings.Contains(result.Content, "main article content") { 355 t.Error("Expected content from article tag") 356 } 357 358 if result.Confidence < 0.85 { 359 t.Errorf("Expected high confidence for semantic HTML, got %f", result.Confidence) 360 } 361 }) 362 363 t.Run("extracts from main tag", func(t *testing.T) { 364 htmlStr := `<html><body> 365 <header>Header</header> 366 <main> 367 <p>This is the main content area with sufficient text.</p> 368 <p>Additional content paragraph with more details.</p> 369 <p>Third paragraph for completeness and length.</p> 370 </main> 371 <footer>Footer</footer> 372 </body></html>` 373 doc := parseHTML(htmlStr) 374 375 result := extractor.ExtractWithSemanticHTML(doc) 376 377 if result == nil { 378 t.Fatal("Expected result, got nil") 379 } 380 381 if result.ExtractionMethod != "semantic-html" { 382 t.Errorf("Expected semantic-html method, got %s", result.ExtractionMethod) 383 } 384 385 if !strings.Contains(result.Content, "main content area") { 386 t.Error("Expected content from main tag") 387 } 388 }) 389 390 t.Run("falls back to heuristics without semantic tags", func(t *testing.T) { 391 htmlStr := `<html><body> 392 <div class="content"> 393 <p>Content in a regular div without semantic HTML tags.</p> 394 <p>Second paragraph with additional information.</p> 395 <p>Third paragraph for sufficient content.</p> 396 </div> 397 </body></html>` 398 doc := parseHTML(htmlStr) 399 400 result := extractor.ExtractWithSemanticHTML(doc) 401 402 if result == nil { 403 t.Fatal("Expected result, got nil") 404 } 405 406 if result.ExtractionMethod == "semantic-html" { 407 t.Error("Should not use semantic-html method without semantic tags") 408 } 409 }) 410 411 t.Run("returns nil for nil document", func(t *testing.T) { 412 result := extractor.ExtractWithSemanticHTML(nil) 413 414 if result != nil { 415 t.Error("Expected nil for nil document") 416 } 417 }) 418 }) 419 420 t.Run("isBlockElement", func(t *testing.T) { 421 extractor := NewHeuristicExtractor() 422 423 t.Run("identifies block elements", func(t *testing.T) { 424 blockTags := []string{"p", "div", "article", "h1", "section"} 425 426 for _, tag := range blockTags { 427 if !extractor.isBlockElement(tag) { 428 t.Errorf("Expected %s to be a block element", tag) 429 } 430 } 431 }) 432 433 t.Run("identifies non-block elements", func(t *testing.T) { 434 inlineTags := []string{"span", "a", "em", "strong", "code"} 435 436 for _, tag := range inlineTags { 437 if extractor.isBlockElement(tag) { 438 t.Errorf("Expected %s to not be a block element", tag) 439 } 440 } 441 }) 442 }) 443}