cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists ๐Ÿƒ
charm leaflet readability golang

feat: add more xpath rules for wikipedia

+83
+71
internal/articles/parser_test.go
··· 263 263 t.Error("Expected edit section markers to be removed from markdown") 264 264 } 265 265 }) 266 + 267 + t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) { 268 + htmlContent := `<html> 269 + <head><title>Test Navigation Article</title></head> 270 + <body> 271 + <h1 id="firstHeading">Test Navigation Article</h1> 272 + <div id="bodyContent"> 273 + <p>Main article content goes here.</p> 274 + <h2>Section One<span class="mw-editsection">[edit]</span></h2> 275 + <p>Section content.</p> 276 + <table class="navbox" role="navigation"> 277 + <tr><td>Navigation item 1</td></tr> 278 + <tr><td>Navigation item 2</td></tr> 279 + </table> 280 + <div class="navbox"> 281 + <p>Another navigation box</p> 282 + </div> 283 + <table class="vertical-navbox"> 284 + <tr><td>Vertical nav item</td></tr> 285 + </table> 286 + <p>More article content.</p> 287 + <div role="navigation"> 288 + <p>Navigation content</p> 289 + </div> 290 + <div id="catlinks"> 291 + <p>Categories: Test Category</p> 292 + </div> 293 + <div id="footer"> 294 + <p>Retrieved from Wikipedia</p> 295 + </div> 296 + </div> 297 + </body> 298 + </html>` 299 + 300 + markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation") 301 + if err != nil { 302 + t.Fatalf("Expected no error, got %v", err) 303 + } 304 + 305 + if !strings.Contains(markdown, "Main article content") { 306 + t.Error("Expected markdown to contain main article content") 307 + } 308 + if !strings.Contains(markdown, "Section content") { 309 + t.Error("Expected markdown to contain section content") 310 + } 311 + if !strings.Contains(markdown, "More article content") { 312 + t.Error("Expected markdown to contain additional content") 313 + } 314 + 315 + if strings.Contains(markdown, "Navigation item") { 316 + t.Error("Expected navbox table content to be stripped") 317 + } 318 + if strings.Contains(markdown, "Another navigation box") { 319 + t.Error("Expected navbox div content to be stripped") 320 + } 321 + if strings.Contains(markdown, "Vertical nav item") { 322 + t.Error("Expected vertical-navbox content to be stripped") 323 + } 324 + if strings.Contains(markdown, "[edit]") { 325 + t.Error("Expected edit section markers to be stripped") 326 + } 327 + if strings.Contains(markdown, "Navigation content") { 328 + t.Error("Expected role=navigation content to be stripped") 329 + } 330 + if strings.Contains(markdown, "Categories:") { 331 + t.Error("Expected category links to be stripped") 332 + } 333 + if strings.Contains(markdown, "Retrieved from") { 334 + t.Error("Expected footer content to be stripped") 335 + } 336 + }) 266 337 }) 267 338 268 339 t.Run("ParseURL", func(t *testing.T) {
+12
internal/articles/rules/.wikipedia.org.txt
··· 1 1 title: //h1[@id='firstHeading'] 2 2 body: //div[@id = 'bodyContent'] 3 3 strip_id_or_class: editsection 4 + strip_id_or_class: mw-editsection 4 5 #strip_id_or_class: toc 5 6 strip_id_or_class: vertical-navbox 7 + strip_id_or_class: navbox 6 8 strip: //*[@id='toc'] 7 9 strip: //div[@id='catlinks'] 8 10 strip: //div[@id='jump-to-nav'] 9 11 strip: //div[@class='thumbcaption']//div[@class='magnify'] 10 12 strip: //table[@class='navbox'] 13 + strip: //table[contains(@class, 'navbox')] 14 + strip: //div[contains(@class, 'navbox')] 15 + strip: //*[@role='navigation'] 11 16 #strip: //table[contains(@class, 'infobox')] 12 17 strip: //div[@class='dablink'] 13 18 strip: //div[@id='contentSub'] ··· 15 20 strip: //*[contains(@class, 'noprint')] 16 21 strip: //span[@class='noexcerpt'] 17 22 strip: //math 23 + strip: //div[@id='footer'] 24 + strip: //div[@id='footer-info'] 25 + strip: //*[contains(@class, 'footer')] 26 + strip: //span[@class='mw-editsection'] 27 + strip: //*[contains(@class, 'mw-editsection')] 28 + strip: //*[@id='External_links']//following-sibling::* 29 + strip: //*[@id='See_also']//following-sibling::* 18 30 19 31 http_header(user-agent): Mozilla/5.2 20 32