tangled
alpha
login
or
join now
desertthunder.dev
/
noteleaf
cli + tui to publish to leaflet (wip) & manage tasks, notes & watch/read lists ๐
charm
leaflet
readability
golang
29
fork
atom
overview
issues
2
pulls
pipelines
feat: add more xpath rules for wikipedia
desertthunder.dev
3 months ago
b5f5e49c
02361b92
+83
2 changed files
expand all
collapse all
unified
split
internal
articles
parser_test.go
rules
.wikipedia.org.txt
+71
internal/articles/parser_test.go
···
263
263
t.Error("Expected edit section markers to be removed from markdown")
264
264
}
265
265
})
266
266
+
267
267
+
t.Run("strips Wikipedia navigation boxes and metadata", func(t *testing.T) {
268
268
+
htmlContent := `<html>
269
269
+
<head><title>Test Navigation Article</title></head>
270
270
+
<body>
271
271
+
<h1 id="firstHeading">Test Navigation Article</h1>
272
272
+
<div id="bodyContent">
273
273
+
<p>Main article content goes here.</p>
274
274
+
<h2>Section One<span class="mw-editsection">[edit]</span></h2>
275
275
+
<p>Section content.</p>
276
276
+
<table class="navbox" role="navigation">
277
277
+
<tr><td>Navigation item 1</td></tr>
278
278
+
<tr><td>Navigation item 2</td></tr>
279
279
+
</table>
280
280
+
<div class="navbox">
281
281
+
<p>Another navigation box</p>
282
282
+
</div>
283
283
+
<table class="vertical-navbox">
284
284
+
<tr><td>Vertical nav item</td></tr>
285
285
+
</table>
286
286
+
<p>More article content.</p>
287
287
+
<div role="navigation">
288
288
+
<p>Navigation content</p>
289
289
+
</div>
290
290
+
<div id="catlinks">
291
291
+
<p>Categories: Test Category</p>
292
292
+
</div>
293
293
+
<div id="footer">
294
294
+
<p>Retrieved from Wikipedia</p>
295
295
+
</div>
296
296
+
</div>
297
297
+
</body>
298
298
+
</html>`
299
299
+
300
300
+
markdown, err := parser.Convert(htmlContent, ".wikipedia.org", "https://en.wikipedia.org/wiki/Test_Navigation")
301
301
+
if err != nil {
302
302
+
t.Fatalf("Expected no error, got %v", err)
303
303
+
}
304
304
+
305
305
+
if !strings.Contains(markdown, "Main article content") {
306
306
+
t.Error("Expected markdown to contain main article content")
307
307
+
}
308
308
+
if !strings.Contains(markdown, "Section content") {
309
309
+
t.Error("Expected markdown to contain section content")
310
310
+
}
311
311
+
if !strings.Contains(markdown, "More article content") {
312
312
+
t.Error("Expected markdown to contain additional content")
313
313
+
}
314
314
+
315
315
+
if strings.Contains(markdown, "Navigation item") {
316
316
+
t.Error("Expected navbox table content to be stripped")
317
317
+
}
318
318
+
if strings.Contains(markdown, "Another navigation box") {
319
319
+
t.Error("Expected navbox div content to be stripped")
320
320
+
}
321
321
+
if strings.Contains(markdown, "Vertical nav item") {
322
322
+
t.Error("Expected vertical-navbox content to be stripped")
323
323
+
}
324
324
+
if strings.Contains(markdown, "[edit]") {
325
325
+
t.Error("Expected edit section markers to be stripped")
326
326
+
}
327
327
+
if strings.Contains(markdown, "Navigation content") {
328
328
+
t.Error("Expected role=navigation content to be stripped")
329
329
+
}
330
330
+
if strings.Contains(markdown, "Categories:") {
331
331
+
t.Error("Expected category links to be stripped")
332
332
+
}
333
333
+
if strings.Contains(markdown, "Retrieved from") {
334
334
+
t.Error("Expected footer content to be stripped")
335
335
+
}
336
336
+
})
266
337
})
267
338
268
339
t.Run("ParseURL", func(t *testing.T) {
+12
internal/articles/rules/.wikipedia.org.txt
···
1
1
title: //h1[@id='firstHeading']
2
2
body: //div[@id = 'bodyContent']
3
3
strip_id_or_class: editsection
4
4
+
strip_id_or_class: mw-editsection
4
5
#strip_id_or_class: toc
5
6
strip_id_or_class: vertical-navbox
7
7
+
strip_id_or_class: navbox
6
8
strip: //*[@id='toc']
7
9
strip: //div[@id='catlinks']
8
10
strip: //div[@id='jump-to-nav']
9
11
strip: //div[@class='thumbcaption']//div[@class='magnify']
10
12
strip: //table[@class='navbox']
13
13
+
strip: //table[contains(@class, 'navbox')]
14
14
+
strip: //div[contains(@class, 'navbox')]
15
15
+
strip: //*[@role='navigation']
11
16
#strip: //table[contains(@class, 'infobox')]
12
17
strip: //div[@class='dablink']
13
18
strip: //div[@id='contentSub']
···
15
20
strip: //*[contains(@class, 'noprint')]
16
21
strip: //span[@class='noexcerpt']
17
22
strip: //math
23
23
+
strip: //div[@id='footer']
24
24
+
strip: //div[@id='footer-info']
25
25
+
strip: //*[contains(@class, 'footer')]
26
26
+
strip: //span[@class='mw-editsection']
27
27
+
strip: //*[contains(@class, 'mw-editsection')]
28
28
+
strip: //*[@id='External_links']//following-sibling::*
29
29
+
strip: //*[@id='See_also']//following-sibling::*
18
30
19
31
http_header(user-agent): Mozilla/5.2
20
32